RsF6Vn86

· 5 years ago · May 28, 2020, 10:00 AM
1import csv
2import datetime
3import hashlib
4import json
5import sys
6import time
7from pathlib import Path
8from urllib.error import HTTPError
9
10import data_extraction.dict_utils as dict_utils
11import data_extraction.map_wd_response as map_wd_response
12from data_extraction.constants import *
13from data_extraction.request_utils import send_http_request
14from shared.utils import (chunks, create_new_path, language_config_to_list,
15                          setup_logger)
16from SPARQLWrapper import JSON, SPARQLWrapper
17
18DEV = True
19DEV_CHUNK_LIMIT = 2  # Not entry but chunks of 50
20already_extracted_superclass_ids = set()
21
22logger = setup_logger(
23    "data_extraction.get_wikidata_items",
24    Path(__file__).parent.parent.absolute()
25    / "logs" / GET_WIKIDATA_ITEMS_LOG_FILENAME,
26)
27
28
29def extract_art_ontology():
30    """ Extracts *.csv and *.JSON files for artworks from Wikidata """
31
32    # Array of already crawled wikidata items
33    already_crawled_wikidata_items = set()
34
35    for artwork, wd in [
36        (DRAWING[PLURAL], DRAWING[ID]),
37        (SCULPTURE[PLURAL], SCULPTURE[ID]),
38        (PAINTING[PLURAL], PAINTING[ID]),
39    ]:
40        extracted_artwork = extract_artworks(
41            artwork, wd, already_crawled_wikidata_items
42        )
43
44        path_name = create_new_path(ARTWORK[PLURAL], artwork, CSV)
45        generate_csv(artwork, extracted_artwork, get_fields(artwork), path_name)
46
47        path_name = create_new_path(ARTWORK[PLURAL], artwork, JSON)
48        generate_json(artwork, extracted_artwork, path_name)
49
50    merged_artworks = merge_artworks()
51
52    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)
53    generate_csv(
54        ARTWORK[PLURAL],
55        merged_artworks,
56        get_fields(ARTWORK[PLURAL]) + [TYPE],
57        path_name,
58    )
59
60    # Get motifs and main subjects
61    motifs = extract_motifs_and_main_subjects(merged_artworks)
62
63    # Get extracted genres, materials, etc.
64    genres, materials, movements, artists, locations = bundle_extract_data_calls(
65        [
66            GENRE[PLURAL],
67            MATERIAL[PLURAL],
68            MOVEMENT[PLURAL],
69            ARTIST[PLURAL],
70            LOCATION[PLURAL],
71        ],
72        merged_artworks,
73    )
74
75    # Get distinct classes from artworks, motifs, etc.
76    extracted_classes = get_distinct_extracted_classes(
77        merged_artworks, motifs, genres, materials, movements, artists, locations,
78    )
79
80    # Get country labels for merged artworks and locations
81    (
82        locations,
83        merged_artworks,
84        movements,
85    ) = get_country_labels_for_merged_artworks_and_locations(
86        locations, merged_artworks, movements
87    )
88
89    # Get labels for artists
90    artists = get_labels_for_artists(
91        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP]
92    )
93
94    # Get unit symbols from qid for artworks
95    distinct_unit_qids = get_unit_symbols_from_qid(merged_artworks)
96    unit_symbols = get_unit_symbols(distinct_unit_qids)
97    resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)
98
99    # Write to JSON
100    write_data_to_json(
101        motifs,
102        genres,
103        extracted_classes,
104        materials,
105        movements,
106        locations,
107        merged_artworks,
108        artists,
109    )
110
111
112def extract_artworks(
113    type_name,
114    wikidata_id,
115    already_crawled_wikidata_items,
116    languageKeys=[item[0] for item in language_config_to_list()],
117):
118    """Extracts artworks metadata from Wikidata and stores them in a dictionary.
119
120    type_name -- e.g., 'drawings', will be used as filename
121    wikidata_id -- e.g., 'wd:Q93184' Wikidata ID of a class; all instances of this class and all subclasses with label, artist, and image will be loaded.
122    languageKeys -- e.g, list('en', 'de')
123
124    Examples:
125    extract_artworks('drawings', 'wd:Q93184', '('en', 'de'))
126    extract_artworks('sculptures', 'wd:Q860861', '('en', 'de'))
127    extract_artworks('paintings', 'wd:Q3305213', '('en', 'de'))
128    """
129    print(datetime.datetime.now(), "Starting with", type_name)
130
131    extract_dicts = []
132    chunk_count = 0
133    item_count = 0
134    artwork_ids = query_artwork_qids(type_name, wikidata_id)
135
136    # Don't load items again, if they were loaded in another artwork category
137    for artwork_id in artwork_ids:
138        if artwork_id in already_crawled_wikidata_items:
139            artwork_ids.remove(artwork_id)
140
141    print(
142        f"{len(artwork_ids)} {type_name} entries are not loaded yet, starting now. Already crawled item count is {len(already_crawled_wikidata_items)}"
143    )
144    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
145    artwork_id_chunks = chunks(artwork_ids, chunk_size)
146    for chunk in artwork_id_chunks:
147        if DEV and chunk_count == DEV_CHUNK_LIMIT:
148            logger.info(
149                f"DEV_CHUNK_LIMIT of {type_name} reached. End extraction for {type_name}"
150            )
151            break
152
153        query_result = wikidata_entity_request(chunk)
154        if ENTITIES not in query_result:
155            logger.error("Skipping chunk")
156            continue
157
158        for result in query_result[ENTITIES].values():
159            try:
160                qid = result[ID]
161                # How to get image url
162                # https://stackoverflow.com/questions/34393884/how-to-get-image-url-property-from-wikidata-item-by-api
163                image = get_image_url_by_name(
164                    result[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0][MAINSNAK][
165                        DATAVALUE
166                    ][VALUE]
167                )
168            except Exception as error:
169                logger.error(
170                    "Error on qid or image, skipping item. Error: {0}".format(
171                        error)
172                )
173                continue
174
175            label = dict_utils.try_get_label_or_description(
176                result, LABEL[PLURAL], EN)
177            description = dict_utils.try_get_label_or_description(
178                result, DESCRIPTION[PLURAL], EN)
179
180            (
181                classes,
182                artists,
183                locations,
184                genres,
185                movements,
186                materials,
187                motifs,
188                main_subjects,
189            ) = get_attribute_values_with_try_get_func(
190                result,
191                [
192                    CLASS[SINGULAR],
193                    ARTIST[SINGULAR],
194                    LOCATION[SINGULAR],
195                    GENRE[SINGULAR],
196                    MOVEMENT[SINGULAR],
197                    MATERIAL[SINGULAR],
198                    MOTIF[SINGULAR],
199                    MAIN_SUBJECT[SINGULAR],
200                ],
201                dict_utils.try_get_qid_reference_list,
202            )
203
204            iconclasses = dict_utils.try_get_value_list(
205                result, PROPERTY_NAME_TO_PROPERTY_ID[ICONCLASS[SINGULAR]]
206            )
207            inception = dict_utils.try_get_year_from_property_timestamp(
208                result, PROPERTY_NAME_TO_PROPERTY_ID[INCEPTION]
209            )
210            country = dict_utils.try_get_first_qid(
211                result, PROPERTY_NAME_TO_PROPERTY_ID[COUNTRY])
212
213            # Resolve dimensions
214            # The units are qids which have to be resolved later
215            height, width, length, diameter = get_attribute_values_with_try_get_func(
216                result, [HEIGHT, WIDTH, LENGTH,
217                         DIAMETER], dict_utils.try_get_dimension_value,
218            )
219            (
220                height_unit,
221                width_unit,
222                length_unit,
223                diameter_unit,
224            ) = get_attribute_values_with_try_get_func(
225                result, [HEIGHT, WIDTH, LENGTH,
226                         DIAMETER], dict_utils.try_get_dimension_unit,
227            )
228
229            artwork_dictionary = {
230                ID: qid,
231                CLASS[PLURAL]: classes,
232                LABEL[SINGULAR]: label,
233                DESCRIPTION[SINGULAR]: description,
234                IMAGE: image,
235                ARTIST[PLURAL]: artists,
236                LOCATION[PLURAL]: locations,
237                GENRE[PLURAL]: genres,
238                MOVEMENT[PLURAL]: movements,
239                INCEPTION: inception,
240                MATERIAL[PLURAL]: materials,
241                MOTIF[PLURAL]: motifs,
242                COUNTRY: country,
243                HEIGHT: height,
244                HEIGHT_UNIT: height_unit,
245                WIDTH: width,
246                WIDTH_UNIT: width_unit,
247                LENGTH: length,
248                LENGTH_UNIT: length_unit,
249                DIAMETER: diameter,
250                DIAMETER_UNIT: diameter_unit,
251                ICONCLASS[PLURAL]: iconclasses,
252                MAIN_SUBJECT[PLURAL]: main_subjects,
253            }
254
255            for langkey in languageKeys:
256                label_lang = dict_utils.try_get_label_or_description(
257                    result, LABEL[PLURAL], langkey
258                )
259                description_lang = dict_utils.try_get_label_or_description(
260                    result, DESCRIPTION[PLURAL], langkey
261                )
262                wikipedia_link_lang = dict_utils.try_get_wikipedia_link(
263                    result, langkey)
264                artwork_dictionary.update(
265                    {
266                        f"{LABEL[SINGULAR]}_{langkey}": label_lang,
267                        f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang,
268                        f"{WIKIPEDIA_LINK}_{langkey}": wikipedia_link_lang,
269                    }
270                )
271            extract_dicts.append(artwork_dictionary)
272            already_crawled_wikidata_items.add(qid)
273
274        item_count += len(chunk)
275        print(
276            f"Status of {type_name}: {item_count}/{len(artwork_ids)}",
277            end="\r",
278            flush=True,
279        )
280
281        chunk_count += 1
282
283    print(datetime.datetime.now(), "Finished with", type_name)
284    return extract_dicts
285
286
287def merge_artworks():
288    """ Merges artworks from files 'paintings.json', 'drawings.json',
289        'sculptures.json' (function extract_artworks) and
290        stores them in a dictionary.
291    """
292    print(datetime.datetime.now(), "Starting with", "merging artworks")
293    artworks = set()
294    file_names = [
295        f"{PAINTING[PLURAL]}.{JSON}",
296        f"{DRAWING[PLURAL]}.{JSON}",
297        f"{SCULPTURE[PLURAL]}.{JSON}",
298    ]
299    file_names = [
300        create_new_path(ARTWORK[PLURAL], subpath=file_name) for file_name in file_names
301    ]
302    extract_dicts = []
303
304    for file_name in file_names:
305        with open(file_name, encoding="utf-8") as input:
306            object_array = json.load(input)
307            for object in object_array:
308                if not object[ID] in artworks:  # remove duplicates
309                    object[TYPE] = ARTWORK[SINGULAR]
310                    extract_dicts.append(object)
311                    artworks.add(object[ID])
312
313    print(datetime.datetime.now(), "Finished with", "merging artworks")
314    print()
315    return extract_dicts
316
317
318def wikidata_entity_request(
319    qids,
320    languageKeys=[item[0] for item in language_config_to_list()],
321    props=[CLAIMS, DESCRIPTION[PLURAL], LABEL[PLURAL], SITELINKS],
322    timeout=TIMEOUT,
323    sleep_time=SLEEP_TIME,
324    maxlag=MAX_LAG,
325):
326    """ Represents one artwork request for n-items
327        The API specifies that 50 items can be loaded at once without needing additional permissions:
328        https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
329    """
330    initial_timeout = timeout
331    langkeyPlusWikiList = [key + "wiki" for key in languageKeys]
332    parameters = {
333        "action": "wbgetentities",
334        "ids": "|".join(qids),
335        "format": JSON,
336        "languages": "|".join(languageKeys),
337        "sitefilter": "|".join(langkeyPlusWikiList),
338        "props": "|".join(props),
339        # if the server needs more than maxlag seconds to process
340        # the query an error response is returned
341        "maxlag": maxlag,
342    }
343
344    url = WIKIDATA_API_URL
345    return send_http_request(
346        parameters,
347        HTTP_HEADER,
348        url,
349        logger,
350        initial_timeout=initial_timeout,
351        items=qids,
352        timeout=timeout,
353        sleep_time=sleep_time,
354        maxlag=maxlag,
355    )
356
357
358def get_fields(type_name, languageKeys=[item[0] for item in language_config_to_list()]):
359    """ Returns all fields / columns for a specific type, e. g. 'artworks' """
360    fields = [ID, CLASS[PLURAL], LABEL[SINGULAR], DESCRIPTION[SINGULAR], IMAGE]
361    for langkey in languageKeys:
362        fields += [
363            f"{LABEL[SINGULAR]}_{langkey}",
364            f"{DESCRIPTION[SINGULAR]}_{langkey}",
365            f"{WIKIPEDIA_LINK}_{langkey}",
366        ]
367    if type_name in [
368        DRAWING[PLURAL],
369        SCULPTURE[PLURAL],
370        PAINTING[PLURAL],
371        ARTWORK[PLURAL],
372    ]:
373        fields += [
374            ARTIST[PLURAL],
375            LOCATION[PLURAL],
376            GENRE[PLURAL],
377            MOVEMENT[PLURAL],
378            INCEPTION,
379            MATERIAL[PLURAL],
380            MOTIF[PLURAL],
381            COUNTRY,
382            HEIGHT,
383            HEIGHT_UNIT,
384            WIDTH,
385            WIDTH_UNIT,
386            DIAMETER,
387            DIAMETER_UNIT,
388            LENGTH,
389            LENGTH_UNIT,
390            ICONCLASS[PLURAL],
391            MAIN_SUBJECT[PLURAL],
392        ]
393        for langkey in languageKeys:
394            fields += [f"{COUNTRY}_{langkey}"]
395    elif type_name == ARTIST[PLURAL]:
396        fields += [
397            GENDER,
398            DATE_OF_BIRTH,
399            DATE_OF_DEATH,
400            PLACE_OF_BIRTH,
401            PLACE_OF_DEATH,
402            CITIZENSHIP,
403            MOVEMENT[PLURAL],
404            INFLUENCED_BY,
405        ]
406        for langkey in languageKeys:
407            fields += [f"{GENDER}_{langkey}", f"{CITIZENSHIP}_{langkey}"]
408    elif type_name == MOVEMENT[PLURAL]:
409        fields += [INFLUENCED_BY]
410    elif type_name == LOCATION[PLURAL]:
411        fields += [
412            COUNTRY,
413            WEBSITE,
414            PART_OF,
415            LATITUDE[ABBREVIATION],
416            LONGITUDE[ABBREVIATION],
417        ]
418        for langkey in languageKeys:
419            fields += [f"{COUNTRY}_{langkey}"]
420    elif type_name == CLASS[PLURAL]:
421        fields = [ID, LABEL[SINGULAR], DESCRIPTION[SINGULAR], SUBCLASS_OF]
422        for langkey in languageKeys:
423            fields += [
424                f"{LABEL[SINGULAR]}_{langkey}",
425                f"{DESCRIPTION[SINGULAR]}_{langkey}",
426            ]
427    return fields
428
429
430def extract_motifs_and_main_subjects(merged_artworks):
431    motifs = get_distinct_attribute_values_from_dict(
432        MOTIF[PLURAL], merged_artworks)
433    main_subjects = get_distinct_attribute_values_from_dict(
434        MAIN_SUBJECT[PLURAL], merged_artworks
435    )
436
437    motifs_and_main_subjects = motifs | main_subjects
438    motifs = get_subject("motifs and main subjects", motifs_and_main_subjects)
439    return motifs
440
441
442def query_artwork_qids(type_name, wikidata_id):
443    """ Extracts all artwork QIDs from the wikidata SPARQL endpoint https://query.wikidata.org/ """
444    artwork_ids_filepath = Path(
445        __file__).parent.absolute() / ARTWORK_IDS_QUERY_FILENAME
446    QID_BY_ARTWORK_TYPE_QUERY = (
447        open(artwork_ids_filepath, "r", encoding="utf8")
448        .read()
449        .replace("$QID", wikidata_id)
450    )
451
452    sparql = SPARQLWrapper(WIKIDATA_SPARQL_URL, agent=AGENT_HEADER)
453
454    sparql.setQuery(QID_BY_ARTWORK_TYPE_QUERY)
455    sparql.setReturnFormat(JSON)
456
457    # ToDo: refactor would be better without while True
458    while True:
459        try:
460            query_result = sparql.query().convert()
461            break
462        except HTTPError as error:
463            print(error)
464            print("Waiting for 5 seconds")
465            time.sleep(5)
466            if error.errno != 403:
467                continue
468            else:
469                print("Looks like the bot was blocked.")
470                exit(-1)
471
472    artwork_ids = list(
473        map(
474            lambda result: result["item"][VALUE].replace(
475                WIKIDATA_ENTITY_URL, ""),
476            query_result["results"]["bindings"],
477        )
478    )
479    print(f"{type_name}: {len(artwork_ids)} ids from SPARQL query")
480
481    return artwork_ids
482
483
484def get_distinct_extracted_classes(
485    merged_artworks, motifs, genres, materials, movements, artists, locations
486):
487    distinct_classes = get_distinct_attribute_values_from_dict(
488        CLASS[PLURAL], merged_artworks
489    )
490    distinct_classes = bundle_class_union_calls(
491        distinct_classes, [motifs, genres, materials,
492                           movements, artists, locations],
493    )
494    return get_classes(CLASS[PLURAL], distinct_classes)
495
496
497def get_country_labels_for_merged_artworks_and_locations(
498    locations, merged_artworks, movements
499):
500    tmp = [locations, merged_artworks, movements]
501    distinct_ids = [
502        get_distinct_attribute_values_from_dict(COUNTRY, item, True) for item in tmp
503    ]
504
505    distinct_country_ids = distinct_ids[0].union(
506        distinct_ids[1], distinct_ids[2])
507    country_labels_extracted = get_entity_labels(COUNTRY, distinct_country_ids)
508
509    for item in tmp:
510        yield resolve_entity_id_to_label(COUNTRY, item, country_labels_extracted)
511
512
513def get_labels_for_artists(artists, prop_list):
514    for item in prop_list:
515        distinct_label = get_distinct_attribute_values_from_dict(
516            item, artists, True)
517        extracted_labels = get_entity_labels(item, distinct_label)
518        resolve_entity_id_to_label(item, artists, extracted_labels)
519    return artists
520
521
522def get_image_url_by_name(image_name) -> str:
523    image_name = image_name.replace(" ", "_")
524    hash = hashlib.md5(image_name.encode("utf-8")).hexdigest()
525    hash_index_1 = hash[0]
526    hash_index_1_and_2 = hash[0] + hash[1]
527    url = "https://upload.wikimedia.org/wikipedia/commons/{0}/{1}/{2}".format(
528        hash_index_1, hash_index_1_and_2, image_name
529    )
530    return url
531
532
533def get_attribute_values_with_try_get_func(result, item_list, try_get_func):
534    for item in item_list:
535        yield try_get_func(result, PROPERTY_NAME_TO_PROPERTY_ID[item])
536
537
538def get_unit_symbols_from_qid(merged_artworks):
539    distinct_unit_qids = get_distinct_attribute_values_from_dict(
540        HEIGHT_UNIT, merged_artworks, True
541    )
542
543    for item in [WIDTH_UNIT, LENGTH_UNIT, DIAMETER_UNIT]:
544        distinct_unit_qids = distinct_unit_qids.union(
545            get_distinct_attribute_values_from_dict(item, merged_artworks, True)
546        )
547    return distinct_unit_qids
548
549
550def get_distinct_attribute_values_from_dict(
551    attribute_name, entry_dict, is_single_value_column=False
552):
553    attribute_set = set()
554    for json_object in entry_dict:
555        if is_single_value_column:
556            value = json_object[attribute_name]
557            if value != "":
558                attribute_set.add(value)
559        else:
560            for values in json_object[attribute_name]:
561                attribute_set.add(values)
562
563    return attribute_set
564
565
566def get_subject(
567    type_name, qids, languageKeys=[item[0]
568                                   for item in language_config_to_list()],
569):
570    print(datetime.datetime.now(), f"Starting with {type_name}")
571    print(f"Total {type_name} to extract: {len(qids)}")
572    item_count = 0
573    extract_dicts = []
574    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
575    subject_id_chunks = chunks(list(qids), chunk_size)
576    for chunk in subject_id_chunks:
577        query_result = wikidata_entity_request(chunk)
578
579        if ENTITIES not in query_result:
580            logger.error("Skipping chunk")
581            continue
582
583        for result in query_result[ENTITIES].values():
584            subject_dict = map_wd_response.try_map_response_to_subject(
585                result, type_name)
586            if subject_dict is None:
587                continue
588            if type_name == MOVEMENT[PLURAL] or type_name == ARTIST[PLURAL]:
589                influenced_by = dict_utils.try_get_qid_reference_list(
590                    result, PROPERTY_NAME_TO_PROPERTY_ID[INFLUENCED_BY]
591                )
592                subject_dict.update({INFLUENCED_BY: influenced_by})
593            if type_name == MOVEMENT[PLURAL]:
594                subject_dict.update(
595                    map_wd_response.try_map_response_to_movement(result))
596            if type_name == ARTIST[PLURAL]:
597                subject_dict.update(
598                    map_wd_response.try_map_response_to_artist(result))
599            if type_name == LOCATION[PLURAL]:
600                subject_dict.update(
601                    map_wd_response.try_map_response_to_location(result))
602            extract_dicts.append(subject_dict)
603
604        item_count += len(chunk)
605        print(f"Status of {type_name}: {item_count}/{len(qids)}",
606              end="\r", flush=True)
607
608    print(datetime.datetime.now(), f"Finished with {type_name}")
609    return extract_dicts
610
611
612def get_entity_labels(
613    type_name, qids, languageKeys=[item[0]
614                                   for item in language_config_to_list()],
615):
616    print(datetime.datetime.now(), f"Starting with {type_name} {LABEL[PLURAL]}")
617    print(f"Total {type_name} {LABEL[PLURAL]} to extract: {len(qids)}")
618    item_count = 0
619    extract_dicts = []
620    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
621    id_chunks = chunks(list(qids), chunk_size)
622    for chunk in id_chunks:
623        query_result = wikidata_entity_request(
624            chunk, props=[LABEL[PLURAL]], timeout=10
625        )  # country entities take longer so timeout is increased
626
627        if ENTITIES not in query_result:
628            logger.error("Skipping chunk")
629            continue
630
631        for result in query_result[ENTITIES].values():
632            try:
633                qid = result[ID]
634            except Exception as error:
635                logger.error(
636                    "Error on qid, skipping item. Error: {0}".format(error))
637                continue
638
639            label = dict_utils.try_get_label_or_description(
640                result, LABEL[PLURAL], EN)
641            subject_dict = {
642                ID: qid,
643                LABEL[SINGULAR]: label,
644            }
645
646            for langkey in languageKeys:
647                label_lang = dict_utils.try_get_label_or_description(
648                    result, LABEL[PLURAL], langkey
649                )
650                subject_dict.update(
651                    {f"{LABEL[SINGULAR]}_{langkey}": label_lang})
652            extract_dicts.append(subject_dict)
653
654        item_count += len(chunk)
655        print(
656            f"Status of {type_name} {LABEL[PLURAL]}: {item_count}/{len(qids)}",
657            end="\r",
658            flush=True,
659        )
660
661    print(datetime.datetime.now(), f"Finished with {type_name} {LABEL[PLURAL]}")
662    return extract_dicts
663
664
665def get_classes(
666    type_name, qids, languageKeys=[item[0]
667                                   for item in language_config_to_list()],
668):
669    print(datetime.datetime.now(), f"Starting with {type_name}")
670    if type_name == CLASS[PLURAL]:
671        print(
672            f"Total {type_name} to extract (only 'instance_of' of the provided qids): {len(qids)}"
673        )
674    else:
675        print(
676            f"Total {type_name} to extract (only 'subclass_of' of the provided qids): {len(qids)}"
677        )
678    item_count = 0
679    extract_dicts = []
680    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
681    classes_id_chunks = chunks(list(qids), chunk_size)
682    for chunk in classes_id_chunks:
683        query_result = wikidata_entity_request(chunk)
684
685        if ENTITIES not in query_result:
686            logger.error("Skipping chunk")
687            continue
688
689        for result in query_result[ENTITIES].values():
690            try:
691                qid = result[ID]
692            except Exception as error:
693                logger.error(
694                    "Error on qid, skipping item. Error: {0}".format(error))
695                continue
696            label = dict_utils.try_get_label_or_description(
697                result, LABEL[PLURAL], EN)
698            description = dict_utils.try_get_label_or_description(
699                result, DESCRIPTION[PLURAL], EN)
700            subclass_of = dict_utils.try_get_qid_reference_list(
701                result, PROPERTY_NAME_TO_PROPERTY_ID[SUBCLASS_OF]
702            )
703            class_dict = {
704                ID: qid,
705                LABEL[SINGULAR]: label,
706                DESCRIPTION[SINGULAR]: description,
707                SUBCLASS_OF: subclass_of,
708            }
709
710            for langkey in languageKeys:
711                label_lang = dict_utils.try_get_label_or_description(
712                    result, LABEL[PLURAL], langkey
713                )
714                description_lang = dict_utils.try_get_label_or_description(
715                    result, DESCRIPTION[PLURAL], langkey
716                )
717                class_dict.update(
718                    {
719                        f"{LABEL[SINGULAR]}_{langkey}": label_lang,
720                        f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang,
721                    }
722                )
723            extract_dicts.append(class_dict)
724
725        item_count += len(chunk)
726        print(f"Status of {type_name}: {item_count}/{len(qids)}",
727              end="\r", flush=True)
728
729    superclasses_qids = get_distinct_attribute_values_from_dict(
730        SUBCLASS_OF, extract_dicts
731    )
732    missing_superclass_qids = []
733
734    for superclass_id in superclasses_qids:
735        if superclass_id not in already_extracted_superclass_ids:
736            missing_superclass_qids.append(superclass_id)
737
738    if len(missing_superclass_qids) == 0:
739        return extract_dicts
740    else:
741        [
742            already_extracted_superclass_ids.add(superclass_id)
743            for superclass_id in superclasses_qids
744        ]
745        superclasses = get_classes("subclasses", missing_superclass_qids)
746        for superclass in superclasses:
747            extract_dicts.append(superclass)
748        return extract_dicts
749
750
751def get_unit_symbols(qids):
752    print(datetime.datetime.now(), f"Starting with unit symbols")
753    print(f"Total unit symbols to extract: {len(qids)}")
754    item_count = 0
755    extract_dicts = []
756    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
757    id_chunks = chunks(list(qids), chunk_size)
758    for chunk in id_chunks:
759        query_result = wikidata_entity_request(
760            chunk, props=[CLAIMS], timeout=10)
761
762        if ENTITIES not in query_result:
763            logger.error("Skipping chunk")
764            continue
765
766        for result in query_result[ENTITIES].values():
767            try:
768                qid = result[ID]
769            except Exception as error:
770                logger.error(
771                    "Error on qid, skipping item. Error: {0}".format(error))
772                continue
773
774            unit_symbol = dict_utils.try_get_unit_symbol(
775                result, PROPERTY_NAME_TO_PROPERTY_ID[UNIT_SYMBOL]
776            )
777
778            subject_dict = {ID: qid, UNIT_SYMBOL: unit_symbol}
779            extract_dicts.append(subject_dict)
780
781        item_count += len(chunk)
782        print(
783            f"Status of unit symbols: {item_count}/{len(qids)}", end="\r", flush=True)
784
785    print(datetime.datetime.now(), f"Finished with unit symbols")
786    return extract_dicts
787
788
789def resolve_unit_id_to_unit_symbol(artwork_dict, unit_symbols):
790    attribute_names = [HEIGHT_UNIT, WIDTH_UNIT, LENGTH_UNIT, DIAMETER_UNIT]
791    qid_unit_symbol_dict = {}
792    for unit_symbol_obj in unit_symbols:
793        qid_unit_symbol_dict[unit_symbol_obj[ID]] = unit_symbol_obj
794
795    for artwork_object in artwork_dict:
796        for attribute_name in attribute_names:
797            if artwork_object[attribute_name] != "":
798                entity_id = artwork_object[attribute_name]
799                artwork_object[attribute_name] = qid_unit_symbol_dict[entity_id][
800                    UNIT_SYMBOL
801                ]
802            else:
803                artwork_object[attribute_name] = ""
804
805    return artwork_dict
806
807
808def resolve_entity_id_to_label(
809    attribute_name,
810    artwork_dict,
811    labels,
812    languageKeys=[item[0] for item in language_config_to_list()],
813):
814    # labels objects to qid_labels_dict
815    qid_labels_dict = {}
816    for label_obj in labels:
817        qid_labels_dict[label_obj[ID]] = label_obj
818
819    for artwork_object in artwork_dict:
820        if artwork_object[attribute_name] != "":
821            entity_id = artwork_object[attribute_name]
822            artwork_object[attribute_name] = qid_labels_dict[entity_id][
823                f"{LABEL[SINGULAR]}_{EN}"
824            ]
825            for langkey in languageKeys:
826                artwork_object[f"{attribute_name}_{langkey}"] = qid_labels_dict[
827                    entity_id
828                ][f"{LABEL[SINGULAR]}_{langkey}"]
829        else:
830            for langkey in languageKeys:
831                artwork_object[f"{attribute_name}_{langkey}"] = ""
832
833    return artwork_dict
834
835
836def bundle_class_union_calls(distinct_classes, data_list):
837    for item in data_list:
838        distinct_classes = distinct_classes | get_distinct_attribute_values_from_dict(
839            CLASS[PLURAL], item
840        )
841    return distinct_classes
842
843
844def bundle_extract_data_calls(name_list, merged_artworks):
845    for item in name_list:
846        tmp = get_distinct_attribute_values_from_dict(item, merged_artworks)
847        yield get_subject(item, tmp)
848
849
850def generate_csv(name, extract_dicts, fields, filename):
851    """ Generates a csv file from a dictionary """
852    filename.parent.mkdir(parents=True, exist_ok=True)
853    with open(
854        filename.with_suffix(f".{CSV}"), "w", newline="", encoding="utf-8"
855    ) as file:
856        writer = csv.DictWriter(file, fieldnames=fields,
857                                delimiter=";", quotechar='"')
858        writer.writeheader()
859        for extract_dict in extract_dicts:
860            writer.writerow(extract_dict)
861
862
863def generate_json(name, extract_dicts, filename):
864    """ Generates a JSON file from a dictionary """
865    if len(extract_dicts) == 0:
866        return
867    filename.parent.mkdir(parents=True, exist_ok=True)
868    with open(
869        filename.with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8"
870    ) as file:
871        arrayToDump = []
872        for extract_dict in extract_dicts:
873            extract_dict[TYPE] = name
874            arrayToDump.append(extract_dict)
875        file.write(json.dumps(arrayToDump, ensure_ascii=False))
876
877
878def write_data_to_json(
879    motifs,
880    genres,
881    extracted_classes,
882    materials,
883    movements,
884    locations,
885    merged_artworks,
886    artists,
887):
888    generate_json(MOTIF[SINGULAR], motifs, create_new_path(MOTIF[PLURAL]))
889    generate_json(GENRE[SINGULAR], genres, create_new_path(GENRE[PLURAL]))
890    generate_json(CLASS[SINGULAR], extracted_classes,
891                  create_new_path(CLASS[PLURAL]))
892    generate_json(MATERIAL[SINGULAR], materials,
893                  create_new_path(MATERIAL[PLURAL]))
894    generate_json(MOVEMENT[SINGULAR], movements,
895                  create_new_path(MOVEMENT[PLURAL]))
896    generate_json(LOCATION[SINGULAR], locations,
897                  create_new_path(LOCATION[PLURAL]))
898    generate_json(ARTWORK[SINGULAR], merged_artworks,
899                  create_new_path(ARTWORK[PLURAL]))
900    generate_json(ARTIST[SINGULAR], artists, create_new_path(ARTIST[PLURAL]))
901
902
903if __name__ == "__main__":
904    if len(sys.argv) > 1 and sys.argv[1] == "-d":
905        if len(sys.argv) > 2 and sys.argv[2].isdigit():
906            DEV_CHUNK_LIMIT = int(sys.argv[2])
907        print("DEV MODE: on, DEV_LIM={0}".format(DEV_CHUNK_LIMIT))
908        DEV = True
909
910    logger.info("Extracting Art Ontology")
911    extract_art_ontology()