· 5 years ago · May 28, 2020, 10:00 AM
1import csv
2import datetime
3import hashlib
4import json
5import sys
6import time
7from pathlib import Path
8from urllib.error import HTTPError
9
10import data_extraction.dict_utils as dict_utils
11import data_extraction.map_wd_response as map_wd_response
12from data_extraction.constants import *
13from data_extraction.request_utils import send_http_request
14from shared.utils import (chunks, create_new_path, language_config_to_list,
15 setup_logger)
16from SPARQLWrapper import JSON, SPARQLWrapper
17
18DEV = True
19DEV_CHUNK_LIMIT = 2 # Not entry but chunks of 50
20already_extracted_superclass_ids = set()
21
22logger = setup_logger(
23 "data_extraction.get_wikidata_items",
24 Path(__file__).parent.parent.absolute()
25 / "logs" / GET_WIKIDATA_ITEMS_LOG_FILENAME,
26)
27
28
29def extract_art_ontology():
30 """ Extracts *.csv and *.JSON files for artworks from Wikidata """
31
32 # Array of already crawled wikidata items
33 already_crawled_wikidata_items = set()
34
35 for artwork, wd in [
36 (DRAWING[PLURAL], DRAWING[ID]),
37 (SCULPTURE[PLURAL], SCULPTURE[ID]),
38 (PAINTING[PLURAL], PAINTING[ID]),
39 ]:
40 extracted_artwork = extract_artworks(
41 artwork, wd, already_crawled_wikidata_items
42 )
43
44 path_name = create_new_path(ARTWORK[PLURAL], artwork, CSV)
45 generate_csv(artwork, extracted_artwork, get_fields(artwork), path_name)
46
47 path_name = create_new_path(ARTWORK[PLURAL], artwork, JSON)
48 generate_json(artwork, extracted_artwork, path_name)
49
50 merged_artworks = merge_artworks()
51
52 path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)
53 generate_csv(
54 ARTWORK[PLURAL],
55 merged_artworks,
56 get_fields(ARTWORK[PLURAL]) + [TYPE],
57 path_name,
58 )
59
60 # Get motifs and main subjects
61 motifs = extract_motifs_and_main_subjects(merged_artworks)
62
63 # Get extracted genres, materials, etc.
64 genres, materials, movements, artists, locations = bundle_extract_data_calls(
65 [
66 GENRE[PLURAL],
67 MATERIAL[PLURAL],
68 MOVEMENT[PLURAL],
69 ARTIST[PLURAL],
70 LOCATION[PLURAL],
71 ],
72 merged_artworks,
73 )
74
75 # Get distinct classes from artworks, motifs, etc.
76 extracted_classes = get_distinct_extracted_classes(
77 merged_artworks, motifs, genres, materials, movements, artists, locations,
78 )
79
80 # Get country labels for merged artworks and locations
81 (
82 locations,
83 merged_artworks,
84 movements,
85 ) = get_country_labels_for_merged_artworks_and_locations(
86 locations, merged_artworks, movements
87 )
88
89 # Get labels for artists
90 artists = get_labels_for_artists(
91 artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP]
92 )
93
94 # Get unit symbols from qid for artworks
95 distinct_unit_qids = get_unit_symbols_from_qid(merged_artworks)
96 unit_symbols = get_unit_symbols(distinct_unit_qids)
97 resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)
98
99 # Write to JSON
100 write_data_to_json(
101 motifs,
102 genres,
103 extracted_classes,
104 materials,
105 movements,
106 locations,
107 merged_artworks,
108 artists,
109 )
110
111
112def extract_artworks(
113 type_name,
114 wikidata_id,
115 already_crawled_wikidata_items,
116 languageKeys=[item[0] for item in language_config_to_list()],
117):
118 """Extracts artworks metadata from Wikidata and stores them in a dictionary.
119
120 type_name -- e.g., 'drawings', will be used as filename
121 wikidata_id -- e.g., 'wd:Q93184' Wikidata ID of a class; all instances of this class and all subclasses with label, artist, and image will be loaded.
122 languageKeys -- e.g, list('en', 'de')
123
124 Examples:
125 extract_artworks('drawings', 'wd:Q93184', '('en', 'de'))
126 extract_artworks('sculptures', 'wd:Q860861', '('en', 'de'))
127 extract_artworks('paintings', 'wd:Q3305213', '('en', 'de'))
128 """
129 print(datetime.datetime.now(), "Starting with", type_name)
130
131 extract_dicts = []
132 chunk_count = 0
133 item_count = 0
134 artwork_ids = query_artwork_qids(type_name, wikidata_id)
135
136 # Don't load items again, if they were loaded in another artwork category
137 for artwork_id in artwork_ids:
138 if artwork_id in already_crawled_wikidata_items:
139 artwork_ids.remove(artwork_id)
140
141 print(
142 f"{len(artwork_ids)} {type_name} entries are not loaded yet, starting now. Already crawled item count is {len(already_crawled_wikidata_items)}"
143 )
144 chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
145 artwork_id_chunks = chunks(artwork_ids, chunk_size)
146 for chunk in artwork_id_chunks:
147 if DEV and chunk_count == DEV_CHUNK_LIMIT:
148 logger.info(
149 f"DEV_CHUNK_LIMIT of {type_name} reached. End extraction for {type_name}"
150 )
151 break
152
153 query_result = wikidata_entity_request(chunk)
154 if ENTITIES not in query_result:
155 logger.error("Skipping chunk")
156 continue
157
158 for result in query_result[ENTITIES].values():
159 try:
160 qid = result[ID]
161 # How to get image url
162 # https://stackoverflow.com/questions/34393884/how-to-get-image-url-property-from-wikidata-item-by-api
163 image = get_image_url_by_name(
164 result[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0][MAINSNAK][
165 DATAVALUE
166 ][VALUE]
167 )
168 except Exception as error:
169 logger.error(
170 "Error on qid or image, skipping item. Error: {0}".format(
171 error)
172 )
173 continue
174
175 label = dict_utils.try_get_label_or_description(
176 result, LABEL[PLURAL], EN)
177 description = dict_utils.try_get_label_or_description(
178 result, DESCRIPTION[PLURAL], EN)
179
180 (
181 classes,
182 artists,
183 locations,
184 genres,
185 movements,
186 materials,
187 motifs,
188 main_subjects,
189 ) = get_attribute_values_with_try_get_func(
190 result,
191 [
192 CLASS[SINGULAR],
193 ARTIST[SINGULAR],
194 LOCATION[SINGULAR],
195 GENRE[SINGULAR],
196 MOVEMENT[SINGULAR],
197 MATERIAL[SINGULAR],
198 MOTIF[SINGULAR],
199 MAIN_SUBJECT[SINGULAR],
200 ],
201 dict_utils.try_get_qid_reference_list,
202 )
203
204 iconclasses = dict_utils.try_get_value_list(
205 result, PROPERTY_NAME_TO_PROPERTY_ID[ICONCLASS[SINGULAR]]
206 )
207 inception = dict_utils.try_get_year_from_property_timestamp(
208 result, PROPERTY_NAME_TO_PROPERTY_ID[INCEPTION]
209 )
210 country = dict_utils.try_get_first_qid(
211 result, PROPERTY_NAME_TO_PROPERTY_ID[COUNTRY])
212
213 # Resolve dimensions
214 # The units are qids which have to be resolved later
215 height, width, length, diameter = get_attribute_values_with_try_get_func(
216 result, [HEIGHT, WIDTH, LENGTH,
217 DIAMETER], dict_utils.try_get_dimension_value,
218 )
219 (
220 height_unit,
221 width_unit,
222 length_unit,
223 diameter_unit,
224 ) = get_attribute_values_with_try_get_func(
225 result, [HEIGHT, WIDTH, LENGTH,
226 DIAMETER], dict_utils.try_get_dimension_unit,
227 )
228
229 artwork_dictionary = {
230 ID: qid,
231 CLASS[PLURAL]: classes,
232 LABEL[SINGULAR]: label,
233 DESCRIPTION[SINGULAR]: description,
234 IMAGE: image,
235 ARTIST[PLURAL]: artists,
236 LOCATION[PLURAL]: locations,
237 GENRE[PLURAL]: genres,
238 MOVEMENT[PLURAL]: movements,
239 INCEPTION: inception,
240 MATERIAL[PLURAL]: materials,
241 MOTIF[PLURAL]: motifs,
242 COUNTRY: country,
243 HEIGHT: height,
244 HEIGHT_UNIT: height_unit,
245 WIDTH: width,
246 WIDTH_UNIT: width_unit,
247 LENGTH: length,
248 LENGTH_UNIT: length_unit,
249 DIAMETER: diameter,
250 DIAMETER_UNIT: diameter_unit,
251 ICONCLASS[PLURAL]: iconclasses,
252 MAIN_SUBJECT[PLURAL]: main_subjects,
253 }
254
255 for langkey in languageKeys:
256 label_lang = dict_utils.try_get_label_or_description(
257 result, LABEL[PLURAL], langkey
258 )
259 description_lang = dict_utils.try_get_label_or_description(
260 result, DESCRIPTION[PLURAL], langkey
261 )
262 wikipedia_link_lang = dict_utils.try_get_wikipedia_link(
263 result, langkey)
264 artwork_dictionary.update(
265 {
266 f"{LABEL[SINGULAR]}_{langkey}": label_lang,
267 f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang,
268 f"{WIKIPEDIA_LINK}_{langkey}": wikipedia_link_lang,
269 }
270 )
271 extract_dicts.append(artwork_dictionary)
272 already_crawled_wikidata_items.add(qid)
273
274 item_count += len(chunk)
275 print(
276 f"Status of {type_name}: {item_count}/{len(artwork_ids)}",
277 end="\r",
278 flush=True,
279 )
280
281 chunk_count += 1
282
283 print(datetime.datetime.now(), "Finished with", type_name)
284 return extract_dicts
285
286
287def merge_artworks():
288 """ Merges artworks from files 'paintings.json', 'drawings.json',
289 'sculptures.json' (function extract_artworks) and
290 stores them in a dictionary.
291 """
292 print(datetime.datetime.now(), "Starting with", "merging artworks")
293 artworks = set()
294 file_names = [
295 f"{PAINTING[PLURAL]}.{JSON}",
296 f"{DRAWING[PLURAL]}.{JSON}",
297 f"{SCULPTURE[PLURAL]}.{JSON}",
298 ]
299 file_names = [
300 create_new_path(ARTWORK[PLURAL], subpath=file_name) for file_name in file_names
301 ]
302 extract_dicts = []
303
304 for file_name in file_names:
305 with open(file_name, encoding="utf-8") as input:
306 object_array = json.load(input)
307 for object in object_array:
308 if not object[ID] in artworks: # remove duplicates
309 object[TYPE] = ARTWORK[SINGULAR]
310 extract_dicts.append(object)
311 artworks.add(object[ID])
312
313 print(datetime.datetime.now(), "Finished with", "merging artworks")
314 print()
315 return extract_dicts
316
317
318def wikidata_entity_request(
319 qids,
320 languageKeys=[item[0] for item in language_config_to_list()],
321 props=[CLAIMS, DESCRIPTION[PLURAL], LABEL[PLURAL], SITELINKS],
322 timeout=TIMEOUT,
323 sleep_time=SLEEP_TIME,
324 maxlag=MAX_LAG,
325):
326 """ Represents one artwork request for n-items
327 The API specifies that 50 items can be loaded at once without needing additional permissions:
328 https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
329 """
330 initial_timeout = timeout
331 langkeyPlusWikiList = [key + "wiki" for key in languageKeys]
332 parameters = {
333 "action": "wbgetentities",
334 "ids": "|".join(qids),
335 "format": JSON,
336 "languages": "|".join(languageKeys),
337 "sitefilter": "|".join(langkeyPlusWikiList),
338 "props": "|".join(props),
339 # if the server needs more than maxlag seconds to process
340 # the query an error response is returned
341 "maxlag": maxlag,
342 }
343
344 url = WIKIDATA_API_URL
345 return send_http_request(
346 parameters,
347 HTTP_HEADER,
348 url,
349 logger,
350 initial_timeout=initial_timeout,
351 items=qids,
352 timeout=timeout,
353 sleep_time=sleep_time,
354 maxlag=maxlag,
355 )
356
357
358def get_fields(type_name, languageKeys=[item[0] for item in language_config_to_list()]):
359 """ Returns all fields / columns for a specific type, e. g. 'artworks' """
360 fields = [ID, CLASS[PLURAL], LABEL[SINGULAR], DESCRIPTION[SINGULAR], IMAGE]
361 for langkey in languageKeys:
362 fields += [
363 f"{LABEL[SINGULAR]}_{langkey}",
364 f"{DESCRIPTION[SINGULAR]}_{langkey}",
365 f"{WIKIPEDIA_LINK}_{langkey}",
366 ]
367 if type_name in [
368 DRAWING[PLURAL],
369 SCULPTURE[PLURAL],
370 PAINTING[PLURAL],
371 ARTWORK[PLURAL],
372 ]:
373 fields += [
374 ARTIST[PLURAL],
375 LOCATION[PLURAL],
376 GENRE[PLURAL],
377 MOVEMENT[PLURAL],
378 INCEPTION,
379 MATERIAL[PLURAL],
380 MOTIF[PLURAL],
381 COUNTRY,
382 HEIGHT,
383 HEIGHT_UNIT,
384 WIDTH,
385 WIDTH_UNIT,
386 DIAMETER,
387 DIAMETER_UNIT,
388 LENGTH,
389 LENGTH_UNIT,
390 ICONCLASS[PLURAL],
391 MAIN_SUBJECT[PLURAL],
392 ]
393 for langkey in languageKeys:
394 fields += [f"{COUNTRY}_{langkey}"]
395 elif type_name == ARTIST[PLURAL]:
396 fields += [
397 GENDER,
398 DATE_OF_BIRTH,
399 DATE_OF_DEATH,
400 PLACE_OF_BIRTH,
401 PLACE_OF_DEATH,
402 CITIZENSHIP,
403 MOVEMENT[PLURAL],
404 INFLUENCED_BY,
405 ]
406 for langkey in languageKeys:
407 fields += [f"{GENDER}_{langkey}", f"{CITIZENSHIP}_{langkey}"]
408 elif type_name == MOVEMENT[PLURAL]:
409 fields += [INFLUENCED_BY]
410 elif type_name == LOCATION[PLURAL]:
411 fields += [
412 COUNTRY,
413 WEBSITE,
414 PART_OF,
415 LATITUDE[ABBREVIATION],
416 LONGITUDE[ABBREVIATION],
417 ]
418 for langkey in languageKeys:
419 fields += [f"{COUNTRY}_{langkey}"]
420 elif type_name == CLASS[PLURAL]:
421 fields = [ID, LABEL[SINGULAR], DESCRIPTION[SINGULAR], SUBCLASS_OF]
422 for langkey in languageKeys:
423 fields += [
424 f"{LABEL[SINGULAR]}_{langkey}",
425 f"{DESCRIPTION[SINGULAR]}_{langkey}",
426 ]
427 return fields
428
429
430def extract_motifs_and_main_subjects(merged_artworks):
431 motifs = get_distinct_attribute_values_from_dict(
432 MOTIF[PLURAL], merged_artworks)
433 main_subjects = get_distinct_attribute_values_from_dict(
434 MAIN_SUBJECT[PLURAL], merged_artworks
435 )
436
437 motifs_and_main_subjects = motifs | main_subjects
438 motifs = get_subject("motifs and main subjects", motifs_and_main_subjects)
439 return motifs
440
441
442def query_artwork_qids(type_name, wikidata_id):
443 """ Extracts all artwork QIDs from the wikidata SPARQL endpoint https://query.wikidata.org/ """
444 artwork_ids_filepath = Path(
445 __file__).parent.absolute() / ARTWORK_IDS_QUERY_FILENAME
446 QID_BY_ARTWORK_TYPE_QUERY = (
447 open(artwork_ids_filepath, "r", encoding="utf8")
448 .read()
449 .replace("$QID", wikidata_id)
450 )
451
452 sparql = SPARQLWrapper(WIKIDATA_SPARQL_URL, agent=AGENT_HEADER)
453
454 sparql.setQuery(QID_BY_ARTWORK_TYPE_QUERY)
455 sparql.setReturnFormat(JSON)
456
457 # ToDo: refactor would be better without while True
458 while True:
459 try:
460 query_result = sparql.query().convert()
461 break
462 except HTTPError as error:
463 print(error)
464 print("Waiting for 5 seconds")
465 time.sleep(5)
466 if error.errno != 403:
467 continue
468 else:
469 print("Looks like the bot was blocked.")
470 exit(-1)
471
472 artwork_ids = list(
473 map(
474 lambda result: result["item"][VALUE].replace(
475 WIKIDATA_ENTITY_URL, ""),
476 query_result["results"]["bindings"],
477 )
478 )
479 print(f"{type_name}: {len(artwork_ids)} ids from SPARQL query")
480
481 return artwork_ids
482
483
484def get_distinct_extracted_classes(
485 merged_artworks, motifs, genres, materials, movements, artists, locations
486):
487 distinct_classes = get_distinct_attribute_values_from_dict(
488 CLASS[PLURAL], merged_artworks
489 )
490 distinct_classes = bundle_class_union_calls(
491 distinct_classes, [motifs, genres, materials,
492 movements, artists, locations],
493 )
494 return get_classes(CLASS[PLURAL], distinct_classes)
495
496
497def get_country_labels_for_merged_artworks_and_locations(
498 locations, merged_artworks, movements
499):
500 tmp = [locations, merged_artworks, movements]
501 distinct_ids = [
502 get_distinct_attribute_values_from_dict(COUNTRY, item, True) for item in tmp
503 ]
504
505 distinct_country_ids = distinct_ids[0].union(
506 distinct_ids[1], distinct_ids[2])
507 country_labels_extracted = get_entity_labels(COUNTRY, distinct_country_ids)
508
509 for item in tmp:
510 yield resolve_entity_id_to_label(COUNTRY, item, country_labels_extracted)
511
512
513def get_labels_for_artists(artists, prop_list):
514 for item in prop_list:
515 distinct_label = get_distinct_attribute_values_from_dict(
516 item, artists, True)
517 extracted_labels = get_entity_labels(item, distinct_label)
518 resolve_entity_id_to_label(item, artists, extracted_labels)
519 return artists
520
521
522def get_image_url_by_name(image_name) -> str:
523 image_name = image_name.replace(" ", "_")
524 hash = hashlib.md5(image_name.encode("utf-8")).hexdigest()
525 hash_index_1 = hash[0]
526 hash_index_1_and_2 = hash[0] + hash[1]
527 url = "https://upload.wikimedia.org/wikipedia/commons/{0}/{1}/{2}".format(
528 hash_index_1, hash_index_1_and_2, image_name
529 )
530 return url
531
532
533def get_attribute_values_with_try_get_func(result, item_list, try_get_func):
534 for item in item_list:
535 yield try_get_func(result, PROPERTY_NAME_TO_PROPERTY_ID[item])
536
537
538def get_unit_symbols_from_qid(merged_artworks):
539 distinct_unit_qids = get_distinct_attribute_values_from_dict(
540 HEIGHT_UNIT, merged_artworks, True
541 )
542
543 for item in [WIDTH_UNIT, LENGTH_UNIT, DIAMETER_UNIT]:
544 distinct_unit_qids = distinct_unit_qids.union(
545 get_distinct_attribute_values_from_dict(item, merged_artworks, True)
546 )
547 return distinct_unit_qids
548
549
550def get_distinct_attribute_values_from_dict(
551 attribute_name, entry_dict, is_single_value_column=False
552):
553 attribute_set = set()
554 for json_object in entry_dict:
555 if is_single_value_column:
556 value = json_object[attribute_name]
557 if value != "":
558 attribute_set.add(value)
559 else:
560 for values in json_object[attribute_name]:
561 attribute_set.add(values)
562
563 return attribute_set
564
565
566def get_subject(
567 type_name, qids, languageKeys=[item[0]
568 for item in language_config_to_list()],
569):
570 print(datetime.datetime.now(), f"Starting with {type_name}")
571 print(f"Total {type_name} to extract: {len(qids)}")
572 item_count = 0
573 extract_dicts = []
574 chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
575 subject_id_chunks = chunks(list(qids), chunk_size)
576 for chunk in subject_id_chunks:
577 query_result = wikidata_entity_request(chunk)
578
579 if ENTITIES not in query_result:
580 logger.error("Skipping chunk")
581 continue
582
583 for result in query_result[ENTITIES].values():
584 subject_dict = map_wd_response.try_map_response_to_subject(
585 result, type_name)
586 if subject_dict is None:
587 continue
588 if type_name == MOVEMENT[PLURAL] or type_name == ARTIST[PLURAL]:
589 influenced_by = dict_utils.try_get_qid_reference_list(
590 result, PROPERTY_NAME_TO_PROPERTY_ID[INFLUENCED_BY]
591 )
592 subject_dict.update({INFLUENCED_BY: influenced_by})
593 if type_name == MOVEMENT[PLURAL]:
594 subject_dict.update(
595 map_wd_response.try_map_response_to_movement(result))
596 if type_name == ARTIST[PLURAL]:
597 subject_dict.update(
598 map_wd_response.try_map_response_to_artist(result))
599 if type_name == LOCATION[PLURAL]:
600 subject_dict.update(
601 map_wd_response.try_map_response_to_location(result))
602 extract_dicts.append(subject_dict)
603
604 item_count += len(chunk)
605 print(f"Status of {type_name}: {item_count}/{len(qids)}",
606 end="\r", flush=True)
607
608 print(datetime.datetime.now(), f"Finished with {type_name}")
609 return extract_dicts
610
611
612def get_entity_labels(
613 type_name, qids, languageKeys=[item[0]
614 for item in language_config_to_list()],
615):
616 print(datetime.datetime.now(), f"Starting with {type_name} {LABEL[PLURAL]}")
617 print(f"Total {type_name} {LABEL[PLURAL]} to extract: {len(qids)}")
618 item_count = 0
619 extract_dicts = []
620 chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
621 id_chunks = chunks(list(qids), chunk_size)
622 for chunk in id_chunks:
623 query_result = wikidata_entity_request(
624 chunk, props=[LABEL[PLURAL]], timeout=10
625 ) # country entities take longer so timeout is increased
626
627 if ENTITIES not in query_result:
628 logger.error("Skipping chunk")
629 continue
630
631 for result in query_result[ENTITIES].values():
632 try:
633 qid = result[ID]
634 except Exception as error:
635 logger.error(
636 "Error on qid, skipping item. Error: {0}".format(error))
637 continue
638
639 label = dict_utils.try_get_label_or_description(
640 result, LABEL[PLURAL], EN)
641 subject_dict = {
642 ID: qid,
643 LABEL[SINGULAR]: label,
644 }
645
646 for langkey in languageKeys:
647 label_lang = dict_utils.try_get_label_or_description(
648 result, LABEL[PLURAL], langkey
649 )
650 subject_dict.update(
651 {f"{LABEL[SINGULAR]}_{langkey}": label_lang})
652 extract_dicts.append(subject_dict)
653
654 item_count += len(chunk)
655 print(
656 f"Status of {type_name} {LABEL[PLURAL]}: {item_count}/{len(qids)}",
657 end="\r",
658 flush=True,
659 )
660
661 print(datetime.datetime.now(), f"Finished with {type_name} {LABEL[PLURAL]}")
662 return extract_dicts
663
664
665def get_classes(
666 type_name, qids, languageKeys=[item[0]
667 for item in language_config_to_list()],
668):
669 print(datetime.datetime.now(), f"Starting with {type_name}")
670 if type_name == CLASS[PLURAL]:
671 print(
672 f"Total {type_name} to extract (only 'instance_of' of the provided qids): {len(qids)}"
673 )
674 else:
675 print(
676 f"Total {type_name} to extract (only 'subclass_of' of the provided qids): {len(qids)}"
677 )
678 item_count = 0
679 extract_dicts = []
680 chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
681 classes_id_chunks = chunks(list(qids), chunk_size)
682 for chunk in classes_id_chunks:
683 query_result = wikidata_entity_request(chunk)
684
685 if ENTITIES not in query_result:
686 logger.error("Skipping chunk")
687 continue
688
689 for result in query_result[ENTITIES].values():
690 try:
691 qid = result[ID]
692 except Exception as error:
693 logger.error(
694 "Error on qid, skipping item. Error: {0}".format(error))
695 continue
696 label = dict_utils.try_get_label_or_description(
697 result, LABEL[PLURAL], EN)
698 description = dict_utils.try_get_label_or_description(
699 result, DESCRIPTION[PLURAL], EN)
700 subclass_of = dict_utils.try_get_qid_reference_list(
701 result, PROPERTY_NAME_TO_PROPERTY_ID[SUBCLASS_OF]
702 )
703 class_dict = {
704 ID: qid,
705 LABEL[SINGULAR]: label,
706 DESCRIPTION[SINGULAR]: description,
707 SUBCLASS_OF: subclass_of,
708 }
709
710 for langkey in languageKeys:
711 label_lang = dict_utils.try_get_label_or_description(
712 result, LABEL[PLURAL], langkey
713 )
714 description_lang = dict_utils.try_get_label_or_description(
715 result, DESCRIPTION[PLURAL], langkey
716 )
717 class_dict.update(
718 {
719 f"{LABEL[SINGULAR]}_{langkey}": label_lang,
720 f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang,
721 }
722 )
723 extract_dicts.append(class_dict)
724
725 item_count += len(chunk)
726 print(f"Status of {type_name}: {item_count}/{len(qids)}",
727 end="\r", flush=True)
728
729 superclasses_qids = get_distinct_attribute_values_from_dict(
730 SUBCLASS_OF, extract_dicts
731 )
732 missing_superclass_qids = []
733
734 for superclass_id in superclasses_qids:
735 if superclass_id not in already_extracted_superclass_ids:
736 missing_superclass_qids.append(superclass_id)
737
738 if len(missing_superclass_qids) == 0:
739 return extract_dicts
740 else:
741 [
742 already_extracted_superclass_ids.add(superclass_id)
743 for superclass_id in superclasses_qids
744 ]
745 superclasses = get_classes("subclasses", missing_superclass_qids)
746 for superclass in superclasses:
747 extract_dicts.append(superclass)
748 return extract_dicts
749
750
751def get_unit_symbols(qids):
752 print(datetime.datetime.now(), f"Starting with unit symbols")
753 print(f"Total unit symbols to extract: {len(qids)}")
754 item_count = 0
755 extract_dicts = []
756 chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
757 id_chunks = chunks(list(qids), chunk_size)
758 for chunk in id_chunks:
759 query_result = wikidata_entity_request(
760 chunk, props=[CLAIMS], timeout=10)
761
762 if ENTITIES not in query_result:
763 logger.error("Skipping chunk")
764 continue
765
766 for result in query_result[ENTITIES].values():
767 try:
768 qid = result[ID]
769 except Exception as error:
770 logger.error(
771 "Error on qid, skipping item. Error: {0}".format(error))
772 continue
773
774 unit_symbol = dict_utils.try_get_unit_symbol(
775 result, PROPERTY_NAME_TO_PROPERTY_ID[UNIT_SYMBOL]
776 )
777
778 subject_dict = {ID: qid, UNIT_SYMBOL: unit_symbol}
779 extract_dicts.append(subject_dict)
780
781 item_count += len(chunk)
782 print(
783 f"Status of unit symbols: {item_count}/{len(qids)}", end="\r", flush=True)
784
785 print(datetime.datetime.now(), f"Finished with unit symbols")
786 return extract_dicts
787
788
789def resolve_unit_id_to_unit_symbol(artwork_dict, unit_symbols):
790 attribute_names = [HEIGHT_UNIT, WIDTH_UNIT, LENGTH_UNIT, DIAMETER_UNIT]
791 qid_unit_symbol_dict = {}
792 for unit_symbol_obj in unit_symbols:
793 qid_unit_symbol_dict[unit_symbol_obj[ID]] = unit_symbol_obj
794
795 for artwork_object in artwork_dict:
796 for attribute_name in attribute_names:
797 if artwork_object[attribute_name] != "":
798 entity_id = artwork_object[attribute_name]
799 artwork_object[attribute_name] = qid_unit_symbol_dict[entity_id][
800 UNIT_SYMBOL
801 ]
802 else:
803 artwork_object[attribute_name] = ""
804
805 return artwork_dict
806
807
808def resolve_entity_id_to_label(
809 attribute_name,
810 artwork_dict,
811 labels,
812 languageKeys=[item[0] for item in language_config_to_list()],
813):
814 # labels objects to qid_labels_dict
815 qid_labels_dict = {}
816 for label_obj in labels:
817 qid_labels_dict[label_obj[ID]] = label_obj
818
819 for artwork_object in artwork_dict:
820 if artwork_object[attribute_name] != "":
821 entity_id = artwork_object[attribute_name]
822 artwork_object[attribute_name] = qid_labels_dict[entity_id][
823 f"{LABEL[SINGULAR]}_{EN}"
824 ]
825 for langkey in languageKeys:
826 artwork_object[f"{attribute_name}_{langkey}"] = qid_labels_dict[
827 entity_id
828 ][f"{LABEL[SINGULAR]}_{langkey}"]
829 else:
830 for langkey in languageKeys:
831 artwork_object[f"{attribute_name}_{langkey}"] = ""
832
833 return artwork_dict
834
835
836def bundle_class_union_calls(distinct_classes, data_list):
837 for item in data_list:
838 distinct_classes = distinct_classes | get_distinct_attribute_values_from_dict(
839 CLASS[PLURAL], item
840 )
841 return distinct_classes
842
843
844def bundle_extract_data_calls(name_list, merged_artworks):
845 for item in name_list:
846 tmp = get_distinct_attribute_values_from_dict(item, merged_artworks)
847 yield get_subject(item, tmp)
848
849
850def generate_csv(name, extract_dicts, fields, filename):
851 """ Generates a csv file from a dictionary """
852 filename.parent.mkdir(parents=True, exist_ok=True)
853 with open(
854 filename.with_suffix(f".{CSV}"), "w", newline="", encoding="utf-8"
855 ) as file:
856 writer = csv.DictWriter(file, fieldnames=fields,
857 delimiter=";", quotechar='"')
858 writer.writeheader()
859 for extract_dict in extract_dicts:
860 writer.writerow(extract_dict)
861
862
863def generate_json(name, extract_dicts, filename):
864 """ Generates a JSON file from a dictionary """
865 if len(extract_dicts) == 0:
866 return
867 filename.parent.mkdir(parents=True, exist_ok=True)
868 with open(
869 filename.with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8"
870 ) as file:
871 arrayToDump = []
872 for extract_dict in extract_dicts:
873 extract_dict[TYPE] = name
874 arrayToDump.append(extract_dict)
875 file.write(json.dumps(arrayToDump, ensure_ascii=False))
876
877
878def write_data_to_json(
879 motifs,
880 genres,
881 extracted_classes,
882 materials,
883 movements,
884 locations,
885 merged_artworks,
886 artists,
887):
888 generate_json(MOTIF[SINGULAR], motifs, create_new_path(MOTIF[PLURAL]))
889 generate_json(GENRE[SINGULAR], genres, create_new_path(GENRE[PLURAL]))
890 generate_json(CLASS[SINGULAR], extracted_classes,
891 create_new_path(CLASS[PLURAL]))
892 generate_json(MATERIAL[SINGULAR], materials,
893 create_new_path(MATERIAL[PLURAL]))
894 generate_json(MOVEMENT[SINGULAR], movements,
895 create_new_path(MOVEMENT[PLURAL]))
896 generate_json(LOCATION[SINGULAR], locations,
897 create_new_path(LOCATION[PLURAL]))
898 generate_json(ARTWORK[SINGULAR], merged_artworks,
899 create_new_path(ARTWORK[PLURAL]))
900 generate_json(ARTIST[SINGULAR], artists, create_new_path(ARTIST[PLURAL]))
901
902
903if __name__ == "__main__":
904 if len(sys.argv) > 1 and sys.argv[1] == "-d":
905 if len(sys.argv) > 2 and sys.argv[2].isdigit():
906 DEV_CHUNK_LIMIT = int(sys.argv[2])
907 print("DEV MODE: on, DEV_LIM={0}".format(DEV_CHUNK_LIMIT))
908 DEV = True
909
910 logger.info("Extracting Art Ontology")
911 extract_art_ontology()