· 6 years ago · Sep 10, 2019, 02:40 PM
1#! /usr/bin/env python3
2import pandas as pd
3from tqdm import tqdm
4import re
5import wikipediaapi
6import html
7from polyglot.text import Text
8from langdetect import detect
9from yandex_translate import YandexTranslate
10from ontology_tools.ontology_export import export
11
12# Yandex API key
13
14translate = YandexTranslate("nope")
15
16# Pandas options
17tqdm.pandas()
18
19def yandextranslate(i):
20 # Translates i from German to English and removes the apostrophes and square brackets
21 # Returns the translated text
22 de_en = str(translate.translate(i, "de-en").get("text"))[2:-2]
23 return de_en
24
25
26def wikimagic(i):
27 # Tries to find the appropriate translation on Wikipedia
28 # Returns the page name or np.NaN if no page found
29
30 # Escape HTML. Not sure if it's needed but sicher ist sicher
31 escaped = html.escape(i)
32 # Does the page exist in English Wikipedia?
33 wiki_wiki = wikipediaapi.Wikipedia('en')
34 page = wiki_wiki.page(escaped)
35 try:
36 if page.exists():
37 page_title = re.sub(r"\(disambiguation\)", "", page.title)
38 return page_title
39
40 # If the page doesn't exist, search for the English version
41 # of the German article
42 else:
43 wiki_wiki = wikipediaapi.Wikipedia('de')
44 page_de = wiki_wiki.page(escaped)
45 if page_de.exists():
46 page_en = page_de.langlinks['en'].title
47 # Remove the "(disambiguation")
48 page_en = re.sub(r"\(disambiguation\)", "", page_en)
49 return page_en
50 # Otherwise use Yandex Translate API
51 else:
52 return yandextranslate(i)
53 # If something goes wrong, fall back to Yandex Translate API
54 except KeyError:
55 return yandextranslate(i)
56
57
58def is_english(i):
59 # Detects if the word is German using polyglot and langdetect
60 # If the word is both English and not German, returns True
61 for word in i.split(" "):
62 if Text(word).language.code == "de":
63 return False
64 else:
65 # Double check with langdetect
66 if detect(i) == "en":
67 return True
68 else:
69 return False
70
71
72print("Translating the ontology")
73
74
75def ontology_translate(df):
76 print("MISSING VALUES BEFORE: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
77 df["NAME_EN"] = df.progress_apply(
78 # Don't touch if already translated
79 lambda row: row["NAME_EN"] if not isinstance(row["NAME_EN"], float)
80
81 # Don't translate if the value...
82 else row["NAME_DE"] if
83
84 # ...consists of multiple acronyms separated by hyphen, plus, whitespace, etc.
85 re.match(r"^[a-zA-Z]{0,4}[ \-\\\/+]+[a-zA-Z]{1,4}$", row["NAME_DE"])
86
87 # ... is a programming language
88 else row["NAME_DE"] if
89 row["PARENT"] == "Programmiersprachen"
90
91 # ...is a short word (less or equal than 5 letters)
92 else row["NAME_DE"] if
93 len(row["NAME_DE"]) <= 5
94
95 # ..is in English
96 else row["NAME_DE"] if
97 is_english(row["NAME_DE"])
98
99 # Try some Wikipedia+Yandex magic as a last resort
100 else wikimagic(row["NAME_DE"]),
101 # else np.NaN,
102 axis=1,
103 )
104 print("MISSING VALUES AFTER: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
105 return df
106
107# Write out CSV