· 6 years ago · Sep 10, 2019, 02:38 PM
1#! /usr/bin/env python3
2import pandas as pd
3from tqdm import tqdm
4import re
5import wikipediaapi
6import html
7from polyglot.text import Text
8from langdetect import detect
9from yandex_translate import YandexTranslate
10from ontology_tools.ontology_export import export
11
12# Yandex API key
13
14translate = YandexTranslate("trnsl.1.1.20190909T123309Z.6b172dfc062ad4c6.842894597a2b68c6dffd34bbef2b2e050cef7b92")
15# Pandas options
16tqdm.pandas()
17pd.set_option('display.max_rows', 500)
18pd.set_option('display.max_columns', 500)
19pd.set_option('display.width', 1000)
20
21
22def yandextranslate(i):
23 # Translates i from German to English and removes the apostrophes and square brackets
24 # Returns the translated text
25 de_en = str(translate.translate(i, "de-en").get("text"))[2:-2]
26 return de_en
27
28
29def wikimagic(i):
30 # Tries to find the appropriate translation on Wikipedia
31 # Returns the page name or np.NaN if no page found
32
33 # Escape HTML. Not sure if it's needed but sicher ist sicher
34 escaped = html.escape(i)
35 # Does the page exist in English Wikipedia?
36 wiki_wiki = wikipediaapi.Wikipedia('en')
37 page = wiki_wiki.page(escaped)
38 try:
39 if page.exists():
40 page_title = re.sub(r"\(disambiguation\)", "", page.title)
41 return page_title
42
43 # If the page doesn't exist, search for the English version
44 # of the German article
45 else:
46 wiki_wiki = wikipediaapi.Wikipedia('de')
47 page_de = wiki_wiki.page(escaped)
48 if page_de.exists():
49 page_en = page_de.langlinks['en'].title
50 # Remove the "(disambiguation")
51 page_en = re.sub(r"\(disambiguation\)", "", page_en)
52 return page_en
53 # Otherwise use Yandex Translate API
54 else:
55 return yandextranslate(i)
56 # If something goes wrong, fall back to Yandex Translate API
57 except KeyError:
58 return yandextranslate(i)
59
60
61def is_english(i):
62 # Detects if the word is German using polyglot and langdetect
63 # If the word is both English and not German, returns True
64 for word in i.split(" "):
65 if Text(word).language.code == "de":
66 return False
67 else:
68 # Double check with langdetect
69 if detect(i) == "en":
70 return True
71 else:
72 return False
73
74
75print("Translating the ontology")
76
77
78def ontology_translate(df):
79 print("MISSING VALUES BEFORE: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
80 df["NAME_EN"] = df.progress_apply(
81 # Don't touch if already translated
82 lambda row: row["NAME_EN"] if not isinstance(row["NAME_EN"], float)
83
84 # Don't translate if the value...
85 else row["NAME_DE"] if
86
87 # ...consists of multiple acronyms separated by hyphen, plus, whitespace, etc.
88 re.match(r"^[a-zA-Z]{0,4}[ \-\\\/+]+[a-zA-Z]{1,4}$", row["NAME_DE"])
89
90 # ... is a programming language
91 else row["NAME_DE"] if
92 row["PARENT"] == "Programmiersprachen"
93
94 # ...is a short word (less or equal than 5 letters)
95 else row["NAME_DE"] if
96 len(row["NAME_DE"]) <= 5
97
98 # ..is in English
99 else row["NAME_DE"] if
100 is_english(row["NAME_DE"])
101
102 # Try some Wikipedia+Yandex magic as a last resort
103 else wikimagic(row["NAME_DE"]),
104 # else np.NaN,
105 axis=1,
106 )
107 print("MISSING VALUES AFTER: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
108 return df