· 6 years ago · Sep 10, 2019, 02:38 PM
1#! /usr/bin/env python3
2import pandas as pd
3from tqdm import tqdm
4import re
5import wikipediaapi
6import html
7from polyglot.text import Text
8from langdetect import detect
9from yandex_translate import YandexTranslate
10
11# Yandex API key
12translate = YandexTranslate("trnsl.1.1.20190909T123309Z.6b172dfc062ad4c6.842894597a2b68c6dffd34bbef2b2e050cef7b92")
13# Pandas options
14tqdm.pandas()
15pd.set_option('display.max_rows', 500)
16pd.set_option('display.max_columns', 500)
17pd.set_option('display.width', 1000)
18
19
20def yandextranslate(i):
21 # Translates i from German to English and removes the apostrophes and square brackets
22 # Returns the translated text
23 de_en = str(translate.translate(i, "de-en").get("text"))[2:-2]
24 return de_en
25
26
27def wikimagic(i):
28 # Tries to find the appropriate translation on Wikipedia
29 # Returns the page name or np.NaN if no page found
30
31 # Escape HTML. Not sure if it's needed but sicher ist sicher
32 escaped = html.escape(i)
33 # Does the page exist in English Wikipedia?
34 wiki_wiki = wikipediaapi.Wikipedia('en')
35 page = wiki_wiki.page(escaped)
36 try:
37 if page.exists():
38 page_title = re.sub(r"\(disambiguation\)", "", page.title)
39 return page_title
40
41 # If the page doesn't exist, search for the English version
42 # of the German article
43 else:
44 wiki_wiki = wikipediaapi.Wikipedia('de')
45 page_de = wiki_wiki.page(escaped)
46 if page_de.exists():
47 page_en = page_de.langlinks['en'].title
48 # Remove the "(disambiguation")
49 page_en = re.sub(r"\(disambiguation\)", "", page_en)
50 return page_en
51 # Otherwise use Yandex Translate API
52 else:
53 return yandextranslate(i)
54 # If something goes wrong, fall back to Yandex Translate API
55 except KeyError:
56 return yandextranslate(i)
57
58
59def is_english(i):
60 # Detects if the word is German using polyglot and langdetect
61 # If the word is both English and not German, returns True
62 for word in i.split(" "):
63 if Text(word).language.code == "de":
64 return False
65 else:
66 # Double check with langdetect
67 if detect(i) == "en":
68 return True
69 else:
70 return False
71
72
73print("Translating the ontology")
74
75
76def ontology_translate(df):
77 print("MISSING VALUES BEFORE: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
78 df["NAME_EN"] = df.progress_apply(
79 # Don't touch if already translated
80 lambda row: row["NAME_EN"] if not isinstance(row["NAME_EN"], float)
81
82 # Don't translate if the value...
83 else row["NAME_DE"] if
84
85 # ...consists of multiple acronyms separated by hyphen, plus, whitespace, etc.
86 re.match(r"^[a-zA-Z]{0,4}[ \-\\\/+]+[a-zA-Z]{1,4}$", row["NAME_DE"])
87
88 # ... is a programming language
89 else row["NAME_DE"] if
90 row["PARENT"] == "Programmiersprachen"
91
92 # ...is a short word (less or equal than 5 letters)
93 else row["NAME_DE"] if
94 len(row["NAME_DE"]) <= 5
95
96 # ..is in English
97 else row["NAME_DE"] if
98 is_english(row["NAME_DE"])
99
100 # Try some Wikipedia+Yandex magic as a last resort
101 else wikimagic(row["NAME_DE"]),
102 # else np.NaN,
103 axis=1,
104 )
105 print("MISSING VALUES AFTER: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
106 df.drop(["PARENT"], axis=1, inplace=True)
107 return df
108
109# Write out CSV