FUK47YW1

· 6 years ago · Sep 10, 2019, 02:38 PM
1#! /usr/bin/env python3
2import pandas as pd
3from tqdm import tqdm
4import re
5import wikipediaapi
6import html
7from polyglot.text import Text
8from langdetect import detect
9from yandex_translate import YandexTranslate
10
11# Yandex API key
12translate = YandexTranslate("trnsl.1.1.20190909T123309Z.6b172dfc062ad4c6.842894597a2b68c6dffd34bbef2b2e050cef7b92")
13# Pandas options
14tqdm.pandas()
15pd.set_option('display.max_rows', 500)
16pd.set_option('display.max_columns', 500)
17pd.set_option('display.width', 1000)
18
19
20def yandextranslate(i):
21    # Translates i from German to English and removes the apostrophes and square brackets
22    # Returns the translated text
23    de_en = str(translate.translate(i, "de-en").get("text"))[2:-2]
24    return de_en
25
26
27def wikimagic(i):
28    # Tries to find the appropriate translation on Wikipedia
29    # Returns the page name or np.NaN if no page found
30
31    # Escape HTML. Not sure if it's needed but sicher ist sicher
32    escaped = html.escape(i)
33    # Does the page exist in English Wikipedia?
34    wiki_wiki = wikipediaapi.Wikipedia('en')
35    page = wiki_wiki.page(escaped)
36    try:
37        if page.exists():
38            page_title = re.sub(r"\(disambiguation\)", "", page.title)
39            return page_title
40
41        # If the page doesn't exist, search for the English version
42        # of the German article
43        else:
44            wiki_wiki = wikipediaapi.Wikipedia('de')
45            page_de = wiki_wiki.page(escaped)
46            if page_de.exists():
47                page_en = page_de.langlinks['en'].title
48                # Remove the "(disambiguation")
49                page_en = re.sub(r"\(disambiguation\)", "", page_en)
50                return page_en
51            # Otherwise use Yandex Translate API
52            else:
53                return yandextranslate(i)
54    # If something goes wrong, fall back to Yandex Translate API
55    except KeyError:
56        return yandextranslate(i)
57
58
59def is_english(i):
60    # Detects if the word is German using polyglot and langdetect
61    # If the word is both English and not German, returns True
62    for word in i.split(" "):
63        if Text(word).language.code == "de":
64            return False
65    else:
66        # Double check with langdetect
67        if detect(i) == "en":
68            return True
69        else:
70            return False
71
72
73print("Translating the ontology")
74
75
76def ontology_translate(df):
77    print("MISSING VALUES BEFORE: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
78    df["NAME_EN"] = df.progress_apply(
79        # Don't touch if already translated
80        lambda row: row["NAME_EN"] if not isinstance(row["NAME_EN"], float)
81
82        # Don't translate if the value...
83        else row["NAME_DE"] if
84
85        # ...consists of multiple acronyms separated by hyphen, plus, whitespace, etc.
86        re.match(r"^[a-zA-Z]{0,4}[ \-\\\/+]+[a-zA-Z]{1,4}$", row["NAME_DE"])
87
88        # ... is a programming language
89        else row["NAME_DE"] if
90        row["PARENT"] == "Programmiersprachen"
91
92        # ...is a short word (less or equal than 5 letters)
93        else row["NAME_DE"] if
94        len(row["NAME_DE"]) <= 5
95
96        # ..is in English
97        else row["NAME_DE"] if
98        is_english(row["NAME_DE"])
99
100        # Try some Wikipedia+Yandex magic as a last resort
101        else wikimagic(row["NAME_DE"]),
102        # else np.NaN,
103        axis=1,
104    )
105    print("MISSING VALUES AFTER: " + str(len(df["NAME_EN"]) - df["NAME_EN"].count()))
106    df.drop(["PARENT"], axis=1, inplace=True)
107    return df
108
109# Write out CSV