· 7 years ago · Mar 16, 2019, 05:42 PM
1import sys
2import os
3import re
4import datetime
5
6
7def get_email_list(name, domain_list, total_left):
8 emails_left = total_left.copy()
9 email_list = []
10 for email in emails_left:
11 if any(email.endswith(substring) for substring in domain_list):
12 total_left.remove(email)
13 email_list.append(email)
14 language = (len(email_list), name, email_list)
15 print("Total left {} : {} {} ".format(len(total_left), language[1], language[0]))
16 return language, total_left
17
18
19def main():
20 file_path = sys.argv[1]
21
22 if not os.path.isfile(file_path):
23 print("File path {} Program Exiting..".format(
24 file_path))
25 sys.exit()
26
27 no_reviewer_emails = []
28 file_path2 = ""
29 if len(sys.argv) > 2:
30 file_path2 = sys.argv[2]
31 if os.path.isfile(file_path2):
32 with open(file_path2) as fp2:
33 for line in fp2:
34 line = line.lower().strip('\n')
35 no_reviewer_emails.append(line)
36
37 no_reviewer_emails = list(dict.fromkeys(no_reviewer_emails))
38 no_reviewer_emails = sorted(no_reviewer_emails)
39
40 with open(file_path) as fp:
41 cnt = 0
42 lines = []
43 for line in fp:
44 line = line.lower().strip('\n')
45 line = re.sub(r'(?is).*\(', "", line)
46 line = line.replace(")", "").replace(" ", "").replace("â ","")
47 # print("{}".format(line))
48 # record_word_cnt(line.strip().split(' '), bag_of_words)
49 lines.append(line)
50 cnt += 1
51
52 total_left = list(dict.fromkeys(lines))
53 print("Total {} ".format(cnt))
54 print("Total left {}. Duplicates removed.".format(len(total_left)))
55 total_left = sorted(total_left)
56 # [x for x in a if x not in [2, 3, 7]]
57 total_left = [x for x in total_left if x not in no_reviewer_emails]
58 print("Total left {}. No Reviewers removed.".format(len(total_left)))
59
60 english_domain = ["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
61 ".ie", ".com.sg", ".co.uk", ".com.au", "@icloud.com",
62 "@outlook.com", "@protonmail.com", "@aol.com",
63 "@ymail.com", "@live.com", "@mac.com", "@msn.com",
64 "@yahoo.ca", "@me.com", "@me.com", "@googlemail.com",
65 "@googlemail.com", "@yahoo.com.sg", "@yahoo.ie",
66 "@btinternet.com", "@eircom.net", "@comcast.net"]
67 english, total_left = get_email_list("English", english_domain, total_left)
68
69 languages = []
70 chinese_domain = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
71 "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
72 chinese, total_left = get_email_list("Chinese", chinese_domain, total_left)
73 languages.append(chinese)
74
75 korean_domain = [".kr", "@naver.com", "@hanmail.net", "@nate.com",
76 "@nate.com", "@daum.net", "@korea.com", "@posco.com"]
77 korean, total_left = get_email_list("Korean", korean_domain, total_left)
78 languages.append(korean)
79
80 japanese_domain = [".jp", "@ab.wakwak.com", "@nifty.com"]
81 japanese, total_left = get_email_list("Japanese", japanese_domain,
82 total_left)
83 languages.append(japanese)
84
85 german_domain = [".de", ".at", ".ch", "@gmx.net"]
86 german, total_left = get_email_list("German", german_domain, total_left)
87 languages.append(german)
88
89 france_domain = [".fr", ".nc", "@laposte.net", "@kedgebs.com"]
90 france, total_left = get_email_list("France", france_domain, total_left)
91 languages.append(france)
92
93 italian_domain = [".it"]
94 italian, total_left = get_email_list("Italian", italian_domain, total_left)
95 languages.append(italian)
96
97 russian_domain = [".ru", "@yandex.com", "@ukr.net"]
98 russian, total_left = get_email_list("Russian", russian_domain, total_left)
99 languages.append(russian)
100
101 portugese_domain = [".br", ".pt"]
102 portugese, total_left = get_email_list("Portugese", portugese_domain,
103 total_left)
104 languages.append(portugese)
105
106 polish_domain = [".pl"]
107 polish, total_left = get_email_list("Polish", polish_domain, total_left)
108 languages.append(polish)
109
110 swedish_domain = [".se"]
111 swedish, total_left = get_email_list("Swedish", swedish_domain, total_left)
112 languages.append(swedish)
113
114 czech_domain = [".cz"]
115 czech, total_left = get_email_list("Czech", czech_domain, total_left)
116 languages.append(czech)
117
118 croatia_domain = [".hr"]
119 croatia, total_left = get_email_list(
120 "Croatia(Not available) ", croatia_domain, total_left)
121 languages.append(croatia)
122
123 hebrew_domain = [".il"]
124 hebrew, total_left = get_email_list("Hebrew", hebrew_domain, total_left)
125 languages.append(hebrew)
126
127 indonesian_domain = [".id", "@mcreasindo.com"]
128 indonesian, total_left = get_email_list("Indonesia", indonesian_domain,
129 total_left)
130 languages.append(indonesian)
131
132 dutch_domain = [".nl", "@chocoweb.com", "@vierbergen.net"]
133 dutch, total_left = get_email_list("Dutch", dutch_domain, total_left)
134 languages.append(dutch)
135
136 spanish_domain = [".cl", ".ar", ".es", "@mapp-oea.org"]
137 spanish, total_left = get_email_list("Spanish", spanish_domain, total_left)
138 languages.append(spanish)
139
140 education_domain = [".edu", ".edu.sg", ".edu.au"]
141 education, total_left = get_email_list("Education", education_domain,
142 total_left)
143 # languages.append(education)
144
145 vietnamese_domain = [".vn"]
146 vietnamese, total_left = get_email_list("Vietnamese", vietnamese_domain,
147 total_left)
148
149 vietnamese_name = ["nguyen", "hoang", "ngoc", "phuong"]
150 temp_lines = english[2].copy()
151 for line in temp_lines:
152 if any(substring in line for substring in vietnamese_name):
153 english[2].remove(line)
154 vietnamese[2].append(line)
155
156 english = (len(english[2]), english[1], english[2])
157 vietnamese = (len(vietnamese[2]), vietnamese[1], vietnamese[2])
158 print("Total left {} : {} {} : {} {} "
159 .format(len(total_left), english[1], english[0], vietnamese[1],
160 vietnamese[0]))
161 languages.append(vietnamese)
162
163 temp_lines = total_left.copy()
164 bad_email = []
165 for line in temp_lines:
166 if "@gmail" in line or "@aol" in line or "@" not in line \
167 or re.search(".*\..$", line) or re.search(".*\..\..*", line):
168 total_left.remove(line)
169 bad_email.append(line)
170
171 print("Total left {} : bad {} ".format(len(total_left), len(bad_email)))
172
173 # print(*lines, sep='\n')
174 # print("========================")
175 # print(*bad_email, sep='\n')
176 print("========================")
177 languages = sorted(languages, key=lambda tup: tup[0], reverse=True)
178 out_file_name = "tripadvisor_{}.txt" \
179 .format(datetime.datetime.today().strftime('%Y_%m_%d'))
180
181 with open(out_file_name, "w") as of:
182 cnt = 1
183 for language in languages:
184 of.write("\n\n==== {}. {} ====\n".format(cnt, language[1]))
185 of.write("\n".join(language[2]))
186 cnt = cnt + 1
187
188 if len(english[2]) < 1000:
189 of.write("\n\n==== English ====\n")
190 of.write("\n".join(english[2]))
191 elif len(english[2]) < 2000:
192 of.write("\n\n==== English 0-1000 ====\n")
193 of.write("\n".join(english[2][0:1000]))
194 of.write("\n\n==== English 1001- ====\n")
195 of.write("\n".join(english[2][1001:]))
196 elif len(english[2]) < 3000:
197 of.write("\n\n==== English 0-1000 ====\n")
198 of.write("\n".join(english[2][0:1000]))
199 of.write("\n\n==== English 1001-2000 ====\n")
200 of.write("\n".join(english[2][1001:2000]))
201 of.write("\n\n==== English 2001- ====\n")
202 of.write("\n".join(english[2][2001:]))
203 else:
204 of.write("\n\n==== English 0-1000 ====\n")
205 of.write("\n".join(english[2][0:1000]))
206 of.write("\n\n==== English 1001-2000 ====\n")
207 of.write("\n".join(english[2][1001:2000]))
208 of.write("\n\n==== English 2001-3000 ====\n")
209 of.write("\n".join(english[2][2001:3000]))
210 of.write("\n\n==== English 3001- ====\n")
211 of.write("\n".join(english[2][3001:]))
212 print("Contact IT. There are over 3000 emails!!!")
213
214 of.write("\n\n==== Leftover ====\n")
215 of.write("\n".join(total_left))
216 of.write("\n\n==== Bad email ====\n")
217 of.write("\n".join(bad_email))
218 of.write("\n\n")
219 of.write(" ".join(["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
220 ".ie", ".com.sg", ".co.uk", ".com.au", ".de", ".fr",
221 ".hk", "co.jp", "@outlook.com", "@aol.com"]))
222
223 of.write("\n")
224 of.write("\n\n==== Education ====\n")
225 of.write("\n".join(education[2]))
226
227 # print(out_file_name)
228 # print(datetime.datetime.today().strftime('%Y_%m_%d'))
229 # print(*english[2], sep='\n')
230 # sorted_words = order_bag_of_words(bag_of_words, desc=True)
231 # print("Most frequent 10 words {}".format(sorted_words[:10]))
232
233
234if __name__ == '__main__':
235 main()
236
237 # domain_list = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
238 # "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]