· 4 years ago · Aug 18, 2021, 02:48 PM
1#import libraries
2import requests
3from airtable import Airtable
4import gspread
5from oauth2client.service_account import ServiceAccountCredentials
6import pandas as pd
7import nltk
8from nltk.corpus import stopwords
9from nltk.stem.snowball import SnowballStemmer
10import re
11import sys
12import warnings
13import pickle
14import gzip, pickletools
15from configparser import ConfigParser
16
17#get api key from config file and get data from Airtable - base and api key are in a hidden config folder
18config = ConfigParser()
19config.read('config/config.ini')
20key = config.get('default', 'api_key')
21base = config.get('default', 'base')
22base_health = config.get('default', 'base_health')
23base_cra= config.get('default', 'base_cra')
24base_travel= config.get('default', 'base_travel')
25print('Accessed the keys - PF')
26
27airtable_main = Airtable(base, 'Page feedback', api_key=key)
28airtable_health = Airtable(base_health, 'Page feedback', api_key=key)
29airtable_cra = Airtable(base_cra, 'Page feedback', api_key=key)
30airtable_travel= Airtable(base_travel, 'Page feedback', api_key=key)
31
32record_list_main = airtable_main.get_all()
33record_list_health = airtable_health.get_all()
34record_list_cra = airtable_cra.get_all()
35record_list_travel = airtable_travel.get_all()
36print('Fetched the data - PF')
37
38#convert data to Pandas dataframe
39data_main = pd.DataFrame([record['fields'] for record in record_list_main])
40data_health = pd.DataFrame([record['fields'] for record in record_list_health])
41data_cra = pd.DataFrame([record['fields'] for record in record_list_cra])
42data_travel = pd.DataFrame([record['fields'] for record in record_list_travel])
43
44
45
46#If you want to experiment with this script without setting up an AirTable, you can do so by loading the tagged_feedback.csv file from the repo and convert it to a Pandas dataframe, with this line of code: "data = pd.read_csv('tagged_feedback.csv')".
47
48
49data_2 = data_main.append(data_health, ignore_index=True, sort=True)
50
51data_1 = data_2.append(data_cra, ignore_index=True, sort=True)
52
53data = data_1.append(data_travel, ignore_index=True, sort=True)
54
55
56# use creds to create a client to interact with the Google Drive API
57scope = [
58'https://www.googleapis.com/auth/spreadsheets',
59'https://www.googleapis.com/auth/drive'
60]
61creds = ServiceAccountCredentials.from_json_keyfile_name('client_secret.json', scope)
62client = gspread.authorize(creds)
63
64# Find a spreadsheet by name and open the first sheet
65sheet = client.open("Tier 1 - CronJob Models by URL ").sheet1
66
67# Add URLs to dictionary as key, with model as value
68list_of_entries = sheet.get_all_records()
69url_model_dict = {}
70for i in range(len(list_of_entries)):
71 if list_of_entries[i]['URL'] not in url_model_dict.keys():
72 url_model_dict[list_of_entries[i]['URL']] = list_of_entries[i]['MODEL']
73
74
75# Remove unneccessary columns
76data = data[['Comment', 'Lookup_tags', 'URL', 'Model function', 'Tags confirmed', 'Lang']]
77
78# Add model column
79data['model'] = ""
80
81# Add appropriate models to model column by retreiving URL in dict
82for i in range(len(data)):
83 data['model'][i] = url_model_dict.get(data['URL'][i])
84
85#split dataframe for English comments
86data_en = data[data['Lang'].str.contains("EN", na=False)]
87
88#keep only relevant columns from the dataframe
89
90#remove all rows thave have null content - this, in effect, removes comments for which the tags haven't been confirmed by a human
91data_en_topic = data_en.dropna()
92data_en_topic = data_en_topic.drop_duplicates(subset ="Comment")
93data_en_topic = data_en_topic.reset_index(drop=True)
94
95#converts the tags to a string (instead of a list) - needed for further processing - and puts it in a new column
96data_en_topic['topics'] = [','.join(map(str, l)) for l in data_en_topic['Lookup_tags']]
97data_en_topic['Model function'] = [','.join(map(str, l)) for l in data_en_topic['Model function']] ####### might not be needed
98
99# Check if models are the same
100for i in range(len(data_en_topic)):
101 if data_en_topic['Model function'][i] != data_en_topic['model'][i]:
102 print("Found unmatching models - EN")
103 # print(data_en_topic['Model function'][i])
104 # print(data_en_topic['model'][i])
105 # print(i)
106 # print()
107
108# Remove unneccessary columns
109data_en_topic = data_en_topic.drop(columns=['Lookup_tags'])
110data_en_topic = data_en_topic.drop(columns=['Model function'])
111data_en_topic = data_en_topic.drop(columns=['Tags confirmed'])
112
113# for i in range(len(data_en_topic)):
114# data_en_topic['model'][i] = data_en_topic['model'][i].lower()
115
116#split dataframe for French comments - same comments as above for each line
117data_fr = data[data['Lang'].str.contains("FR", na=False)]
118data_fr_topic = data_fr.dropna()
119data_fr_topic = data_fr_topic.drop_duplicates(subset ="Comment")
120data_fr_topic = data_fr_topic.reset_index(drop=True)
121data_fr_topic['topics'] = [','.join(map(str, l)) for l in data_fr_topic['Lookup_tags']]
122data_fr_topic['Model function'] = [','.join(map(str, l)) for l in data_fr_topic['Model function']]
123
124# Check if models are the same
125for i in range(len(data_fr_topic)):
126 if data_fr_topic['Model function'][i] != data_fr_topic['model'][i]:
127 print("Found unmatching models - FR")
128 # print(data_en_topic['Model function'][i])
129 # print(data_en_topic['model'][i])
130 # print(i)
131 # print()
132
133data_fr_topic = data_fr_topic.drop(columns=['Lookup_tags'])
134data_fr_topic = data_fr_topic.drop(columns=['Model function'])
135data_fr_topic = data_fr_topic.drop(columns=['Tags confirmed'])
136
137# for i in range(len(data_fr_topic)):
138# data_fr_topic['model'][i] = data_fr_topic['model'][i].lower()
139
140#get the different possible models
141
142topics_en = list(data_en_topic['model'].unique())
143
144topics_fr = list(data_fr_topic['model'].unique())
145
146#creates a dictionary (key = model, value = tagged feedback for that model)
147sections_en = {topic : data_en_topic[data_en_topic['model'].str.contains(topic, na=False)] for topic in topics_en}
148sections_fr = {topic : data_fr_topic[data_fr_topic['model'].str.contains(topic, na=False)] for topic in topics_fr}
149
150#reset index for each model
151for cat in sections_en:
152 sections_en[cat] = sections_en[cat].reset_index(drop=True)
153
154for cat in sections_fr:
155 sections_fr[cat] = sections_fr[cat].reset_index(drop=True)
156
157#import the OneHotEncoder
158from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
159
160#convert English feedback to sparse matrix
161cats_en = {}
162for section in sections_en:
163 mlb = MultiLabelBinarizer()
164 mhv= mlb.fit_transform(sections_en[section]['topics'].apply(lambda x: set(x.split(','))))
165 cats_en[section] = pd.DataFrame(mhv,columns=mlb.classes_)
166 cats_en[section].insert(0, 'Feedback', sections_en[section]['Comment'])
167
168#convert French feedback to sparse matrix
169cats_fr = {}
170for section in sections_fr:
171 mlb = MultiLabelBinarizer()
172 mhv= mlb.fit_transform(sections_fr[section]['topics'].apply(lambda x: set(x.split(','))))
173 cats_fr[section] = pd.DataFrame(mhv,columns=mlb.classes_)
174 cats_fr[section].insert(0, 'Feedback', sections_fr[section]['Comment'])
175
176
177
178#pre-process feedback for NLP
179
180if not sys.warnoptions:
181 warnings.simplefilter("ignore")
182
183#function to clean the word of any punctuation or special characters
184def cleanPunc(sentence):
185 cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
186 cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
187 cleaned = cleaned.strip()
188 cleaned = cleaned.replace("\n"," ")
189 return cleaned
190
191#function to convert to lowercase
192def keepAlpha(sentence):
193 alpha_sent = ""
194 for word in sentence.split():
195 alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
196 alpha_sent += alpha_word
197 alpha_sent += " "
198 alpha_sent = alpha_sent.strip()
199 return alpha_sent
200
201
202#function to stem feedbck (English)
203stemmer_en = SnowballStemmer("english")
204def stemming_en(sentence):
205 stemSentence = ""
206 for word in sentence.split():
207 stem = stemmer_en.stem(word)
208 stemSentence += stem
209 stemSentence += " "
210 stemSentence = stemSentence.strip()
211 return stemSentence
212
213#apply pre-process functions to English
214for cat in cats_en:
215 cats_en[cat]['Feedback'] = cats_en[cat]['Feedback'].str.lower()
216 cats_en[cat]['Feedback'] = cats_en[cat]['Feedback'].apply(cleanPunc)
217 cats_en[cat]['Feedback'] = cats_en[cat]['Feedback'].apply(keepAlpha)
218 cats_en[cat]['Feedback'] = cats_en[cat]['Feedback'].apply(stemming_en)
219
220
221#function to stem French feedback
222stemmer_fr = SnowballStemmer("french")
223def stemming_fr(sentence):
224 stemSentence = ""
225 for word in sentence.split():
226 stem = stemmer_fr.stem(word)
227 stemSentence += stem
228 stemSentence += " "
229 stemSentence = stemSentence.strip()
230 return stemSentence
231
232#apply processing function to French feedback
233for cat in cats_fr:
234 cats_fr[cat]['Feedback'] = cats_fr[cat]['Feedback'].str.lower()
235 cats_fr[cat]['Feedback'] = cats_fr[cat]['Feedback'].apply(cleanPunc)
236 cats_fr[cat]['Feedback'] = cats_fr[cat]['Feedback'].apply(keepAlpha)
237 cats_fr[cat]['Feedback'] = cats_fr[cat]['Feedback'].apply(stemming_fr)
238
239
240
241#import Vectorizer
242from sklearn.feature_extraction.text import TfidfVectorizer
243
244
245#get all English text to build vectorizer as a dictionary (one key per model)
246all_text_en = {}
247for cat in cats_en:
248 all_text_en[cat] = cats_en[cat]['Feedback'].values.astype('U')
249
250#peform vectoriztion for each English model (using dictionaries)
251vects_en = {}
252all_x_en = {}
253for cat in all_text_en:
254 vectorizer_en = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
255 vects_en[cat] = vectorizer_en.fit(all_text_en[cat])
256 all_x_en[cat] = vects_en[cat].transform(all_text_en[cat])
257
258#split the English labels from the value - get all possible tags for each model
259all_y_en = {}
260categories_en = {}
261for cat in all_x_en:
262 all_y_en[cat] = cats_en[cat].drop(labels = ['Feedback'], axis=1)
263 categories_en[cat] = list(all_y_en[cat].columns.values)
264
265
266#get all French text to build vectorizer as a dictionary (one key per model)
267all_text_fr = {}
268for cat in cats_fr:
269 all_text_fr[cat] = cats_fr[cat]['Feedback'].values.astype('U')
270
271#peform vectoriztion for each French model (using dictionaries)
272all_x_fr = {}
273vects_fr = {}
274for cat in all_text_fr:
275 vectorizer_fr = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
276 vects_fr[cat] = vectorizer_fr.fit(all_text_fr[cat])
277 all_x_fr[cat] = vects_fr[cat].transform(all_text_fr[cat])
278
279#split the French labels from the value - get all possible tags for each model
280all_y_fr = {}
281categories_fr = {}
282for cat in all_x_fr:
283 all_y_fr[cat] = cats_fr[cat].drop(labels = ['Feedback'], axis=1)
284 categories_fr[cat] = list(all_y_fr[cat].columns.values)
285
286
287
288
289#import models to train algorithm
290from sklearn.naive_bayes import MultinomialNB
291from sklearn.pipeline import Pipeline
292from sklearn.multiclass import OneVsRestClassifier
293
294
295#create English model
296model_en = {}
297for cat in categories_en:
298 model_en[cat] = {}
299 for category in categories_en[cat]:
300 NB_pipeline = Pipeline([
301 ('clf', OneVsRestClassifier(MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None))),
302 ])
303 NB_pipeline.fit(all_x_en[cat], cats_en[cat][category])
304 model_en[cat][category] = NB_pipeline
305
306#create French model
307model_fr = {}
308for cat in categories_fr:
309 model_fr[cat] = {}
310 for category in categories_fr[cat]:
311 NB_pipeline = Pipeline([
312 ('clf', OneVsRestClassifier(MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None))),
313 ])
314 NB_pipeline.fit(all_x_fr[cat], cats_fr[cat][category])
315 model_fr[cat][category] = NB_pipeline
316
317
318#save to pickle files
319def serialize(obj, file, protocol=-1):
320 with gzip.open(file, "wb") as f:
321 pickled = pickle.dumps(obj)
322 optimized_pickle = pickletools.optimize(pickled)
323 f.write(optimized_pickle)
324
325
326serialize(categories_en, 'data/categories_en.pickle')
327serialize(categories_fr, 'data/categories_fr.pickle')
328serialize(vects_en, 'data/vectorizer_en.pickle')
329serialize(vects_fr, 'data/vectorizer_fr.pickle')
330serialize(model_en, 'data/model_en.pickle')
331serialize(model_fr, 'data/model_fr.pickle')
332
333print('Processing feedback process complete - PF')