· 6 years ago · Oct 29, 2019, 01:56 AM
1import json
2import requests
3import sqlite3
4import logger
5import os
6import datetime
7import pdb
8
9
10# connect with the database
11name = "db"
12path = 'data/' + name + '.sqlite'
13con = sqlite3.connect(path) # look up what parameters
14
15# parameters for search
16publisher_site = raw_input('Enter publisher_site: ')
17#publisher_site = "dubawa.org" # reviewPublisherSiteFilter key: snopes.com; politifact.com; fullfact.org
18# africacheck.org, afp.factcheck.com, indiatoday.in, bbc.com, teyit.org
19pageSize = 10000 # pageSize key limit of max claims to search for
20api_key = 'AIzaSyDfghJARYkNf_zm8n_yPu6SSJwET3ZGR1U' #os.environ['API_KEY']
21query = "" # query key (currently not being used)
22request_string = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'
23response = requests.get( # searching API
24 request_string,
25 params={'pageSize': pageSize, 'reviewPublisherSiteFilter': publisher_site, 'key': api_key}
26 )
27 #params={'pageSize': pageSize, 'query': query, 'key': api_key}
28 #) # 'reviewPublisherSiteFilter': publisher_site,
29
30logger.important_message("publisher_site: " + publisher_site + ", query: " + query)
31
32# cursor object
33cur = con.cursor()
34# de-serialising response
35response_json = response.json()
36
37# store as a JSON object:
38print("storing as a json object")
39x = datetime.datetime.now() # x represents current time and date
40date_as_string = (str(x.year) + '-' + str(x.day) + '-' + str(x.strftime('%B')) +
41 '-' + str(x.hour) + ':' + str(x.minute))
42file_name = ('data/' + str(date_as_string) + '_' + publisher_site + '.json')
43with open(file_name, 'w') as outfile:
44 json.dump(response_json, outfile)
45
46
47"""
48table_size; given a table name, will return the size of the table.
49"""
50def table_size(table_name):
51 q = "SELECT * FROM " + str(table_name)
52 table_size = len(cur.execute(q).fetchall())
53 return table_size;
54
55"""
56extract claimReview; given a claimReview object, will return a dictionary of
57the claimReview fields
58"""
59def extract_claim_review(claimReview):
60 publisherSite = None
61 publisherName = None
62 url = None
63 title = None
64 reviewDate = None
65 textualRating = None
66 languageCode = None
67
68 if 'url' in claimReview:
69 url = claimReview['url']
70
71 if 'publisher' in claimReview:
72 if 'name' in claimReview['publisher']:
73 publisherName = claimReview['publisher']['name']
74
75 if 'site' in claimReview['publisher']:
76 publisherSite = claimReview['publisher']['site']
77
78 if 'title' in claimReview:
79 title = claimReview['title']
80
81 if 'reviewDate' in claimReview:
82 reviewDate = claimReview['reviewDate']
83
84 if 'textualRating' in claimReview:
85 textualRating = claimReview['textualRating']
86
87 if 'languageCode' in claimReview:
88 languageCode = claimReview['languageCode']
89
90 return {'publisherSite': publisherSite, 'publisherName': publisherName,
91 'url': url, 'title': title, 'reviewDate': reviewDate,
92 'textualRating': textualRating, 'languageCode': languageCode}
93
94"""
95given the fields: text, claimant, claimDate, will add these to the claimInstance
96table of the database (and autogenerate a claimInstanceID for this new entry)
97"""
98def add_claim_instance(ID, values):
99 claim_instance_sql = 'INSERT INTO claimInstance (claimInstanceID, claimText, claimant, claimDate) VALUES (?, ?, ?, ?)'
100 cur.execute(claim_instance_sql, (claimInstanceID, values['text'],
101 values['claimant'], values['claimDate']))
102
103"""
104given the fields: claimText, filepath, nthClaim, will add these to the claimJSONInstance table
105of the database.
106"""
107def add_json_reference(claimText, filepath, nthClaim):
108 json_reference_sql = 'INSERT INTO claimJSONInstance (claimText, filepath, nthClaim) VALUES (?, ?, ?)'
109 cur.execute(json_reference_sql, (claimText, filepath, nthClaim))
110
111
112"""
113given the fields values (dictionary containing: publisherSite, publisherName, url, title, reviewDate,
114textualRating, languageCode), will add these to the claimReview table of the
115database (and autogenerate a reviewID for this new entry).
116"""
117def add_review(reviewID, values):
118 claim_review_sql = ('INSERT INTO claimReview (reviewID, publisherSite, publisherName,'
119 + ' url, title, reviewDate, textualRating, languageCode) VALUES (?, ?, ?, ?, ?, ?, ?, ?)')
120 cur.execute(claim_review_sql, (reviewID, review['publisherSite'], review['publisherName'],
121 review['url'], review['title'], review['reviewDate'], review['textualRating'],
122 review['languageCode']))
123
124"""
125given the field: values (claimInstanceID, reviewID), add this to the
126reviewClaimRelationship table of the database.
127"""
128def add_review_claim_relationship(claimInstanceID, reviewID):
129 review_claim_relationship_sql = 'INSERT INTO reviewClaimRelationship (claimInstanceID, reviewID) VALUES (?, ?)'
130 cur.execute(review_claim_relationship_sql, (claimInstanceID, reviewID))
131
132
133
134# cycle through each claim; insert the claim, insert the claim review
135claim_index_number = 0;
136for claim in response_json['claims']:
137
138 if 'text' in claim:
139
140 # obtain variables for claim
141 text = claim['text']
142
143 if 'claimDate' in claim:
144 claimDate = claim['claimDate']
145 else:
146 claimDate = None
147
148 if 'claimant' in claim:
149 claimant = claim['claimant']
150 else:
151 claimant = None
152
153 # query the database to see if claimInstance already exists:
154 q = "SELECT * FROM claimInstance WHERE claimText = ?"
155 list_instances = cur.execute(q, (text,)).fetchall()
156 matching_number_of_rows = len(list_instances)
157 claim_exists = matching_number_of_rows > 0
158 if claim_exists:
159
160 # the new claimInstance will supercede an existing claimInstance if it contains an additional
161 # field to this existing claimInstance.
162 claimInstanceID = None # stores old ID, or used to store a new ID
163 claim_instance_already_exists = True # of whether or not the claimInstance exists
164
165 # for each of the existing claimInstances, if the claimInstance has a null claimant
166 # we assume that this is the only claimInstance, and supercede it.
167 i = 0 # counter variable
168 continue_cycle = True
169 superceded = False
170 identical_found = False
171 while continue_cycle and i < len(list_instances):
172 claimInstance_i = list_instances[i]
173 instance_claimant = claimInstance_i[2] # the claimant of this existing claimInstance
174 instance_claimID = claimInstance_i[0] # the claimInstanceID of this existing claimInstance
175 instance_claimDate = claimInstance_i[3] # the claimDate of this existing claimInstance
176
177 if claimDate == None:
178 # 1 - 4
179 if claimant == None:
180 logger.important_message("case 1-4" + str(text.encode("utf-8")))
181 claimInstanceID = list_instances[0][0]
182 continue_cycle = False
183 else:
184 if instance_claimDate == None:
185 # case 5
186 if instance_claimant == None:
187 logger.important_message("case 5" + str(text.encode("utf-8")))
188 print("SUPERCEDE")
189 pdb.set_trace()
190 superceded = True
191 # case 6
192 else:
193 if claimant == instance_claimant:
194 logger.important_message("case 6: " + str(text.encode("utf-8")))
195 identical_found = True
196 continue_cycle = False
197 claimInstanceID = instance_claimID
198 else:
199 # case 8
200 if instance_claimant != None:
201 if claimant == instance_claimant:
202 logger.important_message("case 8" + str(text.encode("utf-8")))
203 identical_found = True
204 continue_cycle = False
205 claimInstanceID = instance_claimID
206 else:
207 if claimant == None:
208 if instance_claimDate == None:
209 # case 9
210 if instance_claimant == None:
211 print("SUPERCEDE")
212 pdb.set_trace()
213 superceded = True
214 else:
215 # case 11
216 if instance_claimant == None:
217 if claimant == instance_claimant:
218 logger.important_message("case 11" + str(text.encode("utf-8")))
219 identical_found = True
220 continue_cycle = False
221 claimInstanceID = instance_claimID
222 # case 12
223 else:
224 if claimDate == instance_claimDate:
225 logger.important_message("case 12" + str(text.encode("utf-8")))
226 identical_found = True
227 continue_cycle = False
228 claimInstanceID = instance_claimID
229
230 else:
231 if instance_claimDate == None:
232 # case 13
233 if instance_claimant == None:
234 print("SUPERCEDE")
235 pdb.set_trace()
236 superceded = True
237 # case 14
238 else:
239 if claimant == instance_claimant:
240 print("SUPERCEDE")
241 pdb.set_trace()
242 superceded = True
243 else:
244 # case 15
245 if instance_claimant == None:
246 if claimDate == instance_claimDate:
247 print("SUPERCEDE")
248 pdb.set_trace()
249 superceded = True
250 # case 16
251 else:
252 if (claimDate == instance_claimDate) and (claimant == instance_claimant):
253 logger.important_message("case 16" + str(text.encode("utf-8")))
254 identical_found = True
255 continue_cycle = False
256 claimInstanceID = instance_claimID
257 i += 1
258
259 # for cases where an identical is found
260 if (not superceded) and (not identical_found):
261 logger.important_message("identical found, index: " + str(claim_index_number))
262 claimInstanceID = table_size("claimInstance") + 1
263 claim_instance_already_exists = False
264 add_claim_instance(claimInstanceID, {'text': text, 'claimant': claimant,
265 'claimDate': claimDate})
266 continue_cycle = False
267
268 # check if the reviews of the claim already exist:
269 for claimReview in claim['claimReview']:
270 review = extract_claim_review(claimReview)
271 # TODO find way to check if urls direct to the same place
272 check_review = ("SELECT * FROM claimReview WHERE publisherSite LIKE ? AND publisherName LIKE ? "
273 + "AND url LIKE ? AND title LIKE ? AND reviewDate LIKE ? AND textualRating LIKE ? AND languageCode LIKE ?")
274 review_query_result = cur.execute(check_review,
275 (review['publisherSite'], review['publisherName'],
276 review['url'], review['title'],
277 review['reviewDate'], review['textualRating'],
278 review['languageCode'])).fetchall()
279
280 # check to see if the review already is in the database:
281 if len(review_query_result) > 0: # case when the review already exists
282 # if the claimInstance of the review was new, trace the
283 # existing claimReview, find its reviewID, and then add a
284 # new reviewClaimRelationship
285 if not claim_instance_already_exists:
286 print("case 1.1")
287 try:
288 reviewID = review_query_result[0][0]
289 add_review_claim_relationship(claimInstanceID, reviewID)
290 except Exception as e:
291 print("FAILED case 1.1")
292 logger.error((str(e)))
293
294 # case when the review already exists, and claim instance
295 # already exists, so do nothing
296 else:
297 print("case 1.2")
298
299 else: # case when the review is different to any existing reviews
300
301 if claim_instance_already_exists:
302 print("case 2.1")
303 try:
304 reviewID = table_size("claimReview") + 1
305 review = extract_claim_review(claimReview)
306 add_review(reviewID, review)
307 add_review_claim_relationship(claimInstanceID, reviewID)
308
309 except Exception as e:
310 print("FAILED case 2.1")
311 print("claimInstanceID: " + str(claimInstanceID))
312 logger.error((str(e)))
313
314 else:
315 print("case 2.2")
316 try:
317 reviewID = table_size("claimReview") + 1
318 review = extract_claim_review(claimReview)
319 add_review(reviewID, review)
320 add_review_claim_relationship(claimInstanceID, reviewID)
321 except Exception as e:
322 print("FAILED case 2.2")
323 logger.error((str(e)))
324
325 else:
326 try:
327 # add the claim to the db
328 claim_sql = 'INSERT INTO claim (claimText) VALUES (?)'
329 cur.execute(claim_sql, (text,))
330 # add the claimInstance to the db
331 claimInstanceID = table_size("claimInstance") + 1
332 add_claim_instance(claimInstanceID, {'text': text, 'claimant': claimant,
333 'claimDate': claimDate})
334
335 # add the claim reviews for the given claim
336 for claimReview in claim['claimReview']:
337 review = extract_claim_review(claimReview)
338 reviewID = table_size("claimReview") + 1
339 add_review(reviewID, review)
340 add_review_claim_relationship(claimInstanceID, reviewID)
341 except Exception as e:
342 print(str(e))
343 logger.error((str(e)))
344
345 # add the json reference for the claim
346 add_json_reference(text, file_name, claim_index_number)
347 # increment claim_index_number for next claim to iterate over
348 claim_index_number += 1
349
350
351# commit, then close connection with the database
352con.commit()
353con.close()