· 6 years ago · Dec 11, 2019, 02:30 PM
1import collections
2import requests
3from db_utils import MongoDB
4import copy
5import threading
6import traceback
7import json
8import time
9from fuzzywuzzy import fuzz
10
11RESET_CACHE = False
12
13SIGMOID_MONGODB_LINK = 'mongodb://sigmoid:7cu6UoDSs2bHPGgl@13.233.203.170:52138/'
14DATASET_IDS = [806, 130]
15NO_OF_SAMPLES = 400
16STRICT_MODE = True
17
18
19class Maps:
20 ENDPOINT_OUTPUT_MAP = dict()
21
22
23def modify_prediction_dict(gt, prediction_dict):
24 if "error" in prediction_dict.keys():
25 return None
26
27 new_preds = dict()
28 new_preds['timeElapsed'] = 0.0
29 new_preds['DORY_OCR'] = ""
30 for key, _value in gt.items():
31 new_preds[key] = "NO_MATCH"
32 return new_preds
33
34 ocr_text = prediction_dict['ocrs'][0]['text']
35 new_preds = dict()
36 new_preds['timeElapsed'] = prediction_dict['timeElapsed']
37 new_preds['DORY_OCR'] = prediction_dict['ocrs'][0]['text']
38
39 for key, _value in gt.items():
40 ocr_text_1 = ocr_text.upper().replace(' ', "/").replace('\n', "/").replace(',', '/').replace('/', '/').replace(
41 '0', 'O'). \
42 replace('-', '/').replace('.', '/')
43
44 value_1 = _value.upper().replace(' ', "/").replace('\n', "/").replace(',', '/').replace('/', '/').replace('0', 'O').\
45 replace('-', '/').replace('.', '/')
46
47 ocr_text_2 = ocr_text.upper().replace(' ', "").replace('\n', "").replace(',', '').replace('/', '/').replace(
48 '0', 'O').replace('-', '/').replace('.', '')
49
50 value_2 = _value.upper().replace(' ', "").replace('\n', "").replace(',', '').replace('/', '/').replace(
51 '0', 'O').replace('-', '/').replace('.', '')
52
53 if value_1 in ocr_text_1:
54 new_preds[key] = _value
55 elif value_2 in ocr_text_2:
56 new_preds[key] = _value
57 else:
58 new_preds[key] = "NO_MATCH"
59 #
60 # index = 0
61 # for v in values:
62 # if len(v) <= 2:
63 # index += 1
64 # continue
65 #
66 # if v.strip() in ocr_texts:
67 # index += 1
68 #
69 # try:
70 # score = index / len(values)
71 # except ZeroDivisionError:
72 # score = 0
73 #
74 # if score > 0.5:
75 # new_preds[key] = _value
76 # else:
77 # new_preds[key] = "NO_MATCH"
78
79 # fuzz.ratio(gt_string.upper().replace(' ', "").replace('\n', ""),
80 # pred_string.upper().replace(' ', "").replace('\n', "")) == 100
81
82 # print(gt)
83 # print(prediction_dict)
84 # print(new_preds, e)
85 return new_preds
86
87
88def post_request(url):
89 request_dict = {
90 "type": "documentOcr",
91 "urls": [url],
92 "url": url,
93 "instance": {
94 "id": "random",
95 "callbackUrl": ""
96 }}
97
98 current_time = time.time()
99 try:
100 response = requests.post(url="http://api.prv:52106/ai/vision/signzy-ocr/image",
101 data=json.dumps(request_dict),
102 headers={
103 'Content-Type': "application/json",
104 'cache-control': "no-cache",
105 'Postman-Token': "510186ab-3336-447d-836b-d9e43eebf3b8"
106 })
107 output = json.loads(response.text)
108 time_elapsed = time.time() - current_time
109
110 try:
111 output = output['result']
112 output["timeElapsed"] = time_elapsed
113
114 return output
115 except KeyError:
116 try:
117 output = output['response']['result']
118 output["timeElapsed"] = time_elapsed
119
120 return output
121 except KeyError:
122 output["timeElapsed"] = time_elapsed
123
124 return output
125 except Exception as e:
126 traceback.print_exc()
127
128 return None
129
130
131def evaluation_logic(key, gt_string, pred_string):
132 if key == "B-GEN":
133 if 'f' in gt_string.lower():
134 gt_string = "FEMALE"
135 elif 'm' in gt_string.lower():
136 gt_string = "MALE"
137
138 if key == "B-DOB" or key == "B-DOI" or key == "B-DOE":
139 gt_string = gt_string.replace(
140 ".", "/").replace("-", "/").replace(",", "/")
141
142 gt_string = gt_string.replace(":", "")
143
144 return fuzz.ratio(gt_string.upper().replace(' ', "").replace('\n', ""), pred_string.upper().replace(' ', "").replace('\n', "")) == 100
145
146
147def evaluate(KEY_DICT, rows):
148 prediction_rows = dict()
149
150 def thread_fn(url):
151 prediction_rows[url] = post_request(url=url)
152
153 threads = list()
154 for index, row in enumerate(rows, 0):
155 thread = threading.Thread(target=thread_fn, args=(row["URL"], ))
156 thread.start()
157 threads.append(thread)
158
159 if len(threads) > 16:
160 print("Evaluating {0} Of {1} Documents".format(index, len(rows)))
161 for thread in threads:
162 thread.join()
163 threads = list()
164 else:
165 for thread in threads:
166 thread.join()
167
168 evaluated_rows = list()
169 EVALUATED_KEY_DICT = dict()
170
171 for index, row in enumerate(rows, 1):
172 assert "URL" in row.keys()
173 assert "OCR" in row.keys()
174 accuracy_dict = {"URL": row["URL"],
175 "OCR": row["OCR"]}
176
177 ground_truth_dict = dict()
178
179 for key in KEY_DICT.keys():
180 value = row[key]
181 ground_truth_dict[Maps.ENDPOINT_OUTPUT_MAP[key]] = value
182
183 prediction_dict = prediction_rows[row["URL"]]
184 prediction_dict = modify_prediction_dict(gt=ground_truth_dict, prediction_dict=prediction_dict)
185
186 if prediction_dict is None:
187 continue
188
189 accuracy_dict["OCR_DORY"] = prediction_dict["DORY_OCR"]
190 accuracy_dict["RESPONSE_TIME"] = "{:.2f}".format(
191 prediction_dict["timeElapsed"])
192
193 for key, value in ground_truth_dict.items():
194 TP = False
195 FP = False
196 FN = False
197 DC = False
198
199 try:
200 assert key in prediction_dict.keys()
201 except AssertionError:
202 raise AssertionError("Key: {0} not in API OUTPUT".format(key))
203
204 if len(ground_truth_dict[key]) == 0 and len(prediction_dict[key]) == 0 and STRICT_MODE:
205 DC = True
206 else:
207 if evaluation_logic(key=Maps.ENDPOINT_OUTPUT_MAP[key],
208 gt_string=ground_truth_dict[key],
209 pred_string=prediction_dict[key]):
210 TP = True
211 else:
212 if len(ground_truth_dict[key]) and len(prediction_dict[key]) == 0:
213 FN = True
214 else:
215 FP = True
216
217 accuracy_dict[key + "_PRED"] = prediction_dict[key]
218
219 accuracy_dict[key + "_GT"] = ground_truth_dict[key]
220 accuracy_dict[key + "_SCORE"] = {"FP": 1 if FP is True else 0,
221 "TP": 1 if TP is True else 0,
222 "FN": 1 if FN is True else 0,
223 "DC": 1 if DC is True else 0}
224 accuracy_dict[key + "_PASS"] = 1 if TP or DC else 0
225
226 for key in accuracy_dict.keys():
227 EVALUATED_KEY_DICT[key] = ""
228
229 evaluated_rows.append(accuracy_dict)
230
231 return EVALUATED_KEY_DICT, evaluated_rows
232
233
234def generate_csv(KEY_DICT, dataset_id, rows, name):
235 ROW_ORDER = list(KEY_DICT.keys())
236 ROW_ORDER.reverse()
237
238 metrics = dict()
239
240 row_list = list()
241 for row in rows:
242 r = list()
243
244 for label in ROW_ORDER:
245 if "_SCORE" in label:
246 metric_label = label.replace("_SCORE", "")
247 try:
248 prev_score = metrics[metric_label]
249
250 prev_TP, prev_FP, prev_FN, prev_DC, prev_TOTAL = prev_score
251 TP, FP, FN, DC, TOTAL = row[label]["TP"], row[label][
252 "FP"], row[label]["FN"], row[label]["DC"], prev_TOTAL + 1
253
254 TOTAL_DC = prev_DC + DC
255 if STRICT_MODE is False:
256 DC = 0
257
258 metrics[metric_label] = [prev_TP + TP,
259 prev_FP + FP, prev_FN + FN, TOTAL_DC, TOTAL - DC]
260 except KeyError:
261 TP, FP, FN, DC = row[label]["TP"], row[label]["FP"], row[label]["FN"], row[label]["DC"]
262
263 if STRICT_MODE is False:
264 DC = 0
265
266 metrics[metric_label] = [TP, FP, FN, DC, 1 - DC]
267 del row[label]["DC"]
268
269 r.append(row[label])
270
271 row_list.append(r)
272
273 row_list = [ROW_ORDER] + row_list
274
275 accuracy_metrics = [["Metric"], ["TP"], ["FP"], ["FN"], [
276 "TotalCount"], ["PRECISION"], ["RECALL"], ["F1 SCORE"]]
277 for key, metric in metrics.items():
278 accuracy_metrics[0].append(key)
279 TP = metric[0]
280 FP = metric[1]
281 FN = metric[2]
282 DC = metric[3]
283 TOTAL = metric[4]
284
285 accuracy_metrics[1].append(TP)
286 accuracy_metrics[2].append(FP)
287 accuracy_metrics[3].append(FN)
288 accuracy_metrics[4].append(TOTAL)
289
290 try:
291 PRECISION = TP / (TP + FP)
292 except ZeroDivisionError:
293 PRECISION = 0
294
295 try:
296 RECALL = TP / (TP + FN)
297 except ZeroDivisionError:
298 RECALL = 0
299
300 try:
301 F1_SCORE = 2 * ((PRECISION * RECALL) / (PRECISION + RECALL))
302 except ZeroDivisionError:
303 F1_SCORE = 0
304
305 accuracy_metrics[5].append(PRECISION)
306 accuracy_metrics[6].append(RECALL)
307 accuracy_metrics[7].append(F1_SCORE)
308
309 row_list = row_list + [[], [], []] + accuracy_metrics
310
311 import csv
312
313 with open('./out/{0}.csv'.format(dataset_id if name is None else str(dataset_id) + name), 'w', newline='') as csv_file:
314 csv_writer = csv.writer(csv_file)
315 csv_writer.writerows(row_list)
316
317
318def get_documents(_dataset_id):
319 mongo_db = MongoDB(db_name="sigmoid", db_link=SIGMOID_MONGODB_LINK)
320
321 for field in mongo_db.get_bulk_records(table_name="app_dataset", key_dict={
322 "datasetId": _dataset_id
323 }):
324 print(field)
325
326 try:
327 dataset_id = field['datasetId']
328 except UnboundLocalError:
329 for field in mongo_db.get_bulk_records(table_name="app_dataset", key_dict={}):
330 print(field)
331 raise AssertionError("DatasetName not Found")
332
333 documents = list()
334 for fields in mongo_db.database["app_document"].find({
335 "datasetId": dataset_id
336 }).limit(NO_OF_SAMPLES):
337 documents.append(fields)
338
339 KEY_DICT = dict()
340
341 labels = dict()
342 for index, fields in enumerate(mongo_db.get_bulk_records(table_name="app_label", key_dict={
343 "datasetId": dataset_id
344 })):
345 labels[fields['labelId']] = {
346 "labelType": fields['labelType'],
347 "labelName": fields['name']
348 }
349
350 KEY_DICT[fields['name']] = ""
351
352 annotations = collections.defaultdict(dict)
353 for fields in mongo_db.get_bulk_records(table_name="app_annotation", key_dict={
354 }):
355 if fields["labelId"] not in annotations[fields["documentId"]]:
356 annotations[fields["documentId"]][fields["labelId"]] = list()
357
358 annotations[fields["documentId"]][fields["labelId"]].append({
359 "startOffset": fields["startOffset"],
360 "endOffset": fields["endOffset"]
361 }
362 )
363
364 KEY_DICT["URL"] = ""
365 KEY_DICT["OCR"] = ""
366
367 rows = list()
368 for document in documents:
369 annotation = annotations[document["documentId"]]
370 ocr_text = document['OCR']
371 url = document['persistUrl']
372
373 row = copy.deepcopy(KEY_DICT)
374 row["URL"] = url
375 row["OCR"] = ocr_text
376
377 for label_id, value in labels.items():
378 try:
379 label_annotation = annotation[label_id]
380 except KeyError:
381 continue
382
383 label_type = value["labelType"]
384 label_name = value["labelName"]
385
386 texts = list()
387 for idx in label_annotation:
388 start_idx, end_idx = idx["startOffset"], idx["endOffset"]
389 text = ocr_text[start_idx: end_idx]
390 texts.append(text)
391
392 text = ' '.join(texts)
393 row[label_name] = text
394
395 rows.append(row)
396
397 print("\nkeys for ENDPOINT_OUTPUT_MAP: {0}".format(KEY_DICT.keys()))
398
399 return KEY_DICT, rows
400
401
402def main(max_documents=None, name=None):
403 for dataset_id in DATASET_IDS:
404 Maps.ENDPOINT_OUTPUT_MAP = dict()
405 KEY_DICT, rows = get_documents(dataset_id)
406
407 if 'URL' in KEY_DICT:
408 del KEY_DICT['URL']
409 if 'OCR' in KEY_DICT:
410 del KEY_DICT['OCR']
411
412 for key, value in KEY_DICT.items():
413 Maps.ENDPOINT_OUTPUT_MAP[key] = key
414
415 if max_documents is not None:
416 assert isinstance(max_documents, int)
417 rows = rows[:max_documents]
418
419 EVALUATED_KEY_DICT, evaluated_rows = evaluate(KEY_DICT, rows)
420 generate_csv(EVALUATED_KEY_DICT, dataset_id, evaluated_rows, name)
421
422
423if __name__ == "__main__":
424 import pickle
425 import os
426
427 saved_model_names = list()
428 for root, dirs, filenames in os.walk("/home/ai/ai/preciousdory/outs/trail2"):
429 for filename in filenames:
430 saved_model_name = os.path.join(root, filename)
431 saved_model_names.append(saved_model_name)
432
433 for saved_model_name in ['/home/ai/ai/preciousdory/outs/trail1/best_accuracy_so_far.pth'] + saved_model_names:
434 print(f'Running Model: {saved_model_name}')
435
436 with open('/home/ai/ai/doryplusplus/saved_model.pickle', 'wb') as pickle_file:
437 pickle.dump(saved_model_name, pickle_file)
438
439 main(max_documents=NO_OF_SAMPLES, name=saved_model_name.split('/')[-1].replace('.pt', ''))
440
441 break
442
443
444
445
446
447
448
449
450
451
452
453import pymongo
454import time
455import json
456import pickle
457import bson
458
459
460class MongoDB:
461 """
462 MongoDB instances provide implementations of high-level mongoclient interfaces.
463 """
464 client = None
465
466 def __init__(self, db_link=None, db_name=None):
467 if self.client is None:
468 if db_link:
469 client = pymongo.MongoClient(db_link)
470 else:
471 client = pymongo.MongoClient("mongodb://localhost:27017/")
472
473 self.client = client
474
475 assert db_name is not None
476 self.database = self.client[db_name]
477
478 def retrieve_record(self, table_name, key_dict):
479 """
480 Retrieves a single record filtered by the key_dict
481 :param table_name: the table the records is present.
482 :param key_dict: the filter_dict
483 :return: record_dict
484 """
485 return self.database[table_name].find_one(key_dict)
486
487 def put_record(self, table_name, record_dict):
488 """
489 Inserts a record on the given table.
490 :param table_name: the table the records is present.
491 :param record_dict: the record to be inserted.
492 :return: uuid of the inserted record
493 """
494 return self.database[table_name].insert_one(record_dict).inserted_id
495
496 def update_record(self, table_name, key_dict, record_dict):
497 """
498 Retrieves a single record filtered by the key_dict and then replaces with the
499 record_dict.
500
501 :param table_name: the table the records is present.
502 :param record_dict: the record to be inserted.
503 :param key_dict: the filter_dict
504 :return: None
505 """
506 self.database[table_name].update_one(key_dict, {'$set': record_dict})
507
508 def get_bulk_records(self, table_name, key_dict):
509 """
510 Retrieves a bulk of records filtered by the key_dict.
511 :param table_name: the table the records is present.
512 :param key_dict: the filter_dict
513 :return: record_list
514 """
515 return self.database[table_name].find(key_dict)
516
517 def remove(self, table_name, key_dict):
518 """
519 Deletes a single record filtered by the key_dict.
520 :param table_name: the table the records is present.
521 :param key_dict: the filter_dict
522 :return: True if record is deleted else False
523 """
524 return True if self.database[table_name].delete_one(key_dict).deleted_count else False
525
526 def sort(self, table_name, key_dict, fields):
527 return self.database[table_name].find(key_dict).sort(fields)