8tpF4122

· 6 years ago · Dec 11, 2019, 02:30 PM
1import collections
2import requests
3from db_utils import MongoDB
4import copy
5import threading
6import traceback
7import json
8import time
9from fuzzywuzzy import fuzz
10
11RESET_CACHE = False
12
13SIGMOID_MONGODB_LINK = 'mongodb://sigmoid:7cu6UoDSs2bHPGgl@13.233.203.170:52138/'
14DATASET_IDS = [806, 130]
15NO_OF_SAMPLES = 400
16STRICT_MODE = True
17
18
19class Maps:
20    ENDPOINT_OUTPUT_MAP = dict()
21
22
23def modify_prediction_dict(gt, prediction_dict):
24    if "error" in prediction_dict.keys():
25        return None
26
27        new_preds = dict()
28        new_preds['timeElapsed'] = 0.0
29        new_preds['DORY_OCR'] = ""
30        for key, _value in gt.items():
31            new_preds[key] = "NO_MATCH"
32        return new_preds
33
34    ocr_text = prediction_dict['ocrs'][0]['text']
35    new_preds = dict()
36    new_preds['timeElapsed'] = prediction_dict['timeElapsed']
37    new_preds['DORY_OCR'] = prediction_dict['ocrs'][0]['text']
38
39    for key, _value in gt.items():
40        ocr_text_1 = ocr_text.upper().replace(' ', "/").replace('\n', "/").replace(',', '/').replace('/', '/').replace(
41            '0', 'O'). \
42            replace('-', '/').replace('.', '/')
43
44        value_1 = _value.upper().replace(' ', "/").replace('\n', "/").replace(',', '/').replace('/', '/').replace('0', 'O').\
45        replace('-', '/').replace('.', '/')
46
47        ocr_text_2 = ocr_text.upper().replace(' ', "").replace('\n', "").replace(',', '').replace('/', '/').replace(
48            '0', 'O').replace('-', '/').replace('.', '')
49
50        value_2 = _value.upper().replace(' ', "").replace('\n', "").replace(',', '').replace('/', '/').replace(
51            '0', 'O').replace('-', '/').replace('.', '')
52
53        if value_1 in ocr_text_1:
54            new_preds[key] = _value
55        elif value_2 in ocr_text_2:
56            new_preds[key] = _value
57        else:
58            new_preds[key] = "NO_MATCH"
59        #
60        # index = 0
61        # for v in values:
62        #     if len(v) <= 2:
63        #         index += 1
64        #         continue
65        #
66        #     if v.strip() in ocr_texts:
67        #         index += 1
68        #
69        # try:
70        #     score = index / len(values)
71        # except ZeroDivisionError:
72        #     score = 0
73        #
74        # if score > 0.5:
75        #     new_preds[key] = _value
76        # else:
77        #     new_preds[key] = "NO_MATCH"
78
79    # fuzz.ratio(gt_string.upper().replace(' ', "").replace('\n', ""),
80    #            pred_string.upper().replace(' ', "").replace('\n', "")) == 100
81
82    # print(gt)
83    # print(prediction_dict)
84    # print(new_preds,  e)
85    return new_preds
86
87
88def post_request(url):
89    request_dict = {
90        "type": "documentOcr",
91        "urls": [url],
92        "url": url,
93        "instance": {
94            "id": "random",
95            "callbackUrl": ""
96        }}
97
98    current_time = time.time()
99    try:
100        response = requests.post(url="http://api.prv:52106/ai/vision/signzy-ocr/image",
101                                 data=json.dumps(request_dict),
102                                 headers={
103                                     'Content-Type': "application/json",
104                                     'cache-control': "no-cache",
105                                     'Postman-Token': "510186ab-3336-447d-836b-d9e43eebf3b8"
106                                 })
107        output = json.loads(response.text)
108        time_elapsed = time.time() - current_time
109
110        try:
111            output = output['result']
112            output["timeElapsed"] = time_elapsed
113
114            return output
115        except KeyError:
116            try:
117                output = output['response']['result']
118                output["timeElapsed"] = time_elapsed
119
120                return output
121            except KeyError:
122                output["timeElapsed"] = time_elapsed
123
124                return output
125    except Exception as e:
126        traceback.print_exc()
127
128    return None
129
130
131def evaluation_logic(key, gt_string, pred_string):
132    if key == "B-GEN":
133        if 'f' in gt_string.lower():
134            gt_string = "FEMALE"
135        elif 'm' in gt_string.lower():
136            gt_string = "MALE"
137
138    if key == "B-DOB" or key == "B-DOI" or key == "B-DOE":
139        gt_string = gt_string.replace(
140            ".", "/").replace("-", "/").replace(",", "/")
141
142    gt_string = gt_string.replace(":", "")
143
144    return fuzz.ratio(gt_string.upper().replace(' ', "").replace('\n', ""), pred_string.upper().replace(' ', "").replace('\n', "")) == 100
145
146
147def evaluate(KEY_DICT, rows):
148    prediction_rows = dict()
149
150    def thread_fn(url):
151        prediction_rows[url] = post_request(url=url)
152
153    threads = list()
154    for index, row in enumerate(rows, 0):
155        thread = threading.Thread(target=thread_fn, args=(row["URL"], ))
156        thread.start()
157        threads.append(thread)
158
159        if len(threads) > 16:
160            print("Evaluating {0} Of {1} Documents".format(index, len(rows)))
161            for thread in threads:
162                thread.join()
163            threads = list()
164    else:
165        for thread in threads:
166            thread.join()
167
168    evaluated_rows = list()
169    EVALUATED_KEY_DICT = dict()
170
171    for index, row in enumerate(rows, 1):
172        assert "URL" in row.keys()
173        assert "OCR" in row.keys()
174        accuracy_dict = {"URL": row["URL"],
175                         "OCR": row["OCR"]}
176
177        ground_truth_dict = dict()
178
179        for key in KEY_DICT.keys():
180            value = row[key]
181            ground_truth_dict[Maps.ENDPOINT_OUTPUT_MAP[key]] = value
182
183        prediction_dict = prediction_rows[row["URL"]]
184        prediction_dict = modify_prediction_dict(gt=ground_truth_dict, prediction_dict=prediction_dict)
185
186        if prediction_dict is None:
187            continue
188
189        accuracy_dict["OCR_DORY"] = prediction_dict["DORY_OCR"]
190        accuracy_dict["RESPONSE_TIME"] = "{:.2f}".format(
191            prediction_dict["timeElapsed"])
192
193        for key, value in ground_truth_dict.items():
194            TP = False
195            FP = False
196            FN = False
197            DC = False
198
199            try:
200                assert key in prediction_dict.keys()
201            except AssertionError:
202                raise AssertionError("Key: {0} not in API OUTPUT".format(key))
203
204            if len(ground_truth_dict[key]) == 0 and len(prediction_dict[key]) == 0 and STRICT_MODE:
205                DC = True
206            else:
207                if evaluation_logic(key=Maps.ENDPOINT_OUTPUT_MAP[key],
208                                    gt_string=ground_truth_dict[key],
209                                    pred_string=prediction_dict[key]):
210                    TP = True
211                else:
212                    if len(ground_truth_dict[key]) and len(prediction_dict[key]) == 0:
213                        FN = True
214                    else:
215                        FP = True
216
217            accuracy_dict[key + "_PRED"] = prediction_dict[key]
218
219            accuracy_dict[key + "_GT"] = ground_truth_dict[key]
220            accuracy_dict[key + "_SCORE"] = {"FP": 1 if FP is True else 0,
221                                             "TP": 1 if TP is True else 0,
222                                             "FN": 1 if FN is True else 0,
223                                             "DC": 1 if DC is True else 0}
224            accuracy_dict[key + "_PASS"] = 1 if TP or DC else 0
225
226        for key in accuracy_dict.keys():
227            EVALUATED_KEY_DICT[key] = ""
228
229        evaluated_rows.append(accuracy_dict)
230
231    return EVALUATED_KEY_DICT, evaluated_rows
232
233
234def generate_csv(KEY_DICT, dataset_id, rows, name):
235    ROW_ORDER = list(KEY_DICT.keys())
236    ROW_ORDER.reverse()
237
238    metrics = dict()
239
240    row_list = list()
241    for row in rows:
242        r = list()
243
244        for label in ROW_ORDER:
245            if "_SCORE" in label:
246                metric_label = label.replace("_SCORE", "")
247                try:
248                    prev_score = metrics[metric_label]
249
250                    prev_TP, prev_FP, prev_FN, prev_DC, prev_TOTAL = prev_score
251                    TP, FP, FN, DC, TOTAL = row[label]["TP"], row[label][
252                        "FP"], row[label]["FN"], row[label]["DC"], prev_TOTAL + 1
253
254                    TOTAL_DC = prev_DC + DC
255                    if STRICT_MODE is False:
256                        DC = 0
257
258                    metrics[metric_label] = [prev_TP + TP,
259                                             prev_FP + FP, prev_FN + FN, TOTAL_DC, TOTAL - DC]
260                except KeyError:
261                    TP, FP, FN, DC = row[label]["TP"], row[label]["FP"], row[label]["FN"], row[label]["DC"]
262
263                    if STRICT_MODE is False:
264                        DC = 0
265
266                    metrics[metric_label] = [TP, FP, FN, DC, 1 - DC]
267                del row[label]["DC"]
268
269            r.append(row[label])
270
271        row_list.append(r)
272
273    row_list = [ROW_ORDER] + row_list
274
275    accuracy_metrics = [["Metric"], ["TP"], ["FP"], ["FN"], [
276        "TotalCount"], ["PRECISION"], ["RECALL"], ["F1 SCORE"]]
277    for key, metric in metrics.items():
278        accuracy_metrics[0].append(key)
279        TP = metric[0]
280        FP = metric[1]
281        FN = metric[2]
282        DC = metric[3]
283        TOTAL = metric[4]
284
285        accuracy_metrics[1].append(TP)
286        accuracy_metrics[2].append(FP)
287        accuracy_metrics[3].append(FN)
288        accuracy_metrics[4].append(TOTAL)
289
290        try:
291            PRECISION = TP / (TP + FP)
292        except ZeroDivisionError:
293            PRECISION = 0
294
295        try:
296            RECALL = TP / (TP + FN)
297        except ZeroDivisionError:
298            RECALL = 0
299
300        try:
301            F1_SCORE = 2 * ((PRECISION * RECALL) / (PRECISION + RECALL))
302        except ZeroDivisionError:
303            F1_SCORE = 0
304
305        accuracy_metrics[5].append(PRECISION)
306        accuracy_metrics[6].append(RECALL)
307        accuracy_metrics[7].append(F1_SCORE)
308
309    row_list = row_list + [[], [], []] + accuracy_metrics
310
311    import csv
312
313    with open('./out/{0}.csv'.format(dataset_id if name is None else str(dataset_id) + name), 'w', newline='') as csv_file:
314        csv_writer = csv.writer(csv_file)
315        csv_writer.writerows(row_list)
316
317
318def get_documents(_dataset_id):
319    mongo_db = MongoDB(db_name="sigmoid", db_link=SIGMOID_MONGODB_LINK)
320
321    for field in mongo_db.get_bulk_records(table_name="app_dataset", key_dict={
322        "datasetId": _dataset_id
323    }):
324        print(field)
325
326    try:
327        dataset_id = field['datasetId']
328    except UnboundLocalError:
329        for field in mongo_db.get_bulk_records(table_name="app_dataset", key_dict={}):
330            print(field)
331        raise AssertionError("DatasetName not Found")
332
333    documents = list()
334    for fields in mongo_db.database["app_document"].find({
335        "datasetId": dataset_id
336    }).limit(NO_OF_SAMPLES):
337        documents.append(fields)
338
339    KEY_DICT = dict()
340
341    labels = dict()
342    for index, fields in enumerate(mongo_db.get_bulk_records(table_name="app_label", key_dict={
343        "datasetId": dataset_id
344    })):
345        labels[fields['labelId']] = {
346            "labelType": fields['labelType'],
347            "labelName": fields['name']
348        }
349
350        KEY_DICT[fields['name']] = ""
351
352    annotations = collections.defaultdict(dict)
353    for fields in mongo_db.get_bulk_records(table_name="app_annotation", key_dict={
354    }):
355        if fields["labelId"] not in annotations[fields["documentId"]]:
356            annotations[fields["documentId"]][fields["labelId"]] = list()
357
358        annotations[fields["documentId"]][fields["labelId"]].append({
359            "startOffset": fields["startOffset"],
360            "endOffset": fields["endOffset"]
361        }
362        )
363
364    KEY_DICT["URL"] = ""
365    KEY_DICT["OCR"] = ""
366
367    rows = list()
368    for document in documents:
369        annotation = annotations[document["documentId"]]
370        ocr_text = document['OCR']
371        url = document['persistUrl']
372
373        row = copy.deepcopy(KEY_DICT)
374        row["URL"] = url
375        row["OCR"] = ocr_text
376
377        for label_id, value in labels.items():
378            try:
379                label_annotation = annotation[label_id]
380            except KeyError:
381                continue
382
383            label_type = value["labelType"]
384            label_name = value["labelName"]
385
386            texts = list()
387            for idx in label_annotation:
388                start_idx, end_idx = idx["startOffset"], idx["endOffset"]
389                text = ocr_text[start_idx: end_idx]
390                texts.append(text)
391
392            text = ' '.join(texts)
393            row[label_name] = text
394
395        rows.append(row)
396
397    print("\nkeys for ENDPOINT_OUTPUT_MAP: {0}".format(KEY_DICT.keys()))
398
399    return KEY_DICT, rows
400
401
402def main(max_documents=None, name=None):
403    for dataset_id in DATASET_IDS:
404        Maps.ENDPOINT_OUTPUT_MAP = dict()
405        KEY_DICT, rows = get_documents(dataset_id)
406
407        if 'URL' in KEY_DICT:
408            del KEY_DICT['URL']
409        if 'OCR' in KEY_DICT:
410            del KEY_DICT['OCR']
411
412        for key, value in KEY_DICT.items():
413            Maps.ENDPOINT_OUTPUT_MAP[key] = key
414
415        if max_documents is not None:
416            assert isinstance(max_documents, int)
417            rows = rows[:max_documents]
418
419        EVALUATED_KEY_DICT, evaluated_rows = evaluate(KEY_DICT, rows)
420        generate_csv(EVALUATED_KEY_DICT, dataset_id, evaluated_rows, name)
421
422
423if __name__ == "__main__":
424    import pickle
425    import os
426
427    saved_model_names = list()
428    for root, dirs, filenames in os.walk("/home/ai/ai/preciousdory/outs/trail2"):
429        for filename in filenames:
430            saved_model_name = os.path.join(root, filename)
431            saved_model_names.append(saved_model_name)
432
433    for saved_model_name in ['/home/ai/ai/preciousdory/outs/trail1/best_accuracy_so_far.pth'] + saved_model_names:
434        print(f'Running Model: {saved_model_name}')
435
436        with open('/home/ai/ai/doryplusplus/saved_model.pickle', 'wb') as pickle_file:
437            pickle.dump(saved_model_name, pickle_file)
438
439        main(max_documents=NO_OF_SAMPLES, name=saved_model_name.split('/')[-1].replace('.pt', ''))
440
441        break
442
443
444
445
446
447
448
449
450
451
452
453import pymongo
454import time
455import json
456import pickle
457import bson
458
459
460class MongoDB:
461    """
462    MongoDB instances provide implementations of high-level mongoclient interfaces.
463    """
464    client = None
465
466    def __init__(self, db_link=None, db_name=None):
467        if self.client is None:
468            if db_link:
469                client = pymongo.MongoClient(db_link)
470            else:
471                client = pymongo.MongoClient("mongodb://localhost:27017/")
472
473            self.client = client
474
475        assert db_name is not None
476        self.database = self.client[db_name]
477
478    def retrieve_record(self, table_name, key_dict):
479        """
480        Retrieves a single record filtered by the key_dict
481        :param table_name: the table the records is present.
482        :param key_dict: the filter_dict
483        :return: record_dict
484        """
485        return self.database[table_name].find_one(key_dict)
486
487    def put_record(self, table_name, record_dict):
488        """
489        Inserts a record on the given table.
490        :param table_name: the table the records is present.
491        :param record_dict: the record to be inserted.
492        :return: uuid of the inserted record
493        """
494        return self.database[table_name].insert_one(record_dict).inserted_id
495
496    def update_record(self, table_name, key_dict, record_dict):
497        """
498        Retrieves a single record filtered by the key_dict and then replaces with the
499        record_dict.
500
501        :param table_name: the table the records is present.
502        :param record_dict: the record to be inserted.
503        :param key_dict: the filter_dict
504        :return: None
505        """
506        self.database[table_name].update_one(key_dict, {'$set': record_dict})
507
508    def get_bulk_records(self, table_name, key_dict):
509        """
510        Retrieves a bulk of records filtered by the key_dict.
511        :param table_name: the table the records is present.
512        :param key_dict: the filter_dict
513        :return: record_list
514        """
515        return self.database[table_name].find(key_dict)
516
517    def remove(self, table_name, key_dict):
518        """
519        Deletes a single record filtered by the key_dict.
520        :param table_name: the table the records is present.
521        :param key_dict: the filter_dict
522        :return: True if record is deleted else False
523        """
524        return True if self.database[table_name].delete_one(key_dict).deleted_count else False
525
526    def sort(self, table_name, key_dict, fields):
527        return self.database[table_name].find(key_dict).sort(fields)