· 6 years ago · Apr 07, 2020, 02:34 PM
1import random
2import copy
3import hashlib
4import io
5import itertools
6import os
7import pickle
8import tempfile
9import time
10from abc import ABC, abstractmethod
11from datetime import datetime, timedelta
12from typing import Dict, List, Tuple
13
14import cv2
15import numpy as np
16import pause
17import pytesseract
18import requests
19from PIL import Image
20
21import invoiceparser.tesseract_parser as tp
22from business_objects import OCRMethod
23from invoiceparser.image_util import PIL_img_to_bytes, scale_image_to_filesize
24from logger_setup import getLogger
25from business_objects import Fragment, InvoiceDocument, YDirection
26
27from tempfile import TemporaryFile
28from utility import RenamingUnpickler
29
30logger = getLogger(__name__)
31
32
33
34"""
35====================================================================
36Base OCR
37====================================================================
38"""
39
40CACHE_FILE_NAME = os.path.join(os.path.dirname(__file__), 'ocr_output_cache.pickle')
41
42
43def _init_cache() -> Dict[Tuple[OCRMethod, str], List[Fragment]]:
44 global cache
45
46 if not os.path.isfile(CACHE_FILE_NAME):
47 return {}
48
49 with io.open(CACHE_FILE_NAME, 'rb') as f:
50 return RenamingUnpickler(f).load()
51
52def _save_cache():
53 global cache
54
55 with io.open(CACHE_FILE_NAME, 'wb') as f:
56 pickle.dump(cache, f)
57
58cache = _init_cache()
59
60class OCRBase(ABC):
61
62 def __init__(self):
63 pass
64
65 def get_fragment_list(self, image_name: str, image_bytes: bytes, use_cache: bool = False) -> List[Fragment]:
66 global cache
67
68 md5hash = hashlib.md5(image_bytes).hexdigest()
69 key = (self._get_method(), image_name + str(md5hash))
70
71 if use_cache and key in cache:
72 logger.debug('Found the image in the cache by key |%s|!', key)
73 return copy.deepcopy(cache[key])
74
75 fragments = self._get_fragment_list(image_name, image_bytes)
76
77 if use_cache and len(fragments) > 0:
78 logger.debug('Adding %s to cache', '\n\t'.join(str(fragment) for fragment in fragments))
79 cache[key] = fragments
80 _save_cache()
81
82 return fragments
83
84 @abstractmethod
85 def _get_fragment_list(self, image_name: str, image_bytes: bytes) -> List[Fragment]:
86 pass
87
88
89 @abstractmethod
90 def _get_method(self) -> OCRMethod:
91 pass
92
93"""
94====================================================================
95Azure OCR
96====================================================================
97Pricing: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/
98API: https://westus.dev.cognitive.microsoft.com/docs/services/5adf991815e1060e6355ad44/operations/56f91f2e778daf14a499e1fc
99"""
100
101
102class AzureOCR(OCRBase):
103
104
105 AZURE_PAUSE_TIME = timedelta(seconds=3)
106 AZURE_OCR_URL = "https://westeurope.api.cognitive.microsoft.com/vision/v2.0/ocr"
107 AZURE_SUBSCRIPTION_KEY = "44f8dd20ad21451ead1d09c87cf36e9d"
108 MAX_IMAGE_SIZE = 4 * 1024 * 1024
109 MAX_DIMENSION = 4000
110
111
112 def __init__(self):
113 self.next_azure_request_time = datetime.now()
114
115
116 def _get_fragment_list(self, image_name: str, image_bytes: bytes) -> List[Fragment]:
117 image_bytes = AzureOCR.__normalize_image(image_bytes)
118 logger.debug(len(image_bytes))
119
120 logger.debug('Pausing until |%s|', self.next_azure_request_time)
121 pause.until(self.next_azure_request_time)# Need to throttle down as per API contract
122
123 headers = {'Ocp-Apim-Subscription-Key': AzureOCR.AZURE_SUBSCRIPTION_KEY,
124 'Content-Type': 'application/octet-stream'}
125 params = {'language': 'unk', 'detectOrientation': 'true'}
126 response = requests.post(AzureOCR.AZURE_OCR_URL, headers=headers, params=params, data=image_bytes)
127 if response.status_code != 200:
128 logger.warn('Bad response |%d|! Details:\n %s', response.status_code, response.json())
129 response.raise_for_status()
130
131 self.next_azure_request_time = datetime.now() + AzureOCR.AZURE_PAUSE_TIME
132
133 return AzureOCR.__get_fragments_from_json(response.json())
134
135 def _get_method(self) -> OCRMethod:
136 return OCRMethod.AZURE
137
138 @staticmethod
139 def __wordbox_to_fragment(wordbox) -> Fragment:
140 x = int(wordbox['boundingBox'].split(',')[0])
141 y = int(wordbox['boundingBox'].split(',')[1])
142 width = int(wordbox['boundingBox'].split(',')[2])
143 height = int(wordbox['boundingBox'].split(',')[3])
144 text = ' '
145 for word in [word['text'] for word in wordbox['words']]:
146 # handling the case when a number's digits got separated
147 if text[-1].isdigit() and word[0].isdigit():
148 text = text + word
149 else:
150 text = text + ' ' + word
151 text = text.strip()
152
153 return Fragment(text, x, y, width, height)
154
155
156 @staticmethod
157 def __get_fragments_from_json(json_object) -> List[Fragment]:
158 logger.debug(json_object)
159 wordboxes = [region['lines'] for region in json_object['regions']]
160 wordboxes = list(itertools.chain.from_iterable(wordboxes))
161 fragments = map(lambda wordbox: AzureOCR.__wordbox_to_fragment(wordbox), wordboxes)
162 return list(fragments)
163
164
165
166 @staticmethod
167 def __normalize_image(image_bytes: bytes) -> bytes:
168 img = Image.open(io.BytesIO(image_bytes))
169
170 sizefactor = float(AzureOCR.MAX_IMAGE_SIZE) / len(image_bytes)
171
172 if img.height > img.width:
173 pixelfactor = AzureOCR.MAX_DIMENSION / img.height
174 else:
175 pixelfactor = AzureOCR.MAX_DIMENSION / img.width
176
177 if pixelfactor < 1.0 or sizefactor < 1.0:
178 logger.info('Image is too large! Size: |%s, %d bytes| Dimensions: (%d, %d)', img.size, len(image_bytes), img.width, img.height)
179
180 factor = min(sizefactor, pixelfactor)
181 factor = factor * 0.9
182 # This is hacky but the size of the output image's size from PIL is not determenistic.
183 # So if we need to resize let's be aggressive about it.
184
185 logger.info('Reducing image to |%f| of the original size!', factor)
186 img = img.resize((int(img.width * factor), int(img.height * factor)))
187 logger.info('New size of the image is |%s|', img.size)
188
189 return PIL_img_to_bytes(img)
190 else:
191 return image_bytes
192
193"""
194====================================================================
195Tesseract OCR
196====================================================================
197"""
198
199class TesseractOCR(OCRBase):
200
201
202 def _get_fragment_list(self, image_name: str, image_bytes: bytes) -> List[Fragment]:
203 image = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.COLOR_BGR2GRAY)
204 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
205 gray = cv2.threshold(gray, 0, 255,
206 cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
207
208 document = None
209 with tempfile.TemporaryDirectory() as tmpdir:
210 imgfname = os.path.join(tmpdir, "{}.png".format(os.getpid()))
211 cv2.imwrite(imgfname, gray)
212 hocr_str = pytesseract.pytesseract.run_and_get_output(imgfname, lang=None, config="hocr", extension='hocr')
213 document = tp.HOCRDocument(hocr_str)
214
215 page=document.pages[0]
216
217 l = []
218 for area in page.areas:
219 for paragraph in area.paragraphs:
220 for line in paragraph.lines:
221 for word in line.words:
222 l.append(
223 Fragment(
224 word.ocr_text,
225 word.coordinates[0],
226 4000 - word.coordinates[1],
227 word.coordinates[2] - word.coordinates[0],
228 word.coordinates[3] - word.coordinates[1]
229 )
230 )
231 return l
232
233
234 def _get_method(self) -> OCRMethod:
235 return OCRMethod.TESSERACT
236
237"""
238====================================================================
239OCR.SPACE OCR
240====================================================================
241"""
242
243class OCRSpaceOCR(OCRBase):
244 # wayasam@gmail.com (paid), wayasam@gmail.com (free), vszm5@hotmail.com, vszm@inf.elte.hu Get more keys if needed
245 API_KEYS = ['PKMXB9465888A', 'f0a34a178b88957', '62f395656388957', '96b2d9106788957']
246 MAX_SIZE_BYTES = 1024 * 1024 * 1
247
248 def _get_fragment_list(self, image_name: str, image_bytes: bytes) -> List[Fragment]:
249 image_bytes = scale_image_to_filesize(image_bytes, OCRSpaceOCR.MAX_SIZE_BYTES)
250
251 with TemporaryFile(suffix='.' + image_name.split('.')[-1]) as fp:
252 fp.write(image_bytes)
253 fp.flush()
254
255 for api_key in random.sample(OCRSpaceOCR.API_KEYS, len(OCRSpaceOCR.API_KEYS)):
256 fp.seek(0)
257 payload = { 'isOverlayRequired': True,
258 'apikey': api_key,
259 'detectOrientation': True,
260 'language': 'hun'
261 }
262
263 with requests.Session() as session:
264 response = session.post('https://api.ocr.space/parse/image',
265 files={'filename': fp},
266 data=payload,
267 headers={'Connection':'close'})
268
269 json = response.json()
270
271 if response.status_code == 200 and json["OCRExitCode"] < 3:
272 logger.info('Returning stuff: %s', json)
273 return self.__get_fragments_from_json(json)
274 else:
275 logger.warn('Error occured: |%s|', response.text)
276
277 logger.debug('Failed to get correct response with key |%s|, Code: |%d|, Response: |%s|',\
278 api_key, response.status_code, response.json())
279 #logger.debug('Sleeping for 35 seconds for API throttling')
280 #time.sleep(35)
281
282
283 logger.error('Could not get response with any of the api keys. Image Name: |%s|, bytes length: |%d|',\
284 image_name, len(image_bytes))
285 return []
286
287 def _get_method(self) -> OCRMethod:
288 return OCRMethod.OCRSPACE
289
290
291
292 @staticmethod
293 def __line_to_fragment(line) -> Fragment:
294 x = int(line['Words'][0]['Left'])
295 y = int(line['Words'][0]['Top'])
296 width = int(line['Words'][-1]['Left'] + line['Words'][-1]['Width'] - x)
297 height = int(line['Words'][-1]['Top'] + line['Words'][-1]['Height'] - y)
298 text = line['LineText'].strip()
299
300 return Fragment(text, x, y, width, height)
301
302
303 @staticmethod
304 def __get_fragments_from_json(json_object) -> List[Fragment]:
305 try:
306 lines = [line for line in json_object['ParsedResults'][0]['TextOverlay']['Lines']]
307 fragments = map(lambda line: OCRSpaceOCR.__line_to_fragment(line), lines)
308 return list(fragments)
309 except:
310 logger.error('Could not parse json! |%s|', json_object)
311 return []
312
313def get_ocr_engine(method: OCRMethod) -> OCRBase:
314 if method == OCRMethod.OCRSPACE:
315 return OCRSpaceOCR()
316 elif method == OCRMethod.AZURE:
317 return AzureOCR()
318 else:
319 return TesseractOCR()