· 7 years ago · Nov 16, 2018, 02:20 PM
1import sys
2import os
3from bs4 import BeautifulSoup
4from plot import plot
5import math
6import re
7import numpy as np
8import itertools
9import helpers
10import glob
11np.set_printoptions(threshold=np.inf)
12# Grab this much extra space around tables
13padding = 20
14
15'''
16Tesseract hierarchy:
17
18div.ocr_page
19 div.ocr_carea
20 p.ocr_par
21 span.ocr_line
22 span.ocrx_word
23'''
24
25'''
26table definition:
27 word separation index > document median + 1 std
28 word area index < document median - 1 std
29 never one line
30
31text block:
32 word separation index < document median + 1 std
33 word area index === document median +/- (1 std / 2)
34 never one line
35 On second pass, width is 2 sigma
36
37caption:
38 best: starts with table|figure|fig|map followed by an optional period and a number on a dedicated line
39 good: starts with table|figure|fig|map followed by an optional period and a number on a line with other text
40 ok: starts with some words followed by a number in a text area with an average text height smaller than the average of other text areas on the page
41
42'''
43
44# Determine how tabley a given area is by comparing its attributes to those of the entire document
45# Input is a page of areas, output is the same page, but with a 'type' and 'table_score' assigned
46# to each area
47def classify_areas(page, doc_stats):
48 y_mins = [area['y1'] for area in page['areas']]
49 y_maxes = [area['y2'] for area in page['areas']]
50
51 for area in page['areas']:
52 # The table_score keeps track of how "table-y" an area is, i.e. how many characteristics it has consistent with tables
53 area['table_score'] = 0
54 # Remove gaps smaller than the median gap between words
55 area['gaps'] = [gap for gap in area['gaps'] if gap > doc_stats['word_separation_median']]
56
57 # Add to the table score for each gap (each gap adds one point)
58 for gap in area['gaps']:
59 area['table_score'] += 4
60
61 #
62 # Giant blank areas are *probably* tables
63 # average line height > (document average line height + 100)
64 # area > 250000
65 #
66 if np.nanmean(area['line_heights']) > doc_stats['line_height_avg'] + 100 and area['area'] > 250000:
67 area['type'] = 'table'
68 area['table_score'] += 10
69
70 #
71 # Separator lines
72 # 1 line
73 # 0 words
74 # word separation index === 0
75 # word height index === 0
76 # word height average === 0
77 #
78 elif area['lines'] == 1 and area['words'] == 0 and area['word_separation_index'] == 0 and area['word_height_index'] == 0 and area['word_height_avg'] == 0:
79 area['type'] = 'line'
80
81 #
82 # Tables
83 # word separation index >= (document median word separation index + 1 standard deviation)
84 # area covered by words <= (document word area median - 1 standard deviation)
85 # more than 1 line
86 #
87 elif (area['word_separation_index'] >= (doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std'])) and (area['word_area_index'] <= (doc_stats['word_area_index_median'] - doc_stats['word_area_index_std'])) and area['lines'] > 1:
88 area['type'] = 'table'
89
90 #
91 # Text blocks
92 # word separation index < (document word separation median + 1 standard deviation)
93 # area covered by words > (document word area index - 0.5 standard deviation)
94 # area covered by words < (document word area index + 0.5 standard deviation)
95 # more than 1 line
96 #
97 elif (area['word_separation_index'] < (doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std'])) and (area['word_area_index'] > (doc_stats['word_area_index_median'] - (doc_stats['word_area_index_std']/float(2))) and area['word_area_index'] < (doc_stats['word_area_index_median'] + (doc_stats['word_area_index_std']/float(2)))) and area['lines'] > 1:
98 area['type'] = 'text block'
99
100 # Probably a header or footer
101 elif area['lines'] == 1 and (area['y1'] == min(y_mins) or area['y2'] == max(y_maxes)):
102 area['type'] = 'decoration'
103
104 # Else, unclassified
105 else:
106 area['type'] = 'other'
107
108 # Tally other attributes that are indicative of tables
109 # the area's word separation index is >= the median separation index + 1 standard deviation
110 if area['word_separation_index'] >= (doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std']):
111 area['table_score'] += 1
112 # The area covered by words <= the word area median - 1 standard deviation
113 if area['word_area_index'] <= (doc_stats['word_area_index_median'] - doc_stats['word_area_index_std']):
114 area['table_score'] += 1
115 # It has more than 1 line
116 if area['lines'] > 1:
117 area['table_score'] += 1
118
119 # Summarize the width of text blocks in the document
120
121
122 # Find lines - can be line breaks between paragraphs or divider lines in tables
123 line_breaks = [area for area in page['areas'] if area['type'] == 'line']
124
125 # If a line intersects an area, classify that area as a table
126 for area in page['areas']:
127 if area['type'] != 'line':
128 intersecting_line_breaks = [line for line in line_breaks if helpers.rectangles_intersect(area, line)]
129 for line in intersecting_line_breaks:
130 area['type'] = 'table'
131 area['table_score'] += 1
132
133 # Don't call text blocks with small text text blocks
134 if area['type'] == 'text block' and area['word_height_avg'] < (doc_stats['word_height_avg'] - (doc_stats['word_height_avg_std']/4)) and area['lines'] < 12:
135 area['type'] = 'caption'
136
137 lines = [line for line in area['soup'].find_all('span', 'ocr_line')]
138 if len(lines):
139 clean_line = lines[0].getText().strip().replace('\n', ' ').replace(' ', ' ').lower()
140
141 if (area['type'] == 'text block' or area['type'] == 'other') and re.match('^(table|figure|fig|map)(\.)? \w{1,5}(\S)?(\w{1,5})?(\.)?', clean_line, flags=re.IGNORECASE|re.MULTILINE):
142 area['type'] = 'caption'
143
144 for area in page['areas']:
145 if area['type'] != 'table' and area['table_score'] > 10:
146 area['type'] = 'table'
147
148 return page
149
150# Summarize the area stats of a given document
151def summarize_document(area_stats):
152 # Don't use areas with 1 line or no words in creating summary statistics
153
154 return {
155 'word_separation_mean': np.nanmean([np.nanmean(area['word_distances']) for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
156 'word_separation_median': np.nanmedian([np.nanmean(area['word_distances']) for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
157 'word_separation_std': np.nanstd([np.nanmean(area['word_distances'])for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
158 'word_separation_index_mean': np.nanmean([area['word_separation_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
159 'word_separation_index_median': np.nanmedian([area['word_separation_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
160 'word_separation_index_std': np.nanstd([area['word_separation_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
161 'word_height_index_mean': np.nanmean([area['word_height_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
162 'word_height_index_median': np.nanmedian([area['word_height_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
163 'word_height_index_std': np.nanstd([area['word_height_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
164 'word_area_index_mean': np.nanmean([area['word_area_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
165 'word_area_index_median': np.nanmedian([area['word_area_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
166 'word_area_index_std': np.nanstd([area['word_area_index'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
167 'word_height_avg': np.nanmean([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
168 'word_height_avg_median': np.nanmedian([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
169 'word_height_avg_std': np.nanstd([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
170 'line_height_avg': np.nanmean(reduce(lambda x,y :x+y, [[a for a in area['line_heights']] for area in area_stats])),
171 'line_height_std': np.nanstd(reduce(lambda x,y :x+y, [[a for a in area['line_heights']] for area in area_stats]))
172 }
173
174def line_word_height(line):
175 # For each line, get words
176 words = line.find_all('span', 'ocrx_word')
177 word_heights = []
178 for word_idx, word in enumerate(words):
179 wordbbox = helpers.extractbbox(word.get('title'))
180 word_heights.append(wordbbox['y2'] - wordbbox['y1'])
181
182 avg = 0 if len(words) == 0 else np.nanmean(word_heights)
183
184 return avg
185
186def area_summary(area):
187 summary = {}
188 summary['soup'] = area
189 # Bounding box (x1, y1, x2, y2)
190 summary.update(helpers.extractbbox(area.get('title')))
191
192 # Number of lines
193 summary['lines'] = len(area.find_all('span', 'ocr_line'))
194 summary['line_heights'] = []
195
196 for line in area.find_all('span', 'ocr_line'):
197 bbox = helpers.extractbbox(line.get('title'))
198 height = bbox['y2'] - bbox['y1']
199 summary['line_heights'].append(height)
200
201 # Number of words
202 summary['words'] = len(filter(None, area.getText().strip().replace('\n', ' ').replace(' ', ' ').split(' ')))
203
204 # Area
205 summary['area'] = (summary['x2'] - summary['x1']) * (summary['y2'] - summary['y1'])
206
207 # Get spacing of words
208 summary['x_gaps'] = np.zeros(summary['x2'] - summary['x1'], dtype=np.int)
209
210 # Words per line
211 summary['words_in_line'] = []
212 summary['word_distances'] = []
213 summary['word_heights'] = []
214 summary['word_areas'] = []
215 summary['words_per_line'] = []
216
217 # Iterate on each line in the area
218 for line in area.find_all('span', 'ocr_line'):
219 # For each line, get words
220 words = line.find_all('span', 'ocrx_word')
221
222 # Record the number of words in this line
223 summary['words_per_line'].append(len(words))
224
225 for word_idx, word in enumerate(words):
226 wordbbox = helpers.extractbbox(word.get('title'))
227 summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1'])
228 summary['word_areas'].append((wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1']))
229
230 for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']):
231 if x != 4334 :
232 summary['x_gaps'][x] = 1
233
234 # If word isn't the last word in a line, get distance between word and word + 1
235 if word_idx != (len(words) - 1):
236 wordP1bbox = helpers.extractbbox(words[ word_idx + 1 ].get('title'))
237 # Pythagorean theorum FTW
238 summary['word_distances'].append(math.sqrt(math.pow((wordP1bbox['x1'] - wordbbox['x2']), 2) + math.pow((wordP1bbox['y1'] - wordbbox['y1']), 2)))
239
240 # Count whitespace gaps
241 summary['gaps'] = helpers.get_gaps(summary['x_gaps'])
242
243 # Get the mean of the differences of the word distances (all the same == 0, difference increases away from 0)
244 summary['word_separation_index'] = 0 if summary['words'] == 0 else helpers.meanOfDifferences(summary['word_distances'])
245
246 # Quantify the variation in the height of words in this area
247 summary['word_height_index'] = 0 if summary['words'] == 0 else helpers.meanOfDifferences(summary['word_heights'])
248
249 # Get the average word height of this area
250 summary['word_height_avg'] = 0 if summary['words'] == 0 else np.nanmean(summary['word_heights'])
251
252 # Get word/area ratio
253 summary['word_area_index'] = 0 if summary['words'] == 0 else np.sum(summary['word_areas']) / float(summary['area'])
254
255 return summary
256
257
258
259def process_page(doc_stats, page):
260 def find_above_and_below(extract):
261 out = {
262 'above': [],
263 'below': [],
264 'left': [],
265 'right': []
266 }
267 for area_idx, area in enumerate(page['areas']):
268 # Check if they overlap in x space
269 if area['x1'] <= extract['x2'] and extract['x1'] <= area['x2']:
270 # Check how *much* they overlap in x space
271 # Number of pixels area overlaps with current extract extent
272 overlap = max([ 0, abs(min([ area['x2'], extract['x2'] ]) - max([ extract['x1'], area['x1'] ])) ])
273 area_length = area['x2'] - area['x1']
274 percent_overlap = float(overlap) / area_length
275
276 # If the area overlaps more than 90% in x space with the target area
277 if percent_overlap >= 0.9:
278 # Check if this area is above or below the extract area
279 area_centroid = helpers.centroid(area)
280 extract_centroid = helpers.centroid(extract)
281 # If it is above
282 if area_centroid['y'] <= extract_centroid['y']:
283 # Work backwards so that when we iterate we start at the area closest to the extract
284 out['above'].insert(0, area_idx)
285 # If below
286 else:
287 out['below'].append(area_idx)
288
289 # Check if they overlap in y space
290 elif area['y1'] <= extract['y2'] and extract['y1'] <= area['y2']:
291 overlap = max([ 0, abs(min([ area['y2'], extract['y2'] ]) - max([ extract['y1'], area['y1'] ])) ])
292 area_length = area['y2'] - area['y1']
293 percent_overlap = float(overlap) / area_length
294 if percent_overlap >= 0.9:
295 area_centroid = helpers.centroid(area)
296 extract_centroid = helpers.centroid(extract)
297
298 if area_centroid['x'] <= extract_centroid['x']:
299 out['left'].insert(0, area_idx)
300 else:
301 out['right'].append(area_idx)
302 return out
303
304
305 def expand_extraction(extract_idx, props):
306 # Iterate on above and below areas for each extract
307 for direction, areas in extract_relations[extract_idx].iteritems():
308 stopped = False
309 for area_idx in extract_relations[extract_idx][direction]:
310 # Iterate on all other extracts, making sure that extending the current one won't run into any of the others
311 for extract_idx2, props2 in extract_relations.iteritems():
312 if extract_idx != extract_idx2:
313 will_intersect = helpers.rectangles_intersect(extracts[extract_idx2], helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
314 if will_intersect:
315 stopped = True
316 continue
317
318 if stopped:
319 continue
320
321 if page['areas'][area_idx]['type'] == 'possible table' and direction == extracts[extract_idx]['direction']:
322 #print 'extend', extracts[extract_idx]['name'], 'into possible table'
323 extracts[extract_idx].update(helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
324
325 elif page['areas'][area_idx]['type'] == 'caption':
326 extracts[extract_idx].update(helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
327
328 elif page['areas'][area_idx]['type'] == 'table':
329 #print 'extend', extracts[extract_idx]['name'], 'into table'
330 extracts[extract_idx].update(helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
331
332 elif page['areas'][area_idx]['type'] == 'line':
333 #print 'extend', extracts[extract_idx]['name'], 'into line'
334 extracts[extract_idx].update(helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
335
336 elif ((page['areas'][area_idx]['type'] == 'text block' or page['areas'][area_idx]['type'] == 'other') and page['areas'][area_idx]['word_height_avg'] < (doc_stats['word_height_avg'] - (doc_stats['word_height_avg_std']/4))):
337 #print 'extend', extracts[extract_idx]['name'], 'into text'
338 extracts[extract_idx].update(helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
339
340 else:
341 #print 'stop ', extracts[extract_idx]['name']
342 stopped = True
343
344
345 # Find all areas that each area intersects
346 areas = {}
347 for idx_a, area_a in enumerate(page['areas']):
348 areas[idx_a] = []
349
350 for idx_b, area_b in enumerate(page['areas']):
351 if idx_a != idx_b and helpers.rectangles_intersect(helpers.extractbbox(area_a['soup'].get('title')), helpers.extractbbox(area_b['soup'].get('title'))):
352 areas[idx_a].append(idx_b)
353
354# If area intersects others, recursively get all intersections
355 # new_areas = []
356 # for area_idx in areas:
357 # if len(areas[area_idx]):
358 # new_area = { 'x1': 9999999, 'y1': 9999999, 'x2': -9999999, 'y2': -9999999 }
359 # new_area_consists_of = []
360 # all_intersections = [ areas[i] for i in areas if i in areas[area_idx] ]
361 # # Flatten and filter
362 # all_intersections = set([ item for sublist in all_intersections for item in sublist ])
363 # for area in all_intersections:
364 # new_area_consists_of.append(area)
365 # new_area = helpers.enlarge_extract(new_area, helpers.extractbbox(page['areas'][area]['soup'].get('title')))
366 #
367 # if new_area['x1'] != 9999999:
368 # new_area['consists_of'] = new_area_consists_of
369 # new_areas.append(new_area)
370 #
371 # # Filter unique new areas and remove areas that this new area covers
372 # unique_new_areas = []
373 # for area in new_areas:
374 # # Does this area overlap with any areas already accounted for?
375 # found = False
376 # for uidx, each in enumerate(unique_new_areas):
377 # # If it does, add it to that existing area
378 # if len(set(each['consists_of']).intersection(area['consists_of'])) > 0:
379 # found = True
380 # unique_new_areas[uidx]['consists_of'] = list(set(each['consists_of'] + area['consists_of']))
381 # new_area = helpers.enlarge_extract(each, area)
382 # for key in new_area:
383 # unique_new_areas[uidx][key] = new_area[key]
384 #
385 # if not found:
386 # unique_new_areas.append(area)
387 #
388 # print 'UNIQUE NEW AREAS', unique_new_areas
389
390 # Find the captions/titles for charts, figures, maps, tables
391 indicator_lines = []
392
393 for line in page['lines']:
394 # Remove nonsense
395 clean_line = line.getText().strip().replace('\n', ' ').replace(' ', ' ').lower()
396 # Find all lines that contain only a target word plus a number
397 dedicated_line_matches = re.match('(table|figure|fig|map)(\.)? \d+(\.)?', clean_line, flags=re.IGNORECASE|re.MULTILINE)
398 # Find all the lines that start with one of the target words and a number
399 caption_matches = re.match('(table|figure|fig|map)(\.)? \d+(\.)', clean_line, flags=re.IGNORECASE|re.MULTILINE)
400 # Problematic tesseract matches
401 bad_tesseract_matches = re.match('^(table|figure|fig|map)(\.)? \w{1,5}(\S)?(\w{1,5})?(\.)?', clean_line, flags=re.IGNORECASE|re.MULTILINE)
402
403 bbox = helpers.extractbbox(line.get('title'))
404 # dedicated line (ex: Table 1)
405 if dedicated_line_matches and dedicated_line_matches.group(0) == clean_line:
406 bbox['name'] = dedicated_line_matches.group(0)
407 print ' ', bbox['name'].replace('.', '')
408 indicator_lines.append(bbox)
409
410 # Other
411 elif caption_matches:
412 bbox['name'] = caption_matches.group(0)
413 print ' ', bbox['name'].replace('.', '')
414 indicator_lines.append(bbox)
415
416 elif bad_tesseract_matches:
417 bbox['name'] = bad_tesseract_matches.group(0)
418 print ' ', bbox['name'].replace('.', '')
419 indicator_lines.append(bbox)
420
421 # Assign a caption to each table, and keep track of which captions are assigned to tables. caption_idx: [area_idx, area_idx, ...]
422 caption_areas = {}
423 for area_idx, area in enumerate(page['areas']):
424 if area['type'] == 'table':
425 # Get the distances between the given area and all captions
426 distances = [ { 'idx': line_idx, 'distance': helpers.min_distance(area, line) } for line_idx, line in enumerate(indicator_lines) ]
427
428 # bail if there aren't any indicator_lines
429 if len(distances) == 0:
430 break
431
432 distances_sorted = sorted(distances, key=lambda k: k['distance'])
433
434 for line in distances_sorted:
435 # Check if it intersects any text areas
436 potential_area = helpers.enlarge_extract(area, indicator_lines[line['idx']])
437
438 distances = [helpers.min_distance(area, line) for line in indicator_lines]
439
440 # The index of the nearest caption
441 if len(distances) == 0:
442 break
443
444 nearest_caption = distances.index(min(distances))
445
446 # TODO: Need to check if expanding to this caption would intersect any text areas that don't intersect the caption
447 # Assign the nearest caption to the area
448 area['caption'] = nearest_caption
449 # Bookkeep
450 try:
451 caption_areas[nearest_caption].append(area_idx)
452 except:
453 caption_areas[nearest_caption] = [area_idx]
454
455 '''
456 If a page has tables unassigned to captions, those go in a different pile
457
458 When it comes time to create extract areas from them, they play by different rules:
459 + The starting extract area is simply the area(s) determined to be tables
460 + Extract areas can eat each other / be combined
461 '''
462
463 # Need to go find the tables and create appropriate areas
464 # Basically, treat them as extracts that can overlap, and then merge intersecting extracts
465
466 # alternative_captions = []
467 #
468 # for line in page['lines']:
469 # # First make sure this line doesn't exist any tables
470 # line_bbox = helpers.extractbbox(line.get('title'))
471 # table_intersections = []
472 # for table in all_tables:
473 # if helpers.rectangles_intersect(page['areas'][table], line_bbox):
474 # table_intersections.append(True)
475 # else:
476 # table_intersections.append(False)
477 #
478 # # If it does, skip it
479 # if True in table_intersections:
480 # continue
481 #
482 # # Remove nonsense
483 # clean_line = line.getText().strip().replace('\n', ' ').replace(' ', ' ').lower()
484 # # mediocre caption matches
485 # ok_matches = re.match('^(.*?) \d+(\.)?', clean_line, flags=re.IGNORECASE)
486 #
487 # '''
488 # Caption is good enough if the following are satisfied:
489 # + the average word height is less than the document's average word height - 1/4 average word height std
490 # + The line it is on does not intersect and table
491 # '''
492 # if ok_matches and line_word_height(line) < (doc_stats['word_height_avg'] - (doc_stats['word_height_avg_std']/4)):
493 # line_bbox['name'] = ok_matches.group(0)
494 # print 'Alt caption - ', line_bbox['name']
495 # alternative_captions.append(line_bbox)
496
497
498
499 # Sanity check the caption-area assignments
500 for caption, areas in caption_areas.iteritems():
501 # Only check if the caption is assigned to more than one area
502 if len(areas) > 1:
503 # draw a line through the middle of the caption that spans the page
504 '''
505 x1,y1 0 --------------
506 | |
507 - - - - | - - - - - - - | - - - - <-- Create this line
508 | |
509 -------------- 0 x2,y2
510 '''
511 caption_line_y = indicator_lines[caption]['y1'] + (indicator_lines[caption]['y2'] - indicator_lines[caption]['y1'])
512 caption_line = {
513 'x1': page['page']['x1'],
514 'y1': caption_line_y,
515 'x2': page['page']['x2'],
516 'y2': caption_line_y
517 }
518
519 # Get a list of unique combinations of areas for this caption (example: [(0,1), (1,3)] )
520 area_combinations = list(itertools.combinations(caption_areas[caption], 2))
521
522 # Draw a line between them
523 '''
524 -----------
525 | |
526 | a |
527 | \ |
528 -------\---
529 \ <------ area_connection_line
530 -----\-
531 | \|
532 - - - - | - - -|\ - - - - - - -
533 | | \
534 ------ \
535 \
536 --------\--------------
537 | \ |
538 | \ |
539 | b |
540 | |
541 | |
542 -----------------------
543 '''
544
545 for pair in area_combinations:
546 a = helpers.centroid(page['areas'][pair[0]])
547 b = helpers.centroid(page['areas'][pair[1]])
548 area_line = {
549 'x1': a['x'],
550 'y1': a['y'],
551 'x2': b['x'],
552 'y2': b['y']
553 }
554 # Check if the line intersects the caption line. If it does, determine which of the 'tables' is more table-y
555 if helpers.lines_intersect(caption_line, area_line):
556 if page['areas'][pair[0]]['table_score'] > page['areas'][pair[1]]['table_score']:
557 caption_areas[caption] = [ area for area in caption_areas[caption] if area != pair[1]]
558 else:
559 page['areas'][pair[0]]['type'] = 'possible table'
560 caption_areas[caption] = [ area for area in caption_areas[caption] if area != pair[0]]
561
562 # Extracts are bounding boxes that will be used to actually extract the tables
563 extracts = []
564 for caption, areas in caption_areas.iteritems():
565 print indicator_lines[caption]
566 area_of_interest_centroid_y_mean = np.mean([ helpers.centroid(page['areas'][area])['y'] for area in areas ])
567 indicator_line_centroid_y = helpers.centroid(indicator_lines[caption])['y']
568
569 areas_of_interest = [ page['areas'][area] for area in areas ]
570
571 # Find the area that the indicator line intersects
572 for area in page['areas']:
573 if helpers.rectangles_intersect(area, indicator_lines[caption]):
574 areas_of_interest.append(area)
575 #areas_of_interest.append(indicator_lines[caption])
576
577 # The extract is designated by the min/max coordinates of the caption and cooresponding table(s)
578 extracts.append({
579 'name': indicator_lines[caption]['name'],
580 'direction': 'below' if area_of_interest_centroid_y_mean > indicator_line_centroid_y else 'above',
581 'indicator_line': indicator_lines[caption],
582 'x1': min([a['x1'] for a in areas_of_interest]) - padding,
583 'y1': min([a['y1'] for a in areas_of_interest]) - padding,
584 'x2': max([a['x2'] for a in areas_of_interest]) + padding,
585 'y2': max([a['y2'] for a in areas_of_interest]) + padding
586 })
587
588 # Make sure each table was assigned a caption
589 assigned_tables = []
590 unassigned_tables = []
591 for caption_idx, areas in caption_areas.iteritems():
592 assigned_tables = assigned_tables + areas
593
594 all_tables = []
595 for area_idx, area in enumerate(page['areas']):
596 if area['type'] == 'table':
597 all_tables.append(area_idx)
598
599 if sorted(assigned_tables) == sorted(all_tables):
600 print 'all tables have a caption on page', page['page_no']
601 else:
602 unassigned_tables = set(all_tables).difference(assigned_tables)
603 print 'Not all tables have a caption on page', page['page_no']
604 print 'Not assigned - ', unassigned_tables
605
606 orphan_extracts = []
607 for table in unassigned_tables:
608 if page['areas'][table]['table_score'] > 5:
609 orphan_extracts.append(helpers.expand_area(page['areas'][table], page['areas']))
610
611 orphan_extracts = helpers.union_extracts(orphan_extracts)
612
613 for extract in orphan_extracts:
614 extract['name'] = 'Unknown'
615 extract['direction'] = 'None'
616 # extracts.append(extract)
617
618
619 # Find all areas that overlap in x space and are above and below the extracts
620 extract_relations = {}
621 for extract_idx, extract in enumerate(extracts):
622 extract_relations[extract_idx] = find_above_and_below(extract)
623
624 for extract_idx, extract in enumerate(extracts):
625 expand_extraction(extract_idx, find_above_and_below(extract))
626
627 # for extract_idx, props in extract_relations.iteritems():
628 # expand_extraction(extract_idx, props)
629
630 for extract in orphan_extracts:
631 # Find out if a good extraction already covers this area
632 extract_poly = helpers.make_polygon(extract)
633 covers = False
634 for each in extracts:
635 intersection = extract_poly.intersection(helpers.make_polygon(each))
636 if intersection >= (extract_poly.area * 0.9):
637 covers = True
638
639 if not covers:
640 extracts.append(extract)
641 extract_relations[len(extracts) - 1] = find_above_and_below(extract)
642 expand_extraction(len(extracts) - 1, extract_relations[len(extracts) - 1])
643
644 return extracts
645
646
647# Entry into table extraction
648def extract_tables(document_path):
649 page_paths = glob.glob(document_path + '/tesseract/*.html')
650
651 # Check if a native text layer is available and load it
652 text_layer = ''
653 has_text_layer = False
654 if os.path.exists(document_path + '/text.txt') and os.path.getsize(document_path + '/text.txt') > 1:
655 with open(document_path + '/text.txt') as t:
656 text_layer = t.read()
657 has_text_layer = True
658 else:
659 print 'Does not have text layer'
660
661 pages = []
662 for page_no, page in enumerate(page_paths):
663 # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
664 with open(page) as hocr:
665 text = hocr.read()
666 soup = BeautifulSoup(text, 'html.parser')
667 pages.append({
668 'page_no': page.split('/')[-1].replace('.html', '').replace('page_', ''),
669 'soup': soup,
670 'page': helpers.extractbbox(soup.find_all('div', 'ocr_page')[0].get('title')),
671 'areas': [ area_summary(area) for area in soup.find_all('div', 'ocr_carea') ],
672 'lines': [ line for line in soup.find_all('span', 'ocr_line') ]
673 })
674
675 # Record the OCR-identified text if a native text layer was unavailable
676 if not has_text_layer:
677 text_layer += soup.getText()
678
679
680 # Attempt to identify all charts/tables/etc in the paper by looking at the text layer
681 # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption
682 # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that
683 # it is table 4 because of it's position in the document and our prior knowledge that a table 4
684 # exists
685 text_layer = text_layer.strip().replace('\n', ' ').replace(' ', ' ').lower()
686 figures = []
687 for result in re.findall('(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?', text_layer, flags=re.IGNORECASE):
688 figures.append(' '.join(' '.join(result).replace('.', '').replace('figure', 'fig').split()).lower())
689
690 # Clean up the list of figures/tables/etc
691 figures = sorted(set(figures))
692 figure_idx = {}
693 for fig in figures:
694 parts = fig.split(' ')
695 # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer
696 if parts[0] in figure_idx:
697 try:
698 figure_idx[parts[0]].append(int(parts[1]))
699 except:
700 continue
701 else:
702 try:
703 figure_idx[parts[0]] = [ int(parts[1]) ]
704 except:
705 continue
706
707 # Clean up for reformat
708 for key in figure_idx:
709 figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key])))
710
711 # map/reduce
712 page_areas = [ page['areas'] for page in pages ]
713 area_stats = [ area for areas in page_areas for area in areas ]
714
715 # Calculate summary stats for the document from all areas identified by Tesseract
716 doc_stats = summarize_document(area_stats)
717
718 # Classify and assign a table score to each area in each page
719 pages = [classify_areas(page, doc_stats) for page in pages]
720
721 # Identify the areas that classified as 'text block's and record their widths
722 text_block_widths = []
723 for page in pages:
724 for area in page['areas']:
725 if area['type'] == 'text block':
726 text_block_widths.append( area['x2'] - area['x1'] )
727
728
729 # Calculate stats about the text blocks in the whole document. First get rid of outliers
730 two_sigma = [ val for val in text_block_widths if val > (np.nanmedian(text_block_widths) - (np.nanstd(text_block_widths) * 2)) and val < (np.nanmedian(text_block_widths) + (np.nanstd(text_block_widths) * 2))]
731
732 # Update doc stats, then reclassify
733 doc_stats['text_block_median'] = np.nanmedian(two_sigma)
734 doc_stats['text_block_std'] = np.nanstd(two_sigma)
735
736 # Reclassify all areas based on the stats of the whole document
737 for page in pages:
738 for area in page['areas']:
739 width = area['x2'] - area['x1']
740 # Not a text block if it's width is outside of 2 sigma
741 if area['type'] == 'text block' and (width < doc_stats['text_block_median'] - (2 * doc_stats['text_block_std']) or width > doc_stats['text_block_median'] + (2 * doc_stats['text_block_std'])):
742 area['type'] = 'other'
743
744
745 # Most documents only contain one page height, but others mix landscape and portrait pages
746 # Figure out which is the most common
747 doc_stats['page_height'] = np.bincount([ page['page']['y2'] - page['page']['y1'] for page in pages ]).argmax()
748 doc_stats['page_width'] = np.bincount([ page['page']['x2'] - page['page']['x1'] for page in pages ]).argmax()
749
750 # Find out if a header or footer is present in the document - make sure we don't include them in extracts
751 doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer(pages, doc_stats['page_height'], doc_stats['page_width'])
752
753 new_page_areas = [ { 'page_no': page['page_no'], 'areas': helpers.reclassify_areas(page['areas'], doc_stats['line_height_avg']/2) } for page in pages ]
754 new_pages = {}
755 for page in new_page_areas:
756 new_pages[page['page_no']] = { 'areas': page['areas'] }
757
758 for page in pages:
759 for ai, area in enumerate(new_pages[page['page_no']]['areas']):
760 new_pages[page['page_no']]['areas'][ai]['lines'] = [ line for line in page['soup'].find_all('span', 'ocr_line') if helpers.rectangles_intersect(area['geom'], helpers.extractbbox(line.get('title')))]
761
762
763 for page in pages:
764 new_areas = helpers.reclassify_areas(page['areas'], doc_stats['line_height_avg']/2)
765 # helpers.plot_new_areas(page['page_no'], new_areas)
766
767 doc_stats['found_tables'] = figure_idx
768 print 'these tables were found --'
769 for ttype in figure_idx:
770 print ' ', ttype, figure_idx[ttype]
771
772 for page in pages:
773 page_extracts = process_page(doc_stats, page)
774
775 found = []
776 for e in page_extracts:
777 if e['name'] in found:
778 e['name'] = e['name'] + '*'
779
780 found.append(e['name'])
781
782 # DEBUG
783 # if page['page_no'] == '5':
784 # for idx, area in enumerate(page['areas']):
785 # print 'Area %s -- %s (%s)' % (idx, area['type'], area['table_score'])
786 # print ' Lines: %s' % (area['lines'], )
787 # print ' Words: %s' % (area['words'], )
788 # print ' Area: %s' % (area['area'], )
789 # print ' Word separation index: %s' % ('%.2f' % area['word_separation_index'], )
790 # print ' Word height index: %s' % ('%.2f' % area['word_height_index'], )
791 # print ' Word height avg: %s' % ('%.2f' % area['word_height_avg'], )
792 # print ' Area covered by words: %s%%' % (int(area['word_area_index'] * 100), )
793 # print ' Average word height: %s' % ('%.2f' % area['word_height_avg'])
794 # print ' Gaps: %s' % (area['gaps'])
795 # print ' Line height average: %s' %(np.nanmean(area['line_heights']))
796 # plot(page['soup'], page_extracts)
797 for table in page_extracts:
798 helpers.extract_table(document_path, page['page_no'], table)