· 6 years ago · Nov 07, 2019, 02:46 PM
1from os.path import exists
2from tempfile import mkdtemp, mkstemp
3from shutil import rmtree
4from binascii import b2a_hex
5from os import write, close
6from threading import Thread
7
8from pdfminer.pdfpage import PDFPage
9from pdfminer.pdfparser import PDFParser
10from pdfminer.converter import PDFPageAggregator
11from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
12from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
13from pdfminer.layout import (
14 LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar, LTCurve,
15 LTLine, LTRect,
16)
17
18from kivy.lang import Builder
19from kivy.clock import Clock
20
21from kivy.graphics import Mesh, Color
22from kivy.graphics.tesselator import Tesselator
23
24from kivy.uix.widget import Widget
25from kivy.uix.recycleview import RecycleView
26from kivy.uix.label import Label
27from kivy.uix.image import Image
28from kivy.uix.relativelayout import RelativeLayout
29from kivy.uix.boxlayout import BoxLayout
30
31from kivy.properties import (
32 StringProperty, ListProperty, NumericProperty, AliasProperty,
33 DictProperty, ObjectProperty, BooleanProperty, ColorProperty,
34)
35
36Builder.load_string('''
37#:import RGBA kivy.utils.rgba
38
39<PDFDocumentWidget>:
40 viewclass: 'PDFPageWidget'
41 key_size: 'size'
42 # async load is buggy at the moment
43 # async_load: True
44
45 RecycleGridLayout:
46 spacing: 5
47 cols: root.cols
48 rows: root.rows
49 size_hint: None, None
50 size: self.minimum_size
51 default_size_hint: None, None
52
53<PDFPageWidget>:
54 size_hint: None, None
55
56 canvas.before:
57 Color:
58 rgba: RGBA('FFFFFF')
59 Rectangle:
60 size: self.size
61
62<PDFLabelWidget,PDFImageWidget>:
63 size_hint: None, None
64
65<PDFImageWidget>:
66 pos: self.bbox[:2]
67 size: self.bbox[2] - self.x, self.bbox[3] - self.y
68
69<PDFLabelWidget>:
70 text_size: self.width, None
71 height: self.texture_size[1]
72 color: RGBA('000000')
73 font_size: 8
74
75<PDFCurveWidget>:
76''')
77
78
79class PDFDocumentWidget(RecycleView):
80 source = StringProperty()
81 password = StringProperty()
82 cols = NumericProperty(None)
83 rows = NumericProperty(None)
84 _toc = ListProperty()
85 async_load = BooleanProperty(False)
86
87 def __init__(self, **kwargs):
88 super(PDFDocumentWidget, self).__init__(**kwargs)
89 self._fp = None
90 self._document = None
91 self._tmpdir = None
92 self.bind(source=self.load)
93 if self.source:
94 self.load()
95
96 def load(self, *args):
97 if self._fp:
98 # close the previous pdf file
99 self._fp.close()
100
101 pdf_doc = self.source
102 data = []
103 if not pdf_doc or not exists(pdf_doc):
104 self.pages = []
105 self._doc = []
106 self._document = None
107 if self._tmpdir:
108 rmtree(self._tmpdir)
109 self._tmpdir = None
110
111 try:
112 # open the pdf file
113 self._fp = fp = open(pdf_doc, 'rb')
114 # create a parser object associated with the file object
115 parser = PDFParser(fp)
116 # create a PDFDocument object that stores the document structure
117 doc = PDFDocument(parser)
118 # connect the parser and document objects
119 parser.set_document(doc)
120 # supply the password for initialization
121 # doc.initialize(self.password)
122
123 # if doc.is_extractable:
124 # apply the function and return the result
125 self._document = doc
126 self._parse_toc()
127 self._create_tmpdir()
128 self._parse_pages()
129 except IOError as e:
130 # the file doesn't exist or similar problem
131 print(e)
132
133 def _create_tmpdir(self):
134 if not self._tmpdir:
135 self._tmpdir = mkdtemp()
136 return self._tmpdir
137
138 def _parse_toc(self):
139 """With an open PDFDocument object, get the table of contents (toc) data
140 [this is a higher-order function to be passed to with_pdf()]"""
141 toc = []
142 doc = self._document
143 try:
144 outlines = doc.get_outlines()
145 for (level, title, dest, a, se) in outlines:
146 toc.append((level, title))
147 except:
148 pass
149 finally:
150 self._toc = toc
151
152 def _parse_pages(self):
153 doc = self._document
154 if not doc:
155 self.data = []
156 return
157
158 data = []
159
160 rsrcmgr = PDFResourceManager()
161 laparams = LAParams()
162 self.device = device = PDFPageAggregator(rsrcmgr, laparams=laparams)
163 self.interpreter = PDFPageInterpreter(rsrcmgr, device)
164
165 for i, page in enumerate(PDFPage.create_pages(doc)):
166 p = {
167 'manager': self,
168 'page': page,
169 'size': page.attrs.get('MediaBox', [0, 0, 0, 0])[2:],
170 }
171 data.append(p)
172 self.data = data
173
174
175class PDFImageWidget(Image):
176 bbox = ListProperty([0, 0, 100, 100])
177
178
179class PDFLabelWidget(Label):
180 bbox = ListProperty([0, 0, 100, 100])
181
182
183class PDFCurveWidget(Widget):
184 points = ListProperty()
185 line_width = NumericProperty()
186 stroke = BooleanProperty(False)
187 fill = BooleanProperty(False)
188 even_odd = BooleanProperty()
189 color = ColorProperty()
190 fill_color = ColorProperty()
191
192 def __init__(self, **kwargs):
193 super(PDFCurveWidget, self).__init__(**kwargs)
194 build = Clock.create_trigger(self.build, 0)
195
196 self.bind(
197 points=build,
198 line_width=build,
199 stroke=build,
200 fill=build,
201 even_odd=build,
202 color=build,
203 fill_color=build
204 )
205
206 def build(self, *args):
207 self.canvas.clear()
208 if not self.points:
209 return
210
211 with self.canvas:
212 if self.fill:
213 Color(rgba=self.fill_color)
214 t = Tesselator()
215 t.add_contour(self.points)
216 if tess.tesselate:
217 for vertices, indices in tess.meshes:
218 Mesh(
219 vertices=vertices,
220 indices=indices,
221 mode='triangle fan'
222 )
223 else:
224 print("mesh didn't tesselate!")
225
226 if self.stroke:
227 Color(rgba=self.color)
228 Line(
229 points=self.points,
230 width=self.line_width
231 )
232
233
234class PDFPageWidget(RelativeLayout):
235 labels = DictProperty()
236 attributes = DictProperty()
237 manager = ObjectProperty()
238 page = ObjectProperty()
239 items = ListProperty()
240
241 def on_page(self, *args):
242 if self.manager.async_load:
243 Thread(target=self._load_page).start()
244 else:
245 self._load_page()
246
247 def _load_page(self):
248 self.manager.interpreter.process_page(self.page)
249 self.items = self.manager.device.get_result()
250
251 def on_items(self, *args):
252 self.clear_widgets()
253 self._render_content(self.items)
254
255 def _render_content(self, lt_objs):
256 """Iterate through the list of LT* objects and capture the text
257 or image data contained in each
258 """
259 for lt_obj in lt_objs:
260 print(lt_obj)
261 if isinstance(lt_obj, LTChar):
262 self.add_text(
263 text=lt_obj.get_text(),
264 box_pos=(lt_obj.x0, lt_obj.y0),
265 box_size=(lt_obj.width, lt_obj.height),
266 # font_size=lt_obj.fontsize,
267 # font_name=lt_obj.fontname,
268 )
269
270 elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
271 # text, so arrange is logically based on its column width
272 # this way is very limited style wise, and doesn't allow
273 # support for font, color, style, etc management, as
274 # pdfminer doesn't provide these information at text box
275 # level, by using the following nested loop, it's
276 # possible to have font family info, but for individual
277 # character, which is impractical to create direct
278 # labels for.
279 # for obj in lt_obj:
280 # print(obj)
281 # for o in obj:
282 # print(o)
283
284 self.add_text(
285 text=lt_obj.get_text(),
286 box_pos=(lt_obj.x0, lt_obj.y0),
287 box_size=(lt_obj.width, lt_obj.height),
288 )
289
290 elif isinstance(lt_obj, LTImage):
291 saved_file = self.save_image(lt_obj)
292 if saved_file:
293 self.add_widget(
294 PDFImageWidget(
295 source=saved_file,
296 bbox=lt_obj.bbox
297 )
298 )
299
300 elif isinstance(lt_obj, LTFigure):
301 self._render_content(lt_obj)
302
303 # all of these are actually LTCurves, but all types here for
304 # clarity
305 elif isinstance(lt_obj, (LTLine, LTRect, LTCurve)):
306 self.add_widget(
307 PDFCurveWidget(
308 points=lt_obj.pts or [],
309 line_width=lt_obj.linewidth or 1.0,
310 stroke=lt_obj.stroke,
311 fill=lt_obj.fill,
312 even_odd=lt_obj.evenodd,
313 # colors seem to be indices, to some dict i
314 # can't find in what pdfminer exposes
315 color='#FFFFFFFF', # lt_obj.stroking_color or
316 fill_color='#00000000' # lt_obj.non_stroking_color or
317 )
318 )
319
320 def save_image(self, lt_image):
321 """Try to save the image data from this LTImage object, and
322 return the file name, if successful
323 """
324 if lt_image.stream:
325 file_stream = lt_image.stream.get_rawdata()
326 if file_stream:
327 file_ext = self.determine_image_type(file_stream[0:4])
328 if file_ext:
329 fd, fn = mkstemp(dir=self.manager._tmpdir, suffix='.{}'.format(file_ext))
330 write(fd, file_stream)
331 close(fd)
332 return fn
333
334 @staticmethod
335 def determine_image_type(stream_first_4_bytes):
336 """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
337 file_type = None
338 bytes_as_hex = b2a_hex(stream_first_4_bytes)
339 if bytes_as_hex.startswith(b'ffd8'):
340 file_type = '.jpeg'
341 elif bytes_as_hex == b'89504e47':
342 file_type = '.png'
343 elif bytes_as_hex == b'47494638':
344 file_type = '.gif'
345 elif bytes_as_hex.startswith(b'424d'):
346 file_type = '.bmp'
347 return file_type
348
349 def add_text(self, text, box_pos, box_size, **kwargs):
350 label = self.labels.get((box_pos, box_pos))
351 if not label:
352 label = PDFLabelWidget(text=text, pos=box_pos, size=box_size, **kwargs)
353 self.labels[(box_pos, box_size)] = label
354 self.add_widget(label)
355 else:
356 label.text += text
357
358 def add_image(self, lt_image):
359 source = self.save_image(lt_image)
360 if source:
361 image = PDFImageWidget(
362 source=source,
363 pos=(lt_image.x0, lt_image.y0),
364 size=(lt_image.widt, lt_image.height)
365 )
366 self.add_widget(image)
367 self.images.append(image)
368
369
370if __name__ == '__main__':
371 from sys import argv
372 from kivy.base import runTouchApp
373 from kivy.uix.scrollview import ScrollView
374
375 if len(argv) > 1:
376 fn = argv[1]
377 else:
378 fn = 'Дудников, А.В Русский язык.pdf'
379 root = PDFDocumentWidget(source=fn, cols=1)
380 runTouchApp(root)