· 6 years ago · Nov 07, 2019, 11:08 PM
1from os.path import exists
2from tempfile import mkdtemp, mkstemp
3from shutil import rmtree
4from binascii import b2a_hex
5from os import write, close
6from threading import Thread
7
8from pdfminer.pdfpage import PDFPage
9from pdfminer.pdfparser import PDFParser
10from pdfminer.converter import PDFPageAggregator
11from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
12from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
13from pdfminer.layout import (
14 LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar, LTCurve,
15 LTLine, LTRect,
16)
17
18from kivy.lang import Builder
19from kivy.clock import Clock
20import kivy.graphics
21from kivy.graphics import Mesh, Color
22from kivy.graphics.tesselator import Tesselator
23
24from kivy.uix.widget import Widget
25from kivy.uix.recycleview import RecycleView
26from kivy.uix.label import Label
27from kivy.uix.image import Image
28from kivy.uix.relativelayout import RelativeLayout
29from kivy.uix.boxlayout import BoxLayout
30
31from kivy.properties import (
32 StringProperty, ListProperty, NumericProperty, AliasProperty,
33 DictProperty, ObjectProperty, BooleanProperty, ColorProperty,
34)
35
36Builder.load_string('''
37#:import RGBA kivy.utils.rgba
38
39<PDFDocumentWidget>:
40 viewclass: 'PDFPageWidget'
41 key_size: 'size'
42 # async load is buggy at the moment
43 # async_load: True
44
45 RecycleGridLayout:
46 spacing: 5
47 cols: root.cols
48 rows: root.rows
49 size_hint: None, None
50 size: self.minimum_size
51 default_size_hint: None, None
52
53<PDFPageWidget>:
54 size_hint: None, None
55
56 canvas.before:
57 Color:
58 rgba: RGBA('FFFFFF')
59 Rectangle:
60 size: self.size
61
62<PDFLabelWidget,PDFImageWidget>:
63 size_hint: None, None
64
65<PDFImageWidget>:
66 pos: self.bbox[:2]
67 size: self.bbox[2] - self.x, self.bbox[3] - self.y
68
69<PDFLabelWidget>:
70 text_size: self.width, None
71 height: self.texture_size[1]
72 color: RGBA('000000')
73 font_size: 8
74
75<PDFCurveWidget>:
76''')
77
78
79class PDFDocumentWidget(RecycleView):
80 source = StringProperty()
81 password = StringProperty()
82 cols = NumericProperty(None)
83 rows = NumericProperty(None)
84 _toc = ListProperty()
85 async_load = BooleanProperty(False)
86
87 def __init__(self, **kwargs):
88 super(PDFDocumentWidget, self).__init__(**kwargs)
89 self._fp = None
90 self._document = None
91 self._tmpdir = None
92 self.bind(source=self.load)
93 if self.source:
94 self.load()
95
96 def load(self, *args):
97 if self._fp:
98 # close the previous pdf file
99 self._fp.close()
100
101 pdf_doc = self.source
102 data = []
103 if not pdf_doc or not exists(pdf_doc):
104 self.pages = []
105 self._doc = []
106 self._document = None
107 if self._tmpdir:
108 rmtree(self._tmpdir)
109 self._tmpdir = None
110
111 try:
112 # open the pdf file
113 self._fp = fp = open(pdf_doc, 'rb')
114 # create a parser object associated with the file object
115 parser = PDFParser(fp)
116 # create a PDFDocument object that stores the document structure
117 doc = PDFDocument(parser)
118 # connect the parser and document objects
119 parser.set_document(doc)
120 # supply the password for initialization
121 # doc.initialize(self.password)
122
123 # if doc.is_extractable:
124 # apply the function and return the result
125 self._document = doc
126 self._parse_toc()
127 self._create_tmpdir()
128 self._parse_pages()
129 except IOError as e:
130 # the file doesn't exist or similar problem
131 print(e)
132
133 def _create_tmpdir(self):
134 if not self._tmpdir:
135 self._tmpdir = mkdtemp()
136 return self._tmpdir
137
138 def _parse_toc(self):
139 """With an open PDFDocument object, get the table of contents (toc) data
140 [this is a higher-order function to be passed to with_pdf()]"""
141 toc = []
142 doc = self._document
143 try:
144 outlines = doc.get_outlines()
145 for (level, title, dest, a, se) in outlines:
146 toc.append((level, title))
147 except:
148 pass
149 finally:
150 self._toc = toc
151
152 def _parse_pages(self):
153 doc = self._document
154 if not doc:
155 self.data = []
156 return
157
158 data = []
159
160 rsrcmgr = PDFResourceManager()
161 laparams = LAParams()
162 self.device = device = PDFPageAggregator(rsrcmgr, laparams=laparams)
163 self.interpreter = PDFPageInterpreter(rsrcmgr, device)
164
165 for i, page in enumerate(PDFPage.create_pages(doc)):
166 p = {
167 'manager': self,
168 'page': page,
169 'size': page.attrs.get('MediaBox', [0, 0, 0, 0])[2:],
170 }
171 data.append(p)
172 self.data = data
173
174
175class PDFImageWidget(Image):
176 bbox = ListProperty([0, 0, 100, 100])
177
178
179class PDFLabelWidget(Label):
180 bbox = ListProperty([0, 0, 100, 100])
181
182
183class PDFCurveWidget(Widget):
184 points = ListProperty()
185 line_width = NumericProperty()
186 stroke = BooleanProperty(False)
187 fill = BooleanProperty(False)
188 even_odd = BooleanProperty()
189 color = ColorProperty()
190 fill_color = ColorProperty()
191
192 def __init__(self, **kwargs):
193 super(PDFCurveWidget, self).__init__(**kwargs)
194 build = Clock.create_trigger(self.build, 0)
195
196 self.bind(
197 points=build,
198 line_width=build,
199 stroke=build,
200 fill=build,
201 even_odd=build,
202 color=build,
203 fill_color=build
204 )
205
206 def build(self, *args):
207 self.canvas.clear()
208 if not self.points:
209 return
210
211 with self.canvas:
212 if self.fill:
213 Color(rgba=self.fill_color)
214 t = Tesselator()
215 t.add_contour(self.points)
216 if tess.tesselate:
217 for vertices, indices in tess.meshes:
218 Mesh(
219 vertices=vertices,
220 indices=indices,
221 mode='triangle fan'
222 )
223 else:
224 print("mesh didn't tesselate!")
225
226 if self.stroke:
227 Color(rgba=self.color)
228 Line(
229 points=self.points,
230 width=self.line_width
231 )
232
233
234class PDFPageWidget(RelativeLayout):
235 labels = DictProperty()
236 attributes = DictProperty()
237 manager = ObjectProperty()
238 page = ObjectProperty()
239 items = ListProperty()
240
241 def on_page(self, *args):
242 if self.manager.async_load:
243 Thread(target=self._load_page).start()
244 else:
245 self._load_page()
246
247 def _load_page(self):
248 self.manager.interpreter.process_page(self.page)
249 self.items = self.manager.device.get_result()
250
251 def on_items(self, *args):
252 self.clear_widgets()
253 self._render_content(self.items)
254
255 def _render_content(self, lt_objs):
256 """Iterate through the list of LT* objects and capture the text
257 or image data contained in each
258 """
259 for lt_obj in lt_objs:
260 print(lt_obj)
261 if isinstance(lt_obj, LTChar):
262 self.add_text(
263 text=lt_obj.get_text(),
264 box_pos=(lt_obj.x0, lt_obj.y0),
265 box_size=(lt_obj.width, lt_obj.height),
266 # font_size=lt_obj.fontsize,
267 # font_name=lt_obj.fontname,
268 )
269
270 elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
271 # text, so arrange is logically based on its column width
272 # this way is very limited style wise, and doesn't allow
273 # support for font, color, style, etc management, as
274 # pdfminer doesn't provide these information at text box
275 # level, by using the following nested loop, it's
276 # possible to have font family info, but for individual
277 # character, which is impractical to create direct
278 # labels for.
279 # for obj in lt_obj:
280 # print(obj)
281 # for o in obj:
282 # print(o)
283
284 self.add_text(
285 text=lt_obj.get_text(),
286 box_pos=(lt_obj.x0, lt_obj.y0),
287 box_size=(lt_obj.width, lt_obj.height),
288 )
289
290 elif isinstance(lt_obj, LTImage):
291 saved_file = self.save_image(lt_obj)
292 if saved_file:
293 self.add_widget(
294 PDFImageWidget(
295 source=saved_file,
296 bbox=lt_obj.bbox
297 )
298 )
299
300 elif isinstance(lt_obj, LTFigure):
301 self._render_content(lt_obj)
302
303 # all of these are actually LTCurves, but all types here for
304 # clarity
305 elif isinstance(lt_obj, (LTLine, LTRect, LTCurve)):
306 self.add_widget(
307 PDFCurveWidget(
308 points=lt_obj.pts or [],
309 line_width=lt_obj.linewidth or 1.0,
310 stroke=lt_obj.stroke,
311 fill=lt_obj.fill,
312 even_odd=lt_obj.evenodd,
313 # colors seem to be indices, to some dict i
314 # can't find in what pdfminer exposes
315 color='#FFFFFFFF', # lt_obj.stroking_color or
316 fill_color='#00000000' # lt_obj.non_stroking_color or
317 )
318 )
319
320 def save_image(self, lt_image):
321 """Try to save the image data from this LTImage object, and
322 return the file name, if successful
323 """
324 if lt_image.stream:
325 file_stream = lt_image.stream.get_rawdata()
326 if file_stream:
327 file_ext = self.determine_image_type(file_stream[0:4])
328 if file_ext:
329 fd, fn = mkstemp(dir=self.manager._tmpdir, suffix='.{}'.format(file_ext))
330 write(fd, file_stream)
331 close(fd)
332 return fn
333
334 @staticmethod
335 def determine_image_type(stream_first_4_bytes):
336 """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
337 file_type = None
338 bytes_as_hex = b2a_hex(stream_first_4_bytes)
339 if bytes_as_hex.startswith(b'ffd8'):
340 file_type = '.jpeg'
341 elif bytes_as_hex == b'89504e47':
342 file_type = '.png'
343 elif bytes_as_hex == b'47494638':
344 file_type = '.gif'
345 elif bytes_as_hex.startswith(b'424d'):
346 file_type = '.bmp'
347 return file_type
348
349 def add_text(self, text, box_pos, box_size, **kwargs):
350 label = self.labels.get((box_pos, box_pos))
351 if not label:
352 label = PDFLabelWidget(text=text, pos=box_pos, size=box_size, **kwargs)
353 self.labels[(box_pos, box_size)] = label
354 self.add_widget(label)
355 else:
356 label.text += text
357
358 def add_image(self, lt_image):
359 source = self.save_image(lt_image)
360 if source:
361 image = PDFImageWidget(
362 source=source,
363 pos=(lt_image.x0, lt_image.y0),
364 size=(lt_image.widt, lt_image.height)
365 )
366 self.add_widget(image)
367 self.images.append(image)
368
369
370if __name__ == '__main__':
371 from sys import argv
372 from kivy.base import runTouchApp
373 from kivy.uix.scrollview import ScrollView
374 fn = 'c'
375 root = PDFDocumentWidget(source=fn, cols=1)
376
377
378
379
380
381
382from kivy.app import App
383from kivy.lang import Builder
384from kivy.uix.recycleview import RecycleView
385from kivy.uix.screenmanager import ScreenManager, Screen
386from kivy.uix.boxlayout import BoxLayout
387from kivy.uix.gridlayout import GridLayout
388from kivy.config import Config
389from knopki import Knopki
390
391Config.set("graphics", 'resizable', 0)
392Config.set("graphics", 'height', 960)
393Config.set("graphics", 'width', 640)
394Builder.load_string("""
395#:import Knopki knopki
396<Button1@Button>:
397 background_color:255, 255, 255, 1
398 size:300,150
399 size_hint:None, None
400
401<Image>:
402 source:'фон.jpg'
403 allow_stretch: True
404
405<Label>:
406 color:0,0,0,1
407 font_size:30
408
409<BoxLayout>:
410 orientation:'vertical'
411 spacing:200
412 padding:170,200
413 background_color:1,1,1,1
414
415<GridLayout10@GridLayout>:
416 cols:2
417 spacing:10,40
418 padding:15
419
420<MainScreen>:
421 name:"Menu"
422 Image
423 BoxLayout:
424 Button1:
425 on_press:root.manager.current="Subjects"
426 text:"Subjects"
427 Button1:
428 text:"Special"
429 on_press:root.manager.current="Special"
430
431
432<SubjectsScreen>:
433 name:"Subjects"
434 Image
435 BoxLayout:
436 Button1:
437 text:"10 class"
438 on_press:root.manager.current="SubjectsScreen10"
439 Button1:
440 text:"11 class"
441 on_press:root.manager.current="SubjectsScreen11"
442
443<SubjectsScreen10>:
444 name:"SubjectsScreen10"
445 Image
446 GridLayout10:
447 Button1:
448 text:"Геометрия"
449 Button1:
450 text:"Алгебра"
451 Button1:
452 text:"Химия"
453 Button1:
454 text:"Физика"
455 Button1:
456 text:"Русская литература"
457 Button1:
458 text:"Русский язык"
459 on_press:root.russ_yaz()
460 Button1:
461 text:"География"
462 Button1:
463 text:"Биология"
464 Button1:
465 text:"Белорусский язык"
466
467<SubjectsScreen11>:
468 name:"SubjectsScreen11"
469 Image
470 GridLayout10:
471 Button1:
472 text:"Алгебра"
473 Button1:
474 text:"Геометрия"
475 Button1:
476 text:"Физика"
477 Button1:
478 text:"Химия"
479 Button1:
480 text:"Русская литература"
481 Button1:
482 text:"Русский язык"
483 on_press:self.russ_yaz
484
485 Button1:
486 text:"География"
487 Button1:
488 text:"Биология"
489 Button1:
490 text:"Белорусский язык"
491 Button1:
492 text:"Астрономия"
493
494<SpecialSubjectsScreen>:
495 name:"Special"
496 Image
497 AnchorLayout:
498 Button1:
499 text:"No material(Back)"
500 on_press: root.manager.current='Menu'
501""")
502
503
504class MainScreen(Screen):
505 pass
506
507
508class SpecialSubjectsScreen(Screen):
509 pass
510
511
512class SubjectsScreen(Screen):
513 pass
514
515
516class SubjectsScreen10(Screen):
517 def russ_yaz(self):
518 fn = 'Дудников, А.В Русский язык.pdf'
519 root = PDFDocumentWidget(source=fn, cols=1)
520 runTouchApp(root)
521 PDFDocumentWidget()
522
523
524
525class SubjectsScreen11(Screen):
526 pass
527
528
529sm = ScreenManager()
530sm.add_widget(MainScreen(name='Menu'))
531sm.add_widget(SpecialSubjectsScreen(name='Special'))
532sm.add_widget(SubjectsScreen(name="Subjects"))
533sm.add_widget(SubjectsScreen10(name="SubjectsScreen10"))
534sm.add_widget(SubjectsScreen11(name="SubjectsScreen11"))
535
536
537class LibraryApp(App):
538 def russ_yaz(self):
539 fn = 'Дудников, А.В Русский язык.pdf'
540 root = PDFDocumentWidget(source=fn, cols=1)
541 runTouchApp(root)
542 def build(self):
543 return sm
544
545
546
547if __name__ == '__main__':
548 LibraryApp().run()
549
550
551runTouchApp(root)