· 6 years ago · Nov 08, 2019, 01:46 AM
1from os.path import exists
2from tempfile import mkdtemp, mkstemp
3from shutil import rmtree
4from binascii import b2a_hex
5from os import write, close
6from threading import Thread
7from time import sleep
8
9from kivy.config import Config
10from kivy.graphics.vertex_instructions import Line
11from pdfminer.pdfpage import PDFPage
12from pdfminer.pdfparser import PDFParser
13from pdfminer.converter import PDFPageAggregator
14from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
15from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
16from pdfminer.layout import (
17 LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar, LTCurve,
18 LTLine, LTRect,
19)
20
21from kivy.lang import Builder
22from kivy.clock import Clock
23import kivy.graphics
24from kivy.graphics import Mesh, Color
25from kivy.graphics.tesselator import Tesselator
26
27from kivy.uix.widget import Widget
28from kivy.uix.recycleview import RecycleView
29from kivy.uix.label import Label
30from kivy.uix.image import Image
31from kivy.uix.relativelayout import RelativeLayout
32from kivy.uix.boxlayout import BoxLayout
33
34from kivy.properties import (
35 StringProperty, ListProperty, NumericProperty, AliasProperty,
36 DictProperty, ObjectProperty, BooleanProperty, ColorProperty,
37)
38
39Config.set("graphics", 'resizable', 0)
40Config.set("graphics", 'height', 960)
41Config.set("graphics", 'width', 640)
42Builder.load_string('''
43#:import RGBA kivy.utils.rgba
44
45<PDFDocumentWidget>:
46 viewclass: 'PDFPageWidget'
47 key_size: 'size'
48 # async load is buggy at the moment
49 # async_load: True
50
51 RecycleGridLayout:
52 spacing: 5
53 cols: root.cols
54 rows: root.rows
55 size_hint: None, None
56 size: self.minimum_size
57 default_size_hint: None, None
58
59<PDFPageWidget>:
60 size_hint: None, None
61
62 canvas.before:
63 Color:
64 rgba: RGBA('FFFFFF')
65 Rectangle:
66 size: self.size
67
68<PDFLabelWidget,PDFImageWidget>:
69 size_hint: None, None
70
71<PDFImageWidget>:
72 pos: self.bbox[:2]
73 size: self.bbox[2] - self.x, self.bbox[3] - self.y
74
75<PDFLabelWidget>:
76 text_size: self.width, None
77 height: self.texture_size[1]
78 color: RGBA('000000')
79 font_size: 4
80
81<PDFCurveWidget>:
82''')
83
84
85class PDFDocumentWidget(RecycleView):
86 source = StringProperty()
87 password = StringProperty()
88 cols = NumericProperty(None)
89 rows = NumericProperty(None)
90 _toc = ListProperty()
91 async_load = BooleanProperty(False)
92
93 def __init__(self, **kwargs):
94 super(PDFDocumentWidget, self).__init__(**kwargs)
95 self._fp = None
96 self._document = None
97 self._tmpdir = None
98 self.bind(source=self.load)
99 if self.source:
100 self.load()
101
102 def load(self, *args):
103 if self._fp:
104 # close the previous pdf file
105 self._fp.close()
106
107 pdf_doc = self.source
108 data = []
109 if not pdf_doc or not exists(pdf_doc):
110 self.pages = []
111 self._doc = []
112 self._document = None
113 if self._tmpdir:
114 rmtree(self._tmpdir)
115 self._tmpdir = None
116
117 try:
118 # open the pdf file
119 self._fp = fp = open(pdf_doc, 'rb')
120 # create a parser object associated with the file object
121 parser = PDFParser(fp)
122 # create a PDFDocument object that stores the document structure
123 doc = PDFDocument(parser)
124 # connect the parser and document objects
125 parser.set_document(doc)
126 # supply the password for initialization
127 # doc.initialize(self.password)
128
129 # if doc.is_extractable:
130 # apply the function and return the result
131 self._document = doc
132 self._parse_toc()
133 self._create_tmpdir()
134 self._parse_pages()
135 except IOError as e:
136 # the file doesn't exist or similar problem
137 print(e)
138
139 def _create_tmpdir(self):
140 if not self._tmpdir:
141 self._tmpdir = mkdtemp()
142 return self._tmpdir
143
144 def _parse_toc(self):
145 """With an open PDFDocument object, get the table of contents (toc) data
146 [this is a higher-order function to be passed to with_pdf()]"""
147 toc = []
148 doc = self._document
149 try:
150 outlines = doc.get_outlines()
151 for (level, title, dest, a, se) in outlines:
152 toc.append((level, title))
153 except:
154 pass
155 finally:
156 self._toc = toc
157
158 def _parse_pages(self):
159 doc = self._document
160 if not doc:
161 self.data = []
162 return
163
164 data = []
165
166 rsrcmgr = PDFResourceManager()
167 laparams = LAParams()
168 self.device = device = PDFPageAggregator(rsrcmgr, laparams=laparams)
169 self.interpreter = PDFPageInterpreter(rsrcmgr, device)
170
171 for i, page in enumerate(PDFPage.create_pages(doc)):
172 p = {
173 'manager': self,
174 'page': page,
175 'size': page.attrs.get('MediaBox', [0, 0, 0, 0])[2:],
176 }
177 data.append(p)
178 self.data = data
179
180
181class PDFImageWidget(Image):
182 bbox = ListProperty([0, 0, 100, 100])
183
184
185class PDFLabelWidget(Label):
186 bbox = ListProperty([0, 0, 100, 100])
187
188
189class PDFCurveWidget(Widget):
190 points = ListProperty()
191 line_width = NumericProperty()
192 stroke = BooleanProperty(False)
193 fill = BooleanProperty(False)
194 even_odd = BooleanProperty()
195 color = ColorProperty()
196 fill_color = ColorProperty()
197
198 def __init__(self, **kwargs):
199 super(PDFCurveWidget, self).__init__(**kwargs)
200 build = Clock.create_trigger(self.build, 0)
201
202 self.bind(
203 points=build,
204 line_width=build,
205 stroke=build,
206 fill=build,
207 even_odd=build,
208 color=build,
209 fill_color=build
210 )
211
212 def build(self, tess=None, *args):
213 self.canvas.clear()
214 if not self.points:
215 return
216
217 with self.canvas:
218 if self.fill:
219 Color(rgba=self.fill_color)
220 t = Tesselator()
221 t.add_contour(self.points)
222 if tess.tesselate:
223 for vertices, indices in tess.meshes:
224 Mesh(
225 vertices=vertices,
226 indices=indices,
227 mode='triangle fan'
228 )
229 else:
230 print("mesh didn't tesselate!")
231
232 if self.stroke:
233 Color(rgba=self.color)
234 Line(
235 points=self.points,
236 width=self.line_width
237 )
238
239
240class PDFPageWidget(RelativeLayout):
241 labels = DictProperty()
242 attributes = DictProperty()
243 manager = ObjectProperty()
244 page = ObjectProperty()
245 items = ListProperty()
246
247 def on_page(self, *args):
248 if self.manager.async_load:
249 Thread(target=self._load_page).start()
250 else:
251 self._load_page()
252
253 def _load_page(self):
254 self.manager.interpreter.process_page(self.page)
255 self.items = self.manager.device.get_result()
256
257 def on_items(self, *args):
258 self.clear_widgets()
259 self._render_content(self.items)
260
261 def _render_content(self, lt_objs):
262 """Iterate through the list of LT* objects and capture the text
263 or image data contained in each
264 """
265 for lt_obj in lt_objs:
266 print(lt_obj)
267 if isinstance(lt_obj, LTChar):
268 self.add_text(
269 text=lt_obj.get_text(),
270 box_pos=(lt_obj.x0, lt_obj.y0),
271 box_size=(lt_obj.width, lt_obj.height),
272 # font_size=lt_obj.fontsize,
273 # font_name=lt_obj.fontname,
274 )
275
276 elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
277 # text, so arrange is logically based on its column width
278 # this way is very limited style wise, and doesn't allow
279 # support for font, color, style, etc management, as
280 # pdfminer doesn't provide these information at text box
281 # level, by using the following nested loop, it's
282 # possible to have font family info, but for individual
283 # character, which is impractical to create direct
284 # labels for.
285 # for obj in lt_obj:
286 # print(obj)
287 # for o in obj:
288 # print(o)
289
290 self.add_text(
291 text=lt_obj.get_text(),
292 box_pos=(lt_obj.x0, lt_obj.y0),
293 box_size=(lt_obj.width, lt_obj.height),
294 )
295
296 elif isinstance(lt_obj, LTImage):
297 saved_file = self.save_image(lt_obj)
298 if saved_file:
299 self.add_widget(
300 PDFImageWidget(
301 source=saved_file,
302 bbox=lt_obj.bbox
303 )
304 )
305
306 elif isinstance(lt_obj, LTFigure):
307 self._render_content(lt_obj)
308
309 # all of these are actually LTCurves, but all types here for
310 # clarity
311 elif isinstance(lt_obj, (LTLine, LTRect, LTCurve)):
312 self.add_widget(
313 PDFCurveWidget(
314 points=lt_obj.pts or [],
315 line_width=lt_obj.linewidth or 1.0,
316 stroke=lt_obj.stroke,
317 fill=lt_obj.fill,
318 even_odd=lt_obj.evenodd,
319 # colors seem to be indices, to some dict i
320 # can't find in what pdfminer exposes
321 color='#FFFFFFFF', # lt_obj.stroking_color or
322 fill_color='#00000000' # lt_obj.non_stroking_color or
323 )
324 )
325
326 def save_image(self, lt_image):
327 """Try to save the image data from this LTImage object, and
328 return the file name, if successful
329 """
330 if lt_image.stream:
331 file_stream = lt_image.stream.get_rawdata()
332 if file_stream:
333 file_ext = self.determine_image_type(file_stream[0:4])
334 if file_ext:
335 fd, fn = mkstemp(dir=self.manager._tmpdir, suffix='.{}'.format(file_ext))
336 write(fd, file_stream)
337 close(fd)
338 return fn
339
340 @staticmethod
341 def determine_image_type(stream_first_4_bytes):
342 """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
343 file_type = None
344 bytes_as_hex = b2a_hex(stream_first_4_bytes)
345 if bytes_as_hex.startswith(b'ffd8'):
346 file_type = '.jpeg'
347 elif bytes_as_hex == b'89504e47':
348 file_type = '.png'
349 elif bytes_as_hex == b'47494638':
350 file_type = '.gif'
351 elif bytes_as_hex.startswith(b'424d'):
352 file_type = '.bmp'
353 return file_type
354
355 def add_text(self, text, box_pos, box_size, **kwargs):
356 label = self.labels.get((box_pos, box_pos))
357 if not label:
358 label = PDFLabelWidget(text=text, pos=box_pos, size=box_size, **kwargs)
359 self.labels[(box_pos, box_size)] = label
360 self.add_widget(label)
361 else:
362 label.text += text
363
364 def add_image(self, lt_image):
365 source = self.save_image(lt_image)
366 if source:
367 image = PDFImageWidget(
368 source=source,
369 pos=(lt_image.x0, lt_image.y0),
370 size=(lt_image.widt, lt_image.height)
371 )
372 self.add_widget(image)
373 self.images.append(image)
374
375
376if __name__ == '__main__':
377 from sys import argv
378 from kivy.base import runTouchApp
379 from kivy.uix.scrollview import ScrollView
380
381 fn = 'c'
382
383from kivy.app import App
384from kivy.lang import Builder
385from kivy.uix.recycleview import RecycleView
386from kivy.uix.screenmanager import ScreenManager, Screen
387from kivy.uix.boxlayout import BoxLayout
388from kivy.uix.gridlayout import GridLayout
389from kivy.config import Config
390
391Config.set("graphics", 'resizable', 1)
392Config.set("graphics", 'height', 960)
393Config.set("graphics", 'width', 640)
394Builder.load_string("""
395<Button1@Button>:
396 background_color:255, 255, 255, 1
397 size:300,150
398 size_hint:None, None
399
400<Image>:
401 source:'фон.jpg'
402 allow_stretch: True
403
404<Label>:
405 color:0,0,0,1
406 font_size:30
407
408<BoxLayout>:
409 orientation:'vertical'
410 spacing:200
411 padding:170,200
412 background_color:1,1,1,1
413
414<GridLayout10@GridLayout>:
415 cols:2
416 spacing:10,40
417 padding:15
418
419<MainScreen>:
420 name:"Menu"
421 Image
422 BoxLayout:
423 Button1:
424 on_press:root.manager.current="Subjects"
425 text:"Subjects"
426 Button1:
427 text:"Special"
428 on_press:root.manager.current="Special"
429
430
431<SubjectsScreen>:
432 name:"Subjects"
433 Image
434 BoxLayout:
435 Button1:
436 text:"10 class"
437 on_press:root.manager.current="SubjectsScreen10"
438 Button1:
439 text:"11 class"
440 on_press:root.manager.current="SubjectsScreen11"
441
442<SubjectsScreen10>:
443 name:"SubjectsScreen10"
444 Image
445 GridLayout10:
446 Button1:
447 text:"Геометрия"
448 Button1:
449 text:"Алгебра"
450 Button1:
451 text:"Химия"
452 Button1:
453 text:"Физика"
454 Button1:
455 text:"Русская литература"
456 Button1:
457 text:"Русский язык"
458 on_press: root.russ_yaz()
459 on_press: root.close()
460 Button1:
461 text:"География"
462 Button1:
463 text:"Биология"
464 Button1:
465 text:"Белорусский язык"
466
467<SubjectsScreen11>:
468 name:"SubjectsScreen11"
469 Image
470 GridLayout10:
471 Button1:
472 text:"Алгебра"
473 Button1:
474 text:"Геометрия"
475 Button1:
476 text:"Физика"
477 Button1:
478 text:"Химия"
479 Button1:
480 text:"Русская литература"
481 Button1:
482 text:"Русский язык"
483 on_press:self.russ_yaz
484
485 Button1:
486 text:"География"
487 Button1:
488 text:"Биология"
489 Button1:
490 text:"Белорусский язык"
491 Button1:
492 text:"Астрономия"
493
494<SpecialSubjectsScreen>:
495 name:"Special"
496 Image
497 AnchorLayout:
498 Button1:
499 text:"No material(Back)"
500 on_press: root.manager.current='Menu'
501""")
502
503
504class MainScreen(Screen):
505 pass
506
507
508class SpecialSubjectsScreen(Screen):
509 pass
510
511
512class SubjectsScreen(Screen):
513 pass
514
515
516class SubjectsScreen10(Screen):
517 def russ_yaz(self):
518 fn = 'Дудников, А.В Русский язык.pdf'
519 root = PDFDocumentWidget(source=fn, cols=1)
520 runTouchApp(root)
521 PDFDocumentWidget()
522 def close(self):
523 App.get_running_app().stop()
524
525
526
527class SubjectsScreen11(Screen):
528 pass
529
530
531sm = ScreenManager()
532sm.add_widget(MainScreen(name='Menu'))
533sm.add_widget(SpecialSubjectsScreen(name='Special'))
534sm.add_widget(SubjectsScreen(name="Subjects"))
535sm.add_widget(SubjectsScreen10(name="SubjectsScreen10"))
536sm.add_widget(SubjectsScreen11(name="SubjectsScreen11"))
537
538
539class LibraryApp(App):
540
541
542 def build(self):
543 return sm
544
545
546if __name__ == '__main__':
547 LibraryApp().run()
548
549runTouchApp(root)