· 6 years ago · Nov 08, 2019, 03:10 AM
1from os.path import exists
2from tempfile import mkdtemp, mkstemp
3from shutil import rmtree
4from binascii import b2a_hex
5from os import write, close
6from threading import Thread
7from time import sleep
8
9from kivy.config import Config
10import kivy.graphics
11from kivy.graphics.vertex_instructions import Line
12from pdfminer.pdfpage import PDFPage
13from pdfminer.pdfparser import PDFParser
14from pdfminer.converter import PDFPageAggregator
15from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
16from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
17from pdfminer.layout import (
18 LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar, LTCurve,
19 LTLine, LTRect,
20)
21
22from kivy.lang import Builder
23from kivy.clock import Clock
24import kivy.graphics
25from kivy.graphics import Mesh, Color
26from kivy.graphics.tesselator import Tesselator
27
28from kivy.uix.widget import Widget
29from kivy.uix.recycleview import RecycleView
30from kivy.uix.label import Label
31from kivy.uix.image import Image
32from kivy.uix.relativelayout import RelativeLayout
33from kivy.uix.boxlayout import BoxLayout
34
35from kivy.properties import (
36 StringProperty, ListProperty, NumericProperty, AliasProperty,
37 DictProperty, ObjectProperty, BooleanProperty, ColorProperty,
38)
39
40Config.set("graphics", 'resizable', 0)
41Config.set("graphics", 'height', 960)
42Config.set("graphics", 'width', 640)
43Builder.load_string('''
44#:import RGBA kivy.utils.rgba
45
46<PDFDocumentWidget>:
47 viewclass: 'PDFPageWidget'
48 key_size: 'size'
49 # async load is buggy at the moment
50 # async_load: True
51
52 RecycleGridLayout:
53 spacing: 5
54 cols: root.cols
55 rows: root.rows
56 size_hint: None, None
57 size: self.minimum_size
58 default_size_hint: None, None
59
60<PDFPageWidget>:
61 size_hint: None, None
62
63 canvas.before:
64 Color:
65 rgba: RGBA('FFFFFF')
66 Rectangle:
67 size: self.size
68
69<PDFLabelWidget,PDFImageWidget>:
70 size_hint: None, None
71
72<PDFImageWidget>:
73 pos: self.bbox[:2]
74 size: self.bbox[2] - self.x, self.bbox[3] - self.y
75
76<PDFLabelWidget>:
77 text_size: self.width, None
78 height: self.texture_size[1]
79 color: RGBA('000000')
80 font_size: 4
81
82<PDFCurveWidget>:
83''')
84
85
86class PDFDocumentWidget(RecycleView):
87 source = StringProperty()
88 password = StringProperty()
89 cols = NumericProperty(None)
90 rows = NumericProperty(None)
91 _toc = ListProperty()
92 async_load = BooleanProperty(False)
93
94 def __init__(self, **kwargs):
95 super(PDFDocumentWidget, self).__init__(**kwargs)
96 self._fp = None
97 self._document = None
98 self._tmpdir = None
99 self.bind(source=self.load)
100 if self.source:
101 self.load()
102
103 def load(self, *args):
104 if self._fp:
105 # close the previous pdf file
106 self._fp.close()
107
108 pdf_doc = self.source
109 data = []
110 if not pdf_doc or not exists(pdf_doc):
111 self.pages = []
112 self._doc = []
113 self._document = None
114 if self._tmpdir:
115 rmtree(self._tmpdir)
116 self._tmpdir = None
117
118 try:
119 # open the pdf file
120 self._fp = fp = open(pdf_doc, 'rb')
121 # create a parser object associated with the file object
122 parser = PDFParser(fp)
123 # create a PDFDocument object that stores the document structure
124 doc = PDFDocument(parser)
125 # connect the parser and document objects
126 parser.set_document(doc)
127 # supply the password for initialization
128 # doc.initialize(self.password)
129
130 # if doc.is_extractable:
131 # apply the function and return the result
132 self._document = doc
133 self._parse_toc()
134 self._create_tmpdir()
135 self._parse_pages()
136 except IOError as e:
137 # the file doesn't exist or similar problem
138 print(e)
139
140 def _create_tmpdir(self):
141 if not self._tmpdir:
142 self._tmpdir = mkdtemp()
143 return self._tmpdir
144
145 def _parse_toc(self):
146 """With an open PDFDocument object, get the table of contents (toc) data
147 [this is a higher-order function to be passed to with_pdf()]"""
148 toc = []
149 doc = self._document
150 try:
151 outlines = doc.get_outlines()
152 for (level, title, dest, a, se) in outlines:
153 toc.append((level, title))
154 except:
155 pass
156 finally:
157 self._toc = toc
158
159 def _parse_pages(self):
160 doc = self._document
161 if not doc:
162 self.data = []
163 return
164
165 data = []
166
167 rsrcmgr = PDFResourceManager()
168 laparams = LAParams()
169 self.device = device = PDFPageAggregator(rsrcmgr, laparams=laparams)
170 self.interpreter = PDFPageInterpreter(rsrcmgr, device)
171
172 for i, page in enumerate(PDFPage.create_pages(doc)):
173 p = {
174 'manager': self,
175 'page': page,
176 'size': page.attrs.get('MediaBox', [0, 0, 0, 0])[2:],
177 }
178 data.append(p)
179 self.data = data
180
181
182class PDFImageWidget(Image):
183 bbox = ListProperty([0, 0, 100, 100])
184
185
186class PDFLabelWidget(Label):
187 bbox = ListProperty([0, 0, 100, 100])
188
189
190class PDFCurveWidget(Widget):
191 points = ListProperty()
192 line_width = NumericProperty()
193 stroke = BooleanProperty(False)
194 fill = BooleanProperty(False)
195 even_odd = BooleanProperty()
196 color = ColorProperty()
197 fill_color = ColorProperty()
198
199 def __init__(self, **kwargs):
200 super(PDFCurveWidget, self).__init__(**kwargs)
201 build = Clock.create_trigger(self.build, 0)
202
203 self.bind(
204 points=build,
205 line_width=build,
206 stroke=build,
207 fill=build,
208 even_odd=build,
209 color=build,
210 fill_color=build
211 )
212
213 def build(self, tess=None, *args):
214 self.canvas.clear()
215 if not self.points:
216 return
217
218 with self.canvas:
219 if self.fill:
220 Color(rgba=self.fill_color)
221 t = Tesselator()
222 t.add_contour(self.points)
223 if tess.tesselate:
224 for vertices, indices in tess.meshes:
225 Mesh(
226 vertices=vertices,
227 indices=indices,
228 mode='triangle fan'
229 )
230 else:
231 print("mesh didn't tesselate!")
232
233 if self.stroke:
234 Color(rgba=self.color)
235 Line(
236 points=self.points,
237 width=self.line_width
238 )
239
240
241class PDFPageWidget(RelativeLayout):
242 labels = DictProperty()
243 attributes = DictProperty()
244 manager = ObjectProperty()
245 page = ObjectProperty()
246 items = ListProperty()
247
248 def on_page(self, *args):
249 if self.manager.async_load:
250 Thread(target=self._load_page).start()
251 else:
252 self._load_page()
253
254 def _load_page(self):
255 self.manager.interpreter.process_page(self.page)
256 self.items = self.manager.device.get_result()
257
258 def on_items(self, *args):
259 self.clear_widgets()
260 self._render_content(self.items)
261
262 def _render_content(self, lt_objs):
263 """Iterate through the list of LT* objects and capture the text
264 or image data contained in each
265 """
266 for lt_obj in lt_objs:
267 print(lt_obj)
268 if isinstance(lt_obj, LTChar):
269 self.add_text(
270 text=lt_obj.get_text(),
271 box_pos=(lt_obj.x0, lt_obj.y0),
272 box_size=(lt_obj.width, lt_obj.height),
273 # font_size=lt_obj.fontsize,
274 # font_name=lt_obj.fontname,
275 )
276
277 elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
278 # text, so arrange is logically based on its column width
279 # this way is very limited style wise, and doesn't allow
280 # support for font, color, style, etc management, as
281 # pdfminer doesn't provide these information at text box
282 # level, by using the following nested loop, it's
283 # possible to have font family info, but for individual
284 # character, which is impractical to create direct
285 # labels for.
286 # for obj in lt_obj:
287 # print(obj)
288 # for o in obj:
289 # print(o)
290
291 self.add_text(
292 text=lt_obj.get_text(),
293 box_pos=(lt_obj.x0, lt_obj.y0),
294 box_size=(lt_obj.width, lt_obj.height),
295 )
296
297 elif isinstance(lt_obj, LTImage):
298 saved_file = self.save_image(lt_obj)
299 if saved_file:
300 self.add_widget(
301 PDFImageWidget(
302 source=saved_file,
303 bbox=lt_obj.bbox
304 )
305 )
306
307 elif isinstance(lt_obj, LTFigure):
308 self._render_content(lt_obj)
309
310 # all of these are actually LTCurves, but all types here for
311 # clarity
312 elif isinstance(lt_obj, (LTLine, LTRect, LTCurve)):
313 self.add_widget(
314 PDFCurveWidget(
315 points=lt_obj.pts or [],
316 line_width=lt_obj.linewidth or 1.0,
317 stroke=lt_obj.stroke,
318 fill=lt_obj.fill,
319 even_odd=lt_obj.evenodd,
320 # colors seem to be indices, to some dict i
321 # can't find in what pdfminer exposes
322 color='#FFFFFFFF', # lt_obj.stroking_color or
323 fill_color='#00000000' # lt_obj.non_stroking_color or
324 )
325 )
326
327 def save_image(self, lt_image):
328 """Try to save the image data from this LTImage object, and
329 return the file name, if successful
330 """
331 if lt_image.stream:
332 file_stream = lt_image.stream.get_rawdata()
333 if file_stream:
334 file_ext = self.determine_image_type(file_stream[0:4])
335 if file_ext:
336 fd, fn = mkstemp(dir=self.manager._tmpdir, suffix='.{}'.format(file_ext))
337 write(fd, file_stream)
338 close(fd)
339 return fn
340
341 @staticmethod
342 def determine_image_type(stream_first_4_bytes):
343 """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
344 file_type = None
345 bytes_as_hex = b2a_hex(stream_first_4_bytes)
346 if bytes_as_hex.startswith(b'ffd8'):
347 file_type = '.jpeg'
348 elif bytes_as_hex == b'89504e47':
349 file_type = '.png'
350 elif bytes_as_hex == b'47494638':
351 file_type = '.gif'
352 elif bytes_as_hex.startswith(b'424d'):
353 file_type = '.bmp'
354 return file_type
355
356 def add_text(self, text, box_pos, box_size, **kwargs):
357 label = self.labels.get((box_pos, box_pos))
358 if not label:
359 label = PDFLabelWidget(text=text, pos=box_pos, size=box_size, **kwargs)
360 self.labels[(box_pos, box_size)] = label
361 self.add_widget(label)
362 else:
363 label.text += text
364
365 def add_image(self, lt_image):
366 source = self.save_image(lt_image)
367 if source:
368 image = PDFImageWidget(
369 source=source,
370 pos=(lt_image.x0, lt_image.y0),
371 size=(lt_image.widt, lt_image.height)
372 )
373 self.add_widget(image)
374 self.images.append(image)
375
376
377if __name__ == '__main__':
378 from sys import argv
379 from kivy.base import runTouchApp
380 from kivy.uix.scrollview import ScrollView
381
382 fn = 'c'
383
384from kivy.app import App
385from kivy.lang import Builder
386from kivy.uix.recycleview import RecycleView
387from kivy.uix.screenmanager import ScreenManager, Screen
388from kivy.uix.boxlayout import BoxLayout
389from kivy.uix.gridlayout import GridLayout
390from kivy.config import Config
391
392Config.set("graphics", 'resizable', 1)
393Config.set("graphics", 'height', 960)
394Config.set("graphics", 'width', 640)
395Builder.load_string("""
396<Button1@Button>:
397 background_color:255, 255, 255, 1
398 size:300,150
399 size_hint:None, None
400
401<Image>:
402 source:'фон.jpg'
403 allow_stretch: True
404
405<Label>:
406 color:0,0,0,1
407 font_size:30
408
409<BoxLayout>:
410 orientation:'vertical'
411 spacing:200
412 padding:170,200
413 background_color:1,1,1,1
414
415<GridLayout10@GridLayout>:
416 cols:2
417 spacing:10,40
418 padding:15
419
420<MainScreen>:
421 name:"Menu"
422 Image
423 BoxLayout:
424 Button1:
425 on_press:root.manager.current="Subjects"
426 text:"Subjects"
427 Button1:
428 text:"Special"
429 on_press:root.manager.current="Special"
430
431
432<SubjectsScreen>:
433 name:"Subjects"
434 Image
435 BoxLayout:
436 Button1:
437 text:"10 class"
438 on_press:root.manager.current="SubjectsScreen10"
439 Button1:
440 text:"11 class"
441 on_press:root.manager.current="SubjectsScreen11"
442
443<SubjectsScreen10>:
444 name:"SubjectsScreen10"
445 Image
446 GridLayout10:
447 Button1:
448 text:"Геометрия"
449 Button1:
450 text:"Алгебра"
451 Button1:
452 text:"Химия"
453 Button1:
454 text:"Физика"
455 Button1:
456 text:"Русская литература"
457 Button1:
458 text:"Русский язык"
459 on_press: root.russ_yaz()
460 on_press: root.close()
461 Button1:
462 text:"География"
463 Button1:
464 text:"Биология"
465 Button1:
466 text:"Белорусский язык"
467
468<SubjectsScreen11>:
469 name:"SubjectsScreen11"
470 Image
471 GridLayout10:
472 Button1:
473 text:"Алгебра"
474 Button1:
475 text:"Геометрия"
476 Button1:
477 text:"Физика"
478 Button1:
479 text:"Химия"
480 Button1:
481 text:"Русская литература"
482 Button1:
483 text:"Русский язык"
484 on_press:self.russ_yaz
485
486 Button1:
487 text:"География"
488 Button1:
489 text:"Биология"
490 Button1:
491 text:"Белорусский язык"
492 Button1:
493 text:"Астрономия"
494
495<SpecialSubjectsScreen>:
496 name:"Special"
497 Image
498 AnchorLayout:
499 Button1:
500 text:"No material(Back)"
501 on_press: root.manager.current='Menu'
502""")
503
504
505class MainScreen(Screen):
506 pass
507
508
509class SpecialSubjectsScreen(Screen):
510 pass
511
512
513class SubjectsScreen(Screen):
514 pass
515
516
517class SubjectsScreen10(Screen):
518 def russ_yaz(self):
519 fn = 'Дудников, А.В Русский язык.pdf'
520 root = PDFDocumentWidget(source=fn, cols=1)
521 runTouchApp(root)
522 PDFDocumentWidget()
523
524 def close(self):
525 App.get_running_app().stop()
526
527
528class SubjectsScreen11(Screen):
529 pass
530
531
532sm = ScreenManager()
533sm.add_widget(MainScreen(name='Menu'))
534sm.add_widget(SpecialSubjectsScreen(name='Special'))
535sm.add_widget(SubjectsScreen(name="Subjects"))
536sm.add_widget(SubjectsScreen10(name="SubjectsScreen10"))
537sm.add_widget(SubjectsScreen11(name="SubjectsScreen11"))
538
539
540class LibraryApp(App):
541
542 def build(self):
543 return sm
544
545
546if __name__ == '__main__':
547 LibraryApp().run()