· 6 years ago · Jun 24, 2019, 04:18 PM
1<?php
2/**************************************************************************************************************
3
4 NAME
5 PdfToText.phpclass
6
7 DESCRIPTION
8 A class for extracting text from Pdf files.
9 Usage is very simple : just instantiate a PdfToText object, specifying an input filename, then use the
10 Text property to retrieve PDF textual contents :
11
12 $pdf = new PdfToText ( 'sample.pdf' ) ;
13 echo $pdf -> Text ; // or : echo ( string ) $pdf ;
14
15 Or :
16
17 $pdf = new PdfToText ( ) ;
18 // Modify any property here before loading the file ; for example :
19 // $pdf -> BlockSeparator = " " ;
20 $pdf -> Load ( 'sample.pdf' ) ;
21 echo $pdf -> Text ;
22
23 AUTHOR
24 Christian Vigh, 04/2016.
25
26 HISTORY
27 [Version : 1.6.7] [Date : 2017/05/31] [Author : CV]
28 . Added CID fonts
29 . Changed the way CID font maps are searched and handled
30
31 (...)
32
33 [Version : 1.0] [Date : 2016/04/16] [Author : CV]
34 Initial version.
35
36 **************************************************************************************************************/
37
38
39/*==============================================================================================================
40
41 class PdfToTextException et al -
42 Implements an exception thrown when an error is encountered while decoding PDF files.
43
44 ==============================================================================================================*/
45
46// PdfToText exception -
47// Base class for all other PdfToText exceptions.
48class PdfToTextException extends Exception
49 {
50 public static $IsObject = false ;
51 } ;
52
53
54// PdfToTextDecodingException -
55// Thrown when unexpected data is encountered while analyzing PDF contents.
56class PdfToTextDecodingException extends PdfToTextException
57 {
58 public function __construct ( $message, $object_id = false )
59 {
60 $text = "Pdf decoding error" ;
61
62 if ( $object_id !== false )
63 $text .= " (object #$object_id)" ;
64
65 $text .= " : $message" ;
66
67 parent::__construct ( $text ) ;
68 }
69 }
70
71
72// PdfToTextDecryptionException -
73// Thrown when something unexpected is encountered while processing encrypted data.
74class PdfToTextDecryptionException extends PdfToTextException
75 {
76 public function __construct ( $message, $object_id = false )
77 {
78 $text = "Pdf decryption error" ;
79
80 if ( $object_id !== false )
81 $text .= " (object #$object_id)" ;
82
83 $text .= " : $message" ;
84
85 parent::__construct ( $text ) ;
86 }
87 }
88
89
90// PdfToTextTimeoutException -
91// Thrown when the PDFOPT_ENFORCE_EXECUTION_TIME or PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME option is set, and
92// the script took longer than the allowed execution time limit.
93class PdfToTextTimeoutException extends PdfToTextException
94 {
95 // Set to true if the reason why the max execution time was reached because of too many invocations of the Load() method
96 // Set to false if the max execution time was reached by simply processing one PDF file
97 public $GlobalTimeout ;
98
99 public function __construct ( $message, $global, $php_setting, $class_setting )
100 {
101 $text = "PdfToText max execution time reached " ;
102
103 if ( ! $global )
104 $text .= "for one single file " ;
105
106 $text .= "(php limit = {$php_setting}s, class limit = {$class_setting}s) : $message" ;
107
108 $this -> GlobalTimeout = $global ;
109
110 parent::__construct ( $text ) ;
111 }
112 }
113
114
115// PdfToTextFormException -
116// Thrown if the xml template passed to the GetFormData() method contains an error.
117class PdfToTextFormException extends PdfToTextException
118 {
119 public function __construct ( $message )
120 {
121 $text = "Pdf form template error" ;
122
123 $text .= " : $message" ;
124
125 parent::__construct ( $text ) ;
126 }
127 }
128
129
130// PdfToTextCaptureException -
131// Thrown if the xml template passed to the SetCaptures() method contains an error.
132class PdfToTextCaptureException extends PdfToTextException
133 {
134 public function __construct ( $message )
135 {
136 $text = "Pdf capture template error" ;
137
138 $text .= " : $message" ;
139
140 parent::__construct ( $text ) ;
141 }
142 }
143
144
145
146/*==============================================================================================================
147
148 Custom error reporting functions.
149
150 ==============================================================================================================*/
151if ( ! function_exists ( 'warning' ) )
152 {
153 function warning ( $message )
154 {
155 trigger_error ( $message, E_USER_WARNING ) ;
156 }
157 }
158
159
160if ( ! function_exists ( 'error' ) )
161 {
162 function error ( $message )
163 {
164 if ( is_string ( $message ) )
165 trigger_error ( $message, E_USER_ERROR ) ;
166 else if ( is_a ( $message, '\Exception' ) )
167 throw $message ;
168 }
169 }
170
171
172/*==============================================================================================================
173
174 Backward-compatibility issues.
175
176 ==============================================================================================================*/
177
178// hex2bin -
179// This function appeared only in version 5.4.0
180if ( ! function_exists ( 'hex2bin' ) )
181 {
182 function hex2bin ( $hexstring )
183 {
184 $length = strlen ( $hexstring ) ;
185 $binstring = '' ;
186 $index = 0 ;
187
188 while ( $index < $length )
189 {
190 $byte = substr ( $hexstring, $index, 2 ) ;
191 $ch = pack ( 'H*', $byte ) ;
192 $binstring .= $ch ;
193
194 $index += 2 ;
195 }
196
197 return ( $binstring ) ;
198 }
199
200 }
201
202
203/*==============================================================================================================
204
205 class PfObjectBase -
206 Base class for all PDF objects defined here.
207
208 ==============================================================================================================*/
209abstract class PdfObjectBase // extends Object
210 {
211 // Possible encoding types for streams inside objects ; "unknown" means that the object contains no stream
212 const PDF_UNKNOWN_ENCODING = 0 ; // No stream decoding type could be identified
213 const PDF_ASCIIHEX_ENCODING = 1 ; // AsciiHex encoding - not tested
214 const PDF_ASCII85_ENCODING = 2 ; // Ascii85 encoding - not tested
215 const PDF_FLATE_ENCODING = 3 ; // Flate/deflate encoding
216 const PDF_TEXT_ENCODING = 4 ; // Stream data appears in clear text - no decoding required
217 const PDF_LZW_ENCODING = 5 ; // Not implemented yet
218 const PDF_RLE_ENCODING = 6 ; // Runtime length encoding ; not implemented yet
219 const PDF_DCT_ENCODING = 7 ; // JPEG images
220 const PDF_CCITT_FAX_ENCODING = 8 ; // CCITT Fax encoding - not implemented yet
221 const PDF_JBIG2_ENCODING = 9 ; // JBIG2 filter encoding (black/white) - not implemented yet
222 const PDF_JPX_ENCODING = 10 ; // JPEG2000 encoding - not implemented yet
223
224 // Regular expression used for recognizing references to a font (this list is far from being exhaustive, as it seems
225 // that you can specify almost everything - however, trying to recognize everything would require to develop a complete
226 // parser)
227 protected static $FontSpecifiers = '
228 (/F \d+ (\.\d+)? ) |
229 (/R \d+) |
230 (/f-\d+-\d+) |
231 (/[CT]\d+_\d+) |
232 (/TT \d+) |
233 (/OPBaseFont \d+) |
234 (/OPSUFont \d+) |
235 (/[0-9a-zA-Z]) |
236 (/F\w+) |
237 (/[A-Za-z][A-Za-z0-9]* ( [\-+] [A-Za-z][A-Za-z0-9]* ))
238 ' ;
239
240 // Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent
241 protected static $UnicodeToSimpleAscii = false ;
242
243
244 /*--------------------------------------------------------------------------------------------------------------
245
246 Constructor -
247 Performs static initializations such as the Unicode to Ascii table.
248
249 *-------------------------------------------------------------------------------------------------------------*/
250 public function __construct ( )
251 {
252 if ( self::$UnicodeToSimpleAscii === false )
253 {
254 $charset_file = dirname ( __FILE__ ) . "/Maps/unicode-to-ansi.map" ;
255 include ( $charset_file ) ;
256 self::$UnicodeToSimpleAscii = ( isset ( $unicode_to_ansi ) ) ? $unicode_to_ansi : array ( ) ;
257 }
258
259 // parent::__construct ( ) ;
260 }
261
262
263 /*--------------------------------------------------------------------------------------------------------------
264
265 NAME
266 CodePointToUtf8 - Encodes a Unicode codepoint to UTF8.
267
268 PROTOTYPE
269 $char = $this -> CodePointToUtf8 ( $code ) ;
270
271 DESCRIPTION
272 Encodes a Unicode codepoint to UTF8, trying to handle all possible cases.
273
274 PARAMETERS
275 $code (integer) -
276 Unicode code point to be translated.
277
278 RETURN VALUE
279 A string that contains the UTF8 bytes representing the Unicode code point.
280
281 *-------------------------------------------------------------------------------------------------------------*/
282 protected function CodePointToUtf8 ( $code )
283 {
284 if ( $code )
285 {
286 $result = '' ;
287
288 while ( $code )
289 {
290 $word = ( $code & 0xFFFF ) ;
291
292 if ( ! isset ( self::$UnicodeToSimpleAscii [ $word ] ) )
293 {
294 $entity = "&#$word;" ;
295 $result .= mb_convert_encoding ( $entity, 'UTF-8', 'HTML-ENTITIES' ) . $result ;
296 }
297 else
298 $result .= self::$UnicodeToSimpleAscii [ $word ] ;
299
300 $code = ( integer ) ( $code / 0xFFFF ) ; // There is no unsigned right-shift operator in PHP...
301 }
302
303 return ( $result ) ;
304 }
305 // No translation is apparently possible : use a placeholder to signal this situation
306 else
307 {
308 if ( strpos ( PdfToText::$Utf8Placeholder, '%' ) === false )
309 {
310 return ( PdfToText::$Utf8Placeholder ) ;
311 }
312 else
313 return ( sprintf ( PdfToText::$Utf8Placeholder, $code ) ) ;
314 }
315 }
316
317
318 /*--------------------------------------------------------------------------------------------------------------
319
320 DecodeRawName -
321 Decodes a string that may contain constructs such as '#xy', where 'xy' are hex digits.
322
323 *-------------------------------------------------------------------------------------------------------------*/
324 public static function DecodeRawName ( $str )
325 {
326 return ( rawurldecode ( str_replace ( '#', '%', $str ) ) ) ;
327 }
328
329
330 /*--------------------------------------------------------------------------------------------------------------
331
332 NAME
333 GetEncodingType - Gets an object encoding type.
334
335 PROTOTYPE
336 $type = $this -> GetEncodingType ( $object_id, $object_data ) ;
337
338 DESCRIPTION
339 When an object is a stream, returns its encoding type.
340
341 PARAMETERS
342 $object_id (integer) -
343 PDF object number.
344
345 $object_data (string) -
346 Object contents.
347
348 RETURN VALUE
349 Returns one of the following values :
350
351 - PdfToText::PDF_ASCIIHEX_ENCODING :
352 Hexadecimal encoding of the binary values.
353 Decoding algorithm was taken from the unknown contributor and not tested so far, since I
354 couldn't find a PDF file with such an encoding type.
355
356 - PdfToText::PDF_ASCII85_ENCODING :
357 Obscure encoding format.
358 Decoding algorithm was taken from the unknown contributor and not tested so far, since I
359 couldn't find a PDF file with such an encoding type.
360
361 - PdfToText::PDF_FLATE_ENCODING :
362 gzip/deflate encoding.
363
364 - PdfToText::PDF_TEXT_ENCODING :
365 Stream data is unencoded (ie, it is pure ascii).
366
367 - PdfToText::PDF_UNKNOWN_ENCODING :
368 The object data does not specify any encoding at all. It can happen on objects that do not have
369 a "stream" part.
370
371 - PdfToText::PDF_DCT_ENCODING :
372 a lossy filter based on the JPEG standard.
373
374 The following constants are defined but not yet implemented ; an exception will be thrown if they are
375 encountered somewhere in the PDF file :
376
377 - PDF_LZW_ENCODING :
378 a filter based on LZW Compression; it can use one of two groups of predictor functions for more
379 compact LZW compression : Predictor 2 from the TIFF 6.0 specification and predictors (filters)
380 from the PNG specification
381
382 - PDF_RLE_ENCODING :
383 a simple compression method for streams with repetitive data using the run-length encoding
384 algorithm and the image-specific filters.
385
386 PDF_CCITT_FAX_ENCODING :
387 a lossless bi-level (black/white) filter based on the Group 3 or Group 4 CCITT (ITU-T) fax
388 compression standard defined in ITU-T T.4 and T.6.
389
390 PDF_JBIG2_ENCODING :
391 a lossy or lossless bi-level (black/white) filter based on the JBIG2 standard, introduced in
392 PDF 1.4.
393
394 PDF_JPX_ENCODING :
395 a lossy or lossless filter based on the JPEG 2000 standard, introduced in PDF 1.5.
396
397 *-------------------------------------------------------------------------------------------------------------*/
398 protected function GetEncodingType ( $object_id, $object_data )
399 {
400 $status = preg_match ( '# / (?P<encoding> (ASCIIHexDecode) | (AHx) | (ASCII85Decode) | (A85) | (FlateDecode) | (Fl) | (DCTDecode) | (DCT) | ' .
401 '(LZWDecode) | (LZW) | (RunLengthDecode) | (RL) | (CCITTFaxDecode) | (CCF) | (JBIG2Decode) | (JPXDecode) ) \b #imsx',
402 $object_data, $match ) ;
403
404 if ( ! $status )
405 return ( self::PDF_TEXT_ENCODING ) ;
406
407 switch ( strtolower ( $match [ 'encoding' ] ) )
408 {
409 case 'asciihexdecode' :
410 case 'ahx' : return ( self::PDF_ASCIIHEX_ENCODING ) ;
411
412 case 'ascii85decode' :
413 case 'a85' : return ( self::PDF_ASCII85_ENCODING ) ;
414
415 case 'flatedecode' :
416 case 'fl' : return ( self::PDF_FLATE_ENCODING ) ;
417
418 case 'dctdecode' :
419 case 'dct' : return ( self::PDF_DCT_ENCODING ) ;
420
421 case 'lzwdecode' :
422 case 'lzw' : return ( self::PDF_LZW_ENCODING ) ;
423
424 case 'ccittfaxdecode' :
425 case 'ccf' :
426
427 case 'runlengthdecode' :
428 case 'rl' :
429
430 case 'jbig2decode' :
431
432 case 'jpxdecode' :
433 if ( PdfToText::$DEBUG > 1 )
434 warning ( "Encoding type \"{$match [ 'encoding' ]}\" not yet implemented for pdf object #$object_id." ) ;
435
436 default : return ( self::PDF_UNKNOWN_ENCODING ) ;
437 }
438 }
439
440
441 /*--------------------------------------------------------------------------------------------------------------
442
443 NAME
444 GetObjectReferences - Gets object references from a specified construct.
445
446 PROTOTYPE
447 $status = $this -> GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) ;
448
449 DESCRIPTION
450 Certain parameter specifications are followed by an object reference of the form :
451 x 0 R
452 but it can also be an array of references :
453 [x1 0 R x2 0 R ... xn 0 r]
454 Those kind of constructs can occur after parameters such as : /Pages, /Contents, /Kids...
455 This method extracts the object references found in such a construct.
456
457 PARAMETERS
458 $object_id (integer) -
459 Id of the object to be analyzed.
460
461 $object_data (string) -
462 Object contents.
463
464 $searched_string (string) -
465 String to be searched, that must be followed by an object or an array of object references.
466 This parameter can contain constructs used in regular expressions. Note however that the '#'
467 character must be escaped, since it is used as a delimiter in the regex that is applied on
468 object data.
469
470 $object_ids (array of integers) -
471 Returns on output the ids of the pdf object that have been found after the searched string.
472
473 RETURN VALUE
474 True if the searched string has been found and is followed by an object or array of object references,
475 false otherwise.
476
477 *-------------------------------------------------------------------------------------------------------------*/
478 protected function GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids )
479 {
480 $status = true ;
481 $object_ids = array ( ) ;
482
483 if ( preg_match ( "#$searched_string \s* \\[ (?P<objects> [^\]]+ ) \\]#ix", $object_data, $match ) )
484 {
485 $object_list = $match [ 'objects' ] ;
486
487 if ( preg_match_all ( '/(?P<object> \d+) \s+ \d+ \s+ R/x', $object_list, $matches ) )
488 {
489 foreach ( $matches [ 'object' ] as $id )
490 $object_ids [] = ( integer ) $id ;
491 }
492 else
493 $status = false ;
494 }
495 else if ( preg_match ( "#$searched_string \s+ (?P<object> \d+) \s+ \d+ \s+ R#ix", $object_data, $match ) )
496 {
497 $object_ids [] = ( integer ) $match [ 'object' ] ;
498 }
499 else
500 $status = false ;
501
502 return ( $status ) ;
503 }
504
505
506 /*--------------------------------------------------------------------------------------------------------------
507
508 NAME
509 GetStringParameter - Retrieve a string flag value.
510
511 PROTOTYPE
512 $result = $this -> GetStringParameter ( $parameter, $object_data ) ;
513
514 DESCRIPTION
515 Retrieves the value of a string parameter ; for example :
516
517 /U (parameter value)
518
519 or :
520
521 /U <hexdigits>
522
523 PARAMETERS
524 $parameter (string) -
525 Parameter name.
526
527 $object_data (string) -
528 Object containing the parameter.
529
530 RETURN VALUE
531 The parameter value.
532
533 NOTES
534 description
535
536 *-------------------------------------------------------------------------------------------------------------*/
537 protected function GetStringParameter ( $parameter, $object_data )
538 {
539 if ( preg_match ( '#' . $parameter . ' \s* \( \s* (?P<value> [^)]+) \)#ix', $object_data, $match ) )
540 $result = $this -> ProcessEscapedString ( $match [ 'value' ] ) ;
541 else if ( preg_match ( '#' . $parameter . ' \s* \< \s* (?P<value> [^>]+) \>#ix', $object_data, $match ) )
542 {
543 $hexdigits = $match [ 'value' ] ;
544 $result = '' ;
545
546 for ( $i = 0, $count = strlen ( $hexdigits ) ; $i < $count ; $i += 2 )
547 $result .= chr ( hexdec ( substr ( $hexdigits, $i, 2 ) ) ) ;
548 }
549 else
550 $result = '' ;
551
552 return ( $result ) ;
553 }
554
555
556 /*--------------------------------------------------------------------------------------------------------------
557
558 GetUTCDate -
559 Reformats an Adobe UTC date to a format that can be understood by the strtotime() function.
560 Dates are specified in the following format :
561 D:20150521154000Z
562 D:20160707182114+02
563 with are both recognized by strtotime(). However, another format can be specified :
564 D:20160707182114+02'00'
565 which is not recognized by strtotime() so we have to get rid from the '00' part.
566
567 *-------------------------------------------------------------------------------------------------------------*/
568 protected function GetUTCDate ( $date )
569 {
570 if ( $date )
571 {
572 if ( ( $date [0] == 'D' || $date [0] == 'd' ) && $date [1] == ':' )
573 $date = substr ( $date, 2 ) ;
574
575 if ( ( $index = strpos ( $date, "'" ) ) !== false )
576 $date = substr ( $date, 0, $index ) ;
577 }
578
579 return ( $date ) ;
580 }
581
582
583 /*--------------------------------------------------------------------------------------------------------------
584
585 IsCharacterMap -
586 Checks if the specified text contents represent a character map definition or not.
587
588 *-------------------------------------------------------------------------------------------------------------*/
589 protected function IsCharacterMap ( $decoded_data )
590 {
591 // preg_match is faster than calling strpos several times
592 return ( preg_match ( '#(begincmap)|(beginbfrange)|(beginbfchar)|(/Differences)#ix', $decoded_data ) ) ;
593 }
594
595
596 /*--------------------------------------------------------------------------------------------------------------
597
598 IsFont -
599 Checks if the current object contents specify a font declaration.
600
601 *-------------------------------------------------------------------------------------------------------------*/
602 protected function IsFont ( $object_data )
603 {
604 return
605 (
606 stripos ( $object_data, '/BaseFont' ) !== false ||
607 ( ! preg_match ( '#/Type \s* /FontDescriptor#ix', $object_data ) &&
608 preg_match ( '#/Type \s* /Font#ix', $object_data ) )
609 ) ;
610 }
611
612
613 /*--------------------------------------------------------------------------------------------------------------
614
615 IsFormData -
616 Checks if the current object contents specify references to font data.
617
618 *-------------------------------------------------------------------------------------------------------------*/
619 protected function IsFormData ( $object_data )
620 {
621 return
622 (
623 preg_match ( '#\bR \s* \( \s* datasets \s* \)#imsx', $object_data )
624 ) ;
625 }
626
627
628 /*--------------------------------------------------------------------------------------------------------------
629
630 IsFontMap -
631 Checks if the code contains things like :
632 <</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
633 which maps font 1 (when specified with the /Fx instruction) to object 26, 2 to object 22 and 3 to
634 object 18, respectively, in the above example.
635
636 *-------------------------------------------------------------------------------------------------------------*/
637 protected function IsFontMap ( $object_data )
638 {
639 $object_data = self::UnescapeHexCharacters ( $object_data ) ;
640
641 if ( preg_match ( '#<< \s* ( ' . self::$FontSpecifiers . ' ) \s+ .* >>#imsx', $object_data ) )
642 return ( true ) ;
643 else
644 return ( false ) ;
645 }
646
647
648 /*--------------------------------------------------------------------------------------------------------------
649
650 IsImage -
651 Checks if the code contains things like :
652 /Subtype/Image
653
654 *-------------------------------------------------------------------------------------------------------------*/
655 protected function IsImage ( $object_data )
656 {
657 if ( preg_match ( '#/Subtype \s* /Image#msx', $object_data ) )
658 return ( true ) ;
659 else
660 return ( false ) ;
661 }
662
663
664 /*--------------------------------------------------------------------------------------------------------------
665
666 IsObjectStream -
667 Checks if the code contains an object stream (/Type/ObjStm)
668 /Subtype/Image
669
670 *-------------------------------------------------------------------------------------------------------------*/
671 protected function IsObjectStream ( $object_data )
672 {
673 if ( preg_match ( '#/Type \s* /ObjStm#isx', $object_data ) )
674 return ( true ) ;
675 else
676 return ( false ) ;
677 }
678
679
680 /*--------------------------------------------------------------------------------------------------------------
681
682 NAME
683 IsPageHeaderOrFooter - Check if the specified object contents denote a text stream.
684
685 PROTOTYPE
686 $status = $this -> IsPageHeaderOrFooter ( $stream_data ) ;
687
688 DESCRIPTION
689 Checks if the specified decoded stream contents denotes header or footer data.
690
691 PARAMETERS
692 $stream_data (string) -
693 Decoded stream contents.
694
695 *-------------------------------------------------------------------------------------------------------------*/
696 protected function IsPageHeaderOrFooter ( $stream_data )
697 {
698 if ( preg_match ( '#/Type \s* /Pagination \s* /Subtype \s*/((Header)|(Footer))#ix', $stream_data ) )
699 return ( true ) ;
700 else if ( preg_match ( '#/Attached \s* \[ .*? /((Top)|(Bottom)) [^]]#ix', $stream_data ) )
701 return ( true ) ;
702 else
703 return ( false ) ;
704 }
705
706
707 /*--------------------------------------------------------------------------------------------------------------
708
709 NAME
710 IsText - Check if the specified object contents denote a text stream.
711
712 PROTOTYPE
713 $status = $this -> IsText ( $object_data, $decoded_stream_data ) ;
714
715 DESCRIPTION
716 Checks if the specified object contents denote a text stream.
717
718 PARAMETERS
719 $object_data (string) -
720 Object data, ie the contents located between the "obj" and "endobj" keywords.
721
722 $decoded_stream_data (string) -
723 The flags specified in the object data are not sufficient to be sure that we have a block of
724 drawing instructions. We must also check for certain common instructions to be present.
725
726 RETURN VALUE
727 True if the specified contents MAY be text contents, false otherwise.
728
729 NOTES
730 I do not consider this method as bullet-proof. There may arise some cases where non-text blocks can be
731 mistakenly considered as text blocks, so it is subject to evolve in the future.
732
733 *-------------------------------------------------------------------------------------------------------------*/
734 protected function IsText ( $object_data, $decoded_stream_data )
735 {
736 if ( preg_match ( '# / (Filter) | (Length) #ix', $object_data ) &&
737 ! preg_match ( '# / (Type) | (Subtype) | (Length1) #ix', $object_data ) )
738 {
739 if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
740 return ( true ) ;
741 }
742 else if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
743 return ( true ) ;
744
745 return ( false ) ;
746 }
747
748
749 /*--------------------------------------------------------------------------------------------------------------
750
751 NAME
752 PregStrReplace - Replace string(s) using regular expression(s)
753
754 PROTOTYPE
755 $result = PdfToText::PregStrReplace ( $pattern, $replacement, $subject, $limit = -1,
756 &$match_count = null )
757
758 DESCRIPTION
759 This function behaves like a mix of str_replace() and preg_replace() ; it allows to search for strings
760 using regular expressions, but the replacements are plain-text strings and no reference to a capture
761 specified in the regular expression will be interpreted.
762 This is useful when processing templates, which can contain constructs such as "\00" or "$", which are
763 interpreted by preg_replace() as references to captures.
764
765 The function has the same parameters as preg_replace().
766
767 RETURN VALUE
768 Returns the substituted text.
769
770 *-------------------------------------------------------------------------------------------------------------*/
771 public static function PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, &$match_count = null )
772 {
773 // Make sure that $pattern and $replacement become arrays of the same size
774 if ( is_array ( $pattern ) )
775 {
776 if ( is_array ( $replacement ) )
777 {
778 if ( count ( $pattern ) !== count ( $replacement ) )
779 {
780 warning ( "The \$replacement parameter should have the same number of element as \$pattern." ) ;
781 return ( $subject ) ;
782 }
783 }
784 else
785 $replacement = array_fill ( $replacement, count ( $pattern ), $replacement ) ;
786 }
787 else
788 {
789 if ( is_array ( $replacement ) )
790 {
791 warning ( "Expected string for the \$replacement parameter." ) ;
792 return ( $subject ) ;
793 }
794
795 $pattern = array ( $pattern ) ;
796 $replacement = array ( $replacement ) ;
797 }
798
799 // Upper limit
800 if ( $limit < 1 )
801 $limit = PHP_INT_MAX ;
802
803 // Loop through each supplied pattern
804 $current_subject = $subject ;
805 $count = 0 ;
806
807 for ( $i = 0, $pattern_count = count ( $pattern ) ; $i < $pattern_count ; $i ++ )
808 {
809 $regex = $pattern [$i] ;
810
811 // Get all matches for this pattern
812 if ( preg_match_all ( $regex, $current_subject, $matches, PREG_OFFSET_CAPTURE ) )
813 {
814 $result = '' ; // Current output result
815 $last_offset = 0 ;
816
817 // Process each match
818 foreach ( $matches [0] as $match )
819 {
820 $offset = ( integer ) $match [1] ;
821
822 // Append data from the last seen offset up to the current one
823 if ( $last_offset < $offset )
824 $result .= substr ( $current_subject, $last_offset, $offset - $last_offset ) ;
825
826 // Append the replacement string for this match
827 $result .= $replacement [$i] ;
828
829 // Compute next offset in $current_subject
830 $last_offset = $offset + strlen ( $match [0] ) ;
831
832 // Limit checking
833 $count ++ ;
834
835 if ( $count > $limit )
836 break 2 ;
837 }
838
839 // Append the last part of the subject that has not been matched by anything
840 $result .= substr ( $current_subject, $last_offset ) ;
841
842 // The current subject becomes the string that has been built in the steps above
843 $current_subject = $result ;
844 }
845 }
846
847 /// All done, return
848 return ( $current_subject ) ;
849 }
850
851
852 /*--------------------------------------------------------------------------------------------------------------
853
854 NAME
855 ProcessEscapedCharacter - Interprets a character after a backslash in a string.
856
857 PROTOTYPE
858 $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
859
860 DESCRIPTION
861 Interprets a character after a backslash in a string and returns the interpreted value.
862
863 PARAMETERS
864 $ch (char) -
865 Character to be escaped.
866
867 RETURN VALUE
868 The escaped character.
869
870 NOTES
871 This method does not process octal sequences.
872
873 *-------------------------------------------------------------------------------------------------------------*/
874 protected function ProcessEscapedCharacter ( $ch )
875 {
876 switch ( $ch )
877 {
878 // Normally, only a few characters should be escaped...
879 case '(' : $newchar = "(" ; break ;
880 case ')' : $newchar = ")" ; break ;
881 case '[' : $newchar = "[" ; break ;
882 case ']' : $newchar = "]" ; break ;
883 case '\\' : $newchar = "\\" ; break ;
884 case 'n' : $newchar = "\n" ; break ;
885 case 'r' : $newchar = "\r" ; break ;
886 case 'f' : $newchar = "\f" ; break ;
887 case 't' : $newchar = "\t" ; break ;
888 case 'b' : $newchar = chr ( 8 ) ; break ;
889 case 'v' : $newchar = chr ( 11 ) ; break ;
890
891 // ... but should we consider that it is a heresy to escape other characters ?
892 // For the moment, no.
893 default : $newchar = $ch ; break ;
894 }
895
896 return ( $newchar ) ;
897 }
898
899
900 /*--------------------------------------------------------------------------------------------------------------
901
902 NAME
903 ProcessEscapedString - Processes a string which can have escaped characters.
904
905 PROTOTYPE
906 $result = $this -> ProcessEscapedString ( $str, $process_octal_escapes = false ) ;
907
908 DESCRIPTION
909 Processes a string which may contain escape sequences.
910
911 PARAMETERS
912 $str (string) -
913 String to be processed.
914
915 $process_octal_escapes (boolean) -
916 When true, octal escape sequences such as \037 are processed.
917
918 RETURN VALUE
919 The processed input string.
920
921 *-------------------------------------------------------------------------------------------------------------*/
922 protected function ProcessEscapedString ( $str, $process_octal_escapes = false )
923 {
924 $length = strlen ( $str ) ;
925 $offset = 0 ;
926 $result = '' ;
927 $ord0 = ord ( '0' ) ;
928
929 while ( ( $backslash_index = strpos ( $str, '\\', $offset ) ) !== false )
930 {
931 if ( $backslash_index + 1 < $length )
932 {
933 $ch = $str [ ++ $backslash_index ] ;
934
935 if ( ! $process_octal_escapes )
936 {
937 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
938 $offset = $backslash_index + 1 ;
939 }
940 else if ( $ch < '0' || $ch > '7' )
941 {
942 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
943 $offset = $backslash_index + 1 ;
944 }
945 else
946 {
947 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) ;
948 $ord = ord ( $ch ) - $ord0 ;
949 $count = 0 ;
950 $backslash_index ++ ;
951
952 while ( $backslash_index < $length && $count < 2 &&
953 $str [ $backslash_index ] >= '0' && $str [ $backslash_index ] <= '7' )
954 {
955 $ord = ( $ord * 8 ) + ( ord ( $str [ $backslash_index ++ ] ) - $ord0 ) ;
956 $count ++ ;
957 }
958
959 $result .= chr ( $ord ) ;
960 $offset = $backslash_index ;
961 }
962 }
963 else
964 break ;
965 }
966
967 $result .= substr ( $str, $offset ) ;
968
969 return ( $result ) ;
970 }
971
972
973 /*--------------------------------------------------------------------------------------------------------------
974
975 NAME
976 Unescape - Processes escape sequences from the specified string.
977
978 PROTOTYPE
979 $value = $this -> Unescape ( $text ) ;
980
981 DESCRIPTION
982 Processes escape sequences within the specified text. The recognized escape sequences are like the
983 C-language ones : \b (backspace), \f (form feed), \r (carriage return), \n (newline), \t (tab).
984 All other characters prefixed by "\" are returned as is.
985
986 PARAMETERS
987 $text (string) -
988 Text to be unescaped.
989
990 RETURN VALUE
991 Returns the unescaped value of $text.
992
993 *-------------------------------------------------------------------------------------------------------------*/
994 public static function Unescape ( $text )
995 {
996 $length = strlen ( $text ) ;
997 $result = '' ;
998 $ord0 = ord ( 0 ) ;
999
1000 for ( $i = 0 ; $i < $length ; $i ++ )
1001 {
1002 $ch = $text [$i] ;
1003
1004 if ( $ch == '\\' && isset ( $text [$i+1] ) )
1005 {
1006 $nch = $text [++$i] ;
1007
1008 switch ( $nch )
1009 {
1010 case 'b' : $result .= "\b" ; break ;
1011 case 't' : $result .= "\t" ; break ;
1012 case 'f' : $result .= "\f" ; break ;
1013 case 'r' : $result .= "\r" ; break ;
1014 case 'n' : $result .= "\n" ; break ;
1015 default :
1016 // Octal escape notation
1017 if ( $nch >= '0' && $nch <= '7' )
1018 {
1019 $ord = ord ( $nch ) - $ord0 ;
1020 $digits = 1 ;
1021 $i ++ ;
1022
1023 while ( $i < $length && $digits < 3 && $text [$i] >= '0' && $text [$i] <= '7' )
1024 {
1025 $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
1026 $i ++ ;
1027 $digits ++ ;
1028 }
1029
1030 $i -- ; // Count one character less since $i will be incremented at the end of the for() loop
1031
1032 $result .= chr ( $ord ) ;
1033 }
1034 else
1035 $result .= $nch ;
1036 }
1037 }
1038 else
1039 $result .= $ch ;
1040 }
1041
1042 return ( $result ) ;
1043 }
1044
1045
1046 /*--------------------------------------------------------------------------------------------------------------
1047
1048 NAME
1049 UnescapeHexCharacters - Unescapes characters in the #xy notation.
1050
1051 PROTOTYPE
1052 $result = $this -> UnescapeHexCharacters ( $data ) ;
1053
1054 DESCRIPTION
1055 Some specifications contain hex characters specified as #xy. For the moment, I have met such a construct in
1056 font aliases such as :
1057 /C2#5F0 25 0 R
1058 where "#5F" stands for "_", giving :
1059 /C2_0 25 0 R
1060 Hope that such constructs do not happen in other places...
1061
1062 PARAMETERS
1063 $data (string) -
1064 String to be unescaped.
1065
1066 RETURN VALUE
1067 The input string with all the hex character representations replaced with their ascii equivalent.
1068
1069 *-------------------------------------------------------------------------------------------------------------*/
1070 public static function UnescapeHexCharacters ( $data )
1071 {
1072 if ( strpos ( $data, 'stream' ) === false && preg_match ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data ) )
1073 {
1074 preg_match_all ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data, $matches ) ;
1075
1076 $searches = array ( ) ;
1077 $replacements = array ( ) ;
1078
1079 foreach ( $matches [ 'hex' ] as $hex )
1080 {
1081 if ( ! isset ( $searches [ $hex ] ) )
1082 {
1083 $searches [ $hex ] = $hex ;
1084 $replacements [] = chr ( hexdec ( substr ( $hex, 1 ) ) ) ;
1085 }
1086
1087 $data = str_replace ( $searches, $replacements, $data ) ;
1088 }
1089 }
1090
1091 return ( $data ) ;
1092 }
1093
1094
1095 /*--------------------------------------------------------------------------------------------------------------
1096
1097 ValidatePhpName -
1098 Checks that the specified name (declared in the XML template) is a valid PHP name.
1099
1100 *-------------------------------------------------------------------------------------------------------------*/
1101 public static function ValidatePhpName ( $name )
1102 {
1103 $name = trim ( $name ) ;
1104
1105 if ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) )
1106 error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ;
1107
1108 return ( $name ) ;
1109 }
1110 }
1111
1112
1113/*==============================================================================================================
1114
1115 PdfToText class -
1116 A class for extracting text from Pdf files.
1117
1118 ==============================================================================================================*/
1119class PdfToText extends PdfObjectBase
1120 {
1121 // Current version of the class
1122 const VERSION = "1.6.7" ;
1123
1124 // Pdf processing options
1125 const PDFOPT_NONE = 0x00000000 ; // No extra option
1126 const PDFOPT_REPEAT_SEPARATOR = 0x00000001 ; // Repeats the Separator property if the offset between two text blocks (in array notation)
1127 // is greater than $this -> MinSpaceWidth
1128 const PDFOPT_GET_IMAGE_DATA = 0x00000002 ; // Retrieve raw image data in the $ths -> ImageData array
1129 const PDFOPT_DECODE_IMAGE_DATA = 0x00000004 ; // Creates a jpeg resource for each image
1130 const PDFOPT_IGNORE_TEXT_LEADING = 0x00000008 ; // Ignore text leading values
1131 const PDFOPT_NO_HYPHENATED_WORDS = 0x00000010 ; // Join hyphenated words that are split on two lines
1132 const PDFOPT_AUTOSAVE_IMAGES = 0x00000020 ; // Autosave images ; the ImageFileTemplate property will need to be defined
1133 const PDFOPT_ENFORCE_EXECUTION_TIME = 0x00000040 ; // Enforces the max_execution_time PHP setting when processing a file. A PdfTexterTimeoutException
1134 // will be thrown if processing of a single file reaches (time_limit - 1 second) by default
1135 // The MaxExecutionTime property can be set to modify this default value.
1136 const PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME = 0x00000080 ; // Same as PDFOPT_ENFORCE_EXECUTION_TIME, but for all calls to the Load() method of the PdfToText class
1137 // The MaxGlobalExecutionTime static property can be set to modify the default time limit
1138 const PDFOPT_IGNORE_HEADERS_AND_FOOTERS = 0x00000300 ; // Ignore headers and footers
1139
1140 const PDFOPT_RAW_LAYOUT = 0x00000000 ; // Layout rendering : raw (default)
1141 const PDFOPT_BASIC_LAYOUT = 0x00000400 ; // Layout rendering : basic
1142
1143 const PDFOPT_LAYOUT_MASK = 0x00000C00 ; // Mask to isolate the targeted layout
1144
1145 const PDFOPT_ENHANCED_STATISTICS = 0x00001000 ; // Compute statistics on PDF language instructions
1146 const PDFOPT_DEBUG_SHOW_COORDINATES = 0x00002000 ; // Include text coordinates ; implies the PDFOPT_BASIC_LAYOUT option
1147 // This option can be useful if you want to use capture areas and get information about
1148 // their coordinates
1149 const PDFOPT_CAPTURE = 0x00004000 ; // Indicates that the caller wants to capture some text and use the SetCaptures() method
1150 // It currently enables the PDFOPT_BASIC_LAYOUT option
1151 const PDFOPT_LOOSE_X_CAPTURE = 0x00008000 ; // Includes in captures text fragments whose dimensions may exceed the captured area dimensions
1152 const PDFOPT_LOOSE_Y_CAPTURE = 0x00010000 ; // (currently not used)
1153
1154 // When boolean true, outputs debug information about fonts, character maps and drawing contents.
1155 // When integer > 1, outputs additional information about other objects.
1156 public static $DEBUG = false ;
1157
1158 // Current filename
1159 public $Filename = false ;
1160 // Extracted text
1161 public $Text = '' ;
1162 // Document pages (array of strings)
1163 public $Pages = array ( ) ;
1164 // Document images (array of PdfImage objects)
1165 public $Images = array ( ) ;
1166 protected $ImageCount = 0 ;
1167 // Raw data for document images
1168 public $ImageData = array ( ) ;
1169 // ImageAutoSaveFileTemplate :
1170 // Template for the file names to be generated when extracting images, if the PDFOPT_AUTOSAVE_IMAGES has been specified.
1171 // Can contain any path, plus the following printf()-like modifiers :
1172 // . "%p" : Path of the original PDF file.
1173 // . "%f" : Filename part of the original PDF file.
1174 // . "%d" : A sequential number, starting from 1, used when generating filenames. The format can contains a width specifier,
1175 // such as "%3d", which will generate 3-digits sequential numbers left-filled with zeroes.
1176 // . "%s" : Image suffix, which will automatically based on the underlying image type.
1177 public $ImageAutoSaveFileTemplate = "%p/%f.%d.%s" ;
1178 // Auto-save image file format
1179 public $ImageAutoSaveFormat = IMG_JPEG ;
1180 // Auto-saved image file names
1181 public $AutoSavedImageFiles = array ( ) ;
1182 // Text chunk separator (used to separate blocks of text specified as an array notation)
1183 public $BlockSeparator = '' ;
1184 // Separator used to separate text groups where the offset value is less than -1000 thousands of character units
1185 // (eg : [(1)-1822(2)] will add a separator between the characters "1" and "2")
1186 // Note that such values are expressed in thousands of text units and subtracted from the current position. A
1187 // negative value means adding more space between the two text units it separates.
1188 public $Separator = ' ' ;
1189 // Separator to be used between pages in the $Text property
1190 public $PageSeparator = "\n" ;
1191 // Minimum value (in 1/1000 of text units) that separates two text chunks that can be considered as a real space
1192 public $MinSpaceWidth = 200 ;
1193 // Pdf options
1194 public $Options = self::PDFOPT_NONE ;
1195 // Maximum number of pages to extract from the PDF. A zero value means "extract everything"
1196 // If this number is negative, then the pages to be extract start from the last page. For example, a value of -2
1197 // extracts the last two pages
1198 public $MaxSelectedPages = false ;
1199 // Maximum number of images to be extracted. A value of zero means "extract everything". A non-zero value gives
1200 // the number of images to extract.
1201 public $MaxExtractedImages = false ;
1202 // Location of the CID tables directory
1203 public static $CIDTablesDirectory ;
1204 // Loacation of the Font metrics directory, for the Adobe standard 14 fonts
1205 public static $FontMetricsDirectory ;
1206 // Standard Adobe font names, and their corresponding file in $FontMetricsDirectory
1207 public static $AdobeStandardFontMetrics = array
1208 (
1209 'courier' => 'courier.fm',
1210 'courier-bold' => 'courierb.fm',
1211 'courier-oblique' => 'courieri.fm',
1212 'courier-boldoblique' => 'courierbi.fm',
1213 'helvetica' => 'helvetica.fm',
1214 'helvetica-bold' => 'helveticab.fm',
1215 'helvetica-oblique' => 'helveticai.fm',
1216 'helvetica-boldoblique' => 'helveticabi.fm',
1217 'symbol' => 'symbol.fm',
1218 'times-roman' => 'times.fm',
1219 'times-bold' => 'timesb.fm',
1220 'times-bolditalic' => 'timesbi.fm',
1221 'times-italic' => 'timesi.fm',
1222 'zapfdingbats' => 'zapfdingbats.fm'
1223 ) ;
1224 // Author information
1225 public $Author = '' ;
1226 public $CreatorApplication = '' ;
1227 public $ProducerApplication = '' ;
1228 public $CreationDate = '' ;
1229 public $ModificationDate = '' ;
1230 public $Title = '' ;
1231 public $Subject = '' ;
1232 public $Keywords = '' ;
1233 protected $GotAuthorInformation = false ;
1234 // Unique and arbitrary file identifier, as specified in the PDF file
1235 // Well, in fact, there are two IDs, but the PDF specification does not mention the goal of the second one
1236 public $ID = '' ;
1237 public $ID2 = '' ;
1238 // End of line string
1239 public $EOL = PHP_EOL ;
1240 // String to be used when no Unicode translation is possible
1241 public static $Utf8Placeholder = '' ;
1242 // Information about memory consumption implied by the file currently being loaded
1243 public $MemoryUsage,
1244 $MemoryPeakUsage ;
1245 // Offset of the document start (%PDF-x.y)
1246 public $DocumentStartOffset ;
1247 // Debug statistics
1248 public $Statistics = array ( ) ;
1249 // Max execution time settings. A positive value means "don't exceed that number of seconds".
1250 // A negative value means "Don't exceed PHP setting max_execution_time - that number of seconds". If the result
1251 // is negative, then the default will be "max_execution_time - 1".
1252 // For those limits to be enforced, you need to specify either the PDFOPT_ENFORCE_EXECUTION_TIME or
1253 // PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME options, or both
1254 public $MaxExecutionTime = -1 ;
1255 public static $MaxGlobalExecutionTime = -1 ;
1256 // This property is expressed in percents ; it gives the extra percentage to add to the values computed by
1257 // the PdfTexterFont::GetStringWidth() method.
1258 // This is basically used when computing text positions and string lengths with the PDFOPT_BASIC_LAYOUT option :
1259 // the computed string length is shorter than its actual length (because of extra spacing determined by character
1260 // kerning in the font data). To determine whether two consecutive blocks of text should be separated by a space,
1261 // we empirically add this extra percentage to the computed string length. The default is -5%.
1262 public $ExtraTextWidth = -5 ;
1263
1264 // Marker stuff. The unprocessed marker list is a sequential array of markers, which will later be dispatched into
1265 // indexed arrays during their first reference
1266 protected $UnprocessedMarkerList = array ( 'font' => array ( ) ) ;
1267 protected $TextWithFontMarkers = array ( ) ;
1268
1269 // Internal variables used when the PDFOPT_ENFORCE_* options are specified
1270 protected static $PhpMaxExecutionTime ;
1271 protected static $GlobalExecutionStartTime ;
1272 protected static $AllowedGlobalExecutionTime ;
1273 protected $ExecutionStartTime ;
1274 protected $AllowedExecutionTime ;
1275
1276 // Font mappings
1277 protected $FontTable = false ;
1278 // Extra Adobe standard font mappings (for character names of the form "/axxx" for example)
1279 protected $AdobeExtraMappings = array ( ) ;
1280 // Page map object
1281 protected $PageMap ;
1282 // Page locations (start and end offsets)
1283 protected $PageLocations ;
1284 // Encryption data
1285 public $IsEncrypted = false ;
1286 protected $EncryptionData = false ;
1287 // A flag coming from the constructor options, telling if enhanced statistics are enabled
1288 protected $EnhancedStatistics ;
1289
1290 // Document text fragments, with their absolute (x,y) position, approximate width and height
1291 protected $DocumentFragments ;
1292
1293 // Form data
1294 protected $FormData ;
1295 protected $FormDataObjectNumbers ;
1296 protected $FormDataDefinitions ;
1297 protected $FormaDataObjects ;
1298
1299 // Capture data
1300 public $CaptureDefinitions ;
1301 protected $CaptureObject ;
1302
1303 // Indicates whether global static initializations have been made
1304 // This is mainly used for variables such as $Utf8PlaceHolder, which is initialized to a different value
1305 private static $StaticInitialized = false ;
1306
1307 // Drawing instructions that are to be ignored and removed from a text stream before processing, for performance
1308 // reasons (it is faster to call preg_replace() once to remove them than calling the __next_instruction() and
1309 // __next_token() methods to process an input stream containing such useless instructions)
1310 // This is an array of regular expressions where the following constructs are replaced at runtime during static
1311 // initialization :
1312 // %n - Will be replaced with a regex matching a decimal number.
1313 private static $IgnoredInstructionTemplatesLayout = array
1314 (
1315 '%n{6} ( (c) ) \s+',
1316 '%n{4} ( (re) | (y) | (v) | (k) | (K) ) \s+',
1317 '%n{3} ( (scn) | (SCN) | (r) | (rg) | (RG) | (sc) | (SC) ) \s+',
1318 '%n{2} ( (m) | (l) ) \s+',
1319 '%n ( (w) | (M) | (g) | (G) | (J) | (j) | (d) | (i) | (sc) | (SC) | (Tc) | (Tw) | (scn) | (Tr) | (Tz) | (Ts) ) \s+',
1320 '\b ( (BDC) | (EMC) ) \s+',
1321 '\/( (Cs \d+) | (CS \d+) | (G[Ss] \d+) | (Fm \d+) | (Im \d+) | (PlacedGraphic) ) \s+ \w+ \s*',
1322 '\/( (Span) | (Artifact) | (Figure) | (P) ) \s* << .*? >> [ \t\r\n>]*',
1323 '\/ ( (PlacedGraphic) | (Artifact) ) \s+',
1324 '\d+ \s+ ( (scn) | (SCN) )',
1325 '\/MC \d+ \s+',
1326 '^ \s* [fhS] \r? \n',
1327 '^W \s+ n \r? \n',
1328 '(f | W) \* \s+',
1329 '^[fhnS] \s+',
1330 '-?0 (\. \d+)? \s+ T[cw]',
1331 '\bBI \s+ .*? \bID \s+ .*? \bEI',
1332 '\/ \w+ \s+ ( (cs) | (CS) | (ri) | (gs) )',
1333 // Hazardous replaces ?
1334 '( [Ww] \s+ ){3,}',
1335 ' \[\] \s+ [Shs] \s+'
1336 ) ;
1337 // Additional instructions to be stripped when no particular page layout has been requested
1338 private static $IgnoredInstructionTemplatesNoLayout = array
1339 (
1340 '%n{6} ( (cm) ) \s+',
1341// '\b ( (BT) | (ET) ) \s+',
1342 '^ \s* [Qq] \r? \n',
1343 '^ \s* (\b [a-zA-Z] \s+)+',
1344 '\s* (\b [a-zA-Z] \s+)+$',
1345 '^[qQ] \s+',
1346 '^q \s+ [hfS] \n',
1347 '( [Qfhnq] \s+ ){2,}'
1348 ) ;
1349 // Replacement regular expressions for %something constructs specified in the $IgnoredInstructions array
1350 private static $ReplacementConstructs = array
1351 (
1352 '%n' => '( [+\-]? ( ( [0-9]+ ( \. [0-9]* )? ) | ( \. [0-9]+ ) ) \s+ )'
1353 ) ;
1354 // The final regexes that are built during static initialization by the __build_ignored_instructions() method
1355 private static $IgnoredInstructionsNoLayout = array ( ) ;
1356 private static $IgnoredInstructionsLayout = array ( ) ;
1357 private $IgnoredInstructions = array ( ) ;
1358
1359 // Map id buffer - for avoiding unneccesary calls to GetFontByMapId
1360 private $MapIdBuffer = array ( ) ;
1361
1362 // Same for MapCharacter()
1363 private $CharacterMapBuffer = array ( ) ;
1364
1365 // Font objects buffer - used by __assemble_text_fragments()
1366 private $FontObjectsBuffer = array ( ) ;
1367
1368 // Regex used for removing hyphens - we have to take care of different line endings : "\n" for Unix, "\r\n"
1369 // for Windows, and "\r" for pure Mac files.
1370 // Note that we replace an hyphen followed by an end-of-line then by non-space characters with the non-space
1371 // characters, so the word gets joined on the same line. Spaces after the end of the word (on the next line)
1372 // are removed, in order for the next word to appear at the beginning of the second line.
1373 private static $RemoveHyphensRegex = '#
1374 (
1375 -
1376 [ \t]* ( (\r\n) | \n | \r )+ [ \t\r\n]*
1377 )
1378 ([^ \t\r\n]+)
1379 \s*
1380 #msx' ;
1381
1382 // A small list of Unicode character ranges that are related to languages written from right to left
1383 // For performance reasons, everythings is mapped to a range here, even if it includes codepoints that do not map to anything
1384 // (this class is not a Unicode codepoint validator, but a Pdf text extractor...)
1385 // The UTF-16 version is given as comments ; only the UTF-8 translation is used here
1386 // To be completed !
1387 private static $RtlCharacters = array
1388 (
1389 // This range represents the following languages :
1390 // - Hebrew (0590..05FF)
1391 // - Arabic (0600..06FF)
1392 // - Syriac (0700..074F)
1393 // - Supplement for Arabic (0750..077F)
1394 // - Thaana (0780..07BF)
1395 // - N'ko (07C0..07FF)
1396 // - Samaritan (0800..083F)
1397 // - Mandaic (0840..085F)
1398 // array ( 0x00590, 0x0085F ),
1399 // Hebrew supplement (I suppose ?) + other characters
1400 // array ( 0x0FB1D, 0x0FEFC ),
1401 // Mende kikakui
1402 // array ( 0x1E800, 0x1E8DF ),
1403 // Adlam
1404 // array ( 0x1E900, 0x1E95F ),
1405 // Others
1406 // array ( 0x10800, 0x10C48 ),
1407 // array ( 0x1EE00, 0x1EEBB )
1408 "\xD6" => array ( array ( "\x90", "\xBF" ) ),
1409 "\xD7" => array ( array ( "\x80", "\xBF" ) ),
1410 "\xD8" => array ( array ( "\x80", "\xBF" ) ),
1411 "\xD9" => array ( array ( "\x80", "\xBF" ) ),
1412 "\xDA" => array ( array ( "\x80", "\xBF" ) ),
1413 "\xDB" => array ( array ( "\x80", "\xBF" ) ),
1414 "\xDC" => array ( array ( "\x80", "\xBF" ) ),
1415 "\xDD" => array ( array ( "\x80", "\xBF" ) ),
1416 "\xDE" => array ( array ( "\x80", "\xBF" ) ),
1417 "\xDF" => array ( array ( "\x80", "\xBF" ) )
1418 /*
1419 "\xE0" => array
1420 (
1421 array ( "\xA0\x80", "\xA0\xBF" ),
1422 array ( "\xA1\x80", "\xA1\x9F" )
1423 ),
1424 "\xEF" => array
1425 (
1426 array ( "\xAC\x9D", "\xAC\xBF" ),
1427 array ( "\xAD\x80", "\xAD\xBF" ),
1428 array ( "\xAE\x80", "\xAE\xBF" ),
1429 array ( "\xAF\x80", "\xAF\xBF" ),
1430 array ( "\xB0\x80", "\xB0\xBF" ),
1431 array ( "\xB1\x80", "\xB1\xBF" ),
1432 array ( "\xB2\x80", "\xB2\xBF" ),
1433 array ( "\xB3\x80", "\xB3\xBF" ),
1434 array ( "\xB4\x80", "\xB4\xBF" ),
1435 array ( "\xB5\x80", "\xB5\xBF" ),
1436 array ( "\xB6\x80", "\xB6\xBF" ),
1437 array ( "\xB7\x80", "\xB7\xBF" ),
1438 array ( "\xB8\x80", "\xB8\xBF" ),
1439 array ( "\xB9\x80", "\xB9\xBF" ),
1440 array ( "\xBA\x80", "\xBA\xBF" ),
1441 array ( "\xBB\x80", "\xBB\xBC" )
1442 )
1443 */
1444 ) ;
1445
1446 // UTF-8 prefixes for RTL characters as keys, and number of characters that must follow the prefix as values
1447 private static $RtlCharacterPrefixLengths = array
1448 (
1449 "\xD6" => 1,
1450 "\xD7" => 1,
1451 "\xD8" => 1,
1452 "\xD9" => 1,
1453 "\xDA" => 1,
1454 "\xDB" => 1,
1455 "\xDC" => 1,
1456 "\xDE" => 1,
1457 "\xDF" => 1
1458 /*
1459 "\xE0" => 2,
1460 "\xEF" => 2
1461 */
1462 ) ;
1463
1464 // A string that contains all the RTL character prefixes above
1465 private static $RtlCharacterPrefixes ;
1466
1467 // As usual, caching a little bit the results of the IsRtlCharacter() method is welcome. Each item will have the value true if the
1468 // character is RTL, or false if LTR.
1469 private $RtlCharacterBuffer = array ( ) ;
1470
1471 // A subset of a character classification array that avoids too many calls to the ctype_* functions or too many
1472 // character comparisons.
1473 // This array is used only for highly sollicited parts of code
1474 const CTYPE_ALPHA = 0x01 ; // Letter
1475 const CTYPE_DIGIT = 0x02 ; // Digit
1476 const CTYPE_XDIGIT = 0x04 ; // Hex digit
1477 const CTYPE_ALNUM = 0x08 ; // Letter or digit
1478 const CTYPE_LOWER = 0x10 ; // Lower- or upper-case letters
1479 const CTYPE_UPPER = 0x20 ;
1480
1481 private static $CharacterClasses = false ;
1482
1483 // Stuff specific to the current PHP version
1484 private static $HasMemoryGetUsage ;
1485 private static $HasMemoryGetPeakUsage ;
1486
1487
1488 /*--------------------------------------------------------------------------------------------------------------
1489
1490 CONSTRUCTOR
1491 $pdf = new PdfToText ( $filename = null, $options = PDFOPT_NONE ) ;
1492
1493 DESCRIPTION
1494 Builds a PdfToText object and optionally loads the specified file's contents.
1495
1496 PARAMETERS
1497 $filename (string) -
1498 Optional PDF filename whose text contents are to be extracted.
1499
1500 $options (integer) -
1501 A combination of PDFOPT_* flags. This can be any of the following :
1502
1503 - PDFOPT_REPEAT_SEPARATOR :
1504 Text constructs specified as an array are separated by an offset which is expressed as
1505 thousands of text units ; for example :
1506
1507 [(1)-2000(2)]
1508
1509 will be rendered as the text "1 2" ("1" and "2" being separated by two spaces) if the
1510 "Separator" property is set to a space (the default) and this flag is specified.
1511 When not specified, the text will be rendered as "1 2".
1512
1513 - PDFOPT_NONE :
1514 None of the above options will apply.
1515
1516 *-------------------------------------------------------------------------------------------------------------*/
1517 public function __construct ( $filename = null, $options = self::PDFOPT_NONE, $user_password = false, $owner_password = false )
1518 {
1519 // We need the mbstring PHP extension here...
1520 if ( ! function_exists ( 'mb_convert_encoding' ) )
1521 error ( "You must enable the mbstring PHP extension to use this class." ) ;
1522
1523 // Perform static initializations if needed
1524 if ( ! self::$StaticInitialized )
1525 {
1526 if ( self::$DEBUG )
1527 {
1528 // In debug mode, initialize the utf8 placeholder only if it still set to its default value, the empty string
1529 if ( self::$Utf8Placeholder == '' )
1530 self::$Utf8Placeholder = '[Unknown character : 0x%08X]' ;
1531 }
1532
1533 // Build the list of regular expressions from the list of ignored instruction templates
1534 self::__build_ignored_instructions ( ) ;
1535
1536 // Check if some functions are supported or not
1537 self::$HasMemoryGetUsage = function_exists ( 'memory_get_usage' ) ;
1538 self::$HasMemoryGetPeakUsage = function_exists ( 'memory_get_peak_usage' ) ;
1539
1540 // Location of the directory containing CID fonts
1541 self::$CIDTablesDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'CIDTables' ;
1542 self::$FontMetricsDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'FontMetrics' ;
1543
1544 // The string that contains all the Rtl character prefixes in UTF-8 - An optimization used by the __rtl_process() method
1545 self::$RtlCharacterPrefixes = implode ( '', array_keys ( self::$RtlCharacterPrefixLengths ) ) ;
1546
1547 // Build the character classes (used only for testing letters and digits)
1548 if ( self::$CharacterClasses === false )
1549 {
1550 for ( $ord = 0 ; $ord < 256 ; $ord ++ )
1551 {
1552 $ch = chr ( $ord ) ;
1553
1554 if ( $ch >= '0' && $ch <= '9' )
1555 self::$CharacterClasses [ $ch ] = self::CTYPE_DIGIT | self::CTYPE_XDIGIT | self::CTYPE_ALNUM ;
1556 else if ( $ch >= 'A' && $ch <= 'Z' )
1557 {
1558 self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_UPPER | self::CTYPE_ALNUM ;
1559
1560 if ( $ch <= 'F' )
1561 self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ;
1562 }
1563 else if ( $ch >= 'a' && $ch <= 'z' )
1564 {
1565 self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_LOWER | self::CTYPE_ALNUM ;
1566
1567 if ( $ch <= 'f' )
1568 self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ;
1569 }
1570 else
1571 self::$CharacterClasses [ $ch ] = 0 ;
1572 }
1573 }
1574
1575 // Global execution time limit
1576 self::$PhpMaxExecutionTime = ( integer ) ini_get ( 'max_execution_time' ) ;
1577
1578 if ( ! self::$PhpMaxExecutionTime ) // Paranoia : default max script execution time to 120 seconds
1579 self::$PhpMaxExecutionTime = 120 ;
1580
1581 self::$GlobalExecutionStartTime = microtime ( true ) ; // Set the start of the first execution
1582
1583 if ( self::$MaxGlobalExecutionTime > 0 )
1584 self::$AllowedGlobalExecutionTime = self::$MaxGlobalExecutionTime ;
1585 else
1586 self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime + self::$MaxGlobalExecutionTime ;
1587
1588 // Adjust in case of inconsistent values
1589 if ( self::$AllowedGlobalExecutionTime < 0 || self::$AllowedGlobalExecutionTime > self::$PhpMaxExecutionTime )
1590 self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime - 1 ;
1591
1592 self::$StaticInitialized = true ;
1593 }
1594
1595 parent::__construct ( ) ;
1596
1597 $this -> Options = $options ;
1598
1599 if ( $filename )
1600 $this -> Load ( $filename, $user_password, $owner_password ) ;
1601 }
1602
1603
1604 public function __tostring ( )
1605 { return ( $this -> Text ) ; }
1606
1607
1608 /**************************************************************************************************************
1609 **************************************************************************************************************
1610 **************************************************************************************************************
1611 ****** ******
1612 ****** ******
1613 ****** PUBLIC METHODS ******
1614 ****** ******
1615 ****** ******
1616 **************************************************************************************************************
1617 **************************************************************************************************************
1618 **************************************************************************************************************/
1619
1620 /*--------------------------------------------------------------------------------------------------------------
1621
1622 NAME
1623 Load - Loads text contents from a PDF file.
1624 LoadFromString - Loads PDF contents from a string.
1625
1626 PROTOTYPE
1627 $text = $pdf -> Load ( $filename, $user_password = false, $owner_password = false ) ;
1628 $text = $pdf -> LoadFromString ( $contents, $user_password = false, $owner_password = false ) ;
1629
1630 DESCRIPTION
1631 The Load() method extracts text contents from the specified PDF file. Once processed, text contents will
1632 be available through the "Text" property.
1633 The LoadFromString() method performs the same operation on PDF contents already loaded into memory.
1634
1635 PARAMETERS
1636 $filename (string) -
1637 Optional PDF filename whose text contents are to be extracted.
1638
1639 $contents (string) -
1640 String containing PDF contents.
1641
1642 $user_password (string) -
1643 User password used for decrypting PDF contents.
1644
1645 $owner_password (string) -
1646 Owner password.
1647
1648 *-------------------------------------------------------------------------------------------------------------*/
1649 private $__memory_peak_usage_start,
1650 $__memory_usage_start ;
1651
1652 public function Load ( $filename, $user_password = false, $owner_password = false )
1653 {
1654 $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
1655 $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
1656
1657 // Check if the file exists, but only if the file is on a local filesystem
1658 if ( ! preg_match ( '#^ [^:]+ ://#ix', $filename ) && ! file_exists ( $filename ) )
1659 error ( new PdfToTextDecodingException ( "File \"$filename\" does not exist." ) ) ;
1660
1661 // Load its contents
1662 $contents = @file_get_contents ( $filename, FILE_BINARY ) ;
1663
1664 if ( $contents === false )
1665 error ( new PdfToTextDecodingException ( "Unable to open \"$filename\"." ) ) ;
1666
1667 return ( $this -> __load ( $filename, $contents, $user_password, $owner_password ) ) ;
1668 }
1669
1670
1671 public function LoadFromString ( $contents, $user_password = false, $owner_password = false )
1672 {
1673 $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
1674 $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
1675
1676 return ( $this -> __load ( '', $contents, $user_password, $owner_password ) ) ;
1677 }
1678
1679
1680 private function __load ( $filename, $contents, $user_password = false, $owner_password = false )
1681 {
1682 // Search for the start of the document ("%PDF-x.y")
1683 $start_offset = strpos ( $contents, '%PDF' ) ;
1684
1685 if ( $start_offset === false ) // Not a pdf document !
1686 error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
1687 else // May be a PDF document
1688 $this -> DocumentStartOffset = $start_offset ;
1689
1690 // Check that this is a PDF file with a valid version number
1691 if ( ! preg_match ( '/ %PDF- (?P<version> \d+ (\. \d+)*) /ix', $contents, $match, 0, $start_offset ) )
1692 error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
1693
1694 $this -> PdfVersion = $match [ 'version' ] ;
1695
1696 // Initializations
1697 $this -> Text = '' ;
1698 $this -> FontTable = new PdfTexterFontTable ( ) ;
1699 $this -> Filename = realpath ( $filename ) ;
1700 $this -> Pages = array ( ) ;
1701 $this -> Images = array ( ) ;
1702 $this -> ImageData = array ( ) ;
1703 $this -> ImageCount = 0 ;
1704 $this -> AutoSavedImageFiles = array ( ) ;
1705 $this -> PageMap = new PdfTexterPageMap ( ) ;
1706 $this -> PageLocations = array ( ) ;
1707 $this -> Author = '' ;
1708 $this -> CreatorApplication = '' ;
1709 $this -> ProducerApplication = '' ;
1710 $this -> CreationDate = '' ;
1711 $this -> ModificationDate = '' ;
1712 $this -> Title = '' ;
1713 $this -> Subject = '' ;
1714 $this -> Keywords = '' ;
1715 $this -> GotAuthorInformation = false ;
1716 $this -> ID = '' ;
1717 $this -> ID2 = '' ;
1718 $this -> EncryptionData = false ;
1719 $this -> EnhancedStatistics = ( ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) != 0 ) ;
1720
1721 // Also reset cached information that may come from previous runs
1722 $this -> MapIdBuffer = array ( ) ;
1723 $this -> RtlCharacterBuffer = array ( ) ;
1724 $this -> CharacterMapBuffer = array ( ) ;
1725 $this -> FontObjectsBuffer = array ( ) ;
1726 $this -> FormData = array ( ) ;
1727 $this -> FormDataObjectNumbers = false ;
1728 $this -> FomDataDefinitions = array ( ) ;
1729 $this -> FormDataObjects = array ( ) ;
1730 $this -> CaptureDefinitions = false ;
1731 $this -> CaptureObject = false ;
1732 $this -> DocumentFragments = array ( ) ;
1733
1734 // Enable the PDFOPT_BASIC_LAYOUT option if the PDFOPT_CAPTURE flag is specified
1735 if ( $this -> Options & self::PDFOPT_CAPTURE )
1736 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
1737
1738 // Enable the PDFOPT_BASIC_LAYOUT_OPTION is PDFOPT_DEBUG_SHOW_COORDINATES is specified
1739 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
1740 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
1741
1742 // Page layout options needs more instructions to be retained - select the appropriate list of useless instructions
1743 if ( $this -> Options & self::PDFOPT_BASIC_LAYOUT )
1744 $this -> IgnoredInstructions = self::$IgnoredInstructionsLayout ;
1745 else
1746 $this -> IgnoredInstructions = self::$IgnoredInstructionsNoLayout ;
1747
1748
1749 // Debug statistics
1750 $this -> Statistics = array
1751 (
1752 'TextSize' => 0, // Total size of drawing instructions ("text" objects)
1753 'OptimizedTextSize' => 0, // Optimized text size, with useless instructions removed
1754 'Distributions' => array // Statistics about handled instructions distribution - Works only with the page layout option in debug mode
1755 (
1756 'operand' => 0,
1757 'Tm' => 0,
1758 'Td' => 0,
1759 'TD' => 0,
1760 "'" => 0,
1761 'TJ' => 0,
1762 'Tj' => 0,
1763 'Tf' => 0,
1764 'TL' => 0,
1765 'T*' => 0,
1766 '(' => 0,
1767 '<' => 0,
1768 '[' => 0,
1769 'cm' => 0,
1770 'BT' => 0,
1771 'template' => 0,
1772 'ignored' => 0,
1773 'space' => 0
1774 )
1775 ) ;
1776
1777 // Per-instance execution time limit
1778 $this -> ExecutionStartTime = microtime ( true ) ;
1779
1780 if ( $this -> MaxExecutionTime > 0 )
1781 $this -> AllowedExecutionTime = $this -> MaxExecutionTime ;
1782 else
1783 $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime + $this -> MaxExecutionTime ;
1784
1785 // Adjust in case of inconsistent values
1786 if ( $this -> AllowedExecutionTime < 0 || $this -> AllowedExecutionTime > self::$PhpMaxExecutionTime )
1787 $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime - 1 ;
1788
1789 // Systematically set the DECODE_IMAGE_DATA flag if the AUTOSAVE_IMAGES flag has been specified
1790 if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES )
1791 $this -> Options |= self::PDFOPT_DECODE_IMAGE_DATA ;
1792
1793 // Systematically set the GET_IMAGE_DATA flag if DECODE_IMAGE_DATA is specified (debug mode only)
1794 if ( self::$DEBUG && $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA )
1795 $this -> Options |= self::PDFOPT_GET_IMAGE_DATA ;
1796
1797 // Since page layout options take 2 bits, but not all of the 4 possible values are allowed, make sure that an invalid
1798 // value will default to PDFOPT_RAW_LAYOUT value
1799 $layout_option = $this -> Options & self::PDFOPT_LAYOUT_MASK ;
1800
1801 if ( ! $layout_option === self::PDFOPT_RAW_LAYOUT && $layout_option !== self::PDFOPT_BASIC_LAYOUT )
1802 {
1803 $layout_option = self::PDFOPT_RAW_LAYOUT ;
1804 $this -> Options = ( $this -> Options & ~self::PDFOPT_LAYOUT_MASK ) | self::PDFOPT_RAW_LAYOUT ;
1805 }
1806
1807 // Author information needs to be processed after, because it may reference objects that occur later in the PDF stream
1808 $author_information_object_id = false ;
1809
1810 // Extract pdf objects that are enclosed by the "obj" and "endobj" keywords
1811 $pdf_objects = array ( ) ;
1812 $contents_offset = $this -> DocumentStartOffset ;
1813 $contents_length = strlen ( $contents ) ;
1814
1815
1816 while ( $contents_offset < $contents_length &&
1817 preg_match ( '/(?P<re> (?P<object_id> \d+) \s+ \d+ \s+ obj (?P<object> .*?) endobj )/imsx', $contents, $match, PREG_OFFSET_CAPTURE, $contents_offset ) )
1818 {
1819 $object_number = $match [ 'object_id' ] [0] ;
1820 $object_data = $match [ 'object' ] [0] ;
1821
1822 // Handle the special case of object streams (compound objects)
1823 // They are not added in the $pdf_objects array, because they could be mistakenly processed as relevant information,
1824 // such as font definitions, etc.
1825 // Instead, only the objects they are embedding are stored in this array.
1826 if ( $this -> IsObjectStream ( $object_data ) )
1827 {
1828 // Ignore ill-formed object streams
1829 if ( ( $object_stream_matches = $this -> DecodeObjectStream ( $object_number, $object_data ) ) !== false )
1830 {
1831 // Add this list of objects to the list of known objects
1832 for ( $j = 0, $object_stream_count = count ( $object_stream_matches [ 'object_id' ] ) ; $j < $object_stream_count ; $j ++ )
1833 $pdf_objects [ $object_stream_matches [ 'object_id' ] [$j] ] = $object_stream_matches [ 'object' ] [$j] ;
1834 }
1835 }
1836 // Normal (non-compound) object
1837 else
1838 $pdf_objects [ $object_number ] = $object_data ;
1839
1840 // Update current offset through PDF contents
1841 $contents_offset = $match [ 're' ] [1] + strlen ( $match [ 're' ] [0] ) ;
1842 }
1843
1844 // We put a particular attention in treating errors returned by preg_match_all() here, since we need to be really sure why stopped
1845 // to find further PDF objects in the supplied contents
1846 $preg_error = preg_last_error ( ) ;
1847
1848 switch ( $preg_error )
1849 {
1850 case PREG_NO_ERROR :
1851 break ;
1852
1853 case PREG_INTERNAL_ERROR :
1854 error ( new PdfToTextDecodingException ( "PDF object extraction : the preg_match_all() function encountered an internal error." ) ) ;
1855
1856 case PREG_BACKTRACK_LIMIT_ERROR :
1857 error ( new PdfToTextDecodingException ( "PDF object extraction : backtrack limit reached (you may have to modify the pcre.backtrack_limit " .
1858 "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.backtrack_limit' ) . ")." ) ) ;
1859
1860 case PREG_JIT_STACKLIMIT_ERROR :
1861 error ( new PdfToTextDecodingException ( "PDF object extraction : JIT stack limit reached (you may disable this feature by setting the pcre.jit " .
1862 "setting of your PHP.ini file to 0)." ) ) ;
1863
1864 case PREG_RECURSION_LIMIT_ERROR :
1865 error ( new PdfToTextDecodingException ( "PDF object extraction : recursion limit reached (you may have to modify the pcre.recursion_limit " .
1866 "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.recursion_limit' ) . ")." ) ) ;
1867
1868 case PREG_BAD_UTF8_ERROR :
1869 error ( new PdfToTextDecodingException ( "PDF object extraction : bad UTF8 character encountered." ) ) ;
1870
1871 case PREG_BAD_UTF8_OFFSET_ERROR :
1872 error ( new PdfToTextDecodingException ( "PDF object extraction : the specified offset does not start at the beginning of a valid UTF8 codepoint." ) ) ;
1873
1874 default :
1875 error ( new PdfToTextDecodingException ( "PDF object extraction : unkown PREG error #$preg_error" ) ) ;
1876 }
1877
1878
1879 // Extract trailer information, which may contain the ID of an object specifying encryption flags
1880 $this -> GetTrailerInformation ( $contents, $pdf_objects ) ;
1881 unset ( $contents ) ;
1882
1883 // Character maps encountered so far
1884 $cmaps = array ( ) ;
1885
1886 // An array that will store object ids as keys and text contents as values
1887 $text = array ( ) ;
1888
1889 // Loop through the objects
1890 foreach ( $pdf_objects as $object_number => $object_data )
1891 {
1892 // Some additional objects may be uncovered after processing (in an object containing compacted objects for example)
1893 // so add them to the list if necessary
1894 if ( ! isset ( $pdf_objects [ $object_number ] ) )
1895 $pdf_objects [ $object_number ] = $object_data ;
1896
1897 // Try to catch information related to page mapping - but don't discard the object since it can contain additional information
1898 $this -> PageMap -> Peek ( $object_number, $object_data, $pdf_objects ) ;
1899
1900 // Check if the object contais authoring information - it can appear encoded or unencoded
1901 if ( ! $this -> GotAuthorInformation )
1902 $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $object_data ) ;
1903
1904 // Also catch the object encoding type
1905 $type = $this -> GetEncodingType ( $object_number, $object_data ) ;
1906 $stream_match = null ;
1907
1908 if ( strpos ( $object_data, 'stream' ) === false ||
1909 ! preg_match ( '#[^/] stream \s+ (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
1910 {
1911 // Some font definitions are in clear text in an object, some are encoded in a stream within the object
1912 // We process here the unencoded ones
1913 if ( $this -> IsFont ( $object_data ) )
1914 {
1915 $this -> FontTable -> Add ( $object_number, $object_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
1916 continue ;
1917 }
1918 // Some character maps may also be in clear text
1919 else if ( $this -> IsCharacterMap ( $object_data ) )
1920 {
1921 $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $object_data, $this -> AdobeExtraMappings ) ;
1922
1923 if ( $cmap )
1924 $cmaps [] = $cmap ;
1925
1926 continue ;
1927 }
1928 // Check if there is an association between font number and object number
1929 else if ( $this -> IsFontMap ( $object_data ) )
1930 {
1931 $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
1932 }
1933 // Retrieve form data if present
1934 else if ( $this -> IsFormData ( $object_data ) )
1935 {
1936 $this -> RetrieveFormData ( $object_number, $object_data, $pdf_objects ) ;
1937 }
1938 // Ignore other objects that do not contain an encoded stream
1939 else
1940 {
1941 if ( self::$DEBUG > 1 )
1942 echo "\n----------------------------------- UNSTREAMED #$object_number\n$object_data" ;
1943
1944 continue ;
1945 }
1946 }
1947 // Extract image data, if any
1948 else if ( $this -> IsImage ( $object_data ) )
1949 {
1950 $this -> AddImage ( $object_number, $stream_match [ 'stream' ], $type, $object_data ) ;
1951 continue ;
1952 }
1953 // Check if there is an association between font number and object number
1954 else if ( $this -> IsFontMap ( $object_data ) )
1955 {
1956 $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
1957
1958 if ( ! $stream_match )
1959 continue ;
1960 }
1961
1962 // Check if the stream contains data (yes, I have found a sample that had streams of length 0...)
1963 // In other words : ignore empty streams
1964 if ( stripos ( $object_data, '/Length 0' ) !== false )
1965 continue ;
1966
1967 // Isolate stream data and try to find its encoding type
1968 if ( isset ( $stream_match [ 'stream' ] ) )
1969 $stream_data = ltrim ( $stream_match [ 'stream' ], "\r\n" ) ;
1970 else
1971 continue ;
1972
1973 // Ignore this stream if the object does not contain an encoding type (/FLATEDECODE, /ASCIIHEX or /ASCII85)
1974 if ( $type == self::PDF_UNKNOWN_ENCODING )
1975 {
1976 if ( self::$DEBUG > 1 )
1977 echo "\n----------------------------------- UNENCODED #$object_number :\n$object_data" ;
1978
1979 continue ;
1980 }
1981
1982 // Decode the encoded stream
1983 $decoded_stream_data = $this -> DecodeData ( $object_number, $stream_data, $type, $object_data ) ;
1984
1985 // Second chance to peek author information, this time on a decoded stream data
1986 if ( ! $this -> GotAuthorInformation )
1987 $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $decoded_stream_data ) ;
1988
1989 // Check for character maps
1990 if ( $this -> IsCharacterMap ( $decoded_stream_data ) )
1991 {
1992 $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $decoded_stream_data, $this -> AdobeExtraMappings ) ;
1993
1994 if ( $cmap )
1995 $cmaps [] = $cmap ;
1996 }
1997 // Font definitions
1998 else if ( $this -> IsFont ( $decoded_stream_data ) )
1999 {
2000 $this -> FontTable -> Add ( $object_number, $decoded_stream_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
2001 }
2002 // Retrieve form data if present
2003 else if ( $this -> IsFormData ( $object_data ) )
2004 {
2005 $this -> RetrieveFormData ( $object_number, $decoded_stream_data, $pdf_objects ) ;
2006 }
2007 // Plain text (well, in fact PDF drawing instructions)
2008 else if ( $this -> IsText ( $object_data, $decoded_stream_data ) )
2009 {
2010 $text_data = false ;
2011
2012 // Check if we need to ignore page headers and footers
2013 if ( $this -> Options & self::PDFOPT_IGNORE_HEADERS_AND_FOOTERS )
2014 {
2015 if ( ! $this -> IsPageHeaderOrFooter ( $decoded_stream_data ) )
2016 {
2017 $text [ $object_number ] =
2018 $text_data = $decoded_stream_data ;
2019 }
2020 // However, they may be mixed with actual text contents so we need to separate them...
2021 else
2022 {
2023 $this -> ExtractTextData ( $object_number, $decoded_stream_data, $remainder, $header, $footer ) ;
2024
2025 // We still need to check again that the extracted text portion contains something useful
2026 if ( $this -> IsText ( $object_data, $remainder ) )
2027 {
2028 $text [ $object_number ] =
2029 $text_data = $remainder ;
2030 }
2031 }
2032 }
2033 else
2034 {
2035 $text [ $object_number ] =
2036 $text_data = $decoded_stream_data ;
2037 }
2038
2039
2040 // The current object may be a text object that have been defined as an XObject in some other object
2041 // In this case, we have to keep it since it may be referenced by a /TPLx construct from within
2042 // another text object
2043 if ( $text_data )
2044 $this -> PageMap -> AddTemplateObject ( $object_number, $text_data ) ;
2045 }
2046 // This may be here the opportunity to look into the $FormData property and replace object ids with their corresponding data
2047 else
2048 {
2049 $found = false ;
2050
2051 foreach ( $this -> FormData as &$form_entry )
2052 {
2053 if ( is_integer ( $form_entry [ 'values' ] ) && $object_number == $form_entry [ 'values' ] )
2054 {
2055 $form_entry [ 'values' ] = $decoded_stream_data ;
2056 $found = true ;
2057 }
2058 else if ( is_integer ( $form_entry [ 'form' ] ) && $object_number == $form_entry [ 'form' ] )
2059 {
2060 $form_entry [ 'form' ] = $decoded_stream_data ;
2061 $found = true ;
2062 }
2063 }
2064
2065 if ( ! $found && self::$DEBUG > 1 )
2066 echo "\n----------------------------------- UNRECOGNIZED #$object_number :\n$decoded_stream_data\n" ;
2067 }
2068 }
2069
2070 // Form data object numbers
2071 $this -> FormDataObjectNumbers = array_keys ( $this -> FormData ) ;
2072
2073 // Associate character maps with declared fonts
2074 foreach ( $cmaps as $cmap )
2075 $this -> FontTable -> AddCharacterMap ( $cmap ) ;
2076
2077 // Current font defaults to -1, which means : take the first available font as the current one.
2078 // Sometimes it may happen that text drawing instructions do not set a font at all (PdfPro for example)
2079 $current_font = -1 ;
2080
2081 // Build the page catalog
2082 $this -> Pages = array ( ) ;
2083 $this -> PageMap -> MapObjects ( $text ) ;
2084
2085 // Add font mappings local to each page
2086 $mapped_fonts = $this -> PageMap -> GetMappedFonts ( ) ;
2087 $this -> FontTable -> AddPageFontMap ( $mapped_fonts ) ;
2088
2089 // Extract text from the collected text elements
2090 foreach ( $this -> PageMap -> Pages as $page_number => $page_objects )
2091 {
2092 // Checks if this page is selected
2093 if ( ! $this -> IsPageSelected ( $page_number ) )
2094 continue ;
2095
2096 $this -> Pages [ $page_number ] = '' ;
2097
2098 if ( $layout_option === self::PDFOPT_RAW_LAYOUT )
2099 {
2100 foreach ( $page_objects as $page_object )
2101 {
2102 if ( isset ( $text [ $page_object ] ) )
2103 {
2104 $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
2105 $object_text = $this -> ExtractText ( $page_number, $page_object, $new_text, $current_font ) ;
2106 $this -> Pages [ $page_number ] .= $object_text ;
2107 }
2108 else if ( self::$DEBUG > 1 )
2109 echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
2110 }
2111 }
2112 // New style (basic) layout rendering
2113 else if ( $layout_option === self::PDFOPT_BASIC_LAYOUT )
2114 {
2115 $page_fragments = array ( ) ;
2116
2117 foreach ( $page_objects as $page_object )
2118 {
2119 if ( isset ( $text [ $page_object ] ) )
2120 {
2121 $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
2122 $this -> ExtractTextWithLayout ( $page_fragments, $page_number, $page_object, $new_text, $current_font ) ;
2123 }
2124 else if ( self::$DEBUG > 1 )
2125 echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
2126 }
2127
2128 $this -> Pages [ $page_number ] = $this -> __assemble_text_fragments ( $page_number, $page_fragments, $page_width, $page_height ) ;
2129
2130 $this -> DocumentFragments [ $page_number ] = array
2131 (
2132 'fragments' => $page_fragments,
2133 'page-width' => $page_width,
2134 'page_height' => $page_height
2135 ) ;
2136 }
2137 }
2138
2139 // Retrieve author information
2140 if ( $this -> GotAuthorInformation )
2141 $this -> RetrieveAuthorInformation ( $author_information_object_id, $pdf_objects ) ;
2142
2143 // Build the page locations (ie, starting and ending offsets)
2144 $offset = 0 ;
2145 $page_separator = utf8_encode ( $this -> PageSeparator ) ;
2146 $page_separator_length = strlen ( $page_separator ) ;
2147
2148 foreach ( $this -> Pages as $page_number => &$page )
2149 {
2150 // If hyphenated words are unwanted, then remove them
2151 if ( $this -> Options & self::PDFOPT_NO_HYPHENATED_WORDS )
2152 $page = preg_replace ( self::$RemoveHyphensRegex, '$4$2', $page ) ;
2153
2154 $length = strlen ( $page ) ;
2155 $this -> PageLocations [ $page_number ] = array ( 'start' => $offset, 'end' => $offset + $length - 1 ) ;
2156 $offset += $length + $page_separator_length ;
2157 }
2158
2159 // And finally, the Text property
2160 $this -> Text = implode ( $page_separator, $this -> Pages ) ;
2161
2162 // Free memory
2163 $this -> MapIdBuffer = array ( ) ;
2164 $this -> RtlCharacterBuffer = array ( ) ;
2165 $this -> CharacterMapBuffer = array ( ) ;
2166
2167 // Compute memory occupied for this file
2168 $memory_usage_end = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
2169 $memory_peak_usage_end = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
2170
2171 $this -> MemoryUsage = $memory_usage_end - $this -> __memory_usage_start ;
2172 $this -> MemoryPeakUsage = $memory_peak_usage_end - $this -> __memory_peak_usage_start ;
2173
2174 // Adjust the "Distributions" statistics
2175 if ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS )
2176 {
2177 $instruction_count = 0 ;
2178 $statistics = array ( ) ;
2179
2180 // Count the total number of instructions
2181 foreach ( $this -> Statistics [ 'Distributions' ] as $count )
2182 $instruction_count += $count ;
2183
2184 // Now transform the Distributions entries into an associative array containing the instruction counts
2185 // ('count') and their relative percentage
2186 foreach ( $this -> Statistics [ 'Distributions' ] as $name => $count )
2187 {
2188 if ( $instruction_count )
2189 $percent = round ( ( 100.0 / $instruction_count ) * $count, 2 ) ;
2190 else
2191 $percent = 0 ;
2192
2193 $statistics [ $name ] = array
2194 (
2195 'instruction' => $name,
2196 'count' => $count,
2197 'percent' => $percent
2198 ) ;
2199 }
2200
2201 // Set the new 'Distributions' array and sort it by instruction count in reverse order
2202 $this -> Statistics [ 'Distributions' ] = $statistics ;
2203 uksort ( $this -> Statistics [ 'Distributions' ], array ( $this, '__sort_distributions' ) ) ;
2204 }
2205
2206 // All done, return
2207 return ( $this -> Text ) ;
2208 }
2209
2210
2211 public function __sort_distributions ( $a, $b )
2212 { return ( $this -> Statistics [ 'Distributions' ] [$b] [ 'count' ] - $this -> Statistics [ 'Distributions' ] [$a] [ 'count' ] ) ; }
2213
2214
2215
2216 /*--------------------------------------------------------------------------------------------------------------
2217
2218 NAME
2219 AddAdobeExtraMappings - Adds extra mappings for standard Adobe fonts.
2220
2221 PROTOTYPE
2222 $pdf -> AddAdobeExtraMappings ( $mappings ) ;
2223
2224 DESCRIPTION
2225 Adobe supports 4 predefined fonts : standard, Mac, WinAnsi and PDF). All the characters in these fonts
2226 are identified by a character time, a little bit like HTML entities ; for example, 'one' will be the
2227 character '1', 'acircumflex' will be '�', etc.
2228 There are thousands of character names defined by Adobe (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html).
2229 Some of them are not in this list ; this is the case for example of the 'ax' character names, where 'x'
2230 is a decimal number. When such a character is specified in a /Differences array, then there is somewhere
2231 a CharProc[] array giving an object id for each of those characters.
2232 The referenced object(s) in turn contain drawing instructions to draw the glyph. At no point you could
2233 guess what is the corresponding Unicode character for this glyph, since the information is not contained
2234 in the PDF file.
2235 The AddAdobeExtraMappings() method allows you to specify such correspondences. Specify an array as the
2236 $mappings parameter, whose keys are the Adobe character name (for example, "a127") and values the
2237 corresponding Unicode values (see the description of the $mappings parameter for more information).
2238
2239 PARAMETERS
2240 $mappings (associative array) -
2241 Associative array whose keys are Adobe character names. The array values can take several forms :
2242 - A character
2243 - An integer value
2244 - An array of up to four character or integer values.
2245 Internally, every specified value is converted to an array of four integer values, one for
2246 each of the standard Adobe character sets (Standard, Mac, WinAnsi and PDF). The following
2247 rules apply :
2248 - If the input value is a single character, the output array corrsponding the Adobe character
2249 name will be a set of 4 elements corresponding to the ordinal value of the supplied
2250 character.
2251 - If the input value is an integer, the output array will be a set of 4 identical values
2252 - If the input value is an array :
2253 . Arrays with less that 4 elements will be padded, using the last array item for padding
2254 . Arrays with more than 4 elements will be silently truncated
2255 . Each array value can either be a character or a numeric value.
2256
2257 NOTES
2258 In this current implementation, the method applies the mappings to ALL Adobe default fonts. That is,
2259 you cannot have one mapping for one Adobe font referenced in the PDF file, then a second mapping for
2260 a second Adobe font, etc.
2261
2262 *-------------------------------------------------------------------------------------------------------------*/
2263 public function AddAdobeExtraMappings ( $mappings )
2264 {
2265 // Loop through each mapping
2266 foreach ( $mappings as $key => $value )
2267 {
2268 // Character value : we retain its ordinal value as the 4 values of the output array
2269 if ( is_string ( $value ) )
2270 {
2271 $ord = ord ( $value ) ;
2272 $items = array ( $ord, $ord, $ord, $ord ) ;
2273 }
2274 // Numeric value : the output array will contain 4 times the supplied value
2275 else if ( is_numeric ( $value ) )
2276 {
2277 $value = ( integer ) $value ;
2278 $items = array ( $value, $value, $value, $value ) ;
2279 }
2280 // Array value : make sure we will have an output array of 4 values
2281 else if ( is_array ( $value ) )
2282 {
2283 $items = array ( ) ;
2284
2285 // Collect the supplied values, converting characters to their ordinal values if necessary
2286 for ( $i = 0, $count = count ( $value ) ; $i < $count && $i < 4 ; $i ++ )
2287 {
2288 $code = $value [$i] ;
2289
2290 if ( is_string ( $code ) )
2291 $items [] = ord ( $code ) ;
2292 else
2293 $items [] = ( integer ) $code ;
2294 }
2295
2296 // Ensure that we have 4 values ; fill the missing ones with the last seen value if necessary
2297 $count = count ( $items ) ;
2298
2299 if ( ! $count )
2300 error ( new PdfToTextException ( "Adobe extra mapping \"$key\" has no values." ) ) ;
2301
2302 $last_value = $items [ $count - 1 ] ;
2303
2304 for ( $i = $count ; $i < 4 ; $i ++ )
2305 $items [] = $last_value ;
2306 }
2307 else
2308 error ( new PdfToTextException ( "Invalid value \"$value\" for Adobe extra mapping \"$key\"." ) ) ;
2309
2310 // Add this current mapping to the Adobe extra mappings array
2311 $this -> AdobeExtraMappings [ $key ] = $items ;
2312 }
2313 }
2314
2315
2316 /*--------------------------------------------------------------------------------------------------------------
2317
2318 NAME
2319 GetPageFromOffset - Returns a page number from a text offset.
2320
2321 PROTOTYPE
2322 $offset = $pdf -> GetPageFromOffset ( $offset ) ;
2323
2324 DESCRIPTION
2325 Given a byte offset in the Text property, returns its page number in the pdf document.
2326
2327 PARAMETERS
2328 $offset (integer) -
2329 Offset, in the Text property, whose page number is to be retrieved.
2330
2331 RETURN VALUE
2332 Returns a page number in the pdf document, or false if the specified offset does not exist.
2333
2334 *-------------------------------------------------------------------------------------------------------------*/
2335 public function GetPageFromOffset ( $offset )
2336 {
2337 if ( $offset === false )
2338 return ( false ) ;
2339
2340 foreach ( $this -> PageLocations as $page => $location )
2341 {
2342 if ( $offset >= $location [ 'start' ] && $offset <= $location [ 'end' ] )
2343 return ( $page ) ;
2344 }
2345
2346 return ( false ) ;
2347 }
2348
2349
2350 /*--------------------------------------------------------------------------------------------------------------
2351
2352 NAME
2353 text_strpos, text_stripos - Search for an occurrence of a string.
2354
2355 PROTOTYPE
2356 $result = $pdf -> text_strpos ( $search, $start = 0 ) ;
2357 $result = $pdf -> text_stripos ( $search, $start = 0 ) ;
2358
2359 DESCRIPTION
2360 These methods behave as the strpos/stripos PHP functions, except that :
2361 - They operate on the text contents of the pdf file (Text property)
2362 - They return an array containing the page number and text offset. $result [0] will be set to the page
2363 number of the searched text, and $result [1] to its offset in the Text property
2364
2365 PARAMETERS
2366 $search (string) -
2367 String to be searched.
2368
2369 $start (integer) -
2370 Start offset in the pdf text contents.
2371
2372 RETURN VALUE
2373 Returns an array of two values containing the page number and text offset if the searched string has
2374 been found, or false otherwise.
2375
2376 *-------------------------------------------------------------------------------------------------------------*/
2377 public function text_strpos ( $search, $start = 0 )
2378 {
2379 $offset = mb_strpos ( $this -> Text, $search, $start, 'UTF-8' ) ;
2380
2381 if ( $offset !== false )
2382 return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
2383
2384 return ( false ) ;
2385 }
2386
2387
2388 public function text_stripos ( $search, $start = 0 )
2389 {
2390 $offset = mb_stripos ( $this -> Text, $search, $start, 'UTF-8' ) ;
2391
2392 if ( $offset !== false )
2393 return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
2394
2395 return ( false ) ;
2396 }
2397
2398
2399
2400
2401 /*--------------------------------------------------------------------------------------------------------------
2402
2403 NAME
2404 document_strpos, document_stripos - Search for all occurrences of a string.
2405
2406 PROTOTYPE
2407 $result = $pdf -> document_strpos ( $search, $group_by_page = false ) ;
2408 $result = $pdf -> document_stripos ( $search, $group_by_page = false ) ;
2409
2410 DESCRIPTION
2411 Searches for ALL occurrences of a given string in the pdf document. The value of the $group_by_page
2412 parameter determines how the results are returned :
2413 - When true, the returned value will be an associative array whose keys will be page numbers and values
2414 arrays of offset of the found string within the page
2415 - When false, the returned value will be an array of arrays containing two entries : the page number
2416 and the text offset.
2417
2418 For example, if a pdf document contains the string "here" at character offset 100 and 200 in page 1, and
2419 position 157 in page 3, the returned value will be :
2420 - When $group_by_page is false :
2421 [ [ 1, 100 ], [ 1, 200 ], [ 3, 157 ] ]
2422 - When $group_by_page is true :
2423 [ 1 => [ 100, 200 ], 3 => [ 157 ] ]
2424
2425 PARAMETERS
2426 $search (string) -
2427 String to be searched.
2428
2429 $group_by_page (boolean) -
2430 Indicates whether the found offsets should be grouped by page number or not.
2431
2432 RETURN VALUE
2433 Returns an array of page numbers/character offsets (see Description above) or false if the specified
2434 string does not appear in the document.
2435
2436 *-------------------------------------------------------------------------------------------------------------*/
2437 public function document_strpos ( $text, $group_by_page = false )
2438 {
2439 $length = strlen ( $text ) ;
2440
2441 if ( ! $length )
2442 return ( false ) ;
2443
2444 $result = array ( ) ;
2445 $index = 0 ;
2446
2447 while ( ( $index = mb_strpos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false )
2448 {
2449 $page = $this -> GetPageFromOffset ( $index ) ;
2450
2451 if ( $group_by_page )
2452 $result [ $page ] [] = $index ;
2453 else
2454 $result [] = array ( $page, $index ) ;
2455
2456 $index += $length ;
2457 }
2458
2459 return ( $result ) ;
2460 }
2461
2462
2463 public function document_stripos ( $text, $group_by_page = false )
2464 {
2465 $length = strlen ( $text ) ;
2466
2467 if ( ! $length )
2468 return ( false ) ;
2469
2470 $result = array ( ) ;
2471 $index = 0 ;
2472
2473 while ( ( $index = mb_stripos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false )
2474 {
2475 $page = $this -> GetPageFromOffset ( $index ) ;
2476
2477 if ( $group_by_page )
2478 $result [ $page ] [] = $index ;
2479 else
2480 $result [] = array ( $page, $index ) ;
2481
2482 $index += $length ;
2483 }
2484
2485 return ( $result ) ;
2486 }
2487
2488
2489 /*--------------------------------------------------------------------------------------------------------------
2490
2491 NAME
2492 text_match, document_match - Search string using regular expressions.
2493
2494 PROTOTYPE
2495 $status = $pdf -> text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
2496 $status = $pdf -> document_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
2497
2498 DESCRIPTION
2499 text_match() calls the preg_match() PHP function on the pdf text contents, to locate the first occurrence
2500 of text that matches the specified regular expression.
2501 document_match() calls the preg_match_all() function to locate all occurrences that match the specified
2502 regular expression.
2503 Note that both methods add the PREG_OFFSET_CAPTURE flag when calling preg_match/preg_match_all so you
2504 should be aware that all captured results are an array containing the following entries :
2505 - Item [0] is the captured string
2506 - Item [1] is its text offset
2507 - The text_match() and document_match() methods add an extra array item (index 2), which contains the
2508 page number where the matched text resides
2509
2510 PARAMETERS
2511 $pattern (string) -
2512 Regular expression to be searched.
2513
2514 $match (any) -
2515 Output captures. See preg_match/preg_match_all.
2516
2517 $flags (integer) -
2518 PCRE flags. See preg_match/preg_match_all.
2519
2520 $offset (integer) -
2521 Start offset. See preg_match/preg_match_all.
2522
2523 RETURN VALUE
2524 Returns the number of matched occurrences, or false if the specified regular expression is invalid.
2525
2526 *-------------------------------------------------------------------------------------------------------------*/
2527 public function text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 )
2528 {
2529 $local_match = null ;
2530 $status = preg_match ( $pattern, $this -> Text, $local_match, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
2531
2532 if ( $status )
2533 {
2534 foreach ( $local_match as &$entry )
2535 $entry [2] = $this -> GetPageFromOffset ( $entry [1] ) ;
2536
2537 $match = $local_match ;
2538 }
2539
2540 return ( $status ) ;
2541 }
2542
2543
2544 public function document_match ( $pattern, &$matches = null, $flags = 0, $offset = 0 )
2545 {
2546 $local_matches = null ;
2547 $status = preg_match_all ( $pattern, $this -> Text, $local_matches, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
2548
2549 if ( $status )
2550 {
2551 foreach ( $local_matches as &$entry )
2552 {
2553 foreach ( $entry as &$subentry )
2554 $subentry [2] = $this -> GetPageFromOffset ( $subentry [1] ) ;
2555 }
2556
2557 $matches = $local_matches ;
2558 }
2559
2560 return ( $status ) ;
2561 }
2562
2563
2564 /*--------------------------------------------------------------------------------------------------------------
2565
2566 HasFormData -
2567 Returns true if the PDF file contains form data or not.
2568
2569 *-------------------------------------------------------------------------------------------------------------*/
2570 public function HasFormData ( )
2571 {
2572 return ( count ( $this -> FormData ) > 0 ) ;
2573 }
2574
2575
2576 /*--------------------------------------------------------------------------------------------------------------
2577
2578 GetFormCount -
2579 Returns the number of top-level forms contained in the PDF file.
2580
2581 *-------------------------------------------------------------------------------------------------------------*/
2582 public function GetFormCount ( )
2583 {
2584 return ( count ( $this -> FormData ) ) ;
2585 }
2586
2587
2588 /*--------------------------------------------------------------------------------------------------------------
2589
2590 NAME
2591 GetFormData - Returns form data, if any
2592
2593 PROTOTYPE
2594 $object = $pdf -> GetFormData ( $template = null, $form_index = 0 ) ;
2595
2596 DESCRIPTION
2597 Retrieves form data if present.
2598
2599 PARAMETERS
2600 $template (string) -
2601 An XML file describing form data using human-readable names for field values.
2602 If not specified, the inline form definitions will be used, together with the field names
2603 specified in the PDF file.
2604
2605 $form_index (integer) -
2606 Form index in the PDF file. So far, I really don't know if a PDF file can have multiple forms.
2607
2608 RETURN VALUE
2609 An object derived from the PdfToTextFormData class.
2610
2611 *-------------------------------------------------------------------------------------------------------------*/
2612 public function GetFormData ( $template = null, $form_index = 0 )
2613 {
2614 if ( isset ( $this -> FormDataObjects [ $form_index ] ) )
2615 return ( $this -> FormDataObjects [ $form_index ] ) ;
2616
2617 if ( $form_index > count ( $this -> FormDataObjectNumbers ) )
2618 error ( new PdfToTextFormException ( "Invalid form index #$form_index." ) ) ;
2619
2620 $form_data = $this -> FormData [ $this -> FormDataObjectNumbers [ $form_index ] ] ;
2621
2622 if ( $template )
2623 {
2624 if ( ! file_exists ( $template ) )
2625 error ( new PdfToTextFormException ( "Form data template file \"$template\" not found." ) ) ;
2626
2627 $xml_data = file_get_contents ( $template ) ;
2628 $definitions = new PdfToTextFormDefinitions ( $xml_data, $form_data [ 'form' ] ) ; ;
2629 }
2630 else
2631 {
2632 $definitions = new PdfToTextFormDefinitions ( null, $form_data [ 'form' ] ) ;
2633 }
2634
2635 $object = $definitions [ $form_index ] -> GetFormDataFromPdfObject ( $form_data [ 'values' ] ) ;
2636
2637 $this -> FormDataDefinitions [] = $definitions ;
2638 $this -> FormDataObjects [] = $object ;
2639
2640 return ( $object ) ;
2641 }
2642
2643
2644 /*--------------------------------------------------------------------------------------------------------------
2645
2646 NAME
2647 MarkTextLike - Marks output text.
2648
2649 PROTOTYPE
2650 $pdf -> MarkTextLike ( $regex, $marker_start, $marker_end ) ;
2651
2652 DESCRIPTION
2653 Sometimes it may be convenient, when you want to extract only a portion of text, to say : "I want to
2654 extract text between this title and this title". The MarkTextLike() method provides some support for
2655 such a task. Imagine you have documents that have the same structure, all starting with an "Introduction"
2656 title :
2657
2658 Introduction
2659 ...
2660 some text
2661 ...
2662 Some other title
2663 ...
2664
2665 By calling the MarkTextLike() method such as in the example below :
2666
2667 $pdf -> MarkTextLike ( '/\bIntroduction\b/', '<M>', '</M' ) ;
2668
2669 then you will get as output :
2670
2671 <M>Introduction</M>
2672 ...
2673 some text
2674 ...
2675 <M>Some other title</M>
2676
2677 Adding such markers in the output will allow you to easily extract the text between the chapters
2678 "Introduction" and "Some other title", using a regular expression.
2679
2680 The font name used for the first string matched by the specified regular expression will be searched
2681 later to add markers around all the text portions using this font.
2682
2683
2684 PARAMETERS
2685 $regex (string) -
2686 A regular expression to match the text to be matched. Subsequent portions of text using the
2687 same font will be surrounded by the marker start/end strings.
2688
2689 $marker_start, $marker_end (string) -
2690 Markers to surround the string when a match is found.
2691
2692 *-------------------------------------------------------------------------------------------------------------*/
2693 public function MarkTextLike ( $regex, $marker_start, $marker_end )
2694 {
2695 $this -> UnprocessedMarkerList [ 'font' ] [] = array
2696 (
2697 'regex' => $regex,
2698 'start' => $marker_start,
2699 'end' => $marker_end
2700 ) ;
2701 }
2702
2703
2704 /*--------------------------------------------------------------------------------------------------------------
2705
2706 NAME
2707 SetCaptures, SetCapturesFromString - Defines document parts to be captured.
2708
2709 PROTOTYPE
2710 $pdf -> SetCaptures ( $xml_file ) ;
2711 $pdf -> SetCapturesFromString ( $xml_data ) ;
2712
2713 DESCRIPTION
2714 Defines document parts to be captured.
2715 SetCaptures() takes the definitions for the areas to be captured from an XML file, while
2716 SetCapturesFromString() takes them from a string representing xml capture definitions.
2717
2718 NOTES
2719 - See file README.md for an explanation on the format of the XML capture definition file.
2720 - The SetCaptures() methods must be called before the Load() method.
2721
2722 *-------------------------------------------------------------------------------------------------------------*/
2723 public function SetCaptures ( $xml_file )
2724 {
2725 if ( ! file_exists ( $xml_file ) )
2726 error ( new PdfToTextException ( "File \"$xml_file\" does not exist." ) ) ;
2727
2728 $xml_data = file_get_contents ( $xml_file ) ;
2729
2730 $this -> SetCapturesFromString ( $xml_data ) ;
2731
2732 }
2733
2734
2735 public function SetCapturesFromString ( $xml_data )
2736 {
2737 // Setting capture areas implies having the PDFOPT_BASIC_LAYOUT option
2738 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
2739
2740 $this -> CaptureDefinitions = new PdfToTextCaptureDefinitions ( $xml_data ) ;
2741 }
2742
2743
2744 /*--------------------------------------------------------------------------------------------------------------
2745
2746 NAME
2747 GetCaptures - Returns captured data.
2748
2749 PROTOTYPE
2750 $object = $pdf -> GetCaptures ( $full = false ) ;
2751
2752 PARAMETERS
2753 $full (boolean) -
2754 When true, the whole captures, togethers with their definitions, are returned. When false,
2755 only a basic object containing the capture names and their values is returned.
2756
2757 DESCRIPTION
2758 Returns the object that contains captured data.
2759
2760 RETURN VALUE
2761 An object of type PdfToTextCaptures, or false if an error occurred.
2762
2763 *-------------------------------------------------------------------------------------------------------------*/
2764 public function GetCaptures ( $full = false )
2765 {
2766 if ( ! $this -> CaptureObject )
2767 {
2768 $this -> CaptureDefinitions -> SetPageCount ( count ( $this -> Pages ) ) ;
2769 $this -> CaptureObject = $this -> CaptureDefinitions -> GetCapturedObject ( $this -> DocumentFragments ) ;
2770 }
2771
2772 if ( $full )
2773 return ( $this -> CaptureObject ) ;
2774 else
2775 return ( $this -> CaptureObject -> ToCaptures ( ) ) ;
2776 }
2777
2778
2779 /**************************************************************************************************************
2780 **************************************************************************************************************
2781 **************************************************************************************************************
2782 ****** ******
2783 ****** ******
2784 ****** INTERNAL METHODS ******
2785 ****** ******
2786 ****** ******
2787 **************************************************************************************************************
2788 **************************************************************************************************************
2789 **************************************************************************************************************/
2790
2791 /*--------------------------------------------------------------------------------------------------------------
2792
2793 NAME
2794 AddImage - Adds an image from the PDF stream to the current object.
2795
2796 PROTOTYPE
2797 $this -> AddImage ( $object_id, $stream_data, $type, $object_data ) ;
2798
2799 DESCRIPTION
2800 Adds an image from the PDF stream to the current object.
2801 If the PDFOPT_GET_IMAGE_DATA flag is enabled, image data will be added to the ImageData property.
2802 If the PDFOPT_DECODE_IMAGE_DATA flag is enabled, a jpeg resource will be created and added into the
2803 Images array property.
2804
2805 PARAMETERS
2806 $object_id (integer) -
2807 Pdf object id.
2808
2809 $stream_data (string) -
2810 Contents of the unprocessed stream data containing the image.
2811
2812 $type (integer) -
2813 One of the PdfToText::PDF_*_ENCODING constants.
2814
2815 *-------------------------------------------------------------------------------------------------------------*/
2816 protected function AddImage ( $object_id, $stream_data, $type, $object_data )
2817 {
2818
2819 if ( self::$DEBUG && $this -> Options & self::PDFOPT_GET_IMAGE_DATA )
2820 {
2821 switch ( $type )
2822 {
2823 case self::PDF_DCT_ENCODING :
2824 $this -> ImageData = array ( 'type' => 'jpeg', 'data' => $stream_data ) ;
2825 break ;
2826 }
2827
2828 }
2829
2830
2831 if ( $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA &&
2832 ( ! $this -> MaxExtractedImages || $this -> ImageCount < $this -> MaxExtractedImages ) )
2833 {
2834 $image = $this -> DecodeImage ( $object_id, $stream_data, $type, $object_data, $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) ;
2835
2836 if ( $image !== false )
2837 {
2838 $this -> ImageCount ++ ;
2839
2840 // When the PDFOPT_AUTOSAVE_IMAGES flag is set, we simply use a template filename to generate a real output filename
2841 // then save the image to that file. The memory is freed after that.
2842 if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES )
2843 {
2844 $output_filename = $this -> __get_output_image_filename ( ) ;
2845
2846 $image -> SaveAs ( $output_filename, $this -> ImageAutoSaveFormat ) ;
2847 unset ( $image ) ;
2848
2849 $this -> AutoSavedImageFiles [] = $output_filename ;
2850 }
2851 // Otherwise, simply store the image data into memory
2852 else
2853 $this -> Images [] = $image ;
2854 }
2855 }
2856 }
2857
2858
2859 /*--------------------------------------------------------------------------------------------------------------
2860
2861 NAME
2862 DecodeData - Decodes stream data.
2863
2864 PROTOTYPE
2865 $data = $this -> DecodeData ( $object_id, $stream_data, $type ) ;
2866
2867 DESCRIPTION
2868 Decodes stream data (binary data located between the "stream" and "enstream" directives) according to the
2869 specified encoding type, given in the surrounding object parameters.
2870
2871 PARAMETERS
2872 $object_id (integer) -
2873 Id of the object containing the data.
2874
2875 $stream_data (string) -
2876 Contents of the binary stream.
2877
2878 $type (integer) -
2879 One of the PDF_*_ENCODING constants, as returned by the GetEncodingType() method.
2880
2881 RETURN VALUE
2882 Returns the decoded stream data.
2883
2884 *-------------------------------------------------------------------------------------------------------------*/
2885 protected function DecodeData ( $object_id, $stream_data, $type, $object_data )
2886 {
2887 $decoded_stream_data = '' ;
2888
2889 switch ( $type )
2890 {
2891 case self::PDF_FLATE_ENCODING :
2892 // Objects in password-protected Pdf files SHOULD be encrypted ; however, it happens that we may encounter normal,
2893 // unencrypted ones. This is why we always try to gzuncompress them first then, if failed, try to decrypt them
2894 $decoded_stream_data = @gzuncompress ( $stream_data ) ;
2895
2896 if ( $decoded_stream_data === false )
2897 {
2898 if ( $this -> IsEncrypted )
2899 {
2900 $decoded_stream_data = $this -> EncryptionData -> Decrypt ( $object_id, $stream_data ) ;
2901
2902 if ( $decoded_stream_data === false )
2903 {
2904 if ( self::$DEBUG > 1 )
2905 warning ( new PdfToTextDecodingException ( "Unable to decrypt object contents.", $object_id ) ) ;
2906 }
2907 }
2908 else if ( self::$DEBUG > 1 )
2909 warning ( new PdfToTextDecodingException ( "Invalid gzip data.", $object_id ) ) ;
2910 }
2911
2912 break ;
2913
2914 case self::PDF_LZW_ENCODING :
2915 $decoded_stream_data = $this -> __decode_lzw ( $stream_data ) ;
2916 break ;
2917
2918 case self::PDF_ASCIIHEX_ENCODING :
2919 $decoded_stream_data = $this -> __decode_ascii_hex ( $stream_data ) ;
2920 break ;
2921
2922 case self::PDF_ASCII85_ENCODING :
2923 $decoded_stream_data = $this -> __decode_ascii_85 ( $stream_data ) ;
2924
2925 // Dumbly check if this could not be gzipped data after decoding (normally, the object flags should also specify
2926 // the /FlateDecode flag)
2927 if ( $decoded_stream_data !== false && ( $result = @gzuncompress ( $decoded_stream_data ) ) !== false )
2928 $decoded_stream_data = $result ;
2929
2930 break ;
2931
2932 case self::PDF_TEXT_ENCODING :
2933 $decoded_stream_data = $stream_data ;
2934 break ;
2935 }
2936
2937 return ( $decoded_stream_data ) ;
2938 }
2939
2940
2941 // __decode_lzw -
2942 // Decoding function for LZW encrypted data. This function is largely inspired by the TCPDF one but has been rewritten
2943 // for a performance gain of 30-35%.
2944 private function __decode_lzw ( $data )
2945 {
2946 // The initial dictionary contains 256 entries where each index is equal to its character representation
2947 static $InitialDictionary = array
2948 (
2949 "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", "\x0E", "\x0F",
2950 "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F",
2951 "\x20", "\x21", "\x22", "\x23", "\x24", "\x25", "\x26", "\x27", "\x28", "\x29", "\x2A", "\x2B", "\x2C", "\x2D", "\x2E", "\x2F",
2952 "\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39", "\x3A", "\x3B", "\x3C", "\x3D", "\x3E", "\x3F",
2953 "\x40", "\x41", "\x42", "\x43", "\x44", "\x45", "\x46", "\x47", "\x48", "\x49", "\x4A", "\x4B", "\x4C", "\x4D", "\x4E", "\x4F",
2954 "\x50", "\x51", "\x52", "\x53", "\x54", "\x55", "\x56", "\x57", "\x58", "\x59", "\x5A", "\x5B", "\x5C", "\x5D", "\x5E", "\x5F",
2955 "\x60", "\x61", "\x62", "\x63", "\x64", "\x65", "\x66", "\x67", "\x68", "\x69", "\x6A", "\x6B", "\x6C", "\x6D", "\x6E", "\x6F",
2956 "\x70", "\x71", "\x72", "\x73", "\x74", "\x75", "\x76", "\x77", "\x78", "\x79", "\x7A", "\x7B", "\x7C", "\x7D", "\x7E", "\x7F",
2957 "\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8A", "\x8B", "\x8C", "\x8D", "\x8E", "\x8F",
2958 "\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9A", "\x9B", "\x9C", "\x9D", "\x9E", "\x9F",
2959 "\xA0", "\xA1", "\xA2", "\xA3", "\xA4", "\xA5", "\xA6", "\xA7", "\xA8", "\xA9", "\xAA", "\xAB", "\xAC", "\xAD", "\xAE", "\xAF",
2960 "\xB0", "\xB1", "\xB2", "\xB3", "\xB4", "\xB5", "\xB6", "\xB7", "\xB8", "\xB9", "\xBA", "\xBB", "\xBC", "\xBD", "\xBE", "\xBF",
2961 "\xC0", "\xC1", "\xC2", "\xC3", "\xC4", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCC", "\xCD", "\xCE", "\xCF",
2962 "\xD0", "\xD1", "\xD2", "\xD3", "\xD4", "\xD5", "\xD6", "\xD7", "\xD8", "\xD9", "\xDA", "\xDB", "\xDC", "\xDD", "\xDE", "\xDF",
2963 "\xE0", "\xE1", "\xE2", "\xE3", "\xE4", "\xE5", "\xE6", "\xE7", "\xE8", "\xE9", "\xEA", "\xEB", "\xEC", "\xED", "\xEE", "\xEF",
2964 "\xF0", "\xF1", "\xF2", "\xF3", "\xF4", "\xF5", "\xF6", "\xF7", "\xF8", "\xF9", "\xFA", "\xFB", "\xFC", "\xFD", "\xFE", "\xFF"
2965 ) ;
2966
2967 // Dictionary lengths - when we reach one of the values specified as the key, we have to set the bit length to the corresponding value
2968 static $DictionaryLengths = array
2969 (
2970 511 => 10,
2971 1023 => 11,
2972 2047 => 12
2973 ) ;
2974
2975 // Decoded string to be returned
2976 $result = '' ;
2977
2978 // Convert string to binary string
2979 $bit_string = '' ;
2980 $data_length = strlen ( $data ) ;
2981
2982 for ( $i = 0 ; $i < $data_length ; $i ++ )
2983 $bit_string .= sprintf ( '%08b', ord ( $data[$i] ) ) ;
2984
2985 $data_length *= 8 ;
2986
2987 // Initialize dictionary
2988 $bit_length = 9 ;
2989 $dictionary_index = 258 ;
2990 $dictionary = $InitialDictionary ;
2991
2992 // Previous value
2993 $previous_index = 0 ;
2994
2995 // Start index in bit string
2996 $start_index = 0 ;
2997
2998 // Until we encounter the EOD marker (257), read $bit_length bits
2999 while ( ( $start_index < $data_length ) && ( ( $index = bindec ( substr ( $bit_string, $start_index, $bit_length ) ) ) !== 257 ) )
3000 {
3001 // Move to next bit position
3002 $start_index += $bit_length ;
3003
3004 if ( $index !== 256 && $previous_index !== 256 )
3005 {
3006 // Check if index exists in the dictionary and remember it
3007 if ( $index < $dictionary_index )
3008 {
3009 $result .= $dictionary [ $index ] ;
3010 $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $index ] [0] ;
3011 $previous_index = $index ;
3012 }
3013 // Index does not exist - add it to the dictionary
3014 else
3015 {
3016 $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $previous_index ] [0] ;
3017 $result .= $dictionary_value ;
3018 }
3019
3020 // Update dictionary
3021 $dictionary [ $dictionary_index ++ ] = $dictionary_value ;
3022
3023 // Change bit length whenever we reach an index limit
3024 if ( isset ( $DictionaryLengths [ $dictionary_index ] ) )
3025 $bit_length = $DictionaryLengths [ $dictionary_index ] ;
3026 }
3027 // Clear table marker
3028 else if ( $index === 256)
3029 {
3030 // Reset dictionary and bit length
3031 // Reset dictionary and bit length
3032 $bit_length = 9 ;
3033 $dictionary_index = 258 ;
3034 $previous_index = 256 ;
3035 $dictionary = $InitialDictionary ;
3036 }
3037 // First entry
3038 else // $previous_index === 256
3039 {
3040 // first entry
3041 $result .= $dictionary [ $index ] ;
3042 $previous_index = $index ;
3043 }
3044 }
3045
3046 // All done, return
3047 return ( $result ) ;
3048 }
3049
3050
3051 // __decode_ascii_hex -
3052 // Decoder for /AsciiHexDecode streams.
3053 private function __decode_ascii_hex ( $input )
3054 {
3055 $output = "" ;
3056 $is_odd = true ;
3057 $is_comment = false ;
3058
3059 for ( $i = 0, $codeHigh = -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++ )
3060 {
3061 $c = $input [ $i ] ;
3062
3063 if ( $is_comment )
3064 {
3065 if ( $c == '\r' || $c == '\n' )
3066 $is_comment = false ;
3067
3068 continue;
3069 }
3070
3071 switch ( $c )
3072 {
3073 case '\0' :
3074 case '\t' :
3075 case '\r' :
3076 case '\f' :
3077 case '\n' :
3078 case ' ' :
3079 break ;
3080
3081 case '%' :
3082 $is_comment = true ;
3083 break ;
3084
3085 default :
3086 $code = hexdec ( $c ) ;
3087
3088 if ( $code === 0 && $c != '0' )
3089 return ( '' ) ;
3090
3091 if ( $is_odd )
3092 $codeHigh = $code ;
3093 else
3094 $output .= chr ( ( $codeHigh << 4 ) | $code ) ;
3095
3096 $is_odd = ! $is_odd ;
3097 break ;
3098 }
3099 }
3100
3101 if ( $input [ $i ] != '>' )
3102 return ( '' ) ;
3103
3104 if ( $is_odd )
3105 $output .= chr ( $codeHigh << 4 ) ;
3106
3107 return ( $output ) ;
3108 }
3109
3110
3111 // __decode_ascii_85 -
3112 // Decoder for /Ascii85Decode streams.
3113 private function __decode_ascii_85 ( $data )
3114 {
3115 // Ordinal value of the first character used in Ascii85 encoding
3116 static $first_ord = 33 ;
3117 // "A 'z' in the input data means "sequence of 4 nuls"
3118 static $z_exception = "\0\0\0\0" ;
3119 // Powers of 85, from 4 to 0
3120 static $exp85 = array ( 52200625, 614125, 7225, 85, 1 ) ;
3121
3122 // Ignore empty data
3123 if ( $data === '' )
3124 return ( false ) ;
3125
3126 $data_length = strlen ( $data ) ;
3127 $ords = array ( ) ;
3128 $ord_count = 0 ;
3129 $result = '' ;
3130
3131 // Paranoia : Ascii85 data may start with '<~' (but it always end with '~>'). Anyway, we must start past this construct if present
3132 if ( $data [0] == '<' && $data [1] == '~' )
3133 $start = 2 ;
3134 else
3135 $start = 0 ;
3136
3137 // Loop through nput characters
3138 for ( $i = $start ; $i < $data_length && $data [$i] != '~' ; $i ++ )
3139 {
3140 $ch = $data [$i] ;
3141
3142 // Most common case : current character is in the range of the Ascii85 encoding ('!'..'u')
3143 if ( $ch >= '!' && $ch <= 'u' )
3144 $ords [ $ord_count ++ ] = ord ( $ch ) - $first_ord ;
3145 // 'z' is replaced with a sequence of null bytes
3146 else if ( $ch == 'z' && ! $ord_count )
3147 $result .= $z_exception ;
3148 // Spaces are ignored
3149 else if ( $ch !== "\0" && $ch !== "\t" && $ch !== ' ' && $ch !== "\r" && $ch !== "\n" && $ch !== "\f" )
3150 continue ;
3151 // Other characters : corrupted data...
3152 else
3153 return ( false ) ;
3154
3155 // We have collected 5 characters in base 85 : convert their 32-bits value to base 2 (3 characters)
3156 if ( $ord_count == 5 )
3157 {
3158 $ord_count = 0 ;
3159
3160 for ( $sum = 0, $j = 0 ; $j < 5 ; $j ++ )
3161 $sum = ( $sum * 85 ) + $ords [ $j ] ;
3162
3163 for ( $j = 3 ; $j >= 0 ; $j -- )
3164 $result .= chr ( $sum >> ( $j * 8 ) ) ;
3165 }
3166 }
3167
3168 // A last processing for the potential remaining bytes
3169 // Notes : this situation has never been tested
3170 if ( $ord_count )
3171 {
3172 for ( $i = 0, $sum = 0 ; $i < $ord_count ; $i++ )
3173 $sum += ( $ords [ $i ] + ( $i == $ord_count - 1 ) ) * $exp85 [$i] ;
3174
3175 for ( $i = 0 ; $i < $ord_count - 1 ; $i++ )
3176 $result .= chr ( $sum >> ( ( 3 - $i ) * 8 ) ) ;
3177 }
3178
3179 // All done, return
3180 return ( $result ) ;
3181 }
3182
3183
3184 /*--------------------------------------------------------------------------------------------------------------
3185
3186 NAME
3187 DecodeImage - Returns decoded image contents.
3188
3189 PROTOTYPE
3190 TBC
3191
3192 DESCRIPTION
3193 description
3194
3195 PARAMETERS
3196 $object_id (integer) -
3197 Pdf object number.
3198
3199 $stream_data (string) -
3200 Object data.
3201
3202 $type (integer) -
3203 One of the PdfToText::PDF_*_ENCODING constants.
3204
3205 $autosave (boolean) -
3206 When autosave is selected, images will not be decoded into memory unless they have a format
3207 different from JPEG. This is intended to save memory.
3208
3209 RETURN VALUE
3210 Returns an object of type PdfIMage, or false if the image encoding type is not currently supported.
3211
3212 *-------------------------------------------------------------------------------------------------------------*/
3213 protected function DecodeImage ( $object_id, $stream_data, $type, $object_data, $autosave )
3214 {
3215 switch ( $type )
3216 {
3217 // Normal JPEG image
3218 case self::PDF_DCT_ENCODING :
3219 return ( new PdfJpegImage ( $stream_data, $autosave ) ) ;
3220
3221 // CCITT fax image
3222 case self::PDF_CCITT_FAX_ENCODING :
3223 return ( new PdfFaxImage ( $stream_data ) ) ;
3224
3225 // For now, I have not found enough information to be able to decode image data in an inflated stream...
3226 // In some cases, however, this is JPEG data
3227 case self::PDF_FLATE_ENCODING :
3228 $image = PdfInlinedImage::CreateInstance ( $stream_data, $object_data, $autosave ) ;
3229
3230 if ( $image )
3231 return ( $image ) ;
3232
3233 break ;
3234
3235 default :
3236 return ( false ) ;
3237 }
3238
3239 return ( false ) ;
3240 }
3241
3242
3243 /*--------------------------------------------------------------------------------------------------------------
3244
3245 NAME
3246 DecodeObjectStream - Decodes an object stream.
3247
3248 PROTOTYPE
3249 $array = $this -> DecodeObjectStream ( $object_id, $object_data ) ;
3250
3251 DESCRIPTION
3252 Decodes an object stream. An object stream is yet another PDF object type that contains itself several
3253 objects not defined using the "x y obj ... endobj" syntax.
3254 As far as I understood, object streams data is contained within stream/endstream delimiters, and is
3255 gzipped.
3256 Object streams start with a set of object id/offset pairs separated by a space ; catenated object data
3257 immediately follows the last space ; for example :
3258
3259 1167 0 1168 114 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>[/ICCBased 1156 0 R]
3260
3261 The above example specifies two objects :
3262 . Object #1167, which starts at offset 0 and ends before the second object, at offset #113 in
3263 the data. The contents are :
3264 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>
3265 . Object #1168, which starts at offset #114 and continues until the end of the object stream.
3266 It contains the following data :
3267 [/ICCBased 1156 0 R]
3268
3269 PARAMETERS
3270 $object_id (integer) -
3271 Pdf object number.
3272
3273 $object_data (string) -
3274 Object data.
3275
3276 RETURN VALUE
3277 Returns false if any error occurred (mainly for syntax reasons).
3278 Otherwise, returns an associative array containing the following elements :
3279 - object_id :
3280 Array of all the object ids contained in the object stream.
3281 - object :
3282 Array of corresponding object data.
3283
3284 The reason for this format is that it is identical to the array returned by the preg_match() function
3285 used in the Load() method for finding objects in a PDF file (ie, a regex that matches "x y oj/endobj"
3286 constructs).
3287
3288 *-------------------------------------------------------------------------------------------------------------*/
3289 protected function DecodeObjectStream ( $object_id, $object_data )
3290 {
3291 // Extract gzipped data for this object
3292 if ( preg_match ( '#[^/] stream ( (\r? \n) | \r ) (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
3293 {
3294 $stream_data = $stream_match [ 'stream' ] ;
3295 $type = $this -> GetEncodingType ( $object_id, $object_data ) ;
3296 $decoded_data = $this -> DecodeData ( $object_id, $stream_data, $type, $object_data ) ;
3297
3298 if ( self::$DEBUG > 1 )
3299 echo "\n----------------------------------- OBJSTREAM #$object_id\n$decoded_data" ;
3300 }
3301 // Stay prepared to find one day a sample declared as an object stream but not having gzipped data delimited by stream/endstream tags
3302 else
3303 {
3304 if ( self::$DEBUG > 1 )
3305 error ( new PdfToTextDecodingException ( "Found object stream without gzipped data", $object_id ) ) ;
3306
3307 return ( false ) ;
3308 }
3309
3310 // Object streams data start with a series of object id/offset pairs. The offset is absolute to the first character
3311 // after the last space of these series.
3312 // Note : on Windows platforms, the default stack size is 1Mb. The following regular expression will make Apache crash in most cases,
3313 // so you have to enable the following lines in your http.ini file to set a stack size of 8Mb, as for Unix systems :
3314 // Include conf/extra/httpd-mpm.conf
3315 // ThreadStackSize 8388608
3316 if ( ! preg_match ( '/^ \s* (?P<series> (\d+ \s* )+ )/x', $decoded_data, $series_match ) )
3317 {
3318 if ( self::$DEBUG > 1 )
3319 error ( new PdfToTextDecodingException ( "Object stream does not start with integer object id/offset pairs.", $object_id ) ) ;
3320
3321 return ( false ) ;
3322 }
3323
3324 // Extract the series of object id/offset pairs and the stream object data
3325 $series = explode ( ' ', rtrim ( preg_replace ( '/\s+/', ' ', $series_match [ 'series' ] ) ) ) ;
3326 $data = substr ( $decoded_data, strlen ( $series_match [ 'series' ] ) ) ;
3327
3328 // $series should contain an even number of values
3329 if ( count ( $series ) % 2 )
3330 {
3331 if ( self::$DEBUG )
3332 warning ( new PdfToTextDecodingException ( "Object stream should start with an even number of integer values.", $object_id ) ) ;
3333
3334 array_pop ( $series ) ;
3335 }
3336
3337 // Extract every individual object
3338 $objects = array ( 'object_id' => array ( ), 'object' => array ( ) ) ;
3339
3340 for ( $i = 0, $count = count ( $series ) ; $i < $count ; $i += 2 )
3341 {
3342 $object_id = ( integer ) $series [$i] ;
3343 $offset = ( integer ) $series [$i+1] ;
3344
3345 // If there is a "next" object, extract only a substring within the object stream contents
3346 if ( isset ( $series [ $i + 3 ] ) )
3347 $object_contents = substr ( $data, $offset, $series [ $i + 3 ] - $offset ) ;
3348 // Otherwise, extract everything until the end
3349 else
3350 $object_contents = substr ( $data, $offset ) ;
3351
3352 $objects [ 'object_id'] [] = $object_id ;
3353 $objects [ 'object' ] [] = $object_contents ;
3354 }
3355
3356 return ( $objects ) ;
3357 }
3358
3359
3360 /*--------------------------------------------------------------------------------------------------------------
3361
3362 NAME
3363 ExtractTextData - Extracts text, header & footer information from a text object.
3364
3365 PROTOTYPE
3366 $this -> ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) ;
3367
3368 DESCRIPTION
3369 Extracts text, header & footer information from a text object. The extracted text contents will be
3370 stripped from any header/footer information.
3371
3372 PARAMETERS
3373 $text (string) -
3374 Variable that will receive text contents.
3375
3376 $header, $footer (string) -
3377 Variables that will receive header and footer information.
3378
3379 *-------------------------------------------------------------------------------------------------------------*/
3380 protected function ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer )
3381 {
3382 // Normally, a header or footer is introduced with a construct like :
3383 // << /Type /Pagination ... [/Bottom] ... >> (or [/Top]
3384 // The initial regular expression was :
3385 // << .*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] .*? >> \s* BDC .*? EMC
3386 // (the data contained between the BDC and EMC instructions are text-drawing instructions).
3387 // However, this expression revealed to be too greedy and captured too much data ; in the following example :
3388 // <</MCID 0>> ...(several kb of drawing instructions)... << ... [/Bottom] ... >> BDC (other drawing instructions for the page footer) EMC
3389 // everything was captured, from the initial "<<M/MCID 0>>" to the final "EMC", which caused regular page contents to be interpreted as page bottom
3390 // contents.
3391 // The ".*?" in the regex has been replaced with "[^>]*?", which works better. However, it will fail to recognize header/footer contents if
3392 // the header/footer declaration contains a nested construct , such as :
3393 // << /Type /Pagination ... [/Bottom] ... << (some nested contents) >> ... >> (or [/Top]
3394 // Let's wait for the case to happen one day...
3395 static $header_or_footer_re = '#
3396 (?P<contents>
3397 << [^>]*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] [^>]*? >> \s*
3398 BDC .*? EMC
3399 )
3400 #imsx' ;
3401
3402 $header =
3403 $footer =
3404 $text = '' ;
3405
3406 if ( preg_match_all ( $header_or_footer_re, $stream_contents, $matches, PREG_OFFSET_CAPTURE ) )
3407 {
3408 for ( $i = 0, $count = count ( $matches [ 'contents' ] ) ; $i < $count ; $i ++ )
3409 {
3410 if ( ! strcasecmp ( $matches [ 'location' ] [$i] [0], 'Bottom' ) )
3411 $footer = $matches [ 'contents' ] [$i] [0] ;
3412 else
3413 $header = $matches [ 'contents' ] [$i] [0] ;
3414 }
3415
3416 $text = preg_replace ( $header_or_footer_re, '', $stream_contents ) ;
3417 }
3418 else
3419 $text = $stream_contents ;
3420 }
3421
3422
3423 /*--------------------------------------------------------------------------------------------------------------
3424
3425 NAME
3426 ExtractText - extracts text from a pdf stream.
3427
3428 PROTOTYPE
3429 $text = $this -> ExtractText ( $page_number, $object_id, $data, &$current_font ) ;
3430
3431 DESCRIPTION
3432 Extracts text from decoded stream contents.
3433
3434 PARAMETERS
3435 $page_number (integer) -
3436 �Page number that contains the text to be extracted.
3437
3438 $object_id (integer) -
3439 Object id of this text block.
3440
3441 $data (string) -
3442 Stream contents.
3443
3444 $current_font (integer) -
3445 Id of the current font, which should be found in the $this->FontTable property, if anything
3446 went ok.
3447 This parameter is required, since text blocks may not specify a new font resource id and reuse
3448 the one that waas set before.
3449
3450 RETURN VALUE
3451 Returns the decoded text.
3452
3453 NOTES
3454 The PDF language can be seen as a stack-driven language ; for example, the instruction defining a text
3455 matrix ( "Tm" ) expects 6 floating-point values from the stack :
3456
3457 0 0 0 0 x y Tm
3458
3459 It can also specify specific operators, such as /Rx, which sets font number "x" to be the current font,
3460 or even "<< >>" constructs that we can ignore during our process of extracting textual data.
3461 Actually, we only want to handle a very small subset of the Adobe drawing language ; These are :
3462 - "Tm" instructions, that specify, among others, the x and y coordinates of the next text to be output
3463 - "/R" instructions, that specify which font is to be used for the next text output. This is useful
3464 only if the font has an associated character map.
3465 - "/F", same as "/R", but use a font map id instead of a direct object id.
3466 - Text, specified either using a single notation ( "(sometext)" ) or the array notation
3467 ( "[(...)d1(...)d2...(...)]" ), which allows for specifying inter-character spacing.
3468 - "Tf" instructions, that specifies the font size. This is to be able to compute approximately the
3469 number of empty lines between two successive Y coordinates in "Tm" instructions
3470 - "TL" instructions, that define the text leading to be used by "T*"
3471
3472 This is why I choosed to decompose the process of text extraction into three steps :
3473 - The first one, the lowest-level step, is a tokenizer that extracts individual elements, such as "Tm",
3474 "TJ", "/Rx" or "510.77". This is handled by the __next_token() method.
3475 - The second one, __next_instruction(), collects tokens. It pushes every floating-point value onto the
3476 stack, until an instruction is met.
3477 - The third one, ExtractText(), processes data returned by __next_instruction(), and actually performs
3478 the (restricted) parsing of text drawing instructions.
3479
3480 *-------------------------------------------------------------------------------------------------------------*/
3481 protected function ExtractText ( $page_number, $object_id, $data, &$current_font )
3482 {
3483 $new_data = $this -> __strip_useless_instructions ( $data ) ;
3484
3485 if ( self::$DEBUG )
3486 {
3487 echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
3488 echo $data ;
3489 echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
3490 echo $new_data ;
3491 }
3492
3493 $data = $new_data ;
3494
3495 // Index into the specified block of text-drawing instructions
3496 $data_index = 0 ;
3497
3498 $data_length = strlen ( $data ) ; // Data length
3499 $result = '' ; // Resulting string
3500
3501 // Y-coordinate of the last seen "Tm" instruction
3502 $last_goto_y = 0 ;
3503 $last_goto_x = 0 ;
3504
3505 // Y-coordinate of the last seen "Td" or "TD" relative positioning instruction
3506 $last_relative_goto_y = 0 ;
3507
3508 // When true, the current text should be output on the same line as the preceding one
3509 $use_same_line = false ;
3510
3511 // Instruction preceding the current one
3512 $last_instruction = true ;
3513
3514 // Current font size
3515 $current_font_size = 0 ;
3516
3517 // Active template
3518 $current_template = '' ;
3519
3520 // Various pre-computed variables
3521 $separator_length = strlen ( $this -> Separator ) ;
3522
3523 // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
3524 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
3525
3526 // Extra newlines to add before the current text
3527 $extra_newlines = 0 ;
3528
3529 // Text leading used by T*
3530 $text_leading = 0 ;
3531
3532 // Set to true if a separator needs to be inserted
3533 $needs_separator = false ;
3534
3535 // A flag to tell if we should "forget" the last instruction
3536 $discard_last_instruction = false ;
3537
3538 // A flag that tells whether the Separator and BlockSeparator properties are identical
3539 $same_separators = ( $this -> Separator == $this -> BlockSeparator ) ;
3540
3541 // Instruction count (used for handling execution timeouts)
3542 $instruction_count = 0 ;
3543
3544 // Unprocessed markers
3545 $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
3546
3547 // Loop through instructions
3548 while ( ( $instruction = $this -> __next_instruction ( $page_number, $data, $data_length, $data_index, $current_template ) ) !== false )
3549 {
3550 $fragment = '' ;
3551
3552 $instruction_count ++ ;
3553
3554 // Timeout handling - don't test for every instruction processed
3555 if ( ! ( $instruction_count % 100 ) )
3556 {
3557 // Global timeout handling
3558 if ( $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME )
3559 {
3560 $now = microtime ( true ) ;
3561
3562 if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime )
3563 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
3564 }
3565
3566 // Per-instance timeout handling
3567 if ( $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME )
3568 {
3569 $now = microtime ( true ) ;
3570
3571 if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime )
3572 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
3573 }
3574 }
3575
3576 // Character position after the current instruction
3577 $data_index = $instruction [ 'next' ] ;
3578
3579 // Process current instruction
3580 switch ( $instruction [ 'instruction' ] )
3581 {
3582 // Raw text (enclosed by parentheses) or array text (enclosed within square brackets)
3583 // is returned as a single instruction
3584 case 'text' :
3585 // Empty arrays of text may be encountered - ignore them
3586 if ( ! count ( $instruction [ 'values' ] ) )
3587 break ;
3588
3589 // Check if we have to insert a newline
3590 if ( ! $use_same_line )
3591 {
3592 $fragment .= $this -> EOL ;
3593 $needs_separator = false ;
3594 }
3595 // Roughly simulate spacing between lines by inserting newline characters
3596 else if ( $extra_newlines > 0 )
3597 {
3598 $fragment .= str_repeat ( $this -> EOL, $extra_newlines ) ;
3599 $extra_newlines = 0 ;
3600 $needs_separator = false ;
3601 }
3602 else
3603 $needs_separator = true ;
3604
3605 // Add a separator if necessary
3606 if ( $needs_separator )
3607 {
3608 // If the Separator and BlockSeparator properties are the same (and not empty), only add a block separator if
3609 // the current result does not end with it
3610 if ( $same_separators )
3611 {
3612 if ( $this -> Separator != '' && substr ( $fragment, - $separator_length ) != $this -> BlockSeparator )
3613 $fragment .= $this -> BlockSeparator ;
3614 }
3615 else
3616 $fragment .= $this -> BlockSeparator ;
3617 }
3618
3619 $needs_separator = true ;
3620 $value_index = 0 ;
3621
3622 // Fonts having character maps will require some special processing
3623 if ( $current_font_mapped )
3624 {
3625 // Loop through each text value
3626 foreach ( $instruction [ 'values' ] as $text )
3627 {
3628 $is_hex = ( $text [0] == '<' ) ;
3629 $length = strlen ( $text ) - 1 ;
3630 $handled = false ;
3631
3632 // Characters are encoded within angle brackets ( "<>" ).
3633 // Note that several characters can be specified within the same angle brackets, so we have to take
3634 // into account the width we detected in the begincodespancerange construct
3635 if ( $is_hex )
3636 {
3637 for ( $i = 1 ; $i < $length ; $i += $current_font_map_width )
3638 {
3639 $value = substr ( $text, $i, $current_font_map_width ) ;
3640 $ch = hexdec ( $value ) ;
3641
3642 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
3643 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
3644 else if ( $current_font == -1 )
3645 {
3646 $newchar = chr ( $ch ) ;
3647 }
3648 else
3649 {
3650 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
3651 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
3652 }
3653
3654 $fragment .= $newchar ;
3655 }
3656
3657 $handled = true ;
3658 }
3659 // Yes ! double-byte codes can also be specified as plain text within parentheses !
3660 // However, we have to be really careful here ; the sequence :
3661 // (Be)
3662 // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
3663 // We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
3664 // if not, then we have to consider that it is regular text to be taken one character by
3665 // one character. In this case, we fall back to the "if ( ! $handled )" condition
3666 else if ( $current_font_map_width == 4 )
3667 {
3668 $temp_result = '' ;
3669
3670 for ( $i = 1 ; $i < $length ; $i ++ )
3671 {
3672 // Each character in the pair may be a backslash, which escapes the next character so we must skip it
3673 // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
3674 if ( $text [$i] != '\\' )
3675 $ch1 = $text [$i] ;
3676 else
3677 {
3678 $i ++ ;
3679
3680 if ( $text [$i] < '0' || $text [$i] > '7' )
3681 $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
3682 else
3683 {
3684 $oct = '' ;
3685 $digit_count = 0 ;
3686
3687 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
3688 {
3689 $oct .= $text [$i ++] ;
3690 $digit_count ++ ;
3691 }
3692
3693 $ch1 = chr ( octdec ( $oct ) ) ;
3694 $i -- ;
3695 }
3696 }
3697
3698 $i ++ ;
3699
3700 if ( $text [$i] != '\\' )
3701 $ch2 = $text [$i] ;
3702 else
3703 {
3704 $i ++ ;
3705
3706 if ( $text [$i] < '0' || $text [$i] > '7' )
3707 $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
3708 else
3709 {
3710 $oct = '' ;
3711 $digit_count = 0 ;
3712
3713 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
3714 {
3715 $oct .= $text [$i ++] ;
3716 $digit_count ++ ;
3717 }
3718
3719 $ch2 = chr ( octdec ( $oct ) ) ;
3720 $i -- ;
3721 }
3722 }
3723
3724 // Build the 2-bytes character code
3725 $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ;
3726
3727 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
3728 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
3729 else
3730 {
3731 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
3732 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
3733 }
3734
3735 // Yes !!! for characters encoded with two bytes, we can find the following construct :
3736 // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
3737 // which must be expanded as : (Car)
3738 // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
3739 // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
3740 // for the next quirk to happen...
3741 if ( $newchar == '\\' && isset ( $text [ $i + 2 ] ) )
3742 {
3743 $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
3744 $i ++ ; // this time we processed 3 bytes, not 2
3745 }
3746
3747 $temp_result .= $newchar ;
3748 }
3749
3750 // Happens only if we were unable to translate a character using the current character map
3751 $fragment .= $temp_result ;
3752 $handled = true ;
3753 }
3754
3755 // Character strings within parentheses.
3756 // For every text value, use the character map table for substitutions
3757 if ( ! $handled )
3758 {
3759 for ( $i = 1 ; $i < $length ; $i ++ )
3760 {
3761 $ch = $text [$i] ;
3762
3763 // Set to true to optimize calls to MapCharacters
3764 // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
3765 $use_map_buffer = false ;
3766
3767 // ... but don't forget to handle escape sequences "\n" and "\r" for characters
3768 // 10 and 13
3769 if ( $ch == '\\' )
3770 {
3771 $ch = $text [++$i] ;
3772
3773 // Escaped character
3774 if ( $ch < '0' || $ch > '7' )
3775 $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
3776 // However, an octal form can also be specified ; in this case we have to take into account
3777 // the character width for the current font (if the character width is 4 hex digits, then we
3778 // will encounter constructs such as "\000\077").
3779 // The method used here is dirty : we build a regex to match octal character representations on a substring
3780 // of the text
3781 else
3782 {
3783 $width = $current_font_map_width / 2 ; // Convert to byte count
3784 $subtext = substr ( $text, $i - 1 ) ;
3785 $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
3786
3787 $status = preg_match ( $regex, $subtext, $octal_matches ) ;
3788
3789 if ( $status )
3790 {
3791 $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
3792 $ord = 0 ;
3793
3794 foreach ( $octal_values as $octal_value )
3795 $ord = ( $ord << 8 ) + octdec ( $octal_value ) ;
3796
3797 $ch = chr ( $ord ) ;
3798 $i += strlen ( $octal_matches [0] ) - 2 ;
3799 }
3800 }
3801
3802 $use_map_buffer = false ;
3803 }
3804
3805 // Add substituted character to the output result
3806 $ord = ord ( $ch ) ;
3807
3808 if ( ! $use_map_buffer )
3809 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3810 else
3811 {
3812 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
3813 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
3814 else
3815 {
3816 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3817 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
3818 }
3819 }
3820
3821 $fragment .= $newchar ;
3822 }
3823 }
3824
3825 // Handle offsets between blocks of characters
3826 if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) &&
3827 - ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth )
3828 $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
3829
3830 $value_index ++ ;
3831 }
3832 }
3833 // For fonts having no associated character map, we simply encode the string in UTF8
3834 // after the C-like escape sequences have been processed
3835 // Note that <xxxx> constructs can be encountered here, so we have to process them as well
3836 else
3837 {
3838 foreach ( $instruction [ 'values' ] as $text )
3839 {
3840 $is_hex = ( $text [0] == '<' ) ;
3841 $length = strlen ( $text ) - 1 ;
3842
3843 // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
3844 // Example :
3845 // (this is a sentence \
3846 // continued on the next line)
3847 // Funny isn't it ? so remove such constructs because we don't care
3848 $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
3849
3850 // Characters are encoded within angle brackets ( "<>" )
3851 if ( $is_hex )
3852 {
3853 for ( $i = 1 ; $i < $length ; $i += 2 )
3854 {
3855 $ch = hexdec ( substr ( $text, $i, 2 ) ) ;
3856
3857 $fragment .= $this -> CodePointToUtf8 ( $ch ) ;
3858 }
3859 }
3860 // Characters are plain text
3861 else
3862 {
3863 $text = self::Unescape ( $text ) ;
3864
3865 for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ )
3866 {
3867 $ch = $text [$i] ;
3868 $ord = ord ( $ch ) ;
3869
3870 if ( $ord < 127 )
3871 $newchar = $ch ;
3872 else
3873 {
3874 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
3875 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
3876 else
3877 {
3878 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3879 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
3880 }
3881 }
3882
3883 $fragment .= $newchar ;
3884 }
3885 }
3886
3887 // Handle offsets between blocks of characters
3888 if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) &&
3889 abs ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth )
3890 $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
3891
3892 $value_index ++ ;
3893 }
3894 }
3895
3896 // Process the markers which do not have an associated font yet - this will be done by matching
3897 // the current text fragment against one of the regular expressions defined.
3898 // If a match occurs, then all the subsequent text fragment using the same font will be put markers
3899 for ( $j = 0 ; $j < $unprocessed_marker_count ; $j ++ )
3900 {
3901 $marker = $this -> UnprocessedMarkerList [ 'font' ] [$j] ;
3902
3903 if ( preg_match ( $marker [ 'regex' ], trim ( $fragment ) ) )
3904 {
3905 $this -> TextWithFontMarkers [ $current_font ] = array
3906 (
3907 'font' => $current_font,
3908 'height' => $current_font_size,
3909 'regex' => $marker [ 'regex' ],
3910 'start' => $marker [ 'start' ],
3911 'end' => $marker [ 'end' ]
3912 ) ;
3913
3914 $unprocessed_marker_count -- ;
3915 unset ( $this -> UnprocessedMarkerList [ 'font' ] [$j] ) ;
3916
3917 break ;
3918 }
3919 }
3920
3921 // Check if we need to add markers around this text fragment
3922 if ( isset ( $this -> TextWithFontMarkers [ $current_font ] ) &&
3923 $this -> TextWithFontMarkers [ $current_font ] [ 'height' ] == $current_font_size )
3924 {
3925 $fragment = $this -> TextWithFontMarkers [ $current_font ] [ 'start' ] .
3926 $fragment .
3927 $this -> TextWithFontMarkers [ $current_font ] [ 'end' ] ;
3928 }
3929
3930 $result .= $fragment ;
3931
3932 break ;
3933
3934 // An "nl" instruction means TJ, Tj, T* or "'"
3935 case 'nl' :
3936 if ( ! $instruction [ 'conditional' ] )
3937 {
3938 if ( $instruction [ 'leading' ] && $text_leading && $current_font_size )
3939 {
3940 $count = ( integer ) ( ( $text_leading - $current_font_size ) / $current_font_size ) ;
3941
3942 if ( ! $count )
3943 $count = 1 ;
3944 }
3945 else
3946 $count = 1 ;
3947
3948 $extra = str_repeat ( PHP_EOL, $count ) ;
3949 $result .= $extra ;
3950 $needs_separator = false ;
3951 $last_goto_y -= ( $count * $text_leading ) ; // Approximation on y-coord change
3952 $last_relative_goto_y = 0 ;
3953 }
3954
3955 break ;
3956
3957 // "Tm", "Td" or "TD" : Output text on the same line, if the "y" coordinates are equal
3958 case 'goto' :
3959 // Some text is positioned using 'Tm' instructions ; however they can be immediatley followed by 'Td' instructions
3960 // which give a relative positioning ; so consider that the last instruction wins
3961 if ( $instruction [ 'relative' ] )
3962 {
3963 // Try to put a separator if the x coordinate is non-zero
3964 //if ( $instruction [ 'x' ] - $last_goto_x >= $current_font_size )
3965 // $result .= $this -> Separator ;
3966
3967 $discard_last_instruction = true ;
3968 $extra_newlines = 0 ;
3969 $use_same_line = ( ( $last_relative_goto_y - abs ( $instruction [ 'y' ] ) ) <= $current_font_size ) ;
3970 $last_relative_goto_y = abs ( $instruction [ 'y' ] ) ;
3971 $last_goto_x = $instruction [ 'x' ] ;
3972
3973 if ( - $instruction [ 'y' ] > $current_font_size )
3974 {
3975 $use_same_line = false ;
3976
3977 if ( $last_relative_goto_y )
3978 $extra_newlines = ( integer ) ( $current_font_size / $last_relative_goto_y ) ;
3979 else
3980 $extra_newlines = 0 ;
3981 }
3982 else if ( ! $instruction [ 'y' ] )
3983 {
3984 $use_same_line = true ;
3985 $extra_newlines = 0 ;
3986 }
3987
3988 break ;
3989 }
3990 else
3991 $last_relative_goto_y = 0 ;
3992
3993 $y = $last_goto_y + $last_relative_goto_y ;
3994
3995 if ( $instruction [ 'y' ] == $y || abs ( $instruction [ 'y' ] - $y ) < $current_font_size )
3996 {
3997 $use_same_line = true ;
3998 $extra_newlines = 0 ;
3999 }
4000 else
4001 {
4002 // Compute the number of newlines we have to insert between the current and the next lines
4003 if ( $current_font_size )
4004 $extra_newlines = ( integer ) ( ( $y - $instruction [ 'y' ] - $current_font_size ) / $current_font_size ) ;
4005
4006 $use_same_line = ( $last_goto_y == 0 ) ;
4007 }
4008
4009 $last_goto_y = $instruction [ 'y' ] ;
4010 break ;
4011
4012 // Set font size
4013 case 'fontsize' :
4014 $current_font_size = $instruction [ 'size' ] ;
4015 break ;
4016
4017 // "/Rx" : sets the current font
4018 case 'resource' :
4019 $current_font = $instruction [ 'resource' ] ;
4020
4021 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4022 break ;
4023
4024 // "/TPLx" : references a template, which can contain additional font aliases
4025 case 'template' :
4026 if ( $this -> PageMap -> IsValidXObjectName ( $instruction [ 'token' ] ) )
4027 $current_template = $instruction [ 'token' ] ;
4028
4029 break ;
4030
4031 // 'TL' : text leading to be used for the next "T*" in the flow
4032 case 'leading' :
4033 if ( ! ( $this -> Options & self::PDFOPT_IGNORE_TEXT_LEADING ) )
4034 $text_leading = $instruction [ 'size' ] ;
4035
4036 break ;
4037
4038
4039 // 'ET' : we have to reset a few things here
4040 case 'ET' :
4041 $current_font = -1 ;
4042 $current_font_map_width = 2 ;
4043 break ;
4044 }
4045
4046 // Remember last instruction - this will help us into determining whether we should put the next text
4047 // on the current or following line
4048 if ( ! $discard_last_instruction )
4049 $last_instruction = $instruction ;
4050
4051 $discard_last_instruction = false ;
4052 }
4053
4054 return ( $this -> __rtl_process ( $result ) ) ;
4055 }
4056
4057
4058
4059 // __next_instruction -
4060 // Retrieves the next instruction from the drawing text block.
4061 private function __next_instruction ( $page_number, $data, $data_length, $index, $current_template )
4062 {
4063 static $last_instruction = false ;
4064
4065 $ch = '' ;
4066
4067 // Constructs such as
4068 if ( $last_instruction )
4069 {
4070 $result = $last_instruction ;
4071 $last_instruction = false ;
4072
4073 return ( $result ) ;
4074 }
4075
4076 // Whether we should compute enhanced statistics
4077 $enhanced_statistics = $this -> EnhancedStatistics ;
4078
4079 // Holds the floating-point values encountered so far
4080 $number_stack = array ( ) ;
4081
4082 // Loop through the stream of tokens
4083 while ( ( $part = $this -> __next_token ( $page_number, $data, $data_length, $index ) ) !== false )
4084 {
4085 $token = $part [0] ;
4086 $next_index = $part [1] ;
4087
4088 // Floating-point number : push it onto the stack
4089 if ( ( $token [0] >= '0' && $token [0] <= '9' ) || $token [0] == '-' || $token [0] == '+' || $token [0] == '.' )
4090 {
4091 $number_stack [] = $token ;
4092 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4093 }
4094 // 'Tm' instruction : return a "goto" instruction with the x and y coordinates
4095 else if ( $token == 'Tm' )
4096 {
4097 $x = $number_stack [4] ;
4098 $y = $number_stack [5] ;
4099
4100 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
4101
4102 return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => false, 'token' => $token ) ) ;
4103 }
4104 // 'Td' or 'TD' instructions : return a goto instruction with the x and y coordinates (1st and 2nd args)
4105 else if ( $token == 'Td' || $token == 'TD' )
4106 {
4107 $x = $number_stack [0] ;
4108 $y = $number_stack [1] ;
4109
4110 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
4111
4112 return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => true, 'token' => $token ) ) ;
4113 }
4114 // Output text "'" instruction, with conditional newline
4115 else if ( $token [0] == "'" )
4116 {
4117 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
4118
4119 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
4120 }
4121 // Same as above
4122 else if ( $token == 'TJ' || $token == 'Tj' )
4123 {
4124 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
4125
4126 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
4127 }
4128 // Set font size
4129 else if ( $token == 'Tf' )
4130 {
4131 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
4132
4133 return ( array ( 'instruction' => 'fontsize', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
4134 }
4135 // Text leading (spacing used by T*)
4136 else if ( $token == 'TL' )
4137 {
4138 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
4139
4140 return ( array ( 'instruction' => 'leading', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
4141 }
4142 // Position to next line
4143 else if ( $token == 'T*' )
4144 {
4145 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
4146
4147 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => true ) ) ;
4148 }
4149 // Draw object ("Do"). To prevent different text shapes to appear on the same line, we return a "newline" instruction
4150 // here. Note that the shape position is not taken into account here, and shapes will be processed in the order they
4151 // appear in the pdf file (which is likely to be different from their position on a graphic screen).
4152 else if ( $token == 'Do' )
4153 {
4154 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4155
4156 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => false, 'token' => $token ) ) ;
4157 }
4158 // Raw text output
4159 else if ( $token [0] == '(' )
4160 {
4161 $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index, $enhanced_statistics ) ;
4162 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4163 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
4164
4165 if ( $next_part [0] == "'" )
4166 {
4167 $last_instruction = $instruction ;
4168 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
4169 }
4170 else
4171 return ( $instruction ) ;
4172 }
4173 // Hex digits within angle brackets
4174 else if ( $token [0] == '<' )
4175 {
4176 $ch = $token [1] ;
4177 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
4178 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4179
4180 if ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM )
4181 {
4182 $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index ) ;
4183 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4184
4185 if ( $next_part [0] == "'" )
4186 {
4187 $last_instruction = $instruction ;
4188 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
4189 }
4190 else
4191 return ( $instruction ) ;
4192 }
4193 }
4194 // Text specified as an array of individual raw text elements, and individual interspaces between characters
4195 else if ( $token [0] == '[' )
4196 {
4197 $values = $this -> __extract_chars_from_array ( $token ) ;
4198 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
4199 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => $values [0], 'offsets' => $values [1], 'token' => $token ) ;
4200
4201 return ( $instruction ) ;
4202 }
4203 // Token starts with a slash : maybe a font specification
4204 else if ( preg_match ( '#^ ( ' . self::$FontSpecifiers . ' ) #ix', $token ) )
4205 {
4206 $key = "$page_number:$current_template:$token" ;
4207 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4208
4209 if ( isset ( $this -> MapIdBuffer [ $key ] ) )
4210 $id = $this -> MapIdBuffer [ $key ] ;
4211 else
4212 {
4213 $id = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $token ) ;
4214
4215 $this -> MapIdBuffer [ $key ] = $id ;
4216 }
4217
4218 return ( array ( 'instruction' => 'resource', 'next' => $next_index, 'resource' => $id, 'token' => $token ) ) ;
4219 }
4220 // Template reference, such as /TPL1. Each reference has initially been replaced by !PDFTOTEXT_TEMPLATE_TPLx during substitution
4221 // by ProcessTemplateReferences(), because templates not only specify text to be replaced, but also font aliases
4222 // -and this is the place where we catch font aliases in this case
4223 else if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
4224 {
4225 $current_template = '/' . $match [ 'template' ] ;
4226 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
4227
4228 return ( array ( 'instruction' => 'template', 'next' => $next_index, 'token' => $current_template ) ) ;
4229 }
4230 // Others, only counted for statistics
4231 else if ( $token === 'cm' )
4232 {
4233 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
4234 }
4235 else if ( $token === 'BT' )
4236 {
4237 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
4238
4239 return ( array ( 'instruction' => 'BT', 'next' => $next_index, 'token' => $token ) ) ;
4240 }
4241 else if ( $token == 'ET' ) // Nothing special to count here
4242 {
4243 return ( array ( 'instruction' => 'ET', 'next' => $next_index, 'token' => $token ) ) ;
4244 }
4245 // Other instructions : we're not that much interested in them, so clear the number stack and consider
4246 // that the current parameters, floating-point values, have been processed
4247 else
4248 {
4249 $number_stack = array ( ) ;
4250 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4251 }
4252
4253 $index = $next_index ;
4254 }
4255
4256 // End of input
4257 return ( false ) ;
4258 }
4259
4260
4261 // __next_token :
4262 // Retrieves the next token from the drawing instructions stream.
4263 private function __next_token ( $page_number, $data, $data_length, $index )
4264 {
4265 // Skip spaces
4266 $count = 0 ;
4267
4268 while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) )
4269 {
4270 $index ++ ;
4271 $count ++ ;
4272 }
4273
4274 $enhanced_statistics = $this -> EnhancedStatistics ;
4275 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ;
4276
4277 // End of input
4278 if ( $index >= $data_length )
4279 return ( false ) ;
4280
4281 // The current character will tell us what to do
4282 $ch = $data [ $index ] ;
4283 $ch2 = '' ;
4284
4285 switch ( $ch )
4286 {
4287 // Opening square bracket : we have to find the closing one, taking care of escape sequences
4288 // that can also specify a square bracket, such as "\]"
4289 case "[" :
4290 $pos = $index + 1 ;
4291 $parent = 0 ;
4292 $angle = 0 ;
4293 $result = $ch ;
4294
4295 while ( $pos < $data_length )
4296 {
4297 $nch = $data [ $pos ++ ] ;
4298
4299 switch ( $nch )
4300 {
4301 case '(' :
4302 $parent ++ ;
4303 $result .= $nch ;
4304 break ;
4305
4306 case ')' :
4307 $parent -- ;
4308 $result .= $nch ;
4309 break ;
4310
4311 case '<' :
4312 // Although the array notation can contain hex digits between angle brackets, we have to
4313 // take care that we do not have an angle bracket between two parentheses such as :
4314 // [ (<) ... ]
4315 if ( ! $parent )
4316 $angle ++ ;
4317
4318 $result .= $nch ;
4319 break ;
4320
4321 case '>' :
4322 if ( ! $parent )
4323 $angle -- ;
4324
4325 $result .= $nch ;
4326 break ;
4327
4328 case '\\' :
4329 $result .= $nch . $data [ $pos ++ ] ;
4330 break ;
4331
4332 case ']' :
4333 $result .= ']' ;
4334
4335 if ( ! $parent )
4336 break 2 ;
4337 else
4338 break ;
4339
4340 case "\n" :
4341 case "\r" :
4342 break ;
4343
4344 default :
4345 $result .= $nch ;
4346 }
4347 }
4348
4349 return ( array ( $result, $pos ) ) ;
4350
4351 // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
4352 // such as "\)"
4353 case "(" :
4354 $pos = $index + 1 ;
4355 $result = $ch ;
4356
4357 while ( $pos < $data_length )
4358 {
4359 $nch = $data [ $pos ++ ] ;
4360
4361 if ( $nch == '\\' )
4362 {
4363 $after = $data [ $pos ] ;
4364
4365 // Character references specified as \xyz, where "xyz" are octal digits
4366 if ( $after >= '0' && $after <= '7' )
4367 {
4368 $result .= $nch ;
4369
4370 while ( $data [ $pos ] >= '0' && $data [ $pos ] <= '7' )
4371 $result .= $data [ $pos ++ ] ;
4372 }
4373 // Regular character escapes
4374 else
4375 $result .= $nch . $data [ $pos ++ ] ;
4376 }
4377 else if ( $nch == ')' )
4378 {
4379 $result .= ')' ;
4380 break ;
4381 }
4382 else
4383 $result .= $nch ;
4384 }
4385
4386 return ( array ( $result, $pos ) ) ;
4387
4388 // A construction of the form : "<< something >>", or a unicode character
4389 case '<' :
4390 if ( ! isset ( $data [ $index + 1 ] ) )
4391 return ( false ) ;
4392
4393 if ( $data [ $index + 1 ] == '<' )
4394 {
4395 $pos = strpos ( $data, '>>', $index + 2 ) ;
4396
4397 if ( $pos === false )
4398 return ( false ) ;
4399
4400 return ( array ( substr ( $data, $index, $pos - $index + 2 ), $pos + 2 ) ) ;
4401 }
4402 else
4403 {
4404 $pos = strpos ( $data, '>', $index + 2 ) ;
4405
4406 if ( $pos === false )
4407 return ( false ) ;
4408
4409 // There can be spaces and newlines inside a series of hex digits, so remove them...
4410 $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $pos - $index + 1 ) ) ;
4411
4412 return ( array ( $result, $pos + 1 ) ) ;
4413 }
4414
4415 // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
4416 case "'" :
4417 return ( array ( "'", $index + 1 ) ) ;
4418
4419 // Other cases : this may be either a floating-point number or a keyword
4420 default :
4421 $index ++ ;
4422 $value = $ch ;
4423
4424 if ( isset ( $data [ $index ] ) )
4425 {
4426 if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_DIGIT ) ||
4427 $ch == '-' || $ch == '+' || $ch == '.' )
4428 {
4429 while ( $index < $data_length &&
4430 ( ( self::$CharacterClasses [ $data [ $index ] ] & self::CTYPE_DIGIT ) ||
4431 $data [ $index ] == '.' ) )
4432 $value .= $data [ $index ++ ] ;
4433 }
4434 else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) ||
4435 $ch == '/' || $ch == '!' )
4436 {
4437 $ch = $data [ $index ] ;
4438
4439 while ( $index < $data_length &&
4440 ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) ||
4441 $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) )
4442 {
4443 $value .= $ch ;
4444 $index ++ ;
4445
4446 if ( isset ( $data [ $index ] ) )
4447 $ch = $data [ $index ] ;
4448 }
4449 }
4450 }
4451
4452 return ( array ( $value, $index ) ) ;
4453 }
4454 }
4455
4456
4457 /*--------------------------------------------------------------------------------------------------------------
4458
4459 NAME
4460 ExtractTextWithLayout - Extracts text, trying to render the page layout.
4461
4462 $text = $this -> ExtractTextWithLayout ( $page_number, $object_id, $data, &$current_font ) ;
4463
4464 DESCRIPTION
4465 Extracts text from decoded stream contents, trying to render the layout.
4466
4467 PARAMETERS
4468 $page_number (integer) -
4469 �Page number that contains the text to be extracted.
4470
4471 $object_id (integer) -
4472 Object id of this text block.
4473
4474 $data (string) -
4475 Stream contents.
4476
4477 $current_font (integer) -
4478 Id of the current font, which should be found in the $this->FontTable property, if anything
4479 went ok.
4480 This parameter is required, since text blocks may not specify a new font resource id and reuse
4481 the one that waas set before.
4482
4483 RETURN VALUE
4484 Returns the decoded text.
4485
4486 *-------------------------------------------------------------------------------------------------------------*/
4487 protected function ExtractTextWithLayout ( &$page_fragments, $page_number, $object_id, $data, &$current_font )
4488 {
4489 // Characters that can start a numeric operand
4490 static $numeric_starts = array
4491 (
4492 '+' => true, '-' => true, '.' => true, '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
4493 '5' => true, '6' => true, '7' => true, '8' => true, '9' => true
4494 ) ;
4495 // Initial (default) transformation matrix. To reflect the PDF specifications, we will keep it as a 6 elements array :
4496 // [ sx tx ty sy x y ]
4497 // (although tx and ty are not useful here, since they affect the graphic orientation of the text)
4498 // sx and sy are scaling parameters, actually a multiplier for the x and y parameters. We only keep
4499 static $IdentityMatrix = array ( 1, 0, 0, 1, 0, 0 ) ;
4500
4501 // Remove useless instructions
4502 $new_data = $this -> __strip_useless_instructions ( $data ) ;
4503
4504 if ( self::$DEBUG )
4505 {
4506 echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
4507 echo $data ;
4508 echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
4509 echo $new_data ;
4510 }
4511
4512 $data = $new_data ;
4513 $data_length = strlen ( $data ) ; // Data length
4514
4515 $page_fragment_count = count ( $page_fragments ) ;
4516
4517 // Index into the specified block of text-drawing instructions
4518 $data_index = 0 ;
4519
4520 // Text matrices
4521 $CTM =
4522 $Tm = $IdentityMatrix ;
4523
4524 // Nesting level of BT..ET instructions (Begin text/End text) - they are not nestable but be prepared to meet buggy PDFs
4525 $BT_nesting_level = 0 ;
4526
4527 // Current font data
4528 $current_font_height = 0 ;
4529
4530 // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
4531 $current_template = '' ;
4532 $current_font_name = '' ;
4533 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4534
4535 // Operand stack
4536 $operand_stack = array ( ) ;
4537
4538 // Number of tokens processed so far
4539 $token_count = 0 ;
4540
4541 // Page attributes
4542 $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ;
4543
4544 // Graphics context stack - well, we only store here the current transformation matrix
4545 $graphic_stack = array ( ) ;
4546 $graphic_stack_size = 0 ;
4547
4548 // Global/local execution time measurements
4549 $tokens_between_timechecks = 1000 ;
4550 $enforce_global_execution_time = $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ;
4551 $enforce_local_execution_time = $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME ;
4552 $enforce_execution_time = $enforce_global_execution_time | $enforce_local_execution_time ;
4553
4554 // Whether we should compute enhanced statistics
4555 $enhanced_statistics = $this -> EnhancedStatistics ;
4556
4557 // Whether we should show debug coordinates
4558 $show_debug_coordinates = ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) ;
4559
4560 // Text leading value set by the TL instruction
4561 $text_leading = 0.0 ;
4562
4563 // Loop through the stream of tokens
4564 while ( $this -> __next_token_ex ( $page_number, $data, $data_length, $data_index, $token, $next_index ) !== false )
4565 {
4566 $token_start = $token [0] ;
4567 $token_count ++ ;
4568 $length = $next_index - $data_index - 1 ;
4569
4570 // Check if we need to enforce execution time checking, to prevent PHP from terminating our script without any hope
4571 // of catching the error
4572 if ( $enforce_execution_time && ! ( $token_count % $tokens_between_timechecks ) )
4573 {
4574 if ( $enforce_global_execution_time )
4575 {
4576 $now = microtime ( true ) ;
4577
4578 if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime )
4579 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
4580 }
4581
4582 // Per-instance timeout handling
4583 if ( $enforce_local_execution_time )
4584 {
4585 $now = microtime ( true ) ;
4586
4587 if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime )
4588 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
4589 }
4590 }
4591
4592 /****************************************************************************************************************
4593
4594 The order of the testings is important for maximum performance : put the most common cases first.
4595 A study on over 1000 PDF files has shown the following :
4596
4597 - Instruction operands appear 24.5 million times
4598 - Tx instructions (including Tf, Tm, ', ", etc.) : 24M
4599 - (), <> and [] constructs for drawing text : 17M
4600 - Other : peanuts...
4601 - Ignored instructions : 0.5M (these are the instructions without interest for text extraction and that
4602 could not be removed by the __strip_useless_instructions() method).
4603
4604 Of course, white spaces appear more than 100M times between instructions. However, it gets hard to remove
4605 most of them without compromising the result of __strip_useless_instructions.
4606
4607 ***************************************************************************************************************/
4608 // Numeric or flag for an instruction
4609 if ( $token_start == '/' || isset ( $numeric_starts [ $token_start ] ) )
4610 {
4611 $operand_stack [] = $token ;
4612
4613 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4614 }
4615 // A 2-characters "Tx" or a 1-character quote/doublequote instruction
4616 else if ( ( $length === 2 && $token_start === 'T' ) || ( $length === 1 && ( $token_start === "'" || $token_start === '"' ) ) )
4617 {
4618 switch ( ( $length === 1 ) ? $token [0] : $token [1] )
4619 {
4620 // Tj instruction
4621 case 'j' :
4622 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tj' ] ++ ;
4623 break ;
4624
4625 // Tm instruction
4626 case 'm' :
4627 $Tm [0] = ( double ) $operand_stack [0] ;
4628 $Tm [1] = ( double ) $operand_stack [1] ;
4629 $Tm [2] = ( double ) $operand_stack [2] ;
4630 $Tm [3] = ( double ) $operand_stack [3] ;
4631 $Tm [4] = ( double ) $operand_stack [4] ;
4632 $Tm [5] = ( double ) $operand_stack [5] ;
4633
4634 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
4635 break ;
4636
4637 // Tf instruction
4638 case 'f' :
4639 $current_font_name = $operand_stack [0] ;
4640 $key = "$page_number:$current_template:$current_font_name" ;
4641
4642 // We have to map a font specifier (such /TT0, C0-1, etc.) into an object id.
4643 // Check first if we already met this font
4644 if ( isset ( $this -> MapIdBuffer [ $key ] ) )
4645 $current_font = $this -> MapIdBuffer [ $key ] ;
4646 // Otherwise retrieve its corresponding object number and put it in our font cache
4647 else
4648 {
4649 $current_font = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $current_font_name ) ;
4650
4651 $this -> MapIdBuffer [ $key ] = $current_font ;
4652 }
4653
4654 $current_font_height = ( double ) $operand_stack [1] ;
4655 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4656 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
4657 break ;
4658
4659 // Td instruction
4660 case 'd' :
4661 $Tm [4] += ( double ) $operand_stack [0] * abs ( $Tm [0] ) ;
4662 $Tm [5] += ( double ) $operand_stack [1] * abs ( $Tm [3] ) ;
4663
4664 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Td' ] ++ ;
4665 break ;
4666
4667 // TJ instruction
4668 case 'J' :
4669 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TJ' ] ++ ;
4670 break ;
4671
4672 // TD instruction
4673 case 'D' :
4674 $Tm [4] += ( double ) $operand_stack [0] * $Tm [0] ;
4675 $Tm [5] += ( double ) $operand_stack [1] * $Tm [3] ;
4676 $text_leading -= $Tm [5] ;
4677
4678 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TD' ] ++ ;
4679 break ;
4680
4681 // T* instruction
4682 case '*' :
4683 $Tm [4] = 0.0 ;
4684 $Tm [5] -= $text_leading ; //$current_font_height ;
4685
4686 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
4687 break ;
4688
4689 // TL instruction - Set text leading. Currently not used.
4690 case 'L' :
4691 $text_leading = ( double ) $operand_stack [0] ;
4692 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
4693 break ;
4694
4695 // ' instruction : go to next line and display text
4696 case "'" :
4697 // Update the coordinates of the last text block found so far
4698 $page_fragments [ $page_fragment_count - 1 ] [ 'x' ] += $text_leading ;
4699 $offset = $current_font_height * abs ( $Tm [3] ) ;
4700 $page_fragments [ $page_fragment_count - 1 ] [ 'y' ] -= $offset ;
4701
4702 // And don't forget to update the y coordinate of the current transformation matrix
4703 $Tm [5] -= $offset ;
4704
4705 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
4706 break ;
4707
4708 // "'" instruction
4709 case '"' :
4710 if ( self::$DEBUG )
4711 warning ( "Instruction $token not yet implemented." ) ;
4712
4713 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '"' ] ++ ;
4714 break ;
4715
4716 // Other : ignore them
4717 default :
4718 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4719 }
4720
4721 $operand_stack = array ( ) ;
4722 }
4723 // cm instruction
4724 else if ( $token == 'cm' )
4725 {
4726 $a = ( double ) $operand_stack [0] ;
4727 $b = ( double ) $operand_stack [1] ;
4728 $c = ( double ) $operand_stack [2] ;
4729 $d = ( double ) $operand_stack [3] ;
4730 $e = ( double ) $operand_stack [4] ;
4731 $f = ( double ) $operand_stack [5] ;
4732
4733 $CTM = array ( $a, $b, $c, $d, $e, $f ) ;
4734 $operand_stack = array ( ) ;
4735
4736 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
4737 }
4738 // q/Q instructions (save/restore graphic context)
4739 else if ( $token === 'q' )
4740 {
4741 $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ;
4742 $operand_stack = array ( ) ;
4743 }
4744 else if ( $token === 'Q' )
4745 {
4746 if ( $graphic_stack_size )
4747 list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ;
4748 else if ( self::$DEBUG )
4749 warning ( "Tried to restore graphics context from an empty stack." ) ;
4750
4751 $operand_stack = array ( ) ;
4752 }
4753 // Text array in the [...] notation. Well, in fact, even non-array constructs are returned as an array by the
4754 // __next_token() function, for the sake of simplicity
4755 else if ( $token_start === '[' )
4756 {
4757 $text = $this -> __decode_text ( $token, $current_font, $current_font_mapped, $current_font_map_width ) ;
4758
4759 if ( $text !== '' )
4760 {
4761 $r = $this -> __matrix_multiply ( $Tm, $CTM, $page_attributes [ 'width' ], $page_attributes [ 'height' ] ) ;
4762 $fragment = array
4763 (
4764 'x' => ( $r [4] < 0 ) ? 0.0 : $r [4],
4765 'y' => ( $r [5] < 0 ) ? 0.0 : $r [5],
4766 'page' => $page_number,
4767 'template' => $current_template,
4768 'font' => $current_font_name,
4769 'font-height' => abs ( $current_font_height * $Tm [3] ),
4770 'text' => $text,
4771 ) ;
4772
4773 // Add debug information when needed
4774 if ( self::$DEBUG )
4775 {
4776 $fragment = array_merge
4777 (
4778 $fragment,
4779 array
4780 (
4781 'CTM' => $CTM,
4782 'Tm' => $Tm,
4783 'New Tm' => $r,
4784 'Real font height' => $current_font_height,
4785 'Page width' => $page_attributes [ 'width' ],
4786 'Page height' => $page_attributes ['height' ]
4787 )
4788 ) ;
4789 }
4790
4791 // Add this text fragment to the list
4792 $page_fragments [] = $fragment ;
4793 $page_fragment_count ++ ;
4794
4795 $operand_stack = array ( ) ;
4796 }
4797 }
4798 // BT instruction
4799 else if ( $token == 'BT' )
4800 {
4801 $BT_nesting_level ++ ;
4802 $operand_stack = array ( ) ;
4803 $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ;
4804
4805 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
4806 }
4807 // ET instruction
4808 else if ( $token == 'ET' )
4809 {
4810 if ( $BT_nesting_level )
4811 {
4812 $BT_nesting_level -- ;
4813
4814 if ( ! $BT_nesting_level && $graphic_stack_size )
4815 {
4816 list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ;
4817 }
4818
4819 }
4820
4821 $operand_stack = array ( ) ;
4822 }
4823 // Template (substituted in __next_token)
4824 else if ( $token_start === '!' )
4825 {
4826 if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
4827 {
4828 $name = '/' . $match [ 'template' ] ;
4829 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
4830
4831 if ( $this -> PageMap -> IsValidXObjectName ( $name ) )
4832 $current_template = $name ;
4833 }
4834 else
4835 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4836
4837 $operand_stack = array ( ) ;
4838 }
4839 // Other instructions
4840 else
4841 {
4842 $operand_stack = array ( ) ;
4843 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4844 }
4845
4846 // Update current index in instruction stream
4847 $data_index = $next_index ;
4848 }
4849 }
4850
4851
4852 // __matrix_multiply -
4853 // Multiplies matrix $ma by $mb.
4854 // PDF transformation matrices are 3x3 matrices containing the following values :
4855 //
4856 // | sx rx 0 |
4857 // | ry sy 0 |
4858 // | tx ty 1 |
4859 //
4860 // However, we do not care about the 3rd column, which is always hardcoded. Transformation
4861 // matrices here are implemented 6-elements arrays :
4862 //
4863 // [ sx, rx, ry, tx, ty ]
4864 private function __matrix_multiply ( $ma, $mb, $page_width, $page_height )
4865 {
4866 // Scaling text is only appropriate for rendering graphics ; in our case, we just have to render
4867 // basic text without any consideration about its width or height ; so adjust the sx/sy parameters
4868 // accordingly
4869 $scale_1x = ( $ma [0] > 0 ) ? 1 : -1 ;
4870 $scale_1y = ( $ma [3] > 0 ) ? 1 : -1 ;
4871 $scale_2x = ( $mb [0] > 0 ) ? 1 : -1 ;
4872 $scale_2y = ( $mb [3] > 0 ) ? 1 : -1 ;
4873
4874 // Perform the matrix multiplication
4875 $r = array ( ) ;
4876 $r [0] = ( $scale_1x * $scale_2x ) + ( $ma [1] * $mb [2] ) ;
4877 $r [1] = ( $scale_1x * $mb [1] ) + ( $ma [1] * $scale_2y ) ;
4878 $r [2] = ( $scale_1y * $scale_2x ) + ( $scale_1y * $mb [2] ) ;
4879 $r [3] = ( $scale_1y * $mb [1] ) + ( $scale_1y* $scale_2y ) ;
4880 $r [4] = ( $ma [4] * $scale_2x ) + ( $ma [5] * $mb [2] ) + $mb [4] ;
4881 $r [5] = ( $ma [4] * $mb [1] ) + ( $ma [5] * $scale_2y ) + $mb [5] ;
4882
4883 // Negative x/y values are expressed relative to the page width/height (???)
4884 if ( $r [0] < 0 )
4885 $r [4] = abs ( $r [4] ) ;//$page_width - $r [4] ;
4886
4887 if ( $r [3] < 0 )
4888 $r [5] = abs ( $r [5] ) ; //$page_height - $r [5] ;
4889
4890 return ( $r ) ;
4891 }
4892
4893
4894 // __next_token_ex :
4895 // Reviewed version of __next_token, adapted to ExtractTextWithLayout.
4896 // Both functions will be unified when this one will be stabilized.
4897 private function __next_token_ex ( $page_number, $data, $data_length, $index, &$token, &$next_index )
4898 {
4899 // Skip spaces
4900 $count = 0 ;
4901
4902 while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) )
4903 {
4904 $index ++ ;
4905 $count ++ ;
4906 }
4907
4908 $enhanced_statistics = $this -> EnhancedStatistics ;
4909 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ;
4910
4911 // End of input
4912 if ( $index >= $data_length )
4913 return ( false ) ;
4914
4915 // The current character will tell us what to do
4916 $ch = $data [ $index ] ;
4917
4918 switch ( $ch )
4919 {
4920 // Opening square bracket : we have to find the closing one, taking care of escape sequences
4921 // that can also specify a square bracket, such as "\]"
4922 case "[" :
4923 $next_index = $index + 1 ;
4924 $parent = 0 ;
4925 $angle = 0 ;
4926 $token = '[' ;
4927
4928 while ( $next_index < $data_length )
4929 {
4930 $nch = $data [ $next_index ++ ] ;
4931
4932 switch ( $nch )
4933 {
4934 case '(' :
4935 $parent ++ ;
4936 $token .= $nch ;
4937 break ;
4938
4939 case ')' :
4940 $parent -- ;
4941 $token .= $nch ;
4942 break ;
4943
4944 case '<' :
4945 // Although the array notation can contain hex digits between angle brackets, we have to
4946 // take care that we do not have an angle bracket between two parentheses such as :
4947 // [ (<) ... ]
4948 if ( ! $parent )
4949 $angle ++ ;
4950
4951 $token .= $nch ;
4952 break ;
4953
4954 case '>' :
4955 if ( ! $parent )
4956 $angle -- ;
4957
4958 $token .= $nch ;
4959 break ;
4960
4961 case '\\' :
4962 $token .= $nch . $data [ $next_index ++ ] ;
4963 break ;
4964
4965 case ']' :
4966 $token .= ']' ;
4967
4968 if ( ! $parent )
4969 break 2 ;
4970 else
4971 break ;
4972
4973 case "\n" :
4974 case "\r" :
4975 break ;
4976
4977 default :
4978 $token .= $nch ;
4979 }
4980 }
4981
4982 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
4983
4984 return ( true ) ;
4985
4986 // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
4987 // such as "\)"
4988 case "(" :
4989 $next_index = $index + 1 ;
4990 $token = '[' . $ch ;
4991
4992 while ( $next_index < $data_length )
4993 {
4994 $nch = $data [ $next_index ++ ] ;
4995
4996 if ( $nch === '\\' )
4997 {
4998 $after = $data [ $next_index ] ;
4999
5000 // Character references specified as \xyz, where "xyz" are octal digits
5001 if ( $after >= '0' && $after <= '7' )
5002 {
5003 $token .= $nch ;
5004
5005 while ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '7' )
5006 $token .= $data [ $next_index ++ ] ;
5007 }
5008 // Regular character escapes
5009 else
5010 $token .= $nch . $data [ $next_index ++ ] ;
5011 }
5012 else if ( $nch === ')' )
5013 {
5014 $token .= ')' ;
5015 break ;
5016 }
5017 else
5018 $token .= $nch ;
5019 }
5020
5021 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
5022 $token .= ']' ;
5023
5024 return ( true ) ;
5025
5026 // A construction of the form : "<< something >>", or a unicode character
5027 case '<' :
5028 if ( isset ( $data [ $index + 1 ] ) )
5029 {
5030 if ( $data [ $index + 1 ] === '<' )
5031 {
5032 $next_index = strpos ( $data, '>>', $index + 2 ) ;
5033
5034 if ( $next_index === false )
5035 return ( false ) ;
5036
5037 $token = substr ( $data, $index, $next_index - $index + 2 ) ;
5038 $next_index += 2 ;
5039
5040 return ( true ) ;
5041 }
5042 else
5043 {
5044 $next_index = strpos ( $data, '>', $index + 2 ) ;
5045
5046 if ( $next_index === false )
5047 return ( false ) ;
5048
5049 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
5050
5051 // There can be spaces and newlines inside a series of hex digits, so remove them...
5052 $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $next_index - $index + 1 ) ) ;
5053
5054 $token = "[$result]" ;
5055 $next_index ++ ;
5056
5057 return ( true ) ;
5058 }
5059 }
5060 else
5061 return ( false ) ;
5062
5063 // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
5064 case "'" :
5065 case '"' :
5066 $token = $ch ;
5067 $next_index += 2 ;
5068
5069 return ( true ) ;
5070
5071 // Other cases : this may be either a floating-point number or a keyword
5072 default :
5073 $next_index = ++ $index ;
5074 $token = $ch ;
5075
5076 if ( isset ( $data [ $next_index ] ) )
5077 {
5078 if ( ( $ch >= '0' && $ch <= '9' ) || $ch == '-' || $ch == '+' || $ch == '.' )
5079 {
5080 while ( $next_index < $data_length &&
5081 ( ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '9' ) ||
5082 $data [ $next_index ] === '-' || $data [ $next_index ] === '+' || $data [ $next_index ] === '.' ) )
5083 $token .= $data [ $next_index ++ ] ;
5084 }
5085 else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) ||
5086 $ch == '/' || $ch == '!' )
5087 {
5088 $ch = $data [ $next_index ] ;
5089
5090 while ( $next_index < $data_length &&
5091 ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) ||
5092 $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) )
5093 {
5094 $token .= $ch ;
5095 $next_index ++ ;
5096
5097 if ( isset ( $data [ $next_index ] ) )
5098 $ch = $data [ $next_index ] ;
5099 }
5100 }
5101 }
5102
5103 return ( true ) ;
5104 }
5105 }
5106
5107
5108 // __decode_text -
5109 // Text decoding function when the PDFOPT_BASIC_LAYOUT flag is specified.
5110 private function __decode_text ( $data, $current_font, $current_font_mapped, $current_font_map_width )
5111 {
5112 list ( $text_values, $offsets ) = $this -> __extract_chars_from_array ( $data ) ;
5113 $value_index = 0 ;
5114 $result = '' ;
5115
5116 // Fonts having character maps will require some special processing
5117 if ( $current_font_mapped )
5118 {
5119 // Loop through each text value
5120 foreach ( $text_values as $text )
5121 {
5122 $is_hex = ( $text [0] == '<' ) ;
5123 $length = strlen ( $text ) - 1 ;
5124 $handled = false ;
5125
5126 // Characters are encoded within angle brackets ( "<>" ).
5127 // Note that several characters can be specified within the same angle brackets, so we have to take
5128 // into account the width we detected in the begincodespancerange construct
5129 if ( $is_hex )
5130 {
5131 for ( $i = 1 ; $i < $length ; $i += $current_font_map_width )
5132 {
5133 $value = substr ( $text, $i, $current_font_map_width ) ;
5134 $ch = hexdec ( $value ) ;
5135
5136 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
5137 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
5138 else
5139 {
5140 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
5141 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
5142 }
5143
5144 $result .= $newchar ;
5145 }
5146
5147 $handled = true ;
5148 }
5149 // Yes ! double-byte codes can also be specified as plain text within parentheses !
5150 // However, we have to be really careful here ; the sequence :
5151 // (Be)
5152 // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
5153 // We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
5154 // if not, then we have to consider that it is regular text to be taken one character by
5155 // one character. In this case, we fall back to the "if ( ! $handled )" condition
5156 else if ( $current_font_map_width == 4 )
5157 {
5158 $temp_result = '' ;
5159
5160 for ( $i = 1 ; $i < $length ; $i ++ )
5161 {
5162 // Each character in the pair may be a backslash, which escapes the next character so we must skip it
5163 // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
5164 if ( $text [$i] != '\\' )
5165 $ch1 = $text [$i] ;
5166 else
5167 {
5168 $i ++ ;
5169
5170 if ( $text [$i] < '0' || $text [$i] > '7' )
5171 $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
5172 else
5173 {
5174 $oct = '' ;
5175 $digit_count = 0 ;
5176
5177 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
5178 {
5179 $oct .= $text [$i ++] ;
5180 $digit_count ++ ;
5181 }
5182
5183 $ch1 = chr ( octdec ( $oct ) ) ;
5184 $i -- ;
5185 }
5186 }
5187
5188 $i ++ ;
5189
5190 if ( $text [$i] != '\\' )
5191 $ch2 = $text [$i] ;
5192 else
5193 {
5194 $i ++ ;
5195
5196 if ( $text [$i] < '0' || $text [$i] > '7' )
5197 $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
5198 else
5199 {
5200 $oct = '' ;
5201 $digit_count = 0 ;
5202
5203 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
5204 {
5205 $oct .= $text [$i ++] ;
5206 $digit_count ++ ;
5207 }
5208
5209 $ch2 = chr ( octdec ( $oct ) ) ;
5210 $i -- ;
5211 }
5212 }
5213
5214 // Build the 2-bytes character code
5215 $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ;
5216
5217 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
5218 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
5219 else
5220 {
5221 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
5222 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
5223 }
5224
5225 // Yes !!! for characters encoded with two bytes, we can find the following construct :
5226 // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
5227 // which must be expanded as : (Car)
5228 // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
5229 // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
5230 // for the next quirk to happen...
5231 if ( $newchar == '\\' )
5232 {
5233 $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
5234 $i ++ ; // this time we processed 3 bytes, not 2
5235 }
5236
5237 $temp_result .= $newchar ;
5238 }
5239
5240 // Happens only if we were unable to translate a character using the current character map
5241 $result .= $temp_result ;
5242 $handled = true ;
5243 }
5244
5245 // Character strings within parentheses.
5246 // For every text value, use the character map table for substitutions
5247 if ( ! $handled )
5248 {
5249 for ( $i = 1 ; $i < $length ; $i ++ )
5250 {
5251 $ch = $text [$i] ;
5252
5253 // Set to true to optimize calls to MapCharacters
5254 // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
5255 $use_map_buffer = false ;
5256
5257 // ... but don't forget to handle escape sequences "\n" and "\r" for characters
5258 // 10 and 13
5259 if ( $ch == '\\' )
5260 {
5261 $ch = $text [++$i] ;
5262
5263 // Escaped character
5264 if ( $ch < '0' || $ch > '7' )
5265 $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
5266 // However, an octal form can also be specified ; in this case we have to take into account
5267 // the character width for the current font (if the character width is 4 hex digits, then we
5268 // will encounter constructs such as "\000\077").
5269 // The method used here is dirty : we build a regex to match octal character representations on a substring
5270 // of the text
5271 else
5272 {
5273 $width = $current_font_map_width / 2 ; // Convert to byte count
5274 $subtext = substr ( $text, $i - 1 ) ;
5275 $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
5276
5277 $status = preg_match ( $regex, $subtext, $octal_matches ) ;
5278
5279 if ( $status )
5280 {
5281 $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
5282 $ord = 0 ;
5283
5284 foreach ( $octal_values as $octal_value )
5285 $ord = ( $ord << 8 ) + octdec ( $octal_value ) ;
5286
5287 $ch = chr ( $ord ) ;
5288 $i += strlen ( $octal_matches [0] ) - 2 ;
5289 }
5290 }
5291
5292 $use_map_buffer = false ;
5293 }
5294
5295 // Add substituted character to the output result
5296 $ord = ord ( $ch ) ;
5297
5298 if ( ! $use_map_buffer )
5299 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5300 else
5301 {
5302 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
5303 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
5304 else
5305 {
5306 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5307 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
5308 }
5309 }
5310
5311 $result .= $newchar ;
5312 }
5313 }
5314
5315 // Handle offsets between blocks of characters
5316 if ( isset ( $offsets [ $value_index ] ) &&
5317 - ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth )
5318 $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
5319
5320 $value_index ++ ;
5321 }
5322 }
5323 // For fonts having no associated character map, we simply encode the string in UTF8
5324 // after the C-like escape sequences have been processed
5325 // Note that <xxxx> constructs can be encountered here, so we have to process them as well
5326 else
5327 {
5328 foreach ( $text_values as $text )
5329 {
5330 $is_hex = ( $text [0] == '<' ) ;
5331 $length = strlen ( $text ) - 1 ;
5332
5333 // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
5334 // Example :
5335 // (this is a sentence \
5336 // continued on the next line)
5337 // Funny isn't it ? so remove such constructs because we don't care
5338 $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
5339
5340 // Characters are encoded within angle brackets ( "<>" )
5341 if ( $is_hex )
5342 {
5343 for ( $i = 1 ; $i < $length ; $i += 2 )
5344 {
5345 $ch = hexdec ( substr ( $text, $i, 2 ) ) ;
5346
5347 $result .= $this -> CodePointToUtf8 ( $ch ) ;
5348 }
5349 }
5350 // Characters are plain text
5351 else
5352 {
5353 $text = self::Unescape ( $text ) ;
5354
5355 for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ )
5356 {
5357 $ch = $text [$i] ;
5358 $ord = ord ( $ch ) ;
5359
5360 if ( $ord < 127 )
5361 $newchar = $ch ;
5362 else
5363 {
5364 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
5365 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
5366 else
5367 {
5368 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5369 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
5370 }
5371 }
5372
5373 $result .= $newchar ;
5374 }
5375 }
5376
5377 // Handle offsets between blocks of characters
5378 if ( isset ( $offsets [ $value_index ] ) &&
5379 abs ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth )
5380 $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
5381
5382 $value_index ++ ;
5383 }
5384 }
5385
5386 // All done, return
5387 return ( $result ) ;
5388 }
5389
5390
5391 // __assemble_text_fragments -
5392 // Assembles text fragments collected by the ExtractTextWithLayout function.
5393 private function __assemble_text_fragments ( $page_number, &$fragments, &$page_width, &$page_height )
5394 {
5395 $fragment_count = count ( $fragments ) ;
5396
5397 // No fragment no cry...
5398 if ( ! $fragment_count )
5399 return ( '' ) ;
5400
5401 // Compute the width of each fragment
5402 foreach ( $fragments as &$fragment )
5403 $this -> __compute_fragment_width ( $fragment ) ;
5404
5405 // Sort the fragments and group them by line
5406 usort ( $fragments, array ( $this, '__sort_page_fragments' ) ) ;
5407 $line_fragments = $this -> __group_line_fragments ( $fragments ) ;
5408
5409 // Retrieve the page attributes
5410 $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ;
5411
5412 // Some buggy PDF do not specify page width or page height so, during the processing of text fragments,
5413 // page width & height will be set to the largest x/y coordinate
5414 if ( isset ( $page_attributes [ 'width' ] ) && $page_attributes [ 'width' ] )
5415 $page_width = $page_attributes [ 'width' ] ;
5416 else
5417 {
5418 $page_width = 0 ;
5419
5420 foreach ( $fragments as $fragment )
5421 {
5422 $end_x = $fragment [ 'x' ] + $fragment [ 'width' ] ;
5423
5424 if ( $end_x > $page_width )
5425 $page_width = $end_x ;
5426 }
5427 }
5428
5429 if ( isset ( $page_attributes [ 'height' ] ) && $page_attributes [ 'height' ] )
5430 $page_height = $page_attributes [ 'height' ] ;
5431 else
5432 $page_height = $fragments [0] [ 'y' ] ;
5433
5434 // Block separator
5435 $separator = ( $this -> BlockSeparator ) ? $this -> BlockSeparator : ' ' ;
5436
5437 // Unprocessed marker count
5438 $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
5439
5440 // Add page information if the PDFOPT_DEBUG_SHOW_COORDINATES option has been specified
5441 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
5442 $result = "[Page : $page_number, width = $page_width, height = $page_height]" . $this -> EOL ;
5443 else
5444 $result = '' ;
5445
5446 // Loop through each line of fragments
5447 for ( $i = 0, $line_count = count ( $line_fragments ) ; $i < $line_count ; $i ++ )
5448 {
5449 $current_x = 0 ;
5450
5451 // Loop through each fragment of the current line
5452 for ( $j = 0, $fragment_count = count ( $line_fragments [$i] ) ; $j < $fragment_count ; $j ++ )
5453 {
5454 $fragment = $line_fragments [$i] [$j] ;
5455
5456 // Process the markers which do not have an associated font yet - this will be done by matching
5457 // the current text fragment against one of the regular expressions defined.
5458 // If a match occurs, then all the subsequent text fragment using the same font will be put markers
5459 for ( $k = 0 ; $k < $unprocessed_marker_count ; $k ++ )
5460 {
5461 $marker = $this -> UnprocessedMarkerList [ 'font' ] [$k] ;
5462
5463 if ( preg_match ( $marker [ 'regex' ], $fragment [ 'text' ] ) )
5464 {
5465 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] = array
5466 (
5467 'font' => $fragment [ 'font' ],
5468 'height' => $fragment [ 'font-height' ],
5469 'regex' => $marker [ 'regex' ],
5470 'start' => $marker [ 'start' ],
5471 'end' => $marker [ 'end' ]
5472 ) ;
5473
5474 $unprocessed_marker_count -- ;
5475 unset ( $this -> UnprocessedMarkerList [ 'font' ] [$k] ) ;
5476
5477 break ;
5478 }
5479 }
5480
5481 // Add debug info if needed
5482 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
5483 $result .= $this -> __debug_get_coordinates ( $fragment ) ;
5484
5485 // Add a separator between two fragments, if needed
5486 if ( $j )
5487 {
5488 if ( $current_x < floor ( $fragment [ 'x' ] ) ) // Accept small rounding errors
5489 $result .= $separator ;
5490 }
5491
5492 // Check if we need to add markers around this text fragment
5493 if ( isset ( $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] ) &&
5494 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'height' ] == $fragment [ 'font-height' ] )
5495 {
5496 $fragment_text = $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'start' ] .
5497 $fragment [ 'text' ] .
5498 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'end' ] ;
5499 }
5500 else
5501 $fragment_text = $fragment [ 'text' ] ;
5502
5503 // Add the current fragment to the result
5504 $result .= $fragment_text ;
5505
5506 // Update current x-position
5507 $current_x = $fragment [ 'x' ] + $fragment [ 'width' ] ;
5508 }
5509
5510 // Add a line break between each line
5511 $result .= $this -> EOL ;
5512 }
5513
5514 // All done, return
5515 return ( $result ) ;
5516 }
5517
5518
5519 // __sort_page_fragments -
5520 // Sorts page fragments by their (y,x) coordinates.
5521 public function __sort_page_fragments ( $a, $b )
5522 {
5523 $xa = $a [ 'x' ] ;
5524 $ya = $a [ 'y' ] ;
5525 $xb = $b [ 'x' ] ;
5526 $yb = $b [ 'y' ] ;
5527
5528 if ( $ya !== $yb )
5529 return ( $yb - $ya ) ;
5530 else
5531 return ( $xa - $xb ) ;
5532 }
5533
5534
5535 // __sort_line_fragments -
5536 // Sorts fragments per line.
5537 public function __sort_line_fragments ( $a, $b )
5538 {
5539 return ( $a [ 'x' ] - $b [ 'x' ] ) ;
5540 }
5541
5542
5543 // __group_line_fragments -
5544 // Groups page fragments per line, allowing a certain variation in the y-position.
5545 private function __group_line_fragments ( $fragments )
5546 {
5547 $result = array ( ) ;
5548 $fragment_count = count ( $fragments ) ;
5549 $last_y_coordinate = $fragments [0] [ 'y' ] ;
5550 $current_fragments = array ( $fragments [0] ) ;
5551
5552 for ( $i = 1 ; $i < $fragment_count ; $i ++ )
5553 {
5554 $fragment = $fragments [$i] ;
5555
5556 if ( $fragment [ 'y' ] + $fragment [ 'font-height' ] >= $last_y_coordinate )
5557 $current_fragments [] = $fragment ;
5558 else
5559 {
5560 $last_y_coordinate = $fragment [ 'y' ] ;
5561 usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
5562 $result [] = $current_fragments ;
5563 $current_fragments = array ( $fragment ) ;
5564 }
5565 }
5566
5567 if ( count ( $current_fragments ) )
5568 {
5569 usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
5570 $result [] = $current_fragments ;
5571 }
5572
5573 return ( $result ) ;
5574 }
5575
5576
5577 // __compute_fragment_width -
5578 // Compute the width of the specified text fragment and add the width entry accordingly.
5579 // Returns the font object associated with this fragment
5580 private function __compute_fragment_width ( &$fragment )
5581 {
5582 // To avoid repeated calls to the PdfTexterFontTable::GetFontObject() method, we are buffering them in the FontObjectsBuffer property.
5583 $object_reference = $fragment [ 'page' ] . ':' . $fragment [ 'template' ] . ':' . $fragment [ 'font' ] ;
5584
5585 if ( isset ( $this -> FontObjectsBuffer [ $object_reference ] ) )
5586 $font_object = $this -> FontObjectsBuffer [ $object_reference ] ;
5587 else
5588 {
5589 $font_object = $this -> FontTable -> GetFontObject ( $fragment [ 'page' ], $fragment [ 'template' ], $fragment [ 'font' ] ) ;
5590 $this -> FontObjectsBuffer [ $object_reference ] = $font_object ;
5591 }
5592
5593 // The width of the previous text fragment will be computed only if its associated font contains character widths information
5594 $fragment [ 'width' ] = ( $font_object ) ? $font_object -> GetStringWidth ( $fragment [ 'text' ], $this -> ExtraTextWidth ) : 0 ;
5595
5596 // Return the font object
5597 return ( $font_object ) ;
5598 }
5599
5600
5601 // __debug_get_coordinates -
5602 // Returns the coordinates of the specified text fragment, in debug mode.
5603 private function __debug_get_coordinates ( $fragment )
5604 {
5605 return ( "\n[x:" . round ( $fragment [ 'x' ], 3 ) . ', y:' . round ( $fragment [ 'y' ], 3 ) .
5606 ", w: " . round ( $fragment [ 'width' ], 3 ) . ", h:" . round ( $fragment [ 'font-height' ], 3 ) . ", font:" . $fragment [ 'font' ] . "]" ) ;
5607 }
5608
5609
5610 /*--------------------------------------------------------------------------------------------------------------
5611
5612 NAME
5613 GetTrailerInformation - Retrieves trailer information.
5614
5615 PROTOTYPE
5616 $this -> GetTrailerInformation ( $contents ) ;
5617
5618 DESCRIPTION
5619 Retrieves trailer information :
5620 - Unique file ID
5621 - Id of the object containing encryption data, if the PDF file is encrypted
5622 - Encryption data
5623
5624 PARAMETERS
5625 $contents (string) -
5626 PDF file contents.
5627
5628 *-------------------------------------------------------------------------------------------------------------*/
5629 protected function GetTrailerInformation ( $contents, $pdf_objects )
5630 {
5631 // Be paranoid : check if there is trailer information
5632 if ( ! preg_match ( '/trailer \s* << (?P<trailer> .+?) >>/imsx', $contents, $trailer_match ) )
5633 return ;
5634
5635 $trailer_data = $trailer_match [ 'trailer' ] ;
5636
5637 // Get the unique file id from the trailer data
5638 static $id_regex = '#
5639 /ID \s* \[ \s*
5640 < (?P<id1> [^>]+) >
5641 \s*
5642 < (?P<id2> [^>]+) >
5643 \s* \]
5644 #imsx' ;
5645
5646 if ( preg_match ( $id_regex, $trailer_data, $id_match ) )
5647 {
5648 $this -> ID = $id_match [ 'id1' ] ;
5649 $this -> ID2 = $id_match [ 'id2' ] ;
5650 }
5651
5652 // If there is an object describing encryption data, get its number (/Encrypt flag)
5653 if ( ! preg_match ( '#/Encrypt \s+ (?P<object> \d+)#ix', $trailer_data, $encrypt_match ) )
5654 return ;
5655
5656 $encrypt_object_id = $encrypt_match [ 'object' ] ;
5657
5658 if ( ! isset ( $pdf_objects [ $encrypt_object_id ] ) )
5659 {
5660 if ( self::$DEBUG )
5661 error ( new PdfToTextDecodingException ( "Object #$encrypt_object_id, which should contain encryption data, is missing." ) ) ;
5662
5663 return ;
5664 }
5665
5666 // Parse encryption information
5667 $this -> EncryptionData = PdfEncryptionData::GetInstance ( $this -> ID, $encrypt_object_id, $pdf_objects [ $encrypt_object_id ] ) ;
5668 $this -> IsEncrypted = ( $this -> EncryptionData !== false ) ;
5669 }
5670
5671
5672 // __build_ignored_instructions :
5673 // Takes the template regular expressions from the self::$IgnoredInstructionsTemplates, replace each string with the contents
5674 // of the self::$ReplacementConstructs array, and sets the self::$IgnoredInstructions to a regular expression that is able to
5675 // match the Postscript instructions to be removed from any text stream.
5676 private function __build_ignored_instructions ( )
5677 {
5678 $searches = array_keys ( self::$ReplacementConstructs ) ;
5679 $replacements = array_values ( self::$ReplacementConstructs ) ;
5680
5681 foreach ( self::$IgnoredInstructionTemplatesLayout as $template )
5682 {
5683 $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
5684
5685 self::$IgnoredInstructionsLayout [] = $template ;
5686 self::$IgnoredInstructionsNoLayout [] = $template ;
5687 }
5688
5689 foreach ( self::$IgnoredInstructionTemplatesNoLayout as $template )
5690 {
5691 $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
5692
5693 self::$IgnoredInstructionsNoLayout [] = $template ;
5694 }
5695 }
5696
5697
5698 // __convert_utf16 :
5699 // Some strings found in a pdf file can be encoded in UTF16 (author information, for example).
5700 // When this is the case, the string is converted to UTF8.
5701 private function __convert_utf16 ( $text )
5702 {
5703 if ( isset ( $text [0] ) && isset ( $text [1] ) )
5704 {
5705 $b1 = ord ( $text [0] ) ;
5706 $b2 = ord ( $text [1] ) ;
5707
5708 if ( ( $b1 == 0xFE && $b2 == 0xFF ) || ( $b1 == 0xFF && $b2 == 0xFE ) )
5709 $text = mb_convert_encoding ( $text, 'UTF-8', 'UTF-16' ) ;
5710 }
5711
5712 return ( $text ) ;
5713 }
5714
5715
5716 // __extract_chars_from_array -
5717 // Extracts characters enclosed either within parentheses (character codes) or angle brackets (hex value)
5718 // from an array.
5719 // Example :
5720 //
5721 // [<0D>-40<02>-36<03>-39<0E>-36<0F>-36<0B>-37<10>-37<10>-35(abc)]
5722 //
5723 // will return an array having the following entries :
5724 //
5725 // <0D>, <02>, <03>, <0E>, <0F>, <0B>, <10>, <10>, (abc)
5726 private function __extract_chars_from_array ( $array )
5727 {
5728 $length = strlen ( $array ) - 1 ;
5729 $result = array ( ) ;
5730 $offsets = array ( ) ;
5731
5732 for ( $i = 1 ; $i < $length ; $i ++ ) // Start with character right after the opening bracket
5733 {
5734 $ch = $array [$i] ;
5735
5736 if ( $ch == '(' )
5737 $endch = ')' ;
5738 else if ( $ch == '<' )
5739 $endch = '>' ;
5740 else
5741 {
5742 $value = '' ;
5743
5744 while ( $i < $length && ( ( $array [$i] >= '0' && $array [$i] <= '9' ) ||
5745 $array [$i] == '-' || $array [$i] == '+' || $array [$i] == '.' ) )
5746 $value .= $array [$i++] ;
5747
5748 $offsets [] = ( double ) $value ;
5749
5750 if ( $value !== '' )
5751 $i -- ;
5752
5753 continue ;
5754 }
5755
5756 $char = $ch ;
5757 $i ++ ;
5758
5759 while ( $i < $length && $array [$i] != $endch )
5760 {
5761 if ( $array [$i] == '\\' )
5762 $char .= '\\' . $array [++$i] ;
5763 else
5764 {
5765 $char .= $array [$i] ;
5766
5767 if ( $array [$i] == $endch )
5768 break ;
5769 }
5770
5771 $i ++ ;
5772 }
5773
5774 $result [] = $char . $endch ;
5775 }
5776
5777 return ( array ( $result, $offsets ) ) ;
5778 }
5779
5780
5781 // __extract_chars_from_block -
5782 // Extracts characters from a text block (enclosed in parentheses).
5783 // Returns an array of character ordinals if the $as_array parameter is true, or a string if false.
5784 private function __extract_chars_from_block ( $text, $start_index = false, $length = false, $as_array = false )
5785 {
5786 if ( $as_array )
5787 $result = array ( ) ;
5788 else
5789 $result = '' ;
5790
5791 if ( $start_index === false )
5792 $start_index = 0 ;
5793
5794 if ( $length === false )
5795 $length = strlen ( $text ) ;
5796
5797 $ord0 = ord ( '0' ) ;
5798
5799 for ( $i = $start_index ; $i < $length ; $i ++ )
5800 {
5801 $ch = $text [$i] ;
5802
5803 if ( $ch == '\\' )
5804 {
5805 if ( isset ( $text [ $i + 1 ] ) )
5806 {
5807 $ch2 = $text [ ++$i ] ;
5808
5809 switch ( $ch2 )
5810 {
5811 case 'n' : $ch = "\n" ; break ;
5812 case 'r' : $ch = "\r" ; break ;
5813 case 't' : $ch = "\t" ; break ;
5814 case 'f' : $ch = "\f" ; break ;
5815 case 'v' : $ch = "\v" ; break ;
5816
5817 default :
5818 if ( $ch2 >= '0' && $ch2 <= '7' )
5819 {
5820 $ord = $ch2 - $ord0 ;
5821 $i ++ ;
5822
5823 while ( isset ( $text [$i] ) && $text [$i] >= '0' && $text [$i] <= '7' )
5824 {
5825 $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
5826 $i ++ ;
5827 }
5828
5829 $ch = chr ( $ord ) ;
5830 $i -- ;
5831 }
5832 else
5833 $ch = $ch2 ;
5834
5835 }
5836 }
5837 }
5838
5839 if ( $as_array )
5840 $result [] = ord ( $ch ) ;
5841 else
5842 $result .= $ch ;
5843 }
5844
5845 return ( $result ) ;
5846 }
5847
5848
5849 // __get_character_padding :
5850 // If the offset specified between two character groups in an array notation for displaying text is less
5851 // than -MinSpaceWidth thousands of text units,
5852 private function __get_character_padding ( $char_offset )
5853 {
5854 if ( $char_offset <= - $this -> MinSpaceWidth )
5855 {
5856 if ( $this -> Options & self::PDFOPT_REPEAT_SEPARATOR )
5857 {
5858 // If the MinSpaceWidth property is less than 1000 (text units), consider it has the value 1000
5859 // so that an exuberant number of spaces will not be repeated
5860 $space_width = ( $this -> MinSpaceWidth < 1000 ) ? 1000 : $this -> MinSpaceWidth ;
5861
5862 $repeat_count = abs ( round ( $char_offset / $space_width, 0 ) ) ;
5863
5864 if ( $repeat_count )
5865 $padding = str_repeat ( $this -> Separator, $repeat_count ) ;
5866 else
5867 $padding = $this -> Separator ;
5868 }
5869 else
5870 $padding = $this -> Separator ;
5871
5872 return ( utf8_encode ( self::Unescape ( $padding ) ) ) ;
5873 }
5874 else
5875 return ( '' ) ;
5876 }
5877
5878
5879 // __get_output_image_filename -
5880 // Returns a real filename based on a template supplied by the AutoSaveImageFileTemplate property.
5881 private function __get_output_image_filename ( )
5882 {
5883 static $suffixes = array
5884 (
5885 IMG_JPEG => 'jpg',
5886 IMG_JPG => 'jpg',
5887 IMG_GIF => 'gif',
5888 IMG_PNG => 'png',
5889 IMG_WBMP => 'wbmp',
5890 IMG_XPM => 'xpm'
5891 ) ;
5892
5893 $template = $this -> ImageAutoSaveFileTemplate ;
5894 $length = strlen ( $template ) ;
5895 $parts = pathinfo ( $this -> Filename ) ;
5896
5897 if ( ! isset ( $parts [ 'filename' ] ) ) // for PHP versions < 5.2
5898 {
5899 $index = strpos ( $parts [ 'basename' ], '.' ) ;
5900
5901 if ( $index === false )
5902 $parts [ 'filename' ] = $parts [ 'basename' ] ;
5903 else
5904 $parts [ 'filename' ] = substr ( $parts [ 'basename' ], $index ) ;
5905 }
5906
5907 $searches = array ( ) ;
5908 $replacements = array ( ) ;
5909
5910 // Search for each construct starting with '%'
5911 for ( $i = 0 ; $i < $length ; $i ++ )
5912 {
5913 if ( $template [$i] != '%' || $i + 1 >= $length )
5914 continue ;
5915
5916 $ch = $template [ ++ $i ] ;
5917
5918 // Percent sign found : check the character after
5919 switch ( $ch )
5920 {
5921 // "%%" : Replace it with a single percent
5922 case '%' :
5923 $searches [] = '%%' ;
5924 $replacements [] = '%' ;
5925 break ;
5926
5927 // "%p" : Path of the original PDF file
5928 case 'p' :
5929 $searches [] = '%p' ;
5930 $replacements [] = $parts [ 'dirname' ] ;
5931 break ;
5932
5933 // "%f" : Filename part of the original PDF file, without its suffix
5934 case 'f' :
5935 $searches [] = '%f' ;
5936 $replacements [] = $parts [ 'filename' ] ;
5937 break ;
5938
5939 // "%s" : Output image file suffix, determined by the ImageAutoSaveFormat property
5940 case 's' :
5941 if ( isset ( $suffixes [ $this -> ImageAutoSaveFormat ] ) )
5942 {
5943 $searches [] = '%s' ;
5944 $replacements [] = $suffixes [ $this -> ImageAutoSaveFormat ] ;
5945 }
5946 else
5947 {
5948 $searches [] = '%s' ;
5949 $replacements [] = 'unknown' ;
5950 }
5951
5952 break ;
5953
5954 // Other : may be either "%d", or "%xd", where "x" are digits expression the width of the final sequential index
5955 default :
5956 $width = 0 ;
5957 $chars = '' ;
5958
5959 if ( ctype_digit ( $ch ) )
5960 {
5961 do
5962 {
5963 $width = ( $width * 10 ) + ord ( $ch ) - ord ( '0' ) ;
5964 $chars .= $ch ;
5965 $i ++ ;
5966 } while ( $i < $length && ctype_digit ( $ch = $template [$i] ) ) ;
5967
5968 if ( $template [$i] == 'd' )
5969 {
5970 $searches [] = '%' . $chars . 'd' ;
5971 $replacements [] = sprintf ( "%0{$width}d", $this -> ImageCount ) ;
5972 }
5973 }
5974 else
5975 {
5976 $searches [] = '%d' ;
5977 $replacements [] = $this -> ImageCount ;
5978 }
5979 }
5980 }
5981
5982 // Perform the replacements
5983 if ( count ( $searches ) )
5984 $result = str_replace ( $searches, $replacements, $template ) ;
5985 else
5986 $result = $template ;
5987
5988 // All done, return
5989 return ( $result ) ;
5990 }
5991
5992
5993 // __rtl_process -
5994 // Processes the contents of a page when it contains characters belonging to an RTL language.
5995 private function __rtl_process ( $text )
5996 {
5997 $length = strlen ( $text ) ;
5998 $pos = strcspn ( $text, self::$RtlCharacterPrefixes ) ;
5999
6000 // The text does not contain any of the UTF-8 prefixes that may introduce RTL contents :
6001 // simply return it as is
6002 if ( $pos == $length || $text [$pos] === "\x00" )
6003 return ( $text ) ;
6004
6005 // Extract each individual line, and get rid of carriage returns if any
6006 $lines = explode ( "\n", str_replace ( "\r", '', $text ) ) ;
6007 $new_lines = array ( ) ;
6008
6009 // Loop through lines
6010 foreach ( $lines as $line )
6011 {
6012 // Check if the current line contains potential RTL characters
6013 $pos = strcspn ( $line, self::$RtlCharacterPrefixes ) ;
6014 $length = strlen ( $line ) ;
6015
6016 // If not, simply store it as is
6017 if ( $pos == $length )
6018 {
6019 $new_lines [] = $line ;
6020 continue ;
6021 }
6022
6023 // Otherwise, it gets a little bit more complicated ; we have :
6024 // - To process each series of RTL characters and put them in reverse order
6025 // - Mark spaces and punctuation as "RTL separators", without reversing them (ie, a string like " ." remains " .", not ". ")
6026 // - Other sequences of non-RTL characters must be preserved as is and are not subject to reordering
6027 // The reordering sequence will be described later. For the moment, the $words array is used to store arrays of two elements :
6028 // - The first one is a boolean indicating whether it concerns RTL characters (true) or not (false)
6029 // - The second one is the string itself
6030 $words = array ( ) ;
6031
6032 // Start of the string is not an RTL sequence ; we can add it to our $words array
6033 if ( $pos )
6034 {
6035 $word = substr ( $line, 0, $pos ) ;
6036 $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ;
6037 }
6038
6039 $in_rtl = true ;
6040
6041 // Loop through remaining characters of the current line
6042 while ( $pos < $length )
6043 {
6044 // Character at the current position may be RTL character
6045 if ( $in_rtl )
6046 {
6047
6048 $rtl_text = '' ;
6049 $rtl_char = '' ;
6050 $rtl_char_length = 0 ;
6051 $found_rtl = false ;
6052
6053 // Collect all the consecutive RTL characters, which represent a word, and put the letters in reverse order
6054 while ( $pos < $length && $this -> __is_rtl_character ( $line, $pos, $rtl_char, $rtl_char_length ) )
6055 {
6056 $rtl_text = $rtl_char . $rtl_text ;
6057 $pos += $rtl_char_length ;
6058 $found_rtl = true ;
6059 }
6060
6061 // ... but make sure that we found a valid RTL sequence
6062 if ( $found_rtl )
6063 $words [] = array ( true, $rtl_text ) ;
6064 else
6065 $words [] = array ( false, $line [ $pos ++ ] ) ;
6066
6067 // For now, we are no more in a series of RTL characters
6068 $in_rtl = false ;
6069 }
6070 // Non-RTL characters : collect them until either the end of the current line or the next RTL character
6071 else
6072 {
6073 $next_pos = $pos + strcspn ( $line, self::$RtlCharacterPrefixes, $pos ) ;
6074
6075 if ( $next_pos >= $length )
6076 {
6077 $word = substr ( $line, $pos ) ;
6078 break ;
6079 }
6080 else
6081 {
6082 $word = substr ( $line, $pos, $next_pos - $pos ) ;
6083 $pos = $next_pos ;
6084 $in_rtl = true ;
6085 }
6086
6087 // Don't forget to make the distinction between a sequence of spaces and punctuations, and a real
6088 // piece of text. Space/punctuation strings surrounded by RTL words will be interverted
6089 $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ;
6090 }
6091 }
6092
6093 // Now we have an array, $words, whose first entry of each element indicates whether the second entry is an RTL string
6094 // or not (this includes strings that contain only spaces and punctuation).
6095 // We have to gather all the consecutive array items whose first entry is true, then invert their order.
6096 // Non-RTL strings are not affected by this process.
6097 $stacked_rtl_words = array ( ) ;
6098 $new_words = array ( ) ;
6099
6100 foreach ( $words as $word )
6101 {
6102 // RTL word : put it onto the stack
6103 if ( $word [0] )
6104 $stacked_rtl_words [] = $word [1] ;
6105 // Non-RTL word : add it as is to the output array, $new_words
6106 else
6107 {
6108 // But if RTL words were stacked before, invert them and add them to the output array
6109 if ( count ( $stacked_rtl_words ) )
6110 {
6111 $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
6112 $stacked_rtl_words = array ( ) ;
6113 }
6114
6115 $new_words [] = $word [1] ;
6116 }
6117 }
6118
6119 // Process any remaining RTL words that may have been stacked and not yet processed
6120 if ( count ( $stacked_rtl_words ) )
6121 $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
6122
6123 // That's ok, we have processed one more line
6124 $new_lines [] = implode ( '', $new_words ) ;
6125 }
6126
6127 // All done, return a catenation of all the lines processed so far
6128 $result = implode ( "\n", $new_lines ) ;
6129
6130 return ( $result ) ;
6131 }
6132
6133
6134 // __is_rtl_character -
6135 // Checks if the sequence starting at $pos in string $text is a character belonging to an RTL language.
6136 // If yes, returns true and sets $rtl_char to the UTF8 string sequence for that character, and $rtl_char_length
6137 // to the length of this string.
6138 // If no, returns false.
6139 private function __is_rtl_character ( $text, $pos, &$rtl_char, &$rtl_char_length )
6140 {
6141 $ch = $text [ $pos ] ;
6142
6143 // Check that the current character is the start of a potential UTF8 RTL sequence
6144 if ( isset ( self::$RtlCharacterPrefixLengths [ $ch ] ) )
6145 {
6146 // Get the number of characters that are expected after the sequence
6147 $length_after = self::$RtlCharacterPrefixLengths [ $ch ] ;
6148
6149 // Get the sequence after the UTF8 prefix
6150 $codes_after = substr ( $text, $pos + 1, $length_after ) ;
6151
6152 // Search through $RtlCharacters, which contains arrays of ranges related to the UTF8 character prefix
6153 foreach ( self::$RtlCharacters [ $ch ] as $range )
6154 {
6155 if ( strcmp ( $range [0], $codes_after ) <= 0 &&
6156 strcmp ( $range [1], $codes_after ) >= 0 )
6157 {
6158 $rtl_char = $ch . $codes_after ;
6159 $rtl_char_length = $length_after + 1 ;
6160
6161 return ( true ) ;
6162 }
6163 }
6164
6165 return ( false ) ;
6166 }
6167 else
6168 return ( false ) ;
6169 }
6170
6171
6172 // __is_rtl_separator -
6173 // RTL words are separated by spaces and punctuation signs that are specified as LTR characters.
6174 // However, such sequences, which are separators between words, must be considered as being part
6175 // of an RTL sequence of words and therefore be reversed with them.
6176 // This function helps to determine if the supplied string is simply a sequence of spaces and
6177 // punctuation (a word separator) or plain text, that must keep its position in the line.
6178 private function __is_rtl_separator ( $text )
6179 {
6180 static $known_separators = array ( ) ;
6181 static $separators = " \t,.;:/!-_=+" ;
6182
6183 if ( isset ( $known_separators [ $text ] ) )
6184 return ( true ) ;
6185
6186 for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ )
6187 {
6188 if ( strpos ( $separators, $text [$i] ) === false )
6189 return ( false ) ;
6190 }
6191
6192 $known_separators [ $text ] = true ;
6193
6194 return ( true ) ;
6195 }
6196
6197
6198 // __strip_useless_instructions :
6199 // Removes from a text stream all the Postscript instructions that are not meaningful for text extraction
6200 // (these are mainly shape drawing instructions).
6201 private function __strip_useless_instructions ( $data )
6202 {
6203 $result = preg_replace ( $this -> IgnoredInstructions, ' ', $data ) ;
6204
6205 $this -> Statistics [ 'TextSize' ] += strlen ( $data ) ;
6206 $this -> Statistics [ 'OptimizedTextSize' ] += strlen ( $result ) ;
6207
6208 return ( $result ) ;
6209 }
6210
6211
6212 /*--------------------------------------------------------------------------------------------------------------
6213
6214 NAME
6215 IsPageSelected - Checks if a page is selected for output.
6216
6217 PROTOTYPE
6218 $status = $this -> IsPageSelected ( $page ) ;
6219
6220 DESCRIPTION
6221 Checks if the specified page is to be selected for output.
6222
6223 PARAMETERS
6224 $page (integer) -
6225 Page to be checked.
6226
6227 RETURN VALUE
6228 True if the page is to be selected for output, false otherwise.
6229
6230 *-------------------------------------------------------------------------------------------------------------*/
6231 protected function IsPageSelected ( $page )
6232 {
6233 if ( ! $this -> MaxSelectedPages )
6234 return ( true ) ;
6235
6236 if ( $this -> MaxSelectedPages > 0 )
6237 return ( $page <= $this -> MaxSelectedPages ) ;
6238
6239 // MaxSelectedPages < 0
6240 return ( $page > count ( $this -> PageMap -> Pages ) + $this -> MaxSelectedPages ) ;
6241 }
6242
6243
6244 /*--------------------------------------------------------------------------------------------------------------
6245
6246 NAME
6247 PeekAuthorInformation - Gets author information from the specified object data.
6248
6249 PROTOTYPE
6250 $this -> PeekAuthorInformation ( $object_id, $object_data ) ;
6251
6252 DESCRIPTION
6253 Try to check if the specified object data contains author information (ie, the /Author, /Creator,
6254 /Producer, /ModDate, /CreationDate keywords) and sets the corresponding properties accordingly.
6255
6256 PARAMETERS
6257 $object_id (integer) -
6258 Object id of this text block.
6259
6260 $object_data (string) -
6261 Stream contents.
6262
6263 *-------------------------------------------------------------------------------------------------------------*/
6264 protected function PeekAuthorInformation ( $object_id, $object_data )
6265 {
6266 if ( ( strpos ( $object_data, '/Author' ) !== false || strpos ( $object_data, '/CreationDate' ) !== false ) )
6267 {
6268 $this -> GotAuthorInformation = true ;
6269 return ( $object_id ) ;
6270 }
6271 else
6272 return ( false ) ;
6273 }
6274
6275
6276 /*--------------------------------------------------------------------------------------------------------------
6277
6278 NAME
6279 RetrieveAuthorInformation - Extracts author information
6280
6281 PROTOTYPE
6282 $this -> RetriveAuthorInformation ( $object_id, $pdf_objects ) ;
6283
6284 DESCRIPTION
6285 Extracts the author information. Handles the case where flag values refer to existing objects.
6286
6287 PARAMETERS
6288 $object_id (integer) -
6289 Id of the object containing the author information.
6290
6291 $pdf_objects (array) -
6292 Array whose keys are the PDF object ids, and values their corresponding contents.
6293
6294 *-------------------------------------------------------------------------------------------------------------*/
6295 protected function RetrieveAuthorInformation ( $object_id, $pdf_objects )
6296 {
6297 static $re = '#
6298 (?P<info>
6299 /
6300 (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
6301 \s*
6302 (?P<opening> [(<])
6303 )
6304 #imsx' ;
6305 static $object_re = '#
6306 (?P<info>
6307 /
6308 (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
6309 \s*
6310 (?P<object_ref>
6311 (?P<object> \d+)
6312 \s+
6313 \d+
6314 \s+
6315 R
6316 )
6317 )
6318 #imsx' ;
6319
6320 // Retrieve the object data corresponding to the specified object id
6321 $object_data = $pdf_objects [ $object_id ] ;
6322
6323 // Pre-process flags whose values refer to existing objects
6324 if ( preg_match_all ( $object_re, $object_data, $object_matches ) )
6325 {
6326 $searches = array ( ) ;
6327 $replacements = array ( ) ;
6328
6329 for ( $i = 0, $count = count ( $object_matches [ 'keyword' ] ) ; $i < $count ; $i ++ )
6330 {
6331 $searches [] = $object_matches [ 'object_ref' ] [$i] ;
6332
6333 // Some buggy PDF may reference author information objects that do not exist
6334 $replacements [] = isset ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) ?
6335 trim ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) : '' ;
6336 }
6337
6338 $object_data = str_replace ( $searches, $replacements, $object_data ) ;
6339 }
6340
6341
6342 // To execute faster, run the regular expression only if the object data contains a /Author keyword
6343 if ( preg_match_all ( $re, $object_data, $matches, PREG_OFFSET_CAPTURE ) )
6344 {
6345 for ( $i = 0, $count = count ( $matches [ 'keyword' ] ) ; $i < $count ; $i ++ )
6346 {
6347 $keyword = $matches [ 'keyword' ] [$i] [0] ;
6348 $opening = $matches [ 'opening' ] [$i] [0] ;
6349 $start_index = $matches [ 'info' ] [$i] [1] + strlen ( $matches [ 'info' ] [$i] [0] ) ;
6350
6351 // Text between parentheses : the text is written as is
6352 if ( $opening == '(' )
6353 {
6354 $parent_level = 1 ;
6355
6356 // Since the parameter value can contain any character, including "\" or "(", we will have to find the real closing
6357 // parenthesis
6358 $value = '' ;
6359
6360 for ( $j = $start_index, $object_length = strlen ( $object_data ) ; $j < $object_length ; $j ++ )
6361 {
6362 if ( $object_data [$j] == '\\' )
6363 $value .= '\\' . $object_data [++$j] ;
6364 else if ( $object_data [$j] == '(' )
6365 {
6366 $value .= '(' ;
6367 $parent_level ++ ;
6368 }
6369 else if ( $object_data [$j] == ')' )
6370 {
6371 $parent_level -- ;
6372
6373 if ( ! $parent_level )
6374 break ;
6375 else
6376 $value .= ')' ;
6377 }
6378 else
6379 $value .= $object_data [$j] ;
6380 }
6381 }
6382 // Text within angle brackets, written as hex digits
6383 else
6384 {
6385 $end_index = strpos ( $object_data, '>', $start_index ) ;
6386 $hexdigits = substr ( $object_data, $start_index, $end_index - $start_index ) ;
6387 $value = hex2bin ( str_replace ( array ( "\n", "\r", "\t" ), '', $hexdigits ) ) ;
6388 }
6389
6390 $value = $this -> __convert_utf16 ( $this -> __extract_chars_from_block ( $value ) ) ;
6391
6392 switch ( strtolower ( $keyword ) )
6393 {
6394 case 'author' : $this -> Author = $value ; break ;
6395 case 'creator' : $this -> CreatorApplication = $value ; break ;
6396 case 'producer' : $this -> ProducerApplication = $value ; break ;
6397 case 'title' : $this -> Title = $value ; break ;
6398 case 'keywords' : $this -> Keywords = $value ; break ;
6399 case 'subject' : $this -> Subject = $value ; break ;
6400 case 'creationdate' : $this -> CreationDate = $this -> GetUTCDate ( $value ) ; break ;
6401 case 'moddate' : $this -> ModificationDate = $this -> GetUTCDate ( $value ) ; break ;
6402 }
6403 }
6404
6405 if ( self::$DEBUG )
6406 {
6407 echo "\n----------------------------------- AUTHOR INFORMATION\n" ;
6408 echo ( "Author : " . $this -> Author . "\n" ) ;
6409 echo ( "Creator application : " . $this -> CreatorApplication . "\n" ) ;
6410 echo ( "Producer application : " . $this -> ProducerApplication . "\n" ) ;
6411 echo ( "Title : " . $this -> Title . "\n" ) ;
6412 echo ( "Subject : " . $this -> Subject . "\n" ) ;
6413 echo ( "Keywords : " . $this -> Keywords . "\n" ) ;
6414 echo ( "Creation date : " . $this -> CreationDate . "\n" ) ;
6415 echo ( "Modification date : " . $this -> ModificationDate . "\n" ) ;
6416 }
6417 }
6418 }
6419
6420
6421 /*--------------------------------------------------------------------------------------------------------------
6422
6423 NAME
6424 RetrieveFormData - Retrieves raw form data
6425
6426 PROTOTYPE
6427 $this -> RetrieveFormData ( $object_id, $object_data ) ;
6428
6429 DESCRIPTION
6430 Retrieves raw form data (form definition and field values definition).
6431
6432 PARAMETERS
6433 $object_id (integer) -
6434 Id of the object containing the author information.
6435
6436 $object_data (string) -
6437 Object contents.
6438
6439 $pdf_objects (array) -
6440 Array whose keys are the PDF object ids, and values their corresponding contents.
6441
6442 NOTES
6443 This function only memorizes the contents of form data definitions. The actual data will be processed
6444 only if the GetFormData() function is called.
6445
6446 *-------------------------------------------------------------------------------------------------------------*/
6447 protected function RetrieveFormData ( $object_id, $object_data, $pdf_objects )
6448 {
6449 // Retrieve the object that contains the field values
6450 preg_match ( '#\b R \s* \( \s* datasets \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $field_match ) ;
6451 $field_object = $field_match [ 'object' ] ;
6452
6453 if ( ! isset ( $pdf_objects [ $field_object ] ) )
6454 {
6455 if ( self::$DEBUG )
6456 warning ( "Field definitions object #$field_object not found in object #$object_id." ) ;
6457
6458 return ;
6459 }
6460
6461 // Retrieve the object that contains the form definition
6462 preg_match ( '#\b R \s* \( \s* form \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $form_match ) ;
6463 $form_object = $form_match [ 'object' ] ;
6464
6465 if ( ! isset ( $pdf_objects [ $form_object ] ) )
6466 {
6467 if ( self::$DEBUG )
6468 warning ( "Form definitions object #$form_object not found in object #$object_id." ) ;
6469
6470 return ;
6471 }
6472 // Add this entry to form data information
6473 $this -> FormData [ $object_id ] = array
6474 (
6475 'values' => ( integer ) $field_object,
6476 'form' => ( integer ) $form_object
6477 ) ;
6478 }
6479
6480
6481 }
6482
6483
6484/**************************************************************************************************************
6485 **************************************************************************************************************
6486 **************************************************************************************************************
6487 ****** ******
6488 ****** ******
6489 ****** FONT TABLE MANAGEMENT ******
6490 ****** ******
6491 ****** ******
6492 **************************************************************************************************************
6493 **************************************************************************************************************
6494 **************************************************************************************************************/
6495
6496/*==============================================================================================================
6497
6498 PdfTexterFontTable class -
6499 The PdfTexterFontTable class is not supposed to be used outside the context of the PdfToText class.
6500 Its purposes are to hold a list of font definitions taken from a pdf document, along with their
6501 associated character mapping tables, if any.
6502 This is why no provision has been made to design this class a a general purpose class ; its utility
6503 exists only in the scope of the PdfToText class.
6504
6505 ==============================================================================================================*/
6506class PdfTexterFontTable extends PdfObjectBase
6507 {
6508 // Font table
6509 public $Fonts = array ( ) ;
6510 private $DefaultFont = false ;
6511 // Font mapping between a font number and an object number
6512 private $FontMap = array ( ) ;
6513 // A character map buffer is used to store results from previous calls to the MapCharacter() method of the
6514 // FontTable object. It dramatically reduces the number of calls needed, from one call for each character
6515 // defined in the pdf stream, to one call on each DISTINCT character defined in the PDF stream.
6516 // As an example, imagine a PDF file that contains 200K characters, but only 150 distinct ones. The
6517 // MapCharacter method will be called 150 times, instead of 200 000...
6518 private $CharacterMapBuffer = array ( ) ;
6519
6520
6521 // Constructor -
6522 // Well, does not do anything special
6523 public function __construct ( )
6524 {
6525 parent::__construct ( ) ;
6526 }
6527
6528
6529 // Add -
6530 // Adds the current font declaration to the font table. Handles special cases where font id is not
6531 // given by the object id, but rather by <</Rx...>> constructs
6532 public function Add ( $object_id, $font_definition, $pdf_objects, $extra_mappings )
6533 {
6534 if ( PdfToText::$DEBUG )
6535 {
6536 echo "\n----------------------------------- FONT #$object_id\n" ;
6537 echo $font_definition ;
6538 }
6539
6540 $font_type = PdfTexterFont::FONT_ENCODING_STANDARD ;
6541 $cmap_id = 0 ;
6542 $secondary_cmap_id = 0 ;
6543 $font_variant = false ;
6544
6545 // Font resource id specification
6546 if ( preg_match ( '#<< \s* (?P<rscdefs> /R\d+ .*) >>#ix', $font_definition, $match ) )
6547 {
6548 $resource_definitions = $match [ 'rscdefs' ] ;
6549
6550 preg_match_all ( '#/R (?P<font_id> \d+) #ix', $resource_definitions, $id_matches ) ;
6551 preg_match_all ( '#/ToUnicode \s* (?P<cmap_id> \d+)#ix', $resource_definitions, $cmap_matches ) ;
6552
6553 $count = count ( $id_matches [ 'font_id' ] ) ;
6554
6555 for ( $i = 0 ; $i < $count ; $i ++ )
6556 {
6557 $font_id = $id_matches [ 'font_id' ] [$i] ;
6558 $cmap_id = $cmap_matches [ 'cmap_id' ] [$i] ;
6559
6560 $this -> Fonts [ $font_id ] = new PdfTexterFont ( $font_id, $cmap_id, PdfTexterFont::FONT_ENCODING_UNICODE_MAP, $extra_mappings ) ;
6561 }
6562
6563 return ;
6564 }
6565 // Experimental implementation of CID fonts
6566 else if ( preg_match ( '#/(Base)?Encoding \s* /Identity-H#ix', $font_definition ) )
6567 {
6568 if ( preg_match ( '#/BaseFont \s* /(?P<font> [^\s/]+)#ix', $font_definition, $match ) )
6569 $font_variant = $match [ 'font' ] ;
6570
6571 $font_type = PdfTexterFont::FONT_ENCODING_CID_IDENTITY_H ;
6572 }
6573 // Font has an associated Unicode map (using the /ToUnicode keyword)
6574 else if ( preg_match ( '#/ToUnicode \s* (?P<cmap> \d+)#ix', $font_definition, $match ) )
6575 {
6576 $cmap_id = $match [ 'cmap' ] ;
6577 $font_type = PdfTexterFont::FONT_ENCODING_UNICODE_MAP ;
6578
6579 if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+)#ix', $font_definition, $secondary_match ) )
6580 $secondary_cmap_id = $secondary_match [ 'cmap' ] ;
6581 }
6582 // Font has an associated character map (using a cmap id)
6583 else if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+) \s+ \d+ #ix', $font_definition, $match ) )
6584 {
6585 $cmap_id = $match [ 'cmap' ] ;
6586 $font_type = PdfTexterFont::FONT_ENCODING_PDF_MAP ;
6587 }
6588 // Font uses the Windows Ansi encoding
6589 else if ( preg_match ( '#/(Base)?Encoding \s* /WinAnsiEncoding#ix', $font_definition ) )
6590 {
6591 $font_type = PdfTexterFont::FONT_ENCODING_WINANSI ;
6592
6593 if ( preg_match ( '# /BaseFont \s* / [a-z0-9_]+ \+ [a-z0-9_]+? Cyr #imsx', $font_definition ) )
6594 $font_type |= PdfTexterFont::FONT_VARIANT_ISO8859_5 ;
6595 }
6596 // Font uses the Mac Roman encoding
6597 else if ( preg_match ( '#/(Base)?Encoding \s* /MacRomanEncoding#ix', $font_definition ) )
6598 $font_type = PdfTexterFont::FONT_ENCODING_MAC_ROMAN ;
6599
6600 $this -> Fonts [ $object_id ] = new PdfTexterFont ( $object_id, $cmap_id, $font_type, $secondary_cmap_id, $pdf_objects, $extra_mappings, $font_variant ) ;
6601
6602 // Arbitrarily set the default font to the first font encountered in the pdf file
6603 if ( $this -> DefaultFont === false )
6604 {
6605 reset ( $this -> Fonts ) ;
6606 $this -> DefaultFont = key ( $this -> Fonts ) ;
6607 }
6608 }
6609
6610
6611 // AddFontMap -
6612 // Process things like :
6613 // <</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
6614 // which maps font 1 (when specified with the /Fx instruction) to object 26,
6615 // 2 to object 22 and 3 to object 18, respectively, in the above example.
6616 // Found also a strange way of specifying a font mapping :
6617 // <</f-0-0 5 0 R etc.
6618 // And yet another one :
6619 // <</C0_0 5 0 R
6620 public function AddFontMap ( $object_id, $object_data )
6621 {
6622 $object_data = self::UnescapeHexCharacters ( $object_data ) ;
6623
6624 // The same object can hold different notations for font associations
6625 if ( preg_match_all ( '# (?P<font> ' . self::$FontSpecifiers . ' ) \s+ (?P<object> \d+) #imsx', $object_data, $matches ) )
6626 {
6627 for ( $i = 0, $count = count ( $matches [ 'font' ] ) ; $i < $count ; $i ++ )
6628 {
6629 $font = $matches [ 'font' ] [$i] ;
6630 $object = $matches [ 'object' ] [$i] ;
6631
6632 $this -> FontMap [ $font ] = $object ;
6633 }
6634 }
6635 }
6636
6637
6638 // AddPageFontMap -
6639 // Adds font aliases to the current font map, in the form : "page:xobject:font".
6640 // The associated value is the font object itself.
6641 public function AddPageFontMap ( $map )
6642 {
6643 foreach ( $map as $map_entry )
6644 {
6645 $this -> FontMap [ $map_entry [ 'page' ] . ':' . $map_entry [ 'xobject-name' ] . ':' . $map_entry [ 'font-name' ] ] = $map_entry [ 'object' ] ;
6646 }
6647 }
6648
6649
6650 // AddCharacterMap -
6651 // Associates a character map to a font declaration that referenced it.
6652 public function AddCharacterMap ( $cmap )
6653 {
6654 $status = false ;
6655
6656 // We loop through all fonts, since the same character map can be referenced by several font definitions
6657 foreach ( $this -> Fonts as $font )
6658 {
6659 if ( $font -> CharacterMapId == $cmap -> ObjectId )
6660 {
6661 $font -> CharacterMap = $cmap ;
6662 $status = true ;
6663 }
6664 else if ( $font -> SecondaryCharacterMapId == $cmap -> ObjectId )
6665 {
6666 $cmap -> Secondary = true ;
6667 $font -> SecondaryCharacterMap = $cmap ;
6668 $status = true ;
6669 }
6670 }
6671
6672 return ( $status ) ;
6673 }
6674
6675
6676 // GetFontAttributes -
6677 // Gets the specified font width in hex digits and whether the font has a character map or not.
6678 public function GetFontAttributes ( $page_number, $template, $font, &$font_map_width, &$font_mapped )
6679 {
6680 // Font considered as global to the document
6681 if ( isset ( $this -> Fonts [ $font ] ) )
6682 $key = $font ;
6683 // Font not found : try to use the first one declared in the document
6684 else
6685 {
6686 reset ( $this -> Fonts ) ;
6687 $key = key ( $this -> Fonts ) ;
6688 }
6689
6690 // Font has an associated character map
6691 if ( $key && $this -> Fonts [ $key ] -> CharacterMap )
6692 {
6693 $font_map_width = $this -> Fonts [ $key ] -> CharacterMap -> HexCharWidth ;
6694 $font_mapped = true ;
6695
6696 return ( true ) ;
6697 }
6698 // No character map : characters are specified as two hex digits
6699 else
6700 {
6701 $font_map_width = 2 ;
6702 $font_mapped = false ;
6703
6704 return ( false ) ;
6705 }
6706 }
6707
6708
6709 // GetFontByMapId -
6710 // Returns the font id (object id) associated with the specified mapped id.
6711 public function GetFontByMapId ( $page_number, $template, $id )
6712 {
6713 if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
6714 $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ;
6715 else if ( isset ( $this -> FontMap [ $id ] ) )
6716 $font_object = $this -> FontMap [ $id ] ;
6717 else
6718 $font_object = -1 ;
6719
6720 return ( $font_object ) ;
6721 }
6722
6723
6724 // GetFontObject -
6725 // Returns the PdfTexterFont object for the given page, template and font id (in the form of "/something")
6726 public function GetFontObject ( $page_number, $template, $id )
6727 {
6728 if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
6729 $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ;
6730 else if ( isset ( $this -> FontMap [ $id ] ) )
6731 $font_object = $this -> FontMap [ $id ] ;
6732 else
6733 return ( false ) ;
6734
6735 if ( isset ( $this -> Fonts [ $font_object ] ) )
6736 return ( $this -> Fonts [ $font_object ] ) ;
6737 else
6738 return ( false ) ;
6739 }
6740
6741
6742 // MapCharacter -
6743 // Returns the character associated to the specified one.
6744 public function MapCharacter ( $font, $ch, $return_false_on_failure = false )
6745 {
6746 if ( isset ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) )
6747 return ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) ;
6748
6749 // Use the first declared font as the default font, if none defined
6750 if ( $font == -1 )
6751 $font = $this -> DefaultFont ;
6752
6753 $cache = true ;
6754
6755 if ( isset ( $this -> Fonts [ $font ] ) )
6756 {
6757 $font_object = $this -> Fonts [ $font ] ;
6758
6759 $code = $font_object -> MapCharacter ( $ch, $return_false_on_failure ) ;
6760
6761 if ( $font_object -> CharacterMap )
6762 $cache = $font_object -> CharacterMap -> Cache ;
6763 }
6764 else
6765 {
6766 $code = $this -> CodePointToUtf8 ( $ch ) ;
6767 }
6768
6769 if ( $cache )
6770 $this -> CharacterMapBuffer [ $font ] [ $ch ] = $code ;
6771
6772 return ( $code ) ;
6773 }
6774 }
6775
6776
6777/**************************************************************************************************************
6778 **************************************************************************************************************
6779 **************************************************************************************************************
6780 ****** ******
6781 ****** ******
6782 ****** FONT MANAGEMENT ******
6783 ****** ******
6784 ****** ******
6785 **************************************************************************************************************
6786 **************************************************************************************************************
6787 **************************************************************************************************************/
6788
6789/*==============================================================================================================
6790
6791 PdfTexterFont class -
6792 The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
6793 It holds an optional character mapping table associted with this font.
6794 No provision has been made to design this class a a general purpose class ; its utility exists only in
6795 the scope of the PdfToText class.
6796
6797 ==============================================================================================================*/
6798class PdfTexterFont extends PdfObjectBase
6799 {
6800 // Font encoding types, for fonts that are neither associated with a Unicode character map nor a PDF character map
6801 const FONT_ENCODING_STANDARD = 0 ; // No character map, use the standard character set
6802 const FONT_ENCODING_WINANSI = 1 ; // No character map, use the Windows Ansi character set
6803 const FONT_ENCODING_MAC_ROMAN = 2 ; // No character map, use the MAC OS Roman character set
6804 const FONT_ENCODING_UNICODE_MAP = 3 ; // Font has an associated unicode character map
6805 const FONT_ENCODING_PDF_MAP = 4 ; // Font has an associated PDF character map
6806 const FONT_ENCODING_CID_IDENTITY_H = 5 ; // CID font : IDENTITY-H
6807
6808 // Font variants
6809 const FONT_VARIANT_STANDARD = 0x0000 ;
6810 const FONT_VARIANT_ISO8859_5 = 0x1000 ; // Cyrillic
6811
6812 const FONT_VARIANT_MASK = 0xF000 ;
6813 const FONT_VARIANT_SHIFT = 12 ;
6814
6815 // Font resource id (may be an object id, overridden by <</Rx...>> constructs
6816 public $Id ;
6817 // Font type and variant
6818 public $FontType ;
6819 public $FontVariant ;
6820 // Character map id, specified by the /ToUnicode flag
6821 public $CharacterMapId ;
6822 // Secondary character map id, specified by the /Encoding flag and that can contain a /Differences flag
6823 public $SecondaryCharacterMapId ;
6824 // Optional character map, that may be set by the PdfToText::Load method just before processing text drawing blocks
6825 public $CharacterMap = null ;
6826 public $SecondaryCharacterMap = null ;
6827 // Character widths
6828 public $CharacterWidths = array ( ) ;
6829 // Default character width, if not present in the $CharacterWidths array
6830 public $DefaultWidth = 0 ;
6831 private $GotWidthInformation = false ;
6832 // A buffer for remembering character widths
6833 protected $CharacterWidthsBuffer = array ( ) ;
6834
6835
6836 // Constructor -
6837 // Builds a PdfTexterFont object, using its resource id and optional character map id.
6838 public function __construct ( $resource_id, $cmap_id, $font_type, $secondary_cmap_id = null, $pdf_objects = null, $extra_mappings = null, $font_variant = false )
6839 {
6840
6841 parent::__construct ( ) ;
6842
6843 $this -> Id = $resource_id ;
6844 $this -> CharacterMapId = $cmap_id ;
6845 $this -> SecondaryCharacterMapId = $secondary_cmap_id ;
6846 $this -> FontType = $font_type & ~self::FONT_VARIANT_MASK ;
6847 $this -> FontVariant = ( $font_type >> self::FONT_VARIANT_SHIFT ) & 0x0F ;
6848
6849 // Instantiate the appropriate character map for this font
6850 switch ( $this -> FontType )
6851 {
6852 case self::FONT_ENCODING_WINANSI :
6853 $this -> CharacterMap = new PdfTexterAdobeWinAnsiMap ( $resource_id, $this -> FontVariant ) ;
6854 break ;
6855
6856 case self::FONT_ENCODING_MAC_ROMAN :
6857 $this -> CharacterMap = new PdfTexterAdobeMacRomanMap ( $resource_id, $this -> FontVariant ) ;
6858 break ;
6859
6860 case self::FONT_ENCODING_CID_IDENTITY_H :
6861 $this -> CharacterMap = new PdfTexterIdentityHCIDMap ( $resource_id, $font_variant ) ;
6862 break ;
6863
6864 case self::FONT_ENCODING_PDF_MAP :
6865 $this -> CharacterMap = new PdfTexterEncodingMap ( $cmap_id, $pdf_objects [ $cmap_id ], $extra_mappings ) ;
6866 break ;
6867
6868 case self::FONT_ENCODING_UNICODE_MAP :
6869 break ;
6870
6871 case self::FONT_ENCODING_STANDARD :
6872 break ;
6873
6874 default :
6875 if ( PdfToText::$DEBUG )
6876 warning ( "Unknown font type #$font_type found for object #$resource_id, character map #$cmap_id." ) ;
6877 }
6878
6879 // Get font data ; include font descriptor information if present
6880 $font_data = $pdf_objects [ $resource_id ] ;
6881
6882 if ( preg_match ( '/FontDescriptor \s+ (?P<id> \d+) \s+ \d+ \s+ R/imsx', $font_data, $match ) )
6883 {
6884 $descriptor_id = $match [ 'id' ] ;
6885
6886 // Don't care about searching this in that object, or that in this object - simply catenate the font descriptor
6887 // with the font definition
6888 if ( isset ( $pdf_objects [ $descriptor_id ] ) )
6889 $font_data .= $pdf_objects [ $descriptor_id ] ;
6890 }
6891
6892 // Type1 fonts belong to the Adobe 14 standard fonts available. Information about the character widths is never embedded in the PDF
6893 // file, but must be taken from external data (in the FontMetrics directory).
6894 if ( preg_match ( '#/SubType \s* /Type1#ix', $font_data ) )
6895 {
6896 preg_match ( '#/BaseFont \s* / ([\w]+ \+)? (?P<font> [^\s\[</]+)#ix', $font_data, $match ) ;
6897 $font_name = $match [ 'font' ] ;
6898 $lc_font_name = strtolower ( $font_name ) ;
6899
6900 // Do that only if a font metrics file exists...
6901 if ( isset ( PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ) )
6902 {
6903 $metrics_file = PdfToText::$FontMetricsDirectory . '/' . PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ;
6904
6905 if ( file_exists ( $metrics_file ) )
6906 {
6907 include ( $metrics_file ) ;
6908
6909 if ( isset ( $charwidths ) )
6910 {
6911 // Build the CharacterWidths table
6912 foreach ( $charwidths as $char => $width )
6913 $this -> CharacterWidths [ chr ( $char ) ] = ( double ) $width ;
6914
6915 $this -> GotWidthInformation = true ;
6916 }
6917 }
6918 }
6919 }
6920
6921 // Retrieve the character widths for this font. This means :
6922 // - Retrieving the /FirstChar, /LastChar and /Widths entries from the font definition. /Widths is an array of individual character
6923 // widths, between the /FirstChar and /LastChar entries. A value of zero in this array means "Use the default width"...
6924 // - ... which is given by the /MissingWidth parameter, normally put in the font descriptor whose object id is given by the
6925 // /FontDescriptor entry of the font definition
6926 // Well, to be considered, given the number of buggy PDFs around the world, we won't care about the /LastChar entry and we won't
6927 // check whether the /Widths array contains (LastChar - FirstChar + 1) integer values...
6928 // Get the entries
6929 $first_char = false ;
6930 $widths = false ;
6931 $missing_width = false ;
6932
6933 if ( preg_match ( '#/FirstChar \s+ (?P<char> \d+)#imsx', $font_data, $match ) )
6934 $first_char = $match [ 'char' ] ;
6935
6936 if ( preg_match ( '#/Widths \s* \[ (?P<widths> [^\]]+) \]#imsx', $font_data, $match ) )
6937 $widths = $match [ 'widths' ] ;
6938
6939 if ( preg_match ( '#/MissingWidth \s+ (?P<missing> \d+)#imsx', $font_data, $match ) )
6940 $missing_width = $match [ 'missing' ] ;
6941
6942 // It would not make sense if one of the two entries /FirstChar and /Widths was missing
6943 // So ensure they are all there (note that /MissingWidths can be absent)
6944 if ( $first_char !== false && $widths )
6945 {
6946 if ( $missing_width !== false )
6947 $this -> DefaultWidth = ( double ) $missing_width ;
6948
6949 // Here comes a really tricky part :
6950 // - The PDF file can contain CharProcs (example names : /a0, /a1, etc.) for which we have no
6951 // Unicode equivalent
6952 // - The caller may have called the AddAdobeExtraMappings method, to providing a mapping between
6953 // those char codes (/a0, /a1, etc.) and a Unicode equivalent
6954 // - Each "charproc" listed in the /Differences array as a specific code, such as :
6955 // [0/a1/a2/a3...]
6956 // which maps /a1 to code 0, /a2 to code 1, and so on
6957 // - However, the GetStringWidth() method provides real Unicode characters
6958 // Consequently, we have to map each CharProc character (/a1, /a2, etc.) to the Unicode value
6959 // that may have been specified using the AddAdobeExtraMappings() method.
6960 // The first step below collects the name list of CharProcs.
6961 $charprocs = false ;
6962
6963 if ( isset ( $this -> CharacterMap -> Encodings ) &&
6964 preg_match ( '# /CharProcs \s* << (?P<list> .*?) >>#imsx', $font_data, $match ) )
6965 {
6966 preg_match_all ( '#/ (?P<char> \w+) \s+ \d+ \s+ \d+ \s+ R#msx', $match [ 'list' ], $char_matches ) ;
6967
6968 $charprocs = array_flip ( $char_matches [ 'char' ] ) ;
6969 }
6970
6971 // The /FontMatrix entry defines the scaling to be used for the character widths (among other things)
6972 if ( preg_match ( '#/FontMatrix \s* \[ \s* (?P<multiplier> \d+)#imsx', $font_data, $match ) )
6973 $multiplier = 1000 * ( double ) $match [ 'multiplier' ] ;
6974 else
6975 $multiplier = 1 ;
6976
6977 $widths = trim ( preg_replace ( '/\s+/', ' ', $widths ) ) ;
6978 $widths = explode ( ' ', $widths ) ;
6979
6980 for ( $i = 0, $count = count ( $widths ) ; $i < $count ; $i ++ )
6981 {
6982 $value = ( double ) trim ( $widths [$i] ) ;
6983 $chr_index = $first_char + $i ;
6984
6985 // Tricky thing part 2 :
6986 if ( $charprocs )
6987 {
6988 // If one of the CharProc characters is listed in the /Differences array then...
6989 if ( isset ( $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ) )
6990 {
6991 $chname = $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ;
6992
6993 // ... if this CharProcs character is defined in the encoding table (possibly because
6994 // it was complemeted through a call to the AddAdobeExtraMappings() method), then we
6995 // will use its Unicode counterpart instead of the character ID coming from the
6996 // /Differences array)
6997 if ( isset ( $charprocs [ $chname ] ) && isset ( $this -> CharacterMap -> Encodings [ $chname ] ) )
6998 $chr_index = $this -> CharacterMap -> Encodings [ $chname ] [2] ;
6999 }
7000 }
7001
7002 $this -> CharacterWidths [ chr ( $chr_index ) ] = ( $value ) ? ( $value * $multiplier ) : $this -> DefaultWidth ;
7003 }
7004
7005 $this -> GotWidthInformation = true ;
7006 }
7007 }
7008
7009
7010 // MapCharacter -
7011 // Returns the substitution string value for the specified character, if the current font has an
7012 // associated character map, or the original character encoded in utf8, if not.
7013 public function MapCharacter ( $ch, $return_false_on_failure = false )
7014 {
7015 if ( $this -> CharacterMap )
7016 {
7017 // Character is defined in the character map ; check if it has been overridden by a /Differences array in
7018 // a secondary character map
7019 if ( isset ( $this -> CharacterMap [ $ch ] ) )
7020 {
7021 // Since a /ToUnicode map can have an associated /Encoding map with a /Differences list, this is the right place
7022 // to perform the translation (ie, the final Unicode codepoint is impacted by the /Differences list)
7023 if ( ! $this -> SecondaryCharacterMap ) // Most common case first !
7024 {
7025 $code = $this -> CharacterMap [ $ch ] ;
7026 }
7027 else
7028 {
7029 if ( isset ( $this -> SecondaryCharacterMap [ $ch ] ) )
7030 $code = $this -> SecondaryCharacterMap [ $ch ] ;
7031 else
7032 $code = $this -> CharacterMap [ $ch ] ;
7033 }
7034
7035 return ( $code ) ;
7036 }
7037 // On the contrary, the character may not be defined in the main character map but may exist in the secondary cmap
7038 else if ( $this -> SecondaryCharacterMap && isset ( $this -> SecondaryCharacterMap [ $ch ] ) )
7039 {
7040 $code = $this -> SecondaryCharacterMap [ $ch ] ;
7041
7042 return ( $code ) ;
7043 }
7044 }
7045
7046 if ( $return_false_on_failure )
7047 return ( false ) ;
7048
7049 return ( $this -> CodePointToUtf8 ( $ch ) ) ;
7050 }
7051
7052
7053 /*--------------------------------------------------------------------------------------------------------------
7054
7055 NAME
7056 GetStringWidth - Returns the length of a string, in 1/100 of points
7057
7058 PROTOTYPE
7059 $width = $font -> GetStringWidth ( $text, $extra_percent ) ;
7060
7061 DESCRIPTION
7062 Returns the length of a string, in 1/100 of points.
7063
7064 PARAMETERS
7065 $text (string) -
7066 String whose length is to be measured.
7067
7068 $extra_percent (double) -
7069 Extra percentage to be added to the computed width.
7070
7071 RETURN VALUE
7072 Returns the length of the specified string in 1/1000 of text points, or 0 if the font does not
7073 contain any character width information.
7074
7075 *-------------------------------------------------------------------------------------------------------------*/
7076 public function GetStringWidth ( $text, $extra_percent )
7077 {
7078 // No width information
7079 if ( ! $this -> GotWidthInformation )
7080 return ( false ) ;
7081
7082 $width = 0 ;
7083
7084 // Compute the width of each individual character - use a character width buffer to avoid
7085 // repeating the same tests again and again for characters whose width has already been processed
7086 for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ )
7087 {
7088 $ch = $text [$i] ;
7089
7090 // Character already in the Widths buffer - Simply retrieve its value
7091 if ( isset ( $this -> CharacterWidthsBuffer [ $ch ] ) )
7092 {
7093 $width += $this -> CharacterWidthsBuffer [ $ch ] ;
7094 }
7095 // New character - The width comes either from the CharacterWidths array if an entry is defined
7096 // for this character, or from the default width property.
7097 else
7098 {
7099 if ( isset ( $this -> CharacterWidths [ $ch ] ) )
7100 {
7101 $width += $this -> CharacterWidths [ $ch ] ;
7102 $this -> CharacterWidthsBuffer [ $ch ] = $this -> CharacterWidths [ $ch ] ;
7103 }
7104 else
7105 {
7106 $width += $this -> DefaultWidth ;
7107 $this -> CharacterWidthsBuffer [ $ch ] = $this -> DefaultWidth ;
7108 }
7109 }
7110 }
7111
7112 // The computed width is actually longer/smaller than its actual width. Adjust by the percentage specified
7113 // by the ExtraTextWidth property
7114 $divisor = 100 - $extra_percent ;
7115
7116 if ( $divisor < 50 ) // Arbitrarily fix a limit
7117 $divisor = 50 ;
7118
7119 // All done, return
7120 return ( $width / $divisor ) ;
7121 }
7122 }
7123
7124
7125/*==============================================================================================================
7126
7127 PdfTexterCharacterMap -
7128 The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
7129 Describes a character map.
7130 No provision has been made to design this class a a general purpose class ; its utility exists only in
7131 the scope of the PdfToText class.
7132
7133 ==============================================================================================================*/
7134abstract class PdfTexterCharacterMap extends PdfObjectBase
7135 implements ArrayAccess, Countable
7136 {
7137 // Object id of the character map
7138 public $ObjectId ;
7139 // Number of hex digits in a character represented in hexadecimal notation
7140 public $HexCharWidth ;
7141 // Set to true if the values returned by the array access operator can safely be cached
7142 public $Cache = false ;
7143
7144
7145
7146 public function __construct ( $object_id )
7147 {
7148 parent::__construct ( ) ;
7149 $this -> ObjectId = $object_id ;
7150 }
7151
7152
7153 /*--------------------------------------------------------------------------------------------------------------
7154
7155 CreateInstance -
7156 Creates a PdfTexterCharacterMap instance of the correct type.
7157
7158 *-------------------------------------------------------------------------------------------------------------*/
7159 public static function CreateInstance ( $object_id, $definitions, $extra_mappings )
7160 {
7161 if ( preg_match ( '# (begincmap) | (beginbfchar) | (beginbfrange) #ix', $definitions ) )
7162 return ( new PdfTexterUnicodeMap ( $object_id, $definitions ) ) ;
7163 else if ( stripos ( $definitions, '/Differences' ) !== false )
7164 return ( new PdfTexterEncodingMap ( $object_id, $definitions, $extra_mappings ) ) ;
7165 else
7166 return ( false ) ;
7167 }
7168
7169
7170
7171 /*--------------------------------------------------------------------------------------------------------------
7172
7173 Interface implementations.
7174
7175 *-------------------------------------------------------------------------------------------------------------*/
7176 public function offsetSet ( $offset, $value )
7177 { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
7178
7179 public function offsetUnset ( $offset )
7180 { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
7181 }
7182
7183
7184
7185/*==============================================================================================================
7186
7187 PdfTexterUnicodeMap -
7188 A class for fonts having a character map specified with the /ToUnicode parameter.
7189
7190 ==============================================================================================================*/
7191class PdfTexterUnicodeMap extends PdfTexterCharacterMap
7192 {
7193 // Id of the character map (specified by the /Rx flag)
7194 public $Id ;
7195 // Character substitution table, using the beginbfrange/endbfrange notation
7196 // Only constructs of the form :
7197 // <low> <high> <start>
7198 // are stored in this table. Constructs of the form :
7199 // <x> <y> [ <subst_x> <subst_x+1> ... <subst_y> ]
7200 // are stored in the $DirectMap array, because it is conceptually the same thing in the end as a character substitution being
7201 // defined with the beginbfchar/endbfchar construct.
7202 // Note that a dichotomic search in $RangeMap will be performed for each character reference not yet seen in the pdf flow.
7203 // Once the substitution character has been found, it will be added to the $DirectMap array for later faster access.
7204 // The reason for this optimization is that some pdf files can contain beginbfrange/endbfrange constructs that may seem useless,
7205 // except for validation purposes (ie, validating the fact that a character reference really belongs to the character map).
7206 // However, such constructs can lead to thousands of character substitutions ; consider the following example, that comes
7207 // from a sample I received :
7208 // beginbfrange
7209 // <1000> <1FFFF> <1000>
7210 // <2000> <2FFFF> <2000>
7211 // ...
7212 // <A000> <AFFFF> <A0000>
7213 // ...
7214 // endbfrange
7215 // By naively storing a one-to-one character relationship in an associative array, such as :
7216 // $array [ 0x1000 ] = 0x1000 ;
7217 // $array [ 0x1001 ] = 0x1001 ;
7218 // ..
7219 // $array [ 0x1FFF ] = 0x1FFF ;
7220 // etc.
7221 // you may arrive to a situation where the array becomes so big that it exhausts all of the available memory.
7222 // This is why the ranges are stored as is and a dichotomic search is performed to go faster.
7223 // Since it is useless to use this method to search the same character twice, when it has been found once, the
7224 // substitution pair will be put in the $DirectMap array for subsequent accesses (there is little probability that a PDF
7225 // file contains so much different characters, unless you are processing the whole Unicode table itself ! - but in this
7226 // case, you will simply have to adjust the value of the memory_limit setting in your php.ini file. Consider that I am
7227 // not a magician...).
7228 protected $RangeMap = array ( ) ;
7229 private $RangeCount = 0 ; // Avoid unnecessary calls to the count() function
7230 private $RangeMin = PHP_INT_MAX, // Min and max values of the character ranges
7231 $RangeMax = -1 ;
7232 // Character substitution table for tables using the beginbfchar notation
7233 protected $DirectMap = array ( ) ;
7234
7235
7236 // Constructor -
7237 // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
7238 // beginbfrange/endbfrange constructs.
7239 public function __construct ( $object_id, $definitions )
7240 {
7241 parent::__construct ( $object_id ) ;
7242
7243 if ( PdfToText::$DEBUG )
7244 {
7245 echo "\n----------------------------------- UNICODE CMAP #$object_id\n" ;
7246 echo $definitions;
7247 }
7248
7249 // Retrieve the cmap id, if any
7250 preg_match ( '# /CMapName \s* /R (?P<num> \d+) #ix', $definitions, $match ) ;
7251 $this -> Id = isset ( $match [ 'num' ] ) ? $match [ 'num' ] : -1 ;
7252
7253 // Get the codespace range, which will give us the width of a character specified in hexadecimal notation
7254 preg_match ( '# begincodespacerange \s+ <\s* (?P<low> [0-9a-f]+) \s*> \s* <\s* (?P<high> [0-9a-f]+) \s*> \s*endcodespacerange #ix', $definitions, $match ) ;
7255
7256 if ( isset ( $match [ 'low' ] ) )
7257 $this -> HexCharWidth = max ( strlen ( $match [ 'low' ] ), strlen ( $match [ 'high' ] ) ) ;
7258 else
7259 $this -> HexCharWidth = 0 ;
7260
7261 $max_found_char_width = 0 ;
7262
7263 // Process beginbfchar/endbfchar constructs
7264 if ( preg_match_all ( '/ beginbfchar \s* (?P<chars> .*?) endbfchar /imsx', $definitions, $char_matches ) )
7265 {
7266 foreach ( $char_matches [ 'chars' ] as $char_list )
7267 {
7268 // beginbfchar / endbfchar constructs can behave as a kind of beginfbfrange/endbfrange ; example :
7269 // <21> <0009 0020 000d>
7270 // means :
7271 // . Map character #21 to #0009
7272 // . Map character #22 to #0020
7273 // . Map character #23 to #000D
7274 // There is no clue in the Adobe PDF specification that a single character could be mapped to a range.
7275 // The normal constructs would be :
7276 // <21> <0009>
7277 // <22> <0020>
7278 // <23> <0000D>
7279 preg_match_all ( '/< \s* (?P<item> .*?) \s* >/msx', $char_list, $item_matches ) ;
7280
7281 for ( $i = 0, $item_count = count ( $item_matches [ 'item' ] ) ; $i < $item_count ; $i += 2 )
7282 {
7283 $char = hexdec ( $item_matches [ 'item' ] [$i] ) ;
7284 $char_width = strlen ( $item_matches [ 'item' ] [$i] ) ;
7285 $map = explode ( ' ', preg_replace ( '/\s+/', ' ', $item_matches [ 'item' ] [ $i + 1 ] ) ) ;
7286
7287 if ( $char_width > $max_found_char_width )
7288 $max_found_char_width = $char_width ;
7289
7290 for ( $j = 0, $map_count = count ( $map ) ; $j < $map_count ; $j ++ )
7291 {
7292 $subst = hexdec ( $map [$j] ) ;
7293
7294 // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
7295 // (but it still corresponds to something...)
7296 if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
7297 $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
7298
7299 $this -> DirectMap [ $char + $j ] = $subst ;
7300 }
7301 }
7302
7303 }
7304 }
7305
7306 // Process beginbfrange/endbfrange constructs
7307 if ( preg_match_all ( '/ beginbfrange \s* (?P<ranges> .*?) endbfrange /imsx', $definitions, $range_matches ) )
7308 {
7309 foreach ( $range_matches [ 'ranges' ] as $range_list )
7310 {
7311 $start_index = 0 ;
7312
7313 // There are two forms of syntax in a beginbfrange..endbfrange construct
7314 // 1) "<x> <y> <z>", which maps character ids x through y to z through (z+y-x)
7315 // 2) "<x> <y> [<a1> <a2> ... <an>]", which maps character x to a1, x+1 to a2, up to y, which is mapped to an
7316 // All the values are hex digits.
7317 // We will loop through the range definitions by first identifying the <x> and <y>, and the character that follows
7318 // them, which is either a "<" for notation 1), or a "[" for notation 2).
7319 while ( preg_match ( '# < \s* (?P<from> [0-9a-f]+) \s* > \s* < \s* (?P<to> [0-9a-f]+) \s* > \s* (?P<nextchar> .) #imsx',
7320 $range_list, $range_match, PREG_OFFSET_CAPTURE, $start_index ) )
7321 {
7322 $from = hexdec ( $range_match [ 'from' ] [0] ) ;
7323 $to = hexdec ( $range_match [ 'to' ] [0] ) ;
7324 $next_char = $range_match [ 'nextchar' ] [0] ;
7325 $next_char_index = $range_match [ 'nextchar' ] [1] ;
7326 $char_width = strlen ( $range_match [ 'from' ] [0] ) ;
7327
7328 if ( $char_width > $max_found_char_width )
7329 $max_found_char_width = $char_width ;
7330
7331 // Form 1) : catch the third hex value after <x> and <y>
7332 if ( $next_char == '<' )
7333 {
7334 if ( preg_match ( '/ \s* (?P<start> [0-9a-f]+) (?P<tail> \s* > \s*) /imsx', $range_list, $start_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
7335 {
7336 $subst = hexdec ( $start_match [ 'start' ] [0] ) ;
7337
7338 // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
7339 // (but it still corresponds to something...)
7340 if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
7341 $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
7342
7343 // Don't create a range if <x> and <y> are the same
7344 if ( $from != $to )
7345 {
7346 $this -> RangeMap [] = array ( $from, $to, $subst ) ;
7347
7348 // Adjust min and max values for the ranges stored in this character map - to avoid unnecessary testing
7349 if ( $from < $this -> RangeMin )
7350 $this -> RangeMin = $from ;
7351
7352 if ( $to > $this -> RangeMax )
7353 $this -> RangeMax = $to ;
7354 }
7355 else
7356 $this -> DirectMap [ $from ] = $subst ;
7357
7358 $start_index = $start_match [ 'tail' ] [1] + 1 ;
7359 }
7360 else
7361 error ( "Character range $from..$to not followed by an hexadecimal value in Unicode map #$object_id." ) ;
7362 }
7363 // Form 2) : catch all the hex values between square brackets after <x> and <y>
7364 else if ( $next_char == '[' )
7365 {
7366 if ( preg_match ( '/ (?P<values> [\s<>0-9a-f]+ ) (?P<tail> \] \s*)/imsx', $range_list, $array_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
7367 {
7368 preg_match_all ( '/ < \s* (?P<num> [0-9a-f]+) \s* > /imsx', $array_match [ 'values' ] [0], $array_values ) ;
7369
7370 for ( $i = $from, $count = 0 ; $i <= $to ; $i ++, $count ++ )
7371 $this -> DirectMap [$i] = hexdec ( $array_values [ 'num' ] [ $count ] ) ;
7372
7373 $start_index = $array_match [ 'tail' ] [1] + 1 ;
7374 }
7375 else
7376 error ( "Character range $from..$to not followed by an array of hexadecimal values in Unicode map #$object_id." ) ;
7377 }
7378 else
7379 {
7380 error ( "Unexpected character '$next_char' in Unicode map #$object_id." ) ;
7381 $start_index = $range_match [ 'nextchar' ] [1] + 1 ;
7382 }
7383 }
7384 }
7385
7386 // Sort the ranges by their starting offsets
7387 $this -> RangeCount = count ( $this -> RangeMap ) ;
7388
7389 if ( $this -> RangeCount > 1 )
7390 {
7391 usort ( $this -> RangeMap, array ( $this, '__rangemap_cmpfunc' ) ) ;
7392 }
7393 }
7394
7395 if ( $max_found_char_width && $max_found_char_width != $this -> HexCharWidth )
7396 {
7397 if ( PdfToText::$DEBUG )
7398 warning ( "Character map #$object_id : specified code width ({$this -> HexCharWidth}) differs from actual width ($max_found_char_width)." ) ;
7399
7400 $this -> HexCharWidth = $max_found_char_width ;
7401 }
7402 }
7403
7404
7405 public function __rangemap_cmpfunc ( $a, $b )
7406 { return ( $a [0] - $b [0] ) ; }
7407
7408
7409 /*--------------------------------------------------------------------------------------------------------------
7410
7411 Interface implementations.
7412
7413 *-------------------------------------------------------------------------------------------------------------*/
7414 public function count ( )
7415 { return ( count ( $this -> DirectMap ) ) ; }
7416
7417
7418 public function offsetExists ( $offset )
7419 { return ( $this -> offsetGetSafe ( $offset ) !== false ) ; }
7420
7421
7422 public function offsetGetSafe ( $offset, $translate = true )
7423 {
7424 // Return value
7425 $code = false ;
7426
7427 // Character already has an entry (character reference => subtituted character)
7428 if ( isset ( $this -> DirectMap [ $offset ] ) )
7429 {
7430 $code = ( $translate ) ? $this -> CodePointToUtf8 ( $this -> DirectMap [ $offset ] ) : $this -> DirectMap [ $offset ] ;
7431 }
7432 // Character does not has a direct entry ; have a look in the character ranges defined for this map
7433 else if ( $this -> RangeCount && $offset >= $this -> RangeMin && $offset <= $this -> RangeMax )
7434 {
7435 $low = 0 ;
7436 $high = count ( $this -> RangeMap ) - 1 ;
7437 $result = false ;
7438
7439 // Use a dichotomic search through character ranges
7440 while ( $low <= $high )
7441 {
7442 $middle = ( $low + $high ) >> 1 ;
7443
7444 if ( $offset < $this -> RangeMap [ $middle ] [0] )
7445 $high = $middle - 1 ;
7446 else if ( $offset > $this -> RangeMap [ $middle ] [1] )
7447 $low = $middle + 1 ;
7448 else
7449 {
7450 $result = $this -> RangeMap [ $middle ] [2] + $offset - $this -> RangeMap [ $middle ] [0] ;
7451 break ;
7452 }
7453 }
7454
7455 // Once a character has been found in the ranges defined by this character map, store it in the DirectMap property
7456 // so that it will be directly retrieved during subsequent accesses
7457 if ( $result !== false )
7458 {
7459 $code = ( $translate ) ? $this -> CodePointToUtf8 ( $result ) : $result ;
7460 $this -> DirectMap [ $offset ] = $result ;
7461 }
7462 }
7463
7464 // All done, return
7465 return ( $code ) ;
7466 }
7467
7468
7469 public function offsetGet ( $offset )
7470 {
7471 $code = $this -> offsetGetSafe ( $offset ) ;
7472
7473 if ( $code === false )
7474 $code = $this -> CodePointToUtf8 ( $offset ) ;
7475
7476 return ( $code ) ;
7477 }
7478 }
7479
7480
7481/*==============================================================================================================
7482
7483 PdfTexterEncodingMap -
7484 A class for fonts having a character map specified with the /Encoding parameter.
7485
7486 ==============================================================================================================*/
7487class PdfTexterEncodingMap extends PdfTexterCharacterMap
7488 {
7489 // Possible encodings (there is a 5th one, MacExpertEncoding, but used for "expert fonts" ; no need to deal
7490 // with it here since we only want to extract text)
7491 // Note that the values of these constants are direct indices to the second dimension of the $Encodings table
7492 const PDF_STANDARD_ENCODING = 0 ;
7493 const PDF_MAC_ROMAN_ENCODING = 1 ;
7494 const PDF_WIN_ANSI_ENCODING = 2 ;
7495 const PDF_DOC_ENCODING = 3 ;
7496
7497 // Correspondance between an encoding name and its corresponding character in the
7498 // following format : Standard, Mac, Windows, Pdf
7499 private static $GlobalEncodings = false ;
7500 public $Encodings ;
7501 // Encoding type (one of the PDF_*_ENCODING constants)
7502 public $Encoding ;
7503 // Indicates whether this character map is a secondary one used for Unicode maps ; this must be set at
7504 // a higher level by the PdfTexterFont because at the time a character map is instantiated, we do not know
7505 // yet whether it will be a primary (normal) map, or a map secondary to an existing Unicode map
7506 public $Secondary ;
7507 // Differences array (a character substitution table to the standard encodings)
7508 public $Map = array ( ) ;
7509 // A secondary map for the Differences array, which only contains the differences ; this is used
7510 // for Unicode fonts that also have an associated /Differences parameter, which should not include the
7511 // whole standard Adobe character map but only the differences of encodings
7512 public $SecondaryMap = array ( ) ;
7513 // Differences by position number
7514 public $DifferencesByPosition = array ( ) ;
7515
7516
7517 // Constructor -
7518 // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
7519 // beginbfrange/endbfrange constructs.
7520 public function __construct ( $object_id, $definitions, $extra_mappings )
7521 {
7522 // Ignore character variants whose names end with these suffixes
7523 static $IgnoredVariants = array
7524 (
7525 '/\.scalt$/',
7526 '/\.sc$/',
7527 '/\.fitted$/',
7528 '/\.oldstyle$/',
7529 '/\.taboldstyle$/',
7530 '/\.alt$/',
7531 '/alt$/',
7532 ) ;
7533
7534 parent::__construct ( $object_id ) ;
7535
7536 // Load the default Adobe character sets, if not already done
7537 if ( self::$GlobalEncodings === false )
7538 {
7539 $charset_file = dirname ( __FILE__ ) . '/Maps/adobe-charsets.map' ;
7540 include ( $charset_file ) ;
7541 self::$GlobalEncodings = ( isset ( $adobe_charsets ) ) ? $adobe_charsets : array ( ) ;
7542 }
7543
7544 $this -> Encodings = array_merge ( self::$GlobalEncodings, $extra_mappings ) ;
7545
7546 // Fonts using default Adobe character sets and hexadecimal representations are one-byte long
7547 $this -> HexCharWidth = 2 ;
7548
7549 if ( PdfToText::$DEBUG )
7550 {
7551 echo "\n----------------------------------- ENCODING CMAP #$object_id\n" ;
7552 echo $definitions;
7553 }
7554
7555 // Retrieve text encoding
7556 preg_match ( '# / (?P<encoding> (WinAnsiEncoding) | (PDFDocEncoding) | (MacRomanEncoding) | (StandardEncoding) ) #ix',
7557 $definitions, $encoding_match ) ;
7558
7559 if ( ! isset ( $encoding_match [ 'encoding' ] ) )
7560 $encoding_match [ 'encoding' ] = 'WinAnsiEncoding' ;
7561
7562 switch ( strtolower ( $encoding_match [ 'encoding' ] ) )
7563 {
7564 case 'pdfdocencoding' : $this -> Encoding = self::PDF_DOC_ENCODING ; break ;
7565 case 'macromanencoding' : $this -> Encoding = self::PDF_MAC_ROMAN_ENCODING ; break ;
7566 case 'standardencoding' : $this -> Encoding = self::PDF_STANDARD_ENCODING ; break ;
7567 case 'winansiencoding' :
7568 default : $this -> Encoding = self::PDF_WIN_ANSI_ENCODING ;
7569 }
7570
7571 // Build a virgin character map using the detected encoding
7572 foreach ( $this -> Encodings as $code_array )
7573 {
7574 $char = $code_array [ $this -> Encoding ] ;
7575 $this -> Map [ $char ] = $char ;
7576 }
7577
7578 // Extract the Differences array
7579 preg_match ( '/ \[ \s* (?P<contents> [^\]]*?) \s* \] /x', $definitions, $match ) ;
7580
7581 if ( ! isset ( $match [ 'contents' ] ) )
7582 return ;
7583
7584 $data = trim ( preg_replace ( '/\s+(\d+)/', '/$1', $match [ 'contents' ] ) ) ;
7585 $items = explode ( '/', $data ) ;
7586 $index = 0 ;
7587
7588 for ( $i = 0, $item_count = count ( $items ) ; $i < $item_count ; $i ++ )
7589 {
7590 $item = PdfToText::DecodeRawName ( trim ( $items [$i] ) ) ;
7591
7592 // Integer value : index of next character in map
7593 if ( is_numeric ( $item ) )
7594 $index = ( integer ) $item ;
7595 // String value : a character name, as defined by Adobe
7596 else
7597 {
7598 // Remove variant part of the character name
7599 $item = preg_replace ( $IgnoredVariants, '', trim ( $item ) ) ;
7600
7601 // Keyword (character name) exists in the encoding table
7602 if ( isset ( $this -> Encodings [ $item ] ) )
7603 {
7604 $this -> Map [ $index ] =
7605 $this -> SecondaryMap [ $index ] = $this -> Encodings [ $item ] [ $this -> Encoding ] ;
7606 }
7607 // Not defined ; check if this is the "/gxx" notation, where "xx" is a number
7608 else if ( preg_match ( '/g (?P<value> \d+)/x', $item, $match ) )
7609 {
7610 $value = ( integer ) $match [ 'value' ] ;
7611
7612 // In my current state of investigations, the /g notation has the following characteristics :
7613 // - The value 29 must be added to the number after the "/g" string (why ???)
7614 // - The value after the "/g" string can be greater than 255, meaning that it could be Unicode codepoint
7615 // This has to be carefully watched before revision
7616 $value += 29 ;
7617
7618 $this -> Map [ $index ] =
7619 $this -> SecondaryMap [ $index ] = $value ;
7620 }
7621 // Some characters can be specified by the "/uni" prefix followed by a sequence of hex digits,
7622 // which is not described by the PDF specifications. This sequence gives a Unicode code point.
7623 else if ( preg_match ( '/uni (?P<value> [0-9a-f]+)/ix', $item, $match ) )
7624 {
7625 $value = hexdec ( $match [ 'value' ] ) ;
7626
7627 $this -> Map [ $index ] =
7628 $this -> SecondaryMap [ $index ] = ( integer ) $value ;
7629 }
7630 // Otherwise, put a quotation mark instead
7631 else
7632 {
7633 if ( PdfToText::$DEBUG )
7634 warning ( "Unknown character name found in a /Differences[] array : [$item]" ) ;
7635
7636 $this -> Map [ $index ] =
7637 $this -> SecondaryMap [ $index ] = ord ( '?' ) ;
7638 }
7639
7640 $this -> DifferencesByPosition [ $index ] = $item ;
7641
7642 $index ++ ;
7643 }
7644 }
7645 }
7646
7647
7648 /*--------------------------------------------------------------------------------------------------------------
7649
7650 Interface implementations.
7651
7652 *-------------------------------------------------------------------------------------------------------------*/
7653 public function count ( )
7654 { return ( count ( $this -> Map ) ) ; }
7655
7656
7657 public function offsetExists ( $offset )
7658 {
7659 return ( ( ! $this -> Secondary ) ?
7660 isset ( $this -> Map [ $offset ] ) :
7661 isset ( $this -> SecondaryMap [ $offset ] ) ) ;
7662 }
7663
7664
7665 public function offsetGet ( $offset )
7666 {
7667 if ( ! $this -> Secondary )
7668 {
7669 if ( isset ( $this -> Map [ $offset ] ) )
7670 $ord = $this -> Map [ $offset ] ;
7671 else
7672 $ord = $offset ;
7673
7674 // Check for final character translations (concerns only a few number of characters)
7675 if ( $this -> Encoding == self::PDF_WIN_ANSI_ENCODING && isset ( PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ) )
7676 $ord = PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ;
7677 else if ( $this -> Encoding == self::PDF_MAC_ROMAN_ENCODING && isset ( PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ) )
7678 $ord = PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ;
7679 // As far as I have been able to see, the values expressed by the /Differences tag were the only ones used within the
7680 // Pdf document ; however, handle the case where some characters do not belong to the characters listed by /Differences,
7681 // and use the official Adobe encoding maps when necessary
7682 else if ( isset ( $this -> Encodings [ $ord ] [ $this -> Encoding ] ) )
7683 $ord = $this -> Encodings [ $ord ] [ $this -> Encoding ] ;
7684
7685 $result = $this -> CodePointToUtf8 ( $ord ) ;
7686 }
7687 else if ( isset ( $this -> SecondaryMap [ $offset ] ) )
7688 {
7689 $ord = $this -> SecondaryMap [ $offset ] ;
7690 $result = $this -> CodePointToUtf8 ( $ord ) ;
7691 }
7692 else
7693 $result = false ;
7694
7695 return ( $result ) ;
7696 }
7697 }
7698
7699
7700/**************************************************************************************************************
7701 **************************************************************************************************************
7702 **************************************************************************************************************
7703 ****** ******
7704 ****** ******
7705 ****** CHARACTER MAP MANAGEMENT ******
7706 ****** ******
7707 ****** ******
7708 **************************************************************************************************************
7709 **************************************************************************************************************
7710 **************************************************************************************************************/
7711
7712/*==============================================================================================================
7713
7714 class PdfTexterAdobeMap -
7715 Abstract class to handle Adobe-specific fonts.
7716
7717 ==============================================================================================================*/
7718abstract class PdfTexterAdobeMap extends PdfTexterCharacterMap
7719 {
7720 // Font variant ; one of the PdfTexterFont::FONT_VARIANT_* constants
7721 public $Variant ;
7722 // To be declared by derived classes :
7723 public $Map ;
7724
7725
7726 public function __construct ( $object_id, $font_variant, $map )
7727 {
7728 parent::__construct ( $object_id ) ;
7729
7730 $this -> HexCharWidth = 2 ;
7731 $this -> Variant = $font_variant ;
7732 $this -> Map = $map ;
7733
7734 if ( ! isset ( $map [ $font_variant ] ) )
7735 error ( new PdfToTextDecodingException ( "Undefined font variant #$font_variant." ) ) ;
7736 }
7737
7738
7739 /*--------------------------------------------------------------------------------------------------------------
7740
7741 Interface implementations.
7742
7743 *-------------------------------------------------------------------------------------------------------------*/
7744 public function count ( )
7745 { return ( count ( $this -> $Map [ $this -> Variant ] ) ) ; }
7746
7747
7748 public function offsetExists ( $offset )
7749 { return ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) ) ; }
7750
7751
7752 public function offsetGet ( $offset )
7753 {
7754 if ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) )
7755 $ord = $this -> Map [ $this -> Variant ] [ $offset ] ;
7756 else
7757 $ord = $offset ;
7758
7759 return ( $this -> CodePointToUtf8 ( $ord ) ) ;
7760 }
7761 }
7762
7763
7764/*==============================================================================================================
7765
7766 class PdfTexterAdobeWinAnsiMap -
7767 Abstract class to handle Adobe-specific Win Ansi fonts.
7768
7769 ==============================================================================================================*/
7770class PdfTexterAdobeWinAnsiMap extends PdfTexterAdobeMap
7771 {
7772 // Windows Ansi mapping to Unicode. Only substitutions that have no direct equivalent are listed here
7773 // Source : https://msdn.microsoft.com/en-us/goglobal/cc305145.aspx
7774 // Only characters from 0x80 to 0x9F have no direct translation
7775 public static $WinAnsiCharacterMap = array
7776 (
7777 // Normal WinAnsi mapping
7778 0 => array
7779 (
7780 0x80 => 0x20AC,
7781 0x82 => 0x201A,
7782 0x83 => 0x0192,
7783 0x84 => 0x201E,
7784 0x85 => 0x2026,
7785 0x86 => 0x2020,
7786 0x87 => 0x2021,
7787 0x88 => 0x02C6,
7788 0x89 => 0x2030,
7789 0x8A => 0x0160,
7790 0x8B => 0x2039,
7791 0x8C => 0x0152,
7792 0x8E => 0x017D,
7793 0x91 => 0x2018,
7794 0x92 => 0x2019,
7795 0x93 => 0x201C,
7796 0x94 => 0x201D,
7797 0x95 => 0x2022,
7798 0x96 => 0x2013,
7799 0x97 => 0x2014,
7800 0x98 => 0x02DC,
7801 0x99 => 0x2122,
7802 0x9A => 0x0161,
7803 0x9B => 0x203A,
7804 0x9C => 0x0153,
7805 0x9E => 0x017E,
7806 0x9F => 0x0178
7807 ),
7808 // Cyrillic (IS08859-5)
7809 1 => array
7810 (
7811 0x93 => 0x0022, // Quotes
7812 0x94 => 0x0022,
7813 0xC0 => 0x0410,
7814 0xC1 => 0x0411,
7815 0xC2 => 0x0412,
7816 0xC3 => 0x0413,
7817 0xC4 => 0x0414,
7818 0xC5 => 0x0415,
7819 0xC6 => 0x0416,
7820 0xC7 => 0x0417,
7821 0xC8 => 0x0418,
7822 0xC9 => 0x0419,
7823 0xCA => 0x041A,
7824 0xCB => 0x041B,
7825 0xCC => 0x041C,
7826 0xCD => 0x041D,
7827 0xCE => 0x041E,
7828 0xCF => 0x041F,
7829 0xD0 => 0x0420,
7830 0xD1 => 0x0421,
7831 0xD2 => 0x0422,
7832 0xD3 => 0x0423,
7833 0xD4 => 0x0424,
7834 0xD5 => 0x0425,
7835 0xD6 => 0x0426,
7836 0xD7 => 0x0427,
7837 0xD8 => 0x0428,
7838 0xD9 => 0x0429,
7839 0xDA => 0x042A,
7840 0xDB => 0x042B,
7841 0xDC => 0x042C,
7842 0xDD => 0x042D,
7843 0xDE => 0x042E,
7844 0xDF => 0x042F,
7845 0xE0 => 0x0430,
7846 0xE1 => 0x0431,
7847 0xE2 => 0x0432,
7848 0xE3 => 0x0433,
7849 0xE4 => 0x0434,
7850 0xE5 => 0x0435,
7851 0xE6 => 0x0436,
7852 0xE7 => 0x0437,
7853 0xE8 => 0x0438,
7854 0xE9 => 0x0439,
7855 0xEA => 0x043A,
7856 0xEB => 0x043B,
7857 0xEC => 0x043C,
7858 0xED => 0x043D,
7859 0xEE => 0x043E,
7860 0xEF => 0x043F,
7861 0xF0 => 0x0440,
7862 0xF1 => 0x0441,
7863 0xF2 => 0x0442,
7864 0xF3 => 0x0443,
7865 0xF4 => 0x0444,
7866 0xF5 => 0x0445,
7867 0xF6 => 0x0446,
7868 0xF7 => 0x0447,
7869 0xF8 => 0x0448,
7870 0xF9 => 0x0449,
7871 0xFA => 0x044A,
7872 0xFB => 0x044B,
7873 0xFC => 0x044C,
7874 0xFD => 0x044D,
7875 0xFE => 0x044E,
7876 0xFF => 0x044F
7877 )
7878 ) ;
7879
7880 public function __construct ( $object_id, $font_variant )
7881 {
7882 parent::__construct ( $object_id, $font_variant, self::$WinAnsiCharacterMap ) ;
7883 }
7884 }
7885
7886
7887/*==============================================================================================================
7888
7889 class PdfTexterAdobeMacRomanMap -
7890 Abstract class to handle Adobe-specific Mac Roman fonts.
7891
7892 ==============================================================================================================*/
7893class PdfTexterAdobeMacRomanMap extends PdfTexterAdobeMap
7894 {
7895 // Mac roman to Unicode encoding
7896 // Source : ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
7897 public static $MacRomanCharacterMap = array
7898 (
7899 0 => array
7900 (
7901 0x80 => 0x00C4, # LATIN CAPITAL LETTER A WITH DIAERESIS
7902 0x81 => 0x00C5, # LATIN CAPITAL LETTER A WITH RING ABOVE
7903 0x82 => 0x00C7, # LATIN CAPITAL LETTER C WITH CEDILLA
7904 0x83 => 0x00C9, # LATIN CAPITAL LETTER E WITH ACUTE
7905 0x84 => 0x00D1, # LATIN CAPITAL LETTER N WITH TILDE
7906 0x85 => 0x00D6, # LATIN CAPITAL LETTER O WITH DIAERESIS
7907 0x86 => 0x00DC, # LATIN CAPITAL LETTER U WITH DIAERESIS
7908 0x87 => 0x00E1, # LATIN SMALL LETTER A WITH ACUTE
7909 0x88 => 0x00E0, # LATIN SMALL LETTER A WITH GRAVE
7910 0x89 => 0x00E2, # LATIN SMALL LETTER A WITH CIRCUMFLEX
7911 0x8A => 0x00E4, # LATIN SMALL LETTER A WITH DIAERESIS
7912 0x8B => 0x00E3, # LATIN SMALL LETTER A WITH TILDE
7913 0x8C => 0x00E5, # LATIN SMALL LETTER A WITH RING ABOVE
7914 0x8D => 0x00E7, # LATIN SMALL LETTER C WITH CEDILLA
7915 0x8E => 0x00E9, # LATIN SMALL LETTER E WITH ACUTE
7916 0x8F => 0x00E8, # LATIN SMALL LETTER E WITH GRAVE
7917 0x90 => 0x00EA, # LATIN SMALL LETTER E WITH CIRCUMFLEX
7918 0x91 => 0x00EB, # LATIN SMALL LETTER E WITH DIAERESIS
7919 0x92 => 0x00ED, # LATIN SMALL LETTER I WITH ACUTE
7920 0x93 => 0x00EC, # LATIN SMALL LETTER I WITH GRAVE
7921 0x94 => 0x00EE, # LATIN SMALL LETTER I WITH CIRCUMFLEX
7922 0x95 => 0x00EF, # LATIN SMALL LETTER I WITH DIAERESIS
7923 0x96 => 0x00F1, # LATIN SMALL LETTER N WITH TILDE
7924 0x97 => 0x00F3, # LATIN SMALL LETTER O WITH ACUTE
7925 0x98 => 0x00F2, # LATIN SMALL LETTER O WITH GRAVE
7926 0x99 => 0x00F4, # LATIN SMALL LETTER O WITH CIRCUMFLEX
7927 0x9A => 0x00F6, # LATIN SMALL LETTER O WITH DIAERESIS
7928 0x9B => 0x00F5, # LATIN SMALL LETTER O WITH TILDE
7929 0x9C => 0x00FA, # LATIN SMALL LETTER U WITH ACUTE
7930 0x9D => 0x00F9, # LATIN SMALL LETTER U WITH GRAVE
7931 0x9E => 0x00FB, # LATIN SMALL LETTER U WITH CIRCUMFLEX
7932 0x9F => 0x00FC, # LATIN SMALL LETTER U WITH DIAERESIS
7933 0xA0 => 0x2020, # DAGGER
7934 0xA1 => 0x00B0, # DEGREE SIGN
7935 0xA2 => 0x00A2, # CENT SIGN
7936 0xA3 => 0x00A3, # POUND SIGN
7937 0xA4 => 0x00A7, # SECTION SIGN
7938 0xA5 => 0x2022, # BULLET
7939 0xA6 => 0x00B6, # PILCROW SIGN
7940 0xA7 => 0x00DF, # LATIN SMALL LETTER SHARP S
7941 0xA8 => 0x00AE, # REGISTERED SIGN
7942 0xA9 => 0x00A9, # COPYRIGHT SIGN
7943 0xAA => 0x2122, # TRADE MARK SIGN
7944 0xAB => 0x00B4, # ACUTE ACCENT
7945 0xAC => 0x00A8, # DIAERESIS
7946 0xAD => 0x2260, # NOT EQUAL TO
7947 0xAE => 0x00C6, # LATIN CAPITAL LETTER AE
7948 0xAF => 0x00D8, # LATIN CAPITAL LETTER O WITH STROKE
7949 0xB0 => 0x221E, # INFINITY
7950 0xB1 => 0x00B1, # PLUS-MINUS SIGN
7951 0xB2 => 0x2264, # LESS-THAN OR EQUAL TO
7952 0xB3 => 0x2265, # GREATER-THAN OR EQUAL TO
7953 0xB4 => 0x00A5, # YEN SIGN
7954 0xB5 => 0x00B5, # MICRO SIGN
7955 0xB6 => 0x2202, # PARTIAL DIFFERENTIAL
7956 0xB7 => 0x2211, # N-ARY SUMMATION
7957 0xB8 => 0x220F, # N-ARY PRODUCT
7958 0xB9 => 0x03C0, # GREEK SMALL LETTER PI
7959 0xBA => 0x222B, # INTEGRAL
7960 0xBB => 0x00AA, # FEMININE ORDINAL INDICATOR
7961 0xBC => 0x00BA, # MASCULINE ORDINAL INDICATOR
7962 0xBD => 0x03A9, # GREEK CAPITAL LETTER OMEGA
7963 0xBE => 0x00E6, # LATIN SMALL LETTER AE
7964 0xBF => 0x00F8, # LATIN SMALL LETTER O WITH STROKE
7965 0xC0 => 0x00BF, # INVERTED QUESTION MARK
7966 0xC1 => 0x00A1, # INVERTED EXCLAMATION MARK
7967 0xC2 => 0x00AC, # NOT SIGN
7968 0xC3 => 0x221A, # SQUARE ROOT
7969 0xC4 => 0x0192, # LATIN SMALL LETTER F WITH HOOK
7970 0xC5 => 0x2248, # ALMOST EQUAL TO
7971 0xC6 => 0x2206, # INCREMENT
7972 0xC7 => 0x00AB, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
7973 0xC8 => 0x00BB, # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
7974 0xC9 => 0x2026, # HORIZONTAL ELLIPSIS
7975 0xCA => 0x00A0, # NO-BREAK SPACE
7976 0xCB => 0x00C0, # LATIN CAPITAL LETTER A WITH GRAVE
7977 0xCC => 0x00C3, # LATIN CAPITAL LETTER A WITH TILDE
7978 0xCD => 0x00D5, # LATIN CAPITAL LETTER O WITH TILDE
7979 0xCE => 0x0152, # LATIN CAPITAL LIGATURE OE
7980 0xCF => 0x0153, # LATIN SMALL LIGATURE OE
7981 0xD0 => 0x2013, # EN DASH
7982 0xD1 => 0x2014, # EM DASH
7983 0xD2 => 0x201C, # LEFT DOUBLE QUOTATION MARK
7984 0xD3 => 0x201D, # RIGHT DOUBLE QUOTATION MARK
7985 0xD4 => 0x2018, # LEFT SINGLE QUOTATION MARK
7986 0xD5 => 0x2019, # RIGHT SINGLE QUOTATION MARK
7987 0xD6 => 0x00F7, # DIVISION SIGN
7988 0xD7 => 0x25CA, # LOZENGE
7989 0xD8 => 0x00FF, # LATIN SMALL LETTER Y WITH DIAERESIS
7990 0xD9 => 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
7991 0xDA => 0x2044, # FRACTION SLASH
7992 0xDB => 0x20AC, # EURO SIGN
7993 0xDC => 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
7994 0xDD => 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
7995 0xDE => 0xFB01, # LATIN SMALL LIGATURE FI
7996 0xDF => 0xFB02, # LATIN SMALL LIGATURE FL
7997 0xE0 => 0x2021, # DOUBLE DAGGER
7998 0xE1 => 0x00B7, # MIDDLE DOT
7999 0xE2 => 0x201A, # SINGLE LOW-9 QUOTATION MARK
8000 0xE3 => 0x201E, # DOUBLE LOW-9 QUOTATION MARK
8001 0xE4 => 0x2030, # PER MILLE SIGN
8002 0xE5 => 0x00C2, # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
8003 0xE6 => 0x00CA, # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
8004 0xE7 => 0x00C1, # LATIN CAPITAL LETTER A WITH ACUTE
8005 0xE8 => 0x00CB, # LATIN CAPITAL LETTER E WITH DIAERESIS
8006 0xE9 => 0x00C8, # LATIN CAPITAL LETTER E WITH GRAVE
8007 0xEA => 0x00CD, # LATIN CAPITAL LETTER I WITH ACUTE
8008 0xEB => 0x00CE, # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
8009 0xEC => 0x00CF, # LATIN CAPITAL LETTER I WITH DIAERESIS
8010 0xED => 0x00CC, # LATIN CAPITAL LETTER I WITH GRAVE
8011 0xEE => 0x00D3, # LATIN CAPITAL LETTER O WITH ACUTE
8012 0xEF => 0x00D4, # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
8013 0xF0 => 0xF8FF, # Apple logo
8014 0xF1 => 0x00D2, # LATIN CAPITAL LETTER O WITH GRAVE
8015 0xF2 => 0x00DA, # LATIN CAPITAL LETTER U WITH ACUTE
8016 0xF3 => 0x00DB, # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
8017 0xF4 => 0x00D9, # LATIN CAPITAL LETTER U WITH GRAVE
8018 0xF5 => 0x0131, # LATIN SMALL LETTER DOTLESS I
8019 0xF6 => 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
8020 0xF7 => 0x02DC, # SMALL TILDE
8021 0xF8 => 0x00AF, # MACRON
8022 0xF9 => 0x02D8, # BREVE
8023 0xFA => 0x02D9, # DOT ABOVE
8024 0xFB => 0x02DA, # RING ABOVE
8025 0xFC => 0x00B8, # CEDILLA
8026 0xFD => 0x02DD, # DOUBLE ACUTE ACCENT
8027 0xFE => 0x02DB, # OGONEK
8028 0xFF => 0x02C7 # CARON
8029 )
8030 ) ;
8031
8032
8033 public function __construct ( $object_id, $font_variant )
8034 {
8035 parent::__construct ( $object_id, $font_variant, self::$MacRomanCharacterMap ) ;
8036 }
8037 }
8038
8039
8040/*==============================================================================================================
8041
8042 class PdfTexterAdobeUndocumentedUnicodeMap -
8043 Sometimes, Unicode maps translate character ids to something in the range 0xF000..0xF0FF (or maybe more).
8044 These mapped characters do not correspond to anything else in Unicode, but rather to a special character
8045 set.
8046 This class is not meant to be instantiated by anything here, but rather used for its $Map property.
8047 Note that the $Map array is not complete.
8048
8049 ==============================================================================================================*/
8050class PdfTexterAdobeUndocumentedUnicodeMap extends PdfTexterAdobeMap
8051 {
8052 public static $UnicodeMap = array
8053 (
8054 0xF0F0 => 0x30, // '0' through '9'
8055 0xF0EF => 0x31,
8056 0xF0EE => 0x32,
8057 0xF0ED => 0x33,
8058 0xF0EC => 0x34,
8059 0xF0EB => 0x35,
8060 0xF0EA => 0x36,
8061 0xF0E9 => 0x37,
8062 0xF0E8 => 0x38,
8063 0xF0E7 => 0x39,
8064 0xF0DF => 0x41, // 'A' through 'Z'
8065 0xF0DE => 0x42,
8066 0xF0DD => 0x43,
8067 0xF0DC => 0x44,
8068 0xF0DB => 0x45,
8069 0xF0DA => 0x46,
8070 0xF0D9 => 0x47,
8071 0xF0D8 => 0x48,
8072 0xF0D7 => 0x49,
8073 0xF0D6 => 0x4A,
8074 0xF0D5 => 0x4B,
8075 0xF0D4 => 0x4C,
8076 0xF0D3 => 0x4D,
8077 0xF0D2 => 0x4E,
8078 0xF0D1 => 0x4F,
8079 0xF0D0 => 0x50,
8080 0xF0CF => 0x51,
8081 0xF0CE => 0x52,
8082 0xF0CD => 0x53,
8083 0xF0CC => 0x54,
8084 0xF0CB => 0x55,
8085 0xF0CA => 0x56,
8086 0xF0C9 => 0x57,
8087 0xF0C8 => 0x58,
8088 0xF0C7 => 0x59,
8089 0xF0C6 => 0x5A,
8090 0xF0BF => 0x61, // 'a' through 'z'
8091 0xF0BE => 0x62,
8092 0xF0BD => 0x63,
8093 0xF0BC => 0x64,
8094 0xF0BB => 0x65,
8095 0xF0BA => 0x66,
8096 0xF0B9 => 0x67,
8097 0xF0B8 => 0x68,
8098 0xF0B7 => 0x69,
8099 0xF0B6 => 0x6A,
8100 0xF0B5 => 0x6B,
8101 0xF0B4 => 0x6C,
8102 0xF0B3 => 0x6D,
8103 0xF0B2 => 0x6E,
8104 0xF0B1 => 0x6F,
8105 0xF0B0 => 0x70,
8106 0xF0AF => 0x71,
8107 0xF0AE => 0x72,
8108 0xF0AD => 0x73,
8109 0xF0AC => 0x74,
8110 0xF0AB => 0x75,
8111 0xF0AA => 0x76,
8112 0xF0A9 => 0x77,
8113 0xF0A8 => 0x78,
8114 0xF0A7 => 0x79,
8115 0xF0A6 => 0x7A,
8116 0xF0F1 => 0x2F, // '/'
8117 0xF0E6 => 0x3A, // ':'
8118 0xF0F3 => 0x2D, // '-'
8119 0xF0F8 => 0x28, // '('
8120 0xF0F7 => 0x29, // ')'
8121 0xF0F2 => 0x2E, // '.'
8122 0xF020 => 0x20, // Space
8123 0xF0F9 => 0x27, // "'"
8124 0xF037 => 0xE9, // é
8125 0xF038 => 0xE8, // è
8126 ) ;
8127
8128
8129
8130 public function __construct ( $object_id, $font_variant )
8131 {
8132 parent::__construct ( $object_id, $font_variant, self::$UnicodeMap ) ;
8133 }
8134 }
8135
8136
8137/*==============================================================================================================
8138
8139 PdfTexterCIDMap -
8140 A class for mapping (or trying to...) CID fonts.
8141
8142 ==============================================================================================================*/
8143abstract class PdfTexterCIDMap extends PdfTexterCharacterMap
8144 {
8145 // CID maps are associative arrays whose keys are the font CID (currently expressed as a numeric value) and
8146 // whose values are the corresponding UTF8 representation. The following special values can also be used to
8147 // initialize certain entries :
8148 // UNKNOWN_CID :
8149 // Indicates that the corresponding CID has no known UTF8 counterpart. When the PdfToText::$DEBUG variable
8150 // is true, every character in this case will be replaced with the string : "[UID: abcd]", where "abcd" is
8151 // the hex representation of the CID. This way, new CID tables can be built using this information.
8152 const UNKNOWN_CID = -1 ;
8153 // ALT_CID :
8154 // Sorry, this will remain undocumented so far and will be highligh subject to change, since it is dating
8155 // from my first interpretation of CID fonts, which is probably wrong.
8156 const ALT_CID = -2 ;
8157
8158
8159 // CID font map file ; the file is a PHP script that must contain an array of the form :
8160 // $map = array
8161 // (
8162 // 'plain' => array
8163 // (
8164 // $cid1 => $utf1,
8165 // ...
8166 // )
8167 // ) ;
8168 protected $MapFile ;
8169 // Map, loaded into memry
8170 protected $Map ;
8171 // Map cache - the interest is to avoid unnecessary includes
8172 private static $CachedMaps = array ( ) ;
8173
8174 // Related to the first experimentatl implementation of CID fonts
8175 private $LastAltOffset = false ;
8176
8177
8178 /*--------------------------------------------------------------------------------------------------------------
8179
8180 Constructor -
8181 Loads the specified map.
8182 If the map files contains a definition such as :
8183
8184 $map = 'IDENTITY-H-GQJGLM.cid' ;
8185
8186 then the specified map will be loaded instead (ony one ndirection is supported).
8187
8188 *-------------------------------------------------------------------------------------------------------------*/
8189 public function __construct ( $object_id, $map_name, $font_variant )
8190 {
8191 // Initialize parent objects
8192 parent::__construct ( $object_id ) ;
8193 $this -> HexCharWidth = 4 ; // So far, CIDs are 2-bytes long
8194
8195 // Since alternate characters can be apparently prefixed by 0x0000 or 0x0001, two calls to the array access operator
8196 // will be needed to retrieve the exact character in such cases
8197 // This is why we have to tell the upper layers not to cache the results
8198 $this -> Cache = false ;
8199
8200 $map_index = "$map_name:$font_variant" ;
8201
8202 // If this font has already been loaded somewhere, then reuse its information
8203 if ( isset ( self::$CachedMaps [ $map_index] ) )
8204 {
8205 $map = self::$CachedMaps [ $map_index ] [ 'map' ] ;
8206 $file = self::$CachedMaps [ $map_index ] [ 'file' ] ;
8207 }
8208 // Otherwise,
8209 else
8210 {
8211 $file = $this -> __get_cid_file ( $map_name, $font_variant ) ;
8212
8213 // No CID map found : CID numbers will be mapped as is
8214 if ( ! file_exists ( $file ) )
8215 {
8216 if ( PdfToText::$DEBUG )
8217 warning ( new PdfToTextDecodingException ( "Could not find CID table \"$map_name\" in directory \"" . PdfToText::$CIDTablesDirectory . "\"." ) ) ;
8218 }
8219 // Otherwise, load the CID map
8220 else
8221 {
8222 include ( $file ) ;
8223
8224 if ( isset ( $map ) )
8225 {
8226 // We authorize one CID map to contain the name of another CID map file, instead of the map itself
8227 if ( is_string ( $map ) )
8228 {
8229 $file = PdfToText::$CIDTablesDirectory . "/$map" ;
8230 include ( $file ) ;
8231 }
8232
8233 if ( isset ( $map ) )
8234 self::$CachedMaps [ $map_index ] = array ( 'file' => $file, 'map' => $map ) ;
8235 }
8236 else if ( PdfToText::$DEBUG )
8237 warning ( new PdfToTextDecodingException ( "CID \"$file\" does not contain any definition." ) ) ;
8238 }
8239 }
8240
8241 // Save map info for this CID font
8242 $this -> MapFile = $file ;
8243 $this -> Map = ( isset ( $map ) ) ? $map : array ( ) ;
8244 }
8245
8246
8247 /*--------------------------------------------------------------------------------------------------------------
8248
8249 __get_cid_file -
8250 Searches in the CIDTables directory for the CID map that best matches the specified map name (usually,
8251 IDENTITY-H) and the optional font variant.
8252
8253 If a font variant has been specified, like "ABCD+Italic-Arial", then the CID tables directory will be
8254 searched for the following files, in the following order :
8255 - IDENTITY-H-ABCD+Italic-Arial.cid
8256 - IDENTITY-H-ABCD+Italic.cid
8257 - IDENTITY-H-ABCD.cid
8258 - If none found, then IDENTITY-H-empty.cid will be used and a warning will be issued in debug mode.
8259
8260 *-------------------------------------------------------------------------------------------------------------*/
8261 private function __get_cid_file ( $map_name, $font_variant )
8262 {
8263 $files = array ( ) ;
8264
8265 // Search for font variants, if any
8266 if ( $font_variant )
8267 {
8268 if ( preg_match ( '/^ (?P<name> [a-z_][a-z_0-9]*) (?P<rest> [\-+] .*) $/imsx' , $font_variant, $match ) )
8269 {
8270 $basename = '-' . $match [ 'name' ] ;
8271
8272 if ( preg_match_all ( '/ (?P<sep> [\-+]) (?P<name> [^\-+]+) /ix', $match [ 'rest' ], $other_matches ) )
8273 {
8274 for ( $i = count ( $other_matches [ 'name' ] ) - 1 ; $i >= 0 ; $i -- )
8275 {
8276 $new_file = $basename ;
8277
8278 for ( $j = 0 ; $j < $i ; $j ++ )
8279 $new_file .= $other_matches [ 'sep' ] [$i] . $other_matches [ 'name' ] [$i] ;
8280
8281 $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name$new_file.cid", 'standard' ) ;
8282 }
8283 }
8284 }
8285
8286 // Last one will be the empty CID font
8287 $files [] = array ( PdfToText::$CIDTablesDirectory . "/IDENTITY-H-empty.cid", 'empty' ) ;
8288 }
8289
8290 // Add the specified map file
8291 $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name.cid", 'default' ) ;
8292
8293 // The first existing file in the list should be the appropriate one
8294 foreach ( $files as $file )
8295 {
8296 if ( file_exists ( $file [0] ) )
8297 {
8298 if ( PdfToText::$DEBUG )
8299 {
8300 if ( $file [1] === 'empty' )
8301 warning ( new PdfToTextDecodingException ( "Using empty IDENTITY-H definition for map \"$map_name\", variant \"$font_variant\"." ) ) ;
8302 else if ( $file [1] === 'default' )
8303 warning ( new PdfToTextDecodingException ( "Using default IDENTITY-H definition for map \"$map_name\"." ) ) ;
8304 }
8305
8306 return ( $file [0] ) ;
8307 }
8308 }
8309
8310 // No CID font found
8311 return ( false ) ;
8312 }
8313
8314
8315 /*--------------------------------------------------------------------------------------------------------------
8316
8317 Interface implementations.
8318
8319 *-------------------------------------------------------------------------------------------------------------*/
8320 public function count ( )
8321 { return ( count ( $this -> Map ) ) ; }
8322
8323
8324 public function offsetExists ( $offset )
8325 { return ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) ) ; }
8326
8327
8328 public function offsetGet ( $offset )
8329 {
8330 if ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) )
8331 {
8332 $ch = $this -> Map [ 'plain' ] [ $offset ] ;
8333
8334 switch ( $ch )
8335 {
8336 case self::UNKNOWN_CID :
8337 if ( PdfToText::$DEBUG )
8338 echo ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
8339
8340 $this -> LastAltOffset = false ;
8341
8342 if ( ! PdfToText::$DEBUG )
8343 return ( '' ) ;
8344 else
8345 return ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
8346
8347 case self::ALT_CID :
8348 $this -> LastAltOffset = ( integer ) $offset ;
8349
8350 return ( '' ) ;
8351
8352 default :
8353 if ( $this -> LastAltOffset === false )
8354 return ( $ch ) ;
8355
8356 if ( isset ( $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ) )
8357 {
8358 $ch2 = $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ;
8359
8360 if ( $ch2 == self::UNKNOWN_CID )
8361 {
8362 if ( PdfToText::$DEBUG )
8363 {
8364 echo ( "[CID{$this -> LastAltOffset}:" . sprintf ( '%04x', $offset ) . "]" ) ;
8365
8366 $ch2 = "[CID{$this -> LastAltOffset}: $offset]" ;
8367 }
8368 }
8369 }
8370 else
8371 $ch2 = '' ;
8372
8373 $this -> LastAltOffset = false ;
8374
8375 return ( $ch2 ) ;
8376 }
8377 }
8378 else
8379 {
8380 $this -> LastAltOffset = false ;
8381
8382 return ( '' ) ;
8383 }
8384 }
8385 }
8386
8387
8388
8389/*==============================================================================================================
8390
8391 PdfTexterIdentityHCIDMap -
8392 A class for mapping IDENTITY-H CID fonts (or trying to...).
8393
8394 ==============================================================================================================*/
8395class PdfTexterIdentityHCIDMap extends PdfTexterCIDMap
8396 {
8397 public function __construct ( $object_id, $font_variant )
8398 {
8399 parent::__construct ( $object_id, 'IDENTITY-H', $font_variant ) ;
8400 }
8401 }
8402
8403
8404
8405/*==============================================================================================================
8406
8407 PdfTexterPageMap -
8408 A class for detecting page objects mappings and retrieving page number for a specified object.
8409 There is a quadruple level of indirection here :
8410
8411 - The first level contains a /Type /Catalog parameter, with a /Pages one that references an object which
8412 contains a /Count and /Kids. I don't know yet if the /Pages parameter can reference more than one
8413 object using the array notation. However, the class is designed to handle such situations.
8414 - The object containing the /Kids parameter references objects who, in turn, lists the objects contained
8415 into one single page.
8416 - Each object referenced in /Kids has a /Type/Page parameter, together with /Contents, which lists the
8417 objects of the current page.
8418
8419 Object references are of the form : "x y R", where "x" is the object number.
8420
8421 Of course, anything can be in any order, otherwise it would not be funny ! Consider the following
8422 example :
8423
8424 (1) 5 0 obj
8425 << ... /Pages 1 0 R ... >>
8426 endobj
8427
8428 (2) 1 0 obj
8429 << ... /Count 1 /Kids[6 0 R] ... /Type/Pages ... >>
8430 endobj
8431
8432 (3) 6 0 obj
8433 << ... /Type/Page ... /Parent 1 0 R ... /Contents [10 0 R 11 0 R ... x 0 R]
8434 endobj
8435
8436 Object #5 says that object #1 contains the list of page contents (in this example, there is only one page,
8437 referenced by object #6).
8438 Object #6 says that the objects #10, #11 through #x are contained into the same page.
8439 The quadruple indirection comes when you are handling one of the objects referenced in object #6 and you
8440 need to retrieve their page number...
8441
8442 Of course, you cannot rely on the fact that all objects appear in logical order.
8443
8444 And, of course #2, there may be no page catalog at all ! in such cases, objects containing drawing
8445 instructions will have to be considered as a single page, whose number will be sequential.
8446
8447 And, of course #3, as this is the case with the official PDF 1.7 Reference from Adobe, there can be a
8448 reference to a non-existing object which was meant to contain the /Kids parameter (!). In this case,
8449 taking the ordinal number of objects of type (3) gives the page number minus one.
8450
8451 One mystery is that the PDF 1.7 Reference file contains 1310 pages but only 1309 are recognized here...
8452
8453 ==============================================================================================================*/
8454class PdfTexterPageMap extends PdfObjectBase
8455 {
8456 // Page contents are (normally) first described by a catalog
8457 // Although there should be only one entry for that, this property is defined as an array, as you need to really
8458 // become paranoid when handling pdf contents...
8459 protected $PageCatalogs = array ( ) ;
8460 // Entries that describe which page contains which text objects. Of course, these can be nested otherwise it would not be funny !
8461 protected $PageKids = array ( ) ;
8462 // Terminal entries : they directly give the ids of the objects belonging to a page
8463 public $PageContents = array ( ) ;
8464 // Note that all the above arrays are indexed by object id and filled with the data collected by calling the Peek() Method...
8465
8466 // Objects that could be referenced from other text objects as XObjects, using the /TPLx notation
8467 protected $TemplateObjects = array ( ) ;
8468
8469 // Once the Peek() method has collected page contents & object information, the MapCatalog() method is called to create this array
8470 // which contains page numbers as keys, and the list of objects contained in this page as values
8471 public $Pages = array ( ) ;
8472 // Holds page attributes
8473 public $PageAttributes = array ( ) ;
8474
8475 // Resource mappings can either refer to an object (/Resources 2 0 R) or to inline mappings (/Resources << ... >>)
8476 // The same object can be referenced by many /Resources parameters throughout the pdf file, so its important to keep
8477 // the analyzed mappings in a cache, so that later references will reuse the results of the first one
8478 private $ResourceMappingCache = array ( ) ;
8479 // List of XObject names - Used by the IsValidTemplate() function
8480 private $XObjectNames = array ( ) ;
8481
8482
8483 /*--------------------------------------------------------------------------------------------------------------
8484
8485 CONSTRUCTOR
8486 Creates a PdfTexterPageMap object. Actually, nothing significant is perfomed here, as this class' goal
8487 is to be used internally by PdfTexter.
8488
8489 *-------------------------------------------------------------------------------------------------------------*/
8490 public function __construct ( )
8491 {
8492 parent::__construct ( ) ;
8493 }
8494
8495
8496 /*--------------------------------------------------------------------------------------------------------------
8497
8498 NAME
8499 AddTemplateObject - Adds an object that could be referenced as a template/
8500
8501 PROTOTYPE
8502 $pagemap -> AddTemplateObject ( $object_id, $object_text_data ) ;
8503
8504 DESCRIPTION
8505 Adds an object that may be referenced as a template from another text object, using the /TPLx notation.
8506
8507 PARAMETERS
8508 $object_id (integer) -
8509 Id of the object that may contain a resource mapping entry.
8510
8511 $object_data (string) -
8512 Object contents.
8513
8514 *-------------------------------------------------------------------------------------------------------------*/
8515 public function AddTemplateObject ( $object_id, $object_text_data )
8516 {
8517 $this -> TemplateObjects [ $object_id ] = $object_text_data ;
8518 }
8519
8520
8521 /*--------------------------------------------------------------------------------------------------------------
8522
8523 NAME
8524 GetResourceMappings - Gets resource mappings specified after a /Resources parameter.
8525
8526 PROTOTYPE
8527 $result = $this -> GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list ) ;
8528
8529 DESCRIPTION
8530 Most of the time, objects containing a page description (/Type /Page) also contain a /Resources parameter,
8531 which may be followed by one of the following constructs :
8532 - A reference to an object, such as :
8533 /Resources 2 0 R
8534 - Or an inline set of parameters, such as font or xobject mappings :
8535 /Resources << /Font<</F1 10 0 R ...>> /XObject <</Im0 27 0 R ...>>
8536 This method extracts alias/object mappings for the parameter specified by $parameter (it can be for
8537 example 'Font' or 'Xobject') and returns these mappings as an associative array.
8538
8539 PARAMETERS
8540 $object_id (integer) -
8541 Id of the object that may contain a resource mapping entry.
8542
8543 $object_data (string) -
8544 Object contents.
8545
8546 $parameter (string) -
8547 Parameter defining resource mapping, for example /Font or /XObject.
8548
8549 $pdf_object_list (associative array) -
8550 Array of object id/object data associations, for all objects defined in the pdf file.
8551
8552 RETURN VALUE
8553 The list of resource mappings for the specified parameter, as an associative array, whose keys are the
8554 resource aliases and values are the corresponding object ids.
8555 The method returns an empty array if the specified object does not contain resource mappings or does
8556 not contain the specified parameter.
8557
8558 *-------------------------------------------------------------------------------------------------------------*/
8559 protected function GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list )
8560 {
8561 // The /Resources parameter refers to an existing PDF object
8562 if ( preg_match ( '#/Resources \s* (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $object_data, $match ) )
8563 {
8564 // Return the cached result if the same object has previously been referenced by a /Resources parameter
8565 if ( isset ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) )
8566 return ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) ;
8567
8568 // Check that the object that is referred to exists
8569 if ( isset ( $pdf_object_list [ $match [ 'object_id' ] ] ) )
8570 $data = $pdf_object_list [ $match [ 'object_id' ] ] ;
8571 else
8572 return ( array ( ) ) ;
8573
8574 $is_object = true ; // to tell that we need to put the results in cache for later use
8575 }
8576 // The /Resources parameter is followed by inline mappings
8577 else if ( preg_match ( '#/Resources \s* <#ix', $object_data, $match, PREG_OFFSET_CAPTURE ) )
8578 {
8579 $data = substr ( $object_data, $match [0] [1] + strlen ( $match [0] [0] ) - 1 ) ;
8580 $is_object = false ;
8581 }
8582 else
8583 return ( array ( ) ) ;
8584
8585 // Whatever we will be analyzing (an object contents or inline contents following the /Resources parameter),
8586 // the text will be enclosed within double angle brackets (<< ... >>)
8587
8588 // A small kludge for /XObject which specify an object reference ("15 0 R") instead of XObjects mappings
8589 // ("<< ...>>" )
8590 if ( $parameter == '/XObject' && preg_match ( '#/XObject \s+ (?P<obj> \d+) \s+ \d+ \s+ R#ix', $data, $match ) )
8591 {
8592 $data = '/XObject ' . $pdf_object_list [ $match [ 'obj' ] ] ;
8593 }
8594
8595 if ( preg_match ( "#$parameter \s* << \s* (?P<mappings> .*?) \s* >>#imsx", $data, $match ) )
8596 {
8597 preg_match_all ( '# (?P<mapping> / [^\s]+) \s+ (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $match [ 'mappings' ], $matches ) ;
8598
8599 $mappings = array ( ) ;
8600
8601 // Mapping extraction loop
8602 for ( $i = 0, $count = count ( $matches [ 'object_id' ] ) ; $i < $count ; $i ++ )
8603 $mappings [ $matches [ 'mapping' ] [$i] ] = $matches [ 'object_id' ] [$i] ;
8604
8605 // Put results for referenced objects in cache
8606 if ( $is_object )
8607 $this -> ResourceMappingCache [ $object_id ] [ $parameter ] = $mappings ;
8608
8609 return ( $mappings ) ;
8610 }
8611 else
8612 return ( array ( ) ) ;
8613 }
8614
8615
8616 /*--------------------------------------------------------------------------------------------------------------
8617
8618 NAME
8619 Peek - Peeks page information from a pdf object.
8620
8621 PROTOTYPE
8622 $pagemap -> Peek ( ) ;
8623
8624 DESCRIPTION
8625 Retrieves page information which can be of type (1), (2) or (3), as described in the class comments.
8626
8627 PARAMETERS
8628 $object_id (integer) -
8629 Id of the current pdf object.
8630
8631 $object_data (string) -
8632 Pdf object contents.
8633
8634 $pdf_objects (associative array) -
8635 Objects defined in the pdf file, as an associative array whose keys are object numbers and
8636 values object data.
8637 This parameter is used for /Type/Page objects which have a /Resource parameter that references
8638 an existing object instead of providing font mappings and other XObject mappings inline,
8639 enclosed within double angle brackets (<< /Font ... >>).
8640
8641 *-------------------------------------------------------------------------------------------------------------*/
8642 public function Peek ( $object_id, $object_data, $pdf_objects )
8643 {
8644 // Page catalog (/Type/Catalog and /Pages x 0 R)
8645 if ( preg_match ( '#/Type \s* /Catalog#ix', $object_data ) && $this -> GetObjectReferences ( $object_id, $object_data, '/Pages', $references ) )
8646 $this -> PageCatalogs = array_merge ( $this -> PageCatalogs, $references ) ;
8647 // Object listing the object numbers that give the list of objects contained in a single page (/Types/Pages and /Count x /Kids[x1 0 R ... xn 0 R]
8648 else if ( preg_match ( '#/Type \s* /Pages#ix', $object_data ) )
8649 {
8650 if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Kids', $references ) )
8651 {
8652 // Sometimes, a reference can be the one of an object that contains the real reference ; in the following example,
8653 // the actual page contents are not in object 4, but in object 5
8654 // /Kids 4 0 R
8655 // ...
8656 // 4 0 obj
8657 // [5 0 R]
8658 // endobj
8659 $new_references = array ( ) ;
8660
8661 foreach ( $references as $reference )
8662 {
8663 if ( ! isset ( $pdf_objects [ $reference ] ) ||
8664 ! preg_match ( '/^ \s* (?P<ref> \[ [^]]+ \]) \s*$/imsx', $pdf_objects [ $reference ], $match ) )
8665 {
8666 $new_references [] = $reference ;
8667 }
8668 else
8669 {
8670 $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $sub_references ) ;
8671 $new_references = array_merge ( $new_references, $sub_references ) ;
8672 }
8673
8674 }
8675
8676 // Get kid count (knowing that sometimes, it is missing...)
8677 preg_match ( '#/Count \s+ (?P<count> \d+)#ix', $object_data, $match ) ;
8678 $page_count = ( isset ( $match [ 'count' ] ) ) ? ( integer ) $match [ 'count' ] : false ;
8679
8680 // Get parent object id
8681 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8682 $parent = ( isset ( $match [ 'parent' ] ) ) ? ( integer ) $match [ 'parent' ] : false ;
8683
8684 $this -> PageKids [ $object_id ] = array
8685 (
8686 'object' => $object_id,
8687 'parent' => $parent,
8688 'count' => $page_count,
8689 'kids' => $new_references
8690 ) ;
8691 }
8692 }
8693 // Object listing the other objects that are contained in this page (/Type/Page and /Contents[x1 0 R ... xn 0 R]
8694 else if ( preg_match ( '#/Type \s* /Page\b#ix', $object_data ) )
8695 {
8696 if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) )
8697 {
8698 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8699 $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ;
8700 $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
8701 $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
8702
8703 // Find the width and height of the page (/Mediabox parameter)
8704 if ( preg_match ( '#/MediaBox \s* \[ \s* (?P<x1> \d+) \s+ (?P<y1> \d+) \s+ (?P<x2> \d+) \s+ (?P<y2> \d+) \s* \]#imsx', $object_data, $match ) )
8705 {
8706 $width = ( double ) ( $match [ 'x2' ] - $match [ 'x1' ] + 1 ) ;
8707 $height = ( double ) ( $match [ 'y2' ] - $match [ 'y1' ] + 1 ) ;
8708 }
8709 // Otherwise, fix an arbitrary width and length (but this should never happen, because all pdf files are correct, isn't it?)
8710 else
8711 {
8712 $width = 595 ;
8713 $height = 850 ;
8714 }
8715
8716 // Yes ! some /Contents parameters may designate another object which contains references to the real text contents
8717 // in the form : [x 0 R y 0 R etc.], so we have to dig into it...
8718 $new_references = array ( ) ;
8719
8720 foreach ( $references as $reference )
8721 {
8722 // We just need to check that the object contains something like :
8723 // [x 0 R y 0 R ...]
8724 // and nothing more
8725 if ( isset ( $pdf_objects [ $reference ] ) && preg_match ( '#^\s* \[ [^]]+ \]#x', $pdf_objects [ $reference ] ) &&
8726 $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $nested_references ) )
8727 $new_references = array_merge ( $new_references, $nested_references ) ;
8728 else
8729 $new_references [] = $reference ;
8730 }
8731
8732 $this -> PageContents [ $object_id ] = array
8733 (
8734 'object' => $object_id,
8735 'parent' => $parent,
8736 'contents' => $new_references,
8737 'fonts' => $fonts,
8738 'xobjects' => $xobjects,
8739 'width' => $width,
8740 'height' => $height
8741 ) ;
8742 }
8743 }
8744 // None of the above, but object contains /Xobject's and maybe more...
8745 else if ( preg_match ( '#/Type \s* /XObject\b#ix', $object_data ) )
8746 {
8747 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8748 $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ;
8749 $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
8750 $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
8751
8752 $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) ;
8753
8754 $this -> PageContents [ $object_id ] = array
8755 (
8756 'object' => $object_id,
8757 'parent' => $parent,
8758 'contents' => $references,
8759 'fonts' => $fonts,
8760 'xobjects' => $xobjects
8761 ) ;
8762 }
8763 }
8764
8765
8766 /*--------------------------------------------------------------------------------------------------------------
8767
8768 NAME
8769 ProcessTemplateReferences - Replace template references with actual text contents.
8770
8771 PROTOTYPE
8772 $text = $pagemap -> ReplaceTemplateReferences ( $page_number, $text_data ) ;
8773
8774 DESCRIPTION
8775 Replaces template references of the form "/TPLx Do" with the actual text contents.
8776
8777 PARAMETERS
8778 $page_number (integer) -
8779 Page number of the object that contains the supplied object data.
8780
8781 $text_data (string)
8782 Text drawing instructions that are to be processed.
8783
8784 RETURN VALUE
8785 Returns the original text, where all template references have been replaced with the contents of the
8786 object they refer to.
8787
8788 *-------------------------------------------------------------------------------------------------------------*/
8789 public function ProcessTemplateReferences ( $page_number, $text_data )
8790 {
8791 // Many paranoid checks in this piece of code...
8792 if ( isset ( $this -> Pages [ $page_number ] ) )
8793 {
8794 // Loop through the PageContents array to find which one(s) may be subject to template reference replacements
8795 foreach ( $this -> PageContents as $page_contents )
8796 {
8797 // If the current object relates to the specified page number, AND it has xobjects, then the supplied text data
8798 // may contain template reference of the form : /TPLx.
8799 // In this case, we replace such a reference with the actual contents of the object they refer to
8800 if ( isset ( $page_contents [ 'page' ] ) && $page_contents [ 'page' ] == $page_number && count ( $page_contents [ 'xobjects' ] ) )
8801 {
8802 $template_searches = array ( ) ;
8803 $template_replacements = array ( ) ;
8804
8805 $this -> __get_replacements ( $page_contents, $template_searches, $template_replacements ) ;
8806 $text_data = self::PregStrReplace ( $template_searches, $template_replacements, $text_data ) ;
8807 }
8808 }
8809 }
8810
8811 return ( $text_data ) ;
8812 }
8813
8814
8815 // __get_replacements -
8816 // Recursively gets the search/replacement strings for template references.
8817 private function __get_replacements ( $page_contents, &$searches, &$replacements, $objects_seen = array ( ) )
8818 {
8819 foreach ( $page_contents [ 'xobjects' ] as $template_name => $template_object )
8820 {
8821 if ( isset ( $this -> TemplateObjects [ $template_object ] ) && ! isset ( $objects_seen [ $template_object ] ) )
8822 {
8823 $template = $this -> TemplateObjects [ $template_object ] ;
8824 $searches [] = '#(' . $template_name . ' \s+ Do\b )#msx' ;
8825 $replacements [] = '!PDFTOTEXT_TEMPLATE_' . substr ( $template_name, 1 ) . ' ' . $template ;
8826 $objects_seen [ $template_object ] = $template_object ;
8827
8828 if ( isset ( $this -> PageContents [ $template_object ] ) )
8829 $this -> __get_replacements ( $this -> PageContents [ $template_object ], $searches, $replacements, $objects_seen ) ;
8830 }
8831 }
8832 }
8833
8834
8835
8836 /*--------------------------------------------------------------------------------------------------------------
8837
8838 NAME
8839 MapObjects - Builds a correspondance between object and page numbers.
8840
8841 PROTOTYPE
8842 $pagemap -> MapObjects ( ) ;
8843
8844 DESCRIPTION
8845 Builds a correspondance between object and page numbers. The page number corresponding to an object id
8846 will after that be available using the array notation.
8847
8848 NOTES
8849 This method behaves as if there could be more than one page catalog in the same file, but I've not yet
8850 encountered this case.
8851
8852 *-------------------------------------------------------------------------------------------------------------*/
8853 public function MapObjects ( $objects )
8854 {
8855 $kid_count = count ( $this -> PageKids ) ;
8856
8857 // PDF files created short after the birth of Earth may have neither a page catalog nor page contents descriptions
8858 if ( ! count ( $this -> PageCatalogs ) )
8859 {
8860 // Later, during Pleistocen, references to page kids started to appear...
8861 if ( $kid_count )
8862 {
8863 foreach ( array_keys ( $this -> PageKids ) as $catalog )
8864 $this -> MapKids ( $catalog, $current_page ) ;
8865 }
8866 else
8867 $this -> Pages [1] = array_keys ( $objects ) ;
8868 }
8869 // This is the ideal situation : there is a catalog that allows us to gather indirectly all page data
8870 else
8871 {
8872 $current_page = 1 ;
8873
8874 foreach ( $this -> PageCatalogs as $catalog )
8875 {
8876 if ( isset ( $this -> PageKids [ $catalog ] ) )
8877 $this -> MapKids ( $catalog, $current_page ) ;
8878 // Well, almost ideal : it may happen that the page catalog refers to a non-existing object :
8879 // in this case, we behave the same as if there were no page catalog at all : group everything
8880 // onto one page
8881 else
8882 $this -> Pages [1] = array_keys ( $objects ) ;
8883 }
8884 }
8885 }
8886
8887
8888 /*--------------------------------------------------------------------------------------------------------------
8889
8890 NAME
8891 MapKids - Establishes a correspondance between page kids and a current page number.
8892
8893 PROTOTYPE
8894 $pagemap -> MapObjects ( $catalog, &$page ) ;
8895
8896 DESCRIPTION
8897 Tries to assign a page number to all page description objects that have been collected by the Peek()
8898 method.
8899 Also creates the Pages associative array, whose keys are page numbers and whose values are the ids of
8900 the objects that the page contains.
8901
8902 EXAMPLE
8903 The following example gives an overview of a possible layout for page catalogs ; it describes which
8904 objects contain what.
8905 Lines starting with "#x", where "x" is a number, stands for a PDF object definition, which will start
8906 with "x 0 obj" in the PDF file.
8907 Whenever numbers are referenced (other than those prefixed with a "#"), it means "reference to the
8908 specified object.
8909 For example, "54" will refer to object #54, and will be given as "54 0 R" in the PDF file.
8910 The numbers at the beginning of each line are just "step numbers", which will be referenced in the
8911 explanations after the example :
8912
8913 (01) #1 : /Type/Catalog /Pages 54
8914 (02) -> #54 : /Type/Pages /Kids[3 28 32 58] /Count 5
8915 (03) -> #3 : /Type/Page /Parent 54 /Contents[26]
8916 (04) -> #26 : page contents
8917 (05) -> #28 : /Type/Page /Parent 54 /Contents[30 100 101 102 103 104]
8918 (06) -> #30 : page contents
8919 (07) -> #32 : /Type/Page /Parent 54 /Contents[34]
8920 (08) -> #34 : page contents
8921 (09) -> #58 : /Type/Pages /Parent 54 /Count 2 /Kids[36 40]
8922 (10) -> #36 : /Type/Page /Parent 58 /Contents[38]
8923 (11) -> #38 : page contents
8924 (12) -> #40 : /Type/Page /Parent 58 /Contents[42]
8925 (13) -> #42 : page contents
8926
8927 Explanations :
8928 (01) Object #1 contains the page catalog ; it states that a further description of the page
8929 contents is given by object #54.
8930 Note that it could reference multiple page descriptions, such as : /Pages [54 68 99...]
8931 (although I did not met the case so far)
8932 (02) Object #54 in turn says that it as "kids", described by objects #3, #28, #32 and #58. It
8933 also says that it has 5 pages (/Count parameter) ; but wait... the /Kids parameter references
8934 4 objects while the /Count parameter states that we have 5 pages : what happens ? we will
8935 discover it in the explanations below.
8936 (03) Object #3 states that it is aimed for page description (/Type/Page) ; the page contents
8937 will be found in object #26, specified after the /Contents parameter. Note that here again,
8938 multiple objects could be referenced by the /Contents parameter but, in our case, there is
8939 only one, 26. Object #3 also says that its parent object (in the page catalog) is object
8940 #54, defined in (01).
8941 Since this is the first page we met, it will have page number 1.
8942 (04) ... object #26 contains the Postscript instructions to draw page #1
8943 (05) Object #28 has the same type as #3 ; its page contents can be located in object #30 (06)
8944 The same applies for object #32 (07), whose page contents are given by object #34 (08).
8945 So, (05) and (07) will be pages 2 and 3, respectively.
8946 (09) Now, it starts to become interesting : object #58 does not directly lead to an object
8947 containing Postscript instructions as did objects #3, #28 and #32 whose parent is #54, but
8948 to yet another page catalog which contains 2 pages (/Count 2), described by objects #36 and
8949 #40. It's not located at the same position as object #54 in the hierarchy, so it shows that
8950 page content descriptions can be recursively nested.
8951 (10) Object #36 says that we will find the page contents in object #38 (which will be page 4)
8952 (12) ... and object #40 says that we will find the page contents in object #42 (and our final
8953 page, 5)
8954
8955 *-------------------------------------------------------------------------------------------------------------*/
8956 protected function MapKids ( $catalog, &$page )
8957 {
8958 if ( ! isset ( $this -> PageKids [ $catalog ] ) )
8959 return ;
8960
8961 $entry = $this -> PageKids [ $catalog ] ;
8962
8963 // The PDF file contains an object containing a /Type/Pages/Kids[] construct, specified by another object containing a
8964 // /Type/Catalog/Pages construct : we will rely on its contents to find which page contains what
8965 if ( isset ( $this -> PageContents [ $entry [ 'kids' ] [0] ] ) )
8966 {
8967 foreach ( $entry [ 'kids' ] as $item )
8968 {
8969 // Some objects given by a /Page /Contents[] construct do not directly lead to an object describing PDF contents,
8970 // but rather to an object containing in turn a /Pages /Kids[] construct ; this adds a level of indirection, and
8971 // we have to recursively process it
8972 if ( isset ( $this -> PageKids [ $item ] ) )
8973 {
8974 $this -> MapKids ( $item, $page ) ;
8975 }
8976 // The referenced object actually defines page contents (no indirection)
8977 else
8978 {
8979 $this -> PageContents [ $item ] [ 'page' ] = $page ;
8980 $this -> Pages [ $page ] = ( isset ( $this -> PageContents [ $item ] [ 'contents' ] ) ) ?
8981 $this -> PageContents [ $item ] [ 'contents' ] : array ( ) ;
8982 if ( isset ( $this -> PageContents [ $item ] [ 'width' ] ) )
8983 {
8984 $this -> PageAttributes [ $page ] = array
8985 (
8986 'width' => $this -> PageContents [ $item ] [ 'width' ],
8987 'height' => $this -> PageContents [ $item ] [ 'height' ]
8988 ) ;
8989 }
8990
8991 $page ++ ;
8992 }
8993 }
8994 }
8995 // No page catalog at all : consider everything is on the same page (this class does not use the WheresMyCrystalBall trait)
8996 else
8997 {
8998 foreach ( $entry [ 'kids' ] as $kid )
8999 $this -> MapKids ( $kid, $page ) ;
9000 }
9001 }
9002
9003
9004 /*--------------------------------------------------------------------------------------------------------------
9005
9006 NAME
9007 GetMappedFonts - Retrieves the mapped fonts per page
9008
9009 PROTOTYPE
9010 $array = $pagemap -> GetMappedFonts ( ) ;
9011
9012 DESCRIPTION
9013 Gets the mapped fonts, per page. XObjects are traversed, to retrieved additional font aliases defined
9014 by them.
9015 This function is used by the PdfTexter class to add additional entries to the FontMap object,
9016 ensuring that each reference to a font remains local to a page.
9017
9018 RETURN VALUE
9019 Returns an array of associative arrays which have the following entries :
9020 - 'page' :
9021 Page number.
9022 - 'xobject-name' :
9023 XObject name, that can define further font aliases. This entry is set to the empty string for
9024 global font aliases.
9025 - 'font-name' :
9026 Font name (eg, "/F1", "/C1_0", etc.).
9027 - 'object' :
9028 Object defining the font attributes, such as character map, etc.
9029
9030 *-------------------------------------------------------------------------------------------------------------*/
9031 public function GetMappedFonts ( )
9032 {
9033 $mapped_fonts = array ( ) ;
9034 $current_page = 0 ;
9035
9036 foreach ( $this -> PageCatalogs as $catalog )
9037 {
9038 if ( ! isset ( $this -> PageKids [ $catalog ] ) )
9039 continue ;
9040
9041 foreach ( $this -> PageKids [ $catalog ] [ 'kids' ] as $page_object )
9042 {
9043 $current_page ++ ;
9044
9045 if ( isset ( $this -> PageContents [ $page_object ] ) )
9046 {
9047 $page_contents = $this -> PageContents [ $page_object ] ;
9048 $associations = array ( ) ;
9049
9050 if ( isset ( $page_contents [ 'fonts' ] ) )
9051 {
9052 foreach ( $page_contents [ 'fonts' ] as $font_name => $font_object )
9053 {
9054 $mapped_fonts [] = array
9055 (
9056 'page' => $current_page,
9057 'xobject-name' => '',
9058 'font-name' => $font_name,
9059 'object' => $font_object
9060 ) ;
9061
9062 $associations [ ":$font_name" ] = $font_object ;
9063
9064 $this -> __map_recursive ( $current_page, $page_contents [ 'xobjects' ], $mapped_fonts, $associations ) ;
9065 }
9066 }
9067 }
9068 }
9069 }
9070
9071 return ( $mapped_fonts ) ;
9072 }
9073
9074
9075 // __map_recursive -
9076 // Recursively collects font aliases for XObjects.
9077 private function __map_recursive ( $page_number, $xobjects, &$mapped_fonts, &$associations )
9078 {
9079 foreach ( $xobjects as $xobject_name => $xobject_value )
9080 {
9081 if ( isset ( $this -> PageContents [ $xobject_value ] ) )
9082 {
9083 foreach ( $this -> PageContents [ $xobject_value ] [ 'fonts' ] as $font_name => $font_object )
9084 {
9085 if ( ! isset ( $associations [ "$xobject_name:$font_name" ] ) )
9086 {
9087 $mapped_fonts [] = array
9088 (
9089 'page' => $page_number,
9090 'xobject-name' => $xobject_name,
9091 'font-name' => $font_name,
9092 'object' => $font_object
9093 ) ;
9094
9095 $associations [ "$xobject_name:$font_name" ] = $font_object ;
9096 }
9097 }
9098
9099 $this -> XObjectNames [ $xobject_name ] = 1 ;
9100 $this -> __map_recursive ( $page_number, $this -> PageContents [ $xobject_value ] [ 'xobjects' ], $mapped_fonts, $associations ) ;
9101 }
9102 }
9103 }
9104
9105
9106
9107 /*--------------------------------------------------------------------------------------------------------------
9108
9109 NAME
9110 IsValidXObject - Checks if the specified object is a valid XObject.
9111
9112 PROTOTYPE
9113 $status = $pagemap -> IsValidXObjectName ( $name ) ;
9114
9115 DESCRIPTION
9116 Checks if the specified name is a valid XObject defining its own set of font aliases.
9117
9118 PARAMETERS
9119 $name (string) -
9120 Name of the XObject to be checked.
9121
9122 RETURN VALUE
9123 Returns true if the specified XObject exists and defines its own set of font aliases, false otherwise.
9124
9125 *-------------------------------------------------------------------------------------------------------------*/
9126 public function IsValidXObjectName ( $name )
9127 { return ( isset ( $this -> XObjectNames [ $name ] ) ) ; }
9128 }
9129
9130
9131/**************************************************************************************************************
9132 **************************************************************************************************************
9133 **************************************************************************************************************
9134 ****** ******
9135 ****** ******
9136 ****** IMAGE MANAGEMENT ******
9137 ****** ******
9138 ****** ******
9139 **************************************************************************************************************
9140 **************************************************************************************************************
9141 **************************************************************************************************************/
9142
9143/*==============================================================================================================
9144
9145 class PdfImage -
9146 Holds image data coming from pdf.
9147
9148 ==============================================================================================================*/
9149abstract class PdfImage extends PdfObjectBase
9150 {
9151 // Image resource that can be used to process image data, using the php imagexxx() functions
9152 public $ImageResource = false ;
9153 // Original image data
9154 protected $ImageData ;
9155 // Tells if the image resource has been created - false when the autosave feature is on and the image is pure JPEG data
9156 protected $NoResourceCreated ;
9157
9158
9159 /*--------------------------------------------------------------------------------------------------------------
9160
9161 CONSTRUCTOR
9162 Creates a PdfImage object with a resource that can be used with imagexxx() php functions.
9163
9164 *-------------------------------------------------------------------------------------------------------------*/
9165 public function __construct ( $image_data, $no_resource_created = false )
9166 {
9167 $this -> ImageData = $image_data ;
9168 $this -> NoResourceCreated = $no_resource_created ;
9169
9170 if ( ! $no_resource_created )
9171 $this -> ImageResource = $this -> CreateImageResource ( $image_data ) ;
9172 }
9173
9174
9175 /*--------------------------------------------------------------------------------------------------------------
9176
9177 DESTRUCTOR
9178 Destroys the associated image resource.
9179
9180 *-------------------------------------------------------------------------------------------------------------*/
9181 public function __destruct ( )
9182 {
9183 $this -> DestroyImageResource ( ) ;
9184 }
9185
9186
9187 /*--------------------------------------------------------------------------------------------------------------
9188
9189 NAME
9190 CreateImageResource - creates an image resource from the supplied image data.
9191
9192 PROTOTYPE
9193 $resource = $this -> CreateImageResource ( $data ) ;
9194
9195 DESCRIPTION
9196 Creates an image resource from the supplied image data.
9197 Whatever the input format, the internal format will be the one used by the gd library.
9198
9199 PARAMETERS
9200 $data (string) -
9201 Image data.
9202
9203 *-------------------------------------------------------------------------------------------------------------*/
9204 abstract protected function CreateImageResource ( $image_data ) ;
9205
9206
9207 /*--------------------------------------------------------------------------------------------------------------
9208
9209 NAME
9210 DestroyImageResource - Destroys the allocated image resource.
9211
9212 PROTOTYPE
9213 $this -> DestroyImageResource ( ) ;
9214
9215 DESCRIPTION
9216 Destroys the allocated image resource, using the libgd imagedestroy() function. This method can be
9217 overridden by derived class if the underlying image resource does not come from the gd lib.
9218
9219 *-------------------------------------------------------------------------------------------------------------*/
9220 protected function DestroyImageResource ( )
9221 {
9222 if ( $this -> ImageResource )
9223 imagedestroy ( $this -> ImageResource ) ;
9224 }
9225
9226
9227 /*--------------------------------------------------------------------------------------------------------------
9228
9229 NAME
9230 SaveAs - Saves the current image to a file.
9231
9232 PROTOTYPE
9233 $pdfimage -> SaveAs ( $output_file, $image_type = IMG_JPEG ) ;
9234
9235 DESCRIPTION
9236 Saves the current image resource to the specified output file, in the specified format.
9237
9238 PARAMETERS
9239 $output_file (string) -
9240 Output filename.
9241
9242 $image_type (integer) -
9243 Output format. Can be any of the predefined php constants IMG_*.
9244
9245 *-------------------------------------------------------------------------------------------------------------*/
9246 public function SaveAs ( $output_file, $image_type = IMG_JPEG )
9247 {
9248 if ( ! $this -> ImageResource )
9249 {
9250 if ( $this -> NoResourceCreated && $image_type == IMG_JPEG )
9251 file_put_contents ( $output_file, $this -> ImageData ) ;
9252 else if ( PdfToText::$DEBUG )
9253 warning ( new PdfToTextDecodingException ( "No image resource allocated." ) ) ;
9254
9255 return ;
9256 }
9257
9258 $image_types = imagetypes ( ) ;
9259
9260 switch ( $image_type )
9261 {
9262 case IMG_JPEG :
9263 case IMG_JPG :
9264 if ( ! ( $image_types & IMG_JPEG ) && ! ( $image_types & IMG_JPG ) )
9265 error ( new PdfToTextDecodingException ( "Your current PHP version does not support JPG images." ) ) ;
9266
9267 imagejpeg ( $this -> ImageResource, $output_file, 100 ) ;
9268 break ;
9269
9270 case IMG_GIF :
9271 if ( ! ( $image_types & IMG_GIF ) )
9272 error ( new PdfToTextDecodingException ( "Your current PHP version does not support GIF images." ) ) ;
9273
9274 imagegif ( $this -> ImageResource, $output_file ) ;
9275 break ;
9276
9277 case IMG_PNG :
9278 if ( ! ( $image_types & IMG_PNG ) )
9279 error ( new PdfToTextDecodingException ( "Your current PHP version does not support PNG images." ) ) ;
9280
9281 imagepng ( $this -> ImageResource, $output_file, 0 ) ;
9282 break ;
9283
9284 case IMG_WBMP :
9285 if ( ! ( $image_types & IMG_WBMP ) )
9286 error ( new PdfToTextDecodingException ( "Your current PHP version does not support WBMP images." ) ) ;
9287
9288 imagewbmp ( $this -> ImageResource, $output_file ) ;
9289 break ;
9290
9291 case IMG_XPM :
9292 if ( ! ( $image_types & IMG_XPM ) )
9293 error ( new PdfToTextDecodingException ( "Your current PHP version does not support XPM images." ) ) ;
9294
9295 imagexbm ( $this -> ImageResource, $output_file ) ;
9296 break ;
9297
9298 default :
9299 error ( new PdfToTextDecodingException ( "Unknown image type #$image_type." ) ) ;
9300 }
9301 }
9302
9303
9304 public function Output ( )
9305 {
9306 $this -> SaveAs ( null ) ;
9307 }
9308 }
9309
9310
9311
9312/*==============================================================================================================
9313
9314 class PdfJpegImage -
9315 Handles encoded JPG images.
9316
9317 ==============================================================================================================*/
9318class PdfJpegImage extends PdfImage
9319 {
9320 public function __construct ( $image_data, $autosave )
9321 {
9322 parent::__construct ( $image_data, $autosave ) ;
9323 }
9324
9325
9326 protected function CreateImageResource ( $image_data )
9327 {
9328 return ( imagecreatefromstring ( $image_data ) ) ;
9329 }
9330 }
9331
9332
9333/*==============================================================================================================
9334
9335 class PdfInlinedImage -
9336 Decodes raw image data in objects having the /FlateDecode flag.
9337
9338 ==============================================================================================================*/
9339class PdfInlinedImage extends PdfImage
9340 {
9341 // Supported color schemes
9342 const COLOR_SCHEME_RGB = 1 ;
9343 const COLOR_SCHEME_CMYK = 2 ;
9344 const COLOR_SCHEME_GRAY = 3 ;
9345
9346 // Color scheme names, for debugging only
9347 private static $DecoderNames = array
9348 (
9349 self::COLOR_SCHEME_RGB => 'RGB',
9350 self::COLOR_SCHEME_CMYK => 'CMYK',
9351 self::COLOR_SCHEME_GRAY => 'Gray'
9352 ) ;
9353
9354 // Currently implemented image decoders
9355 private static $Decoders = array
9356 (
9357 self::COLOR_SCHEME_RGB => array
9358 (
9359 8 => '__decode_rgb8'
9360 ),
9361 self::COLOR_SCHEME_GRAY => array
9362 (
9363 8 => '__decode_gray8'
9364 ),
9365 self::COLOR_SCHEME_CMYK => array
9366 (
9367 8 => '__decode_cmyk8'
9368 ),
9369 ) ;
9370
9371 // Image width and height
9372 public $Width,
9373 $Height ;
9374 // Color scheme
9375 public $ColorScheme ;
9376 // Number of bits per color component
9377 public $BitsPerComponent ;
9378 // Decoding function, varying upon the supplied image type
9379 public $DecodingFunction = false ;
9380
9381
9382 /*--------------------------------------------------------------------------------------------------------------
9383
9384 NAME
9385 Constructor - Builds an image from the supplied data.
9386
9387 PROTOTYPE
9388 $image = new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_scheme ) ;
9389
9390 DESCRIPTION
9391 Builds an image from the supplied data. Checks that the image flags are supported.
9392
9393 PARAMETERS
9394 $image_data (string) -
9395 Uncompressed image data.
9396
9397 $width (integer) -
9398 Image width, in pixels.
9399
9400 $height (integer) -
9401 Image height, in pixels.
9402
9403 $bits_per_components (integer) -
9404 Number of bits per color component.
9405
9406 $color_scheme (integer) -
9407 One of the COLOR_SCHEME_* constants, specifying the initial data format.
9408
9409 NOTES
9410 Processed images are always converted to JPEG format.
9411
9412 *-------------------------------------------------------------------------------------------------------------*/
9413 public function __construct ( $image_data, $width, $height, $bits_per_component, $color_scheme )
9414 {
9415 $this -> Width = $width ;
9416 $this -> Height = $height ;
9417 $this -> BitsPerComponent = $bits_per_component ;
9418 $this -> ColorScheme = $color_scheme ;
9419
9420 // Check that we have a decoding function for the supplied parameters
9421 if ( isset ( self::$Decoders [ $color_scheme ] ) )
9422 {
9423 if ( isset ( self::$Decoders [ $color_scheme ] [ $bits_per_component ] ) )
9424 $this -> DecodingFunction = self::$Decoders [ $color_scheme ] [ $bits_per_component ] ;
9425 else
9426 error ( new PdfToTextDecodingException ( "No decoding function has been implemented for image objects having the " .
9427 self::$DecoderNames [ $color_scheme ] . " color scheme with $bits_per_component bits per color component." ) ) ;
9428 }
9429 else
9430 error ( new PdfToTextDecodingException ( "Unknown color scheme $color_scheme." ) ) ;
9431
9432 parent::__construct ( $image_data ) ;
9433 }
9434
9435
9436 /*--------------------------------------------------------------------------------------------------------------
9437
9438 NAME
9439 CreateInstance - Creates an appropriate instance of a PdfImage class.
9440
9441 PROTOTYPE
9442 $image = PdfInlinedImage ( $stream_data, $object_data ) ;
9443
9444 DESCRIPTION
9445 Creates an instance of either :
9446 - A PdfJpegImage class, if the image specifications in $object_data indicate that the compressed stream
9447 contents are only JPEG data
9448 - A PdfInlinedImage class, if the image specifications state that the compressed stream data contain
9449 only color values.
9450
9451 The class currently supports (in $stream_data) :
9452 - Pure JPEG contents
9453 - RGB values
9454 - CMYK values
9455 - Gray scale values (in the current version, the resulting image does not correctly reproduce the
9456 initial colors, if interpolation is to be used).
9457
9458 PARAMETERS
9459 $stream_data (string) -
9460 Compressed image data.
9461
9462 $object_data (string) -
9463 Object containing the stream data.
9464
9465 RETURN VALUE
9466 Returns :
9467 - A PdfJpegImage object, if the stream data contains only pure JPEG contents
9468 - A PdfInlinedImage object, in other cases.
9469 - False if the supplied image data is not currently supported.
9470
9471 *-------------------------------------------------------------------------------------------------------------*/
9472 public static function CreateInstance ( $stream_data, $object_data, $autosave )
9473 {
9474 // Remove stream data from the supplied object data, to speed up the searches below
9475 $index = strpos ( $object_data, 'stream' ) ;
9476
9477 if ( $index !== false )
9478 $object_data = substr ( $object_data, 0, $index ) ;
9479
9480 // Uncompress stream data
9481 $image_data = gzuncompress ( $stream_data ) ;
9482
9483 // The /DCTDecode flag indicates JPEG contents - returns a PdfJpegImage object
9484 if ( stripos ( $object_data, '/DCTDecode' ) )
9485 return ( new PdfJpegImage ( $image_data, $autosave ) ) ;
9486
9487 // Get the image width & height
9488 $match = null ;
9489 preg_match ( '#/Width \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9490 $width = ( integer ) $match [ 'value' ] ;
9491
9492 $match = null ;
9493 preg_match ( '#/Height \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9494 $height = ( integer ) $match [ 'value' ] ;
9495
9496 // Get the number of bits per color component
9497 $match = null ;
9498 preg_match ( '#/BitsPerComponent \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9499 $bits_per_component = ( integer ) $match [ 'value' ] ;
9500
9501 // Get the target color space
9502 // Sometimes, this refers to an object in the PDF file, which can also be embedded in a compound object
9503 // We don't handle such cases for now
9504 $match = null ;
9505 preg_match ( '#/ColorSpace \s* / (?P<value> \w+)#ix', $object_data, $match ) ;
9506
9507 if ( ! isset ( $match [ 'value' ] ) )
9508 return ( false ) ;
9509
9510 $color_space_name = $match [ 'value' ] ;
9511
9512 // Check that we are able to handle the specified color space
9513 switch ( strtolower ( $color_space_name ) )
9514 {
9515 case 'devicergb' :
9516 $color_space = self::COLOR_SCHEME_RGB ;
9517 break ;
9518
9519 case 'devicegray' :
9520 $color_space = self::COLOR_SCHEME_GRAY ;
9521 break ;
9522
9523 case 'devicecmyk' :
9524 $color_space = self::COLOR_SCHEME_CMYK ;
9525 break ;
9526
9527 default :
9528 if ( PdfToText::$DEBUG )
9529 warning ( new PdfToTextDecodingException ( "Unsupported color space \"$color_space_name\"." ) ) ;
9530
9531 return ( false ) ;
9532 }
9533
9534 // Also check that we can handle the specified number of bits per component
9535 switch ( $bits_per_component )
9536 {
9537 case 8 :
9538 break ;
9539
9540 default :
9541 if ( PdfToText::$DEBUG )
9542 warning ( new PdfToTextDecodingException ( "Unsupported bits per component : $bits_per_component." ) ) ;
9543
9544 return ( false ) ;
9545 }
9546
9547 // All done, return a PdfInlinedImage object
9548 return ( new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_space ) ) ;
9549 }
9550
9551
9552 /*--------------------------------------------------------------------------------------------------------------
9553
9554 NAME
9555 CreateImageResource - Creates the image resource.
9556
9557 PROTOTYPE
9558 $resource = $image -> CreateImageResource ( $image_data ) ;
9559
9560 DESCRIPTION
9561 Creates a GD image according to the supplied image data, and the parameters supplied to the class
9562 constructor.
9563
9564 PARAMETERS
9565 $image_data (string) -
9566 Image to be decoded.
9567
9568 RETURN VALUE
9569 Returns a GD graphics resource in true color, or false if there is currently no implemented decoding
9570 function for this kind of images.
9571
9572 *-------------------------------------------------------------------------------------------------------------*/
9573 protected function CreateImageResource ( $image_data )
9574 {
9575 $decoder = $this -> DecodingFunction ;
9576
9577 if ( $decoder )
9578 return ( $this -> $decoder ( $image_data ) ) ;
9579 else
9580 return ( false ) ;
9581 }
9582
9583
9584 /*--------------------------------------------------------------------------------------------------------------
9585
9586 Decoding functions.
9587
9588 *-------------------------------------------------------------------------------------------------------------*/
9589
9590 // __decode_rgb8 -
9591 // Decodes image data consisting of 8-bits RGB values (one byte for each color component).
9592 private function __decode_rgb8 ( $data )
9593 {
9594 $data_length = strlen ( $data ) ;
9595 $colors = array ( ) ;
9596 $width = $this -> Width ;
9597 $height = $this -> Height ;
9598 $image = imagecreatetruecolor ( $width, $height ) ;
9599
9600 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 3 <= $data_length ; $i += 3, $pixel_x ++ )
9601 {
9602 $red = ord ( $data [$i] ) ;
9603 $green = ord ( $data [$i+1] ) ;
9604 $blue = ord ( $data [$i+2] ) ;
9605
9606 $color = ( $red << 16 ) | ( $green << 8 ) | ( $blue ) ;
9607
9608 if ( isset ( $colors [ $color ] ) )
9609 $pixel_color = $colors [ $color ] ;
9610 else
9611 {
9612 $pixel_color = imagecolorallocate ( $image, $red, $green, $blue ) ;
9613 $colors [ $color ] = $pixel_color ;
9614 }
9615
9616 if ( $pixel_x >= $width )
9617 {
9618 $pixel_x = 0 ;
9619 $pixel_y ++ ;
9620 }
9621
9622 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9623 }
9624
9625 return ( $image ) ;
9626 }
9627
9628
9629 // __decode_cmyk8 -
9630 // Decodes image data consisting of 8-bits CMYK values (one byte for each color component).
9631 private function __decode_cmyk8 ( $data )
9632 {
9633 $data_length = strlen ( $data ) ;
9634 $colors = array ( ) ;
9635 $width = $this -> Width ;
9636 $height = $this -> Height ;
9637 $image = imagecreatetruecolor ( $width, $height ) ;
9638
9639 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 4 <= $data_length ; $i += 4, $pixel_x ++ )
9640 {
9641 $cyan = ord ( $data [$i] ) ;
9642 $magenta = ord ( $data [$i+1] ) ;
9643 $yellow = ord ( $data [$i+2] ) ;
9644 $black = ord ( $data [$i+3] ) ;
9645
9646 $color = ( $cyan << 24 ) | ( $magenta << 16 ) | ( $yellow << 8 ) | ( $black ) ;
9647
9648 if ( isset ( $colors [ $color ] ) )
9649 $pixel_color = $colors [ $color ] ;
9650 else
9651 {
9652 $rgb = $this -> __convert_cmyk_to_rgb ( $cyan, $magenta, $yellow, $black ) ;
9653 $pixel_color = imagecolorallocate ( $image, $rgb [0], $rgb [1], $rgb [2] ) ;
9654 $colors [ $color ] = $pixel_color ;
9655 }
9656
9657 if ( $pixel_x >= $width )
9658 {
9659 $pixel_x = 0 ;
9660 $pixel_y ++ ;
9661 }
9662
9663 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9664 }
9665
9666 return ( $image ) ;
9667 }
9668
9669
9670 // __decode_gray8 -
9671 // Decodes image data consisting of 8-bits gray values.
9672 private function __decode_gray8 ( $data )
9673 {
9674 $data_length = strlen ( $data ) ;
9675 $colors = array ( ) ;
9676 $width = $this -> Width ;
9677 $height = $this -> Height ;
9678 $image = imagecreatetruecolor ( $width, $height ) ;
9679
9680 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i < $data_length ; $i ++, $pixel_x ++ )
9681 {
9682 $color = ord ( $data [$i] ) ;
9683
9684 if ( isset ( $colors [ $color ] ) )
9685 $pixel_color = $colors [ $color ] ;
9686 else
9687 {
9688 $pixel_color = imagecolorallocate ( $image, $color, $color, $color ) ;
9689 $colors [ $color ] = $pixel_color ;
9690 }
9691
9692 if ( $pixel_x >= $width )
9693 {
9694 $pixel_x = 0 ;
9695 $pixel_y ++ ;
9696 }
9697
9698 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9699 }
9700
9701 return ( $image ) ;
9702 }
9703
9704
9705 /*--------------------------------------------------------------------------------------------------------------
9706
9707 Support functions.
9708
9709 *-------------------------------------------------------------------------------------------------------------*/
9710
9711 // __convert_cmyk_to_rgb -
9712 // Converts CMYK color value to RGB.
9713 private function __convert_cmyk_to_rgb ( $C, $M, $Y, $K )
9714 {
9715 if ( $C > 1 || $M > 1 || $Y > 1 || $K > 1 )
9716 {
9717 $C /= 100.0 ;
9718 $M /= 100.0 ;
9719 $Y /= 100.0 ;
9720 $K /= 100.0 ;
9721 }
9722
9723 $R = ( 1 - $C * ( 1 - $K ) - $K ) * 256 ;
9724 $G = ( 1 - $M * ( 1 - $K ) - $K ) * 256 ;
9725 $B = ( 1 - $Y * ( 1 - $K ) - $K ) * 256 ;
9726
9727 $result = array ( round ( $R ), round ( $G ), round ( $B ) ) ;
9728
9729 return ( $result ) ;
9730 }
9731 }
9732
9733
9734/*==============================================================================================================
9735
9736 class PdfFaxImage -
9737 Handles encoded CCITT Fax images.
9738
9739 ==============================================================================================================*/
9740class PdfFaxImage extends PdfImage
9741 {
9742 public function __construct ( $image_data )
9743 {
9744 parent::__construct ( $image_data ) ;
9745 }
9746
9747
9748 protected function CreateImageResource ( $image_data )
9749 {
9750 warning ( new PdfToTextDecodingException ( "Decoding of CCITT Fax image format is not yet implemented." ) ) ;
9751 //return ( imagecreatefromstring ( $image_data ) ) ;
9752 }
9753 }
9754
9755
9756/**************************************************************************************************************
9757 **************************************************************************************************************
9758 **************************************************************************************************************
9759 ****** ******
9760 ****** ******
9761 ****** ENCRYPTION MANAGEMENT ******
9762 ****** ******
9763 ****** ******
9764 **************************************************************************************************************
9765 **************************************************************************************************************
9766 **************************************************************************************************************/
9767
9768/*==============================================================================================================
9769
9770 class EncryptionData -
9771 Holds encryption data and allows for decryption.
9772
9773 ==============================================================================================================*/
9774class PdfEncryptionData extends PdfObjectBase
9775 {
9776 // Encryption modes
9777 const PDFMODE_UNKNOWN = 0 ;
9778 const PDFMODE_STANDARD = 1 ;
9779
9780 // Encryption algorithms
9781 const PDFCRYPT_ALGORITHM_RC4 = 0 ;
9782 const PDFCRYPT_ALGORITHM_AES = 1 ;
9783 const PDFCRYPT_ALGORITHM_AES256 = 2 ;
9784
9785 // A 32-bytes hardcoded padding used when computing encryption keys
9786 const PDF_ENCRYPTION_PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A" ;
9787
9788 // Permission bits for encrypted files. Comments come from the PDF specification
9789 const PDFPERM_PRINT = 0x0004 ; // bit 3 :
9790 // (Revision 2) Print the document.
9791 // (Revision 3 or greater) Print the document (possibly not at the highest quality level,
9792 // depending on whether bit 12 is also set).
9793 const PDFPERM_MODIFY = 0x0008 ; // bit 4 :
9794 // Modify the contents of the document by operations other than those controlled by bits 6, 9, and 11.
9795 const PDFPERM_COPY = 0x0010 ; // bit 5 :
9796 // (Revision 2) Copy or otherwise extract text and graphics from the document, including extracting text
9797 // and graphics (in support of accessibility to users with disabilities or for other purposes).
9798 // (Revision 3 or greater) Copy or otherwise extract text and graphics from the document by operations
9799 // other than that controlled by bit 10.
9800 const PDFPERM_MODIFY_EXTRA = 0x0020 ; // bit 6 :
9801 // Add or modify text annotations, fill in interactive form fields, and, if bit 4 is also set,
9802 // create or modify interactive form fields (including signature fields).
9803 const PDFPERM_FILL_FORM = 0x0100 ; // bit 9 :
9804 // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
9805 // even if bit 6 is clear.
9806 const PDFPERM_EXTRACT = 0x0200 ; // bit 10 :
9807 // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
9808 // even if bit 6 is clear.
9809 const PDFPERM_ASSEMBLE = 0x0400 ; // bit 11 :
9810 // (Revision 3 or greater) Assemble the document (insert, rotate, or delete pages and create bookmarks
9811 // or thumbnail images), even if bit 4 is clear.
9812 const PDFPERM_HIGH_QUALITY_PRINT = 0x0800 ; // bit 12 :
9813 // (Revision 3 or greater) Print the document to a representation from which a faithful digital copy of
9814 // the PDF content could be generated. When this bit is clear (and bit 3 is set), printing is limited to
9815 // a low-level representation of the appearance, possibly of degraded quality.
9816
9817 public $FileId ; // File ID, as specified by the /ID flag
9818 public $ObjectId ; // Object id and text contents
9819 private $ObjectData ;
9820 public $Mode ; // Encryption mode - currently, only the "Standard" keyword is accepted
9821 public $EncryptionAlgorithm ; // Encryption algorithm - one of the PDFCRYPT_* constants
9822 public $AlgorithmVersion, // Encryption algorithm version & revision
9823 $AlgorithmRevision ;
9824 public $Flags ; // Protection flags, when an owner password has been specified - one of the PDFPERM_* constants
9825 public $KeyLength ; // Encryption key length
9826 public $UserKey, // User and owner password keys
9827 $OwnerKey ;
9828 public $UserEncryptionString, // Not sure yet of the real usage of these ones
9829 $OwnerEncryptionString ;
9830 public $EncryptMetadata ; // True if metadata is also encrypted
9831 public $FileKeyLength ; // Key length / 5
9832
9833 protected $Decrypter ; // Decrypter object
9834
9835 private $UnsupportedEncryptionAlgorithm = false ; // True if the encryption algorithm used in the PDF file is not yet supported
9836
9837
9838 /**************************************************************************************************************
9839
9840 NAME
9841 Constructor
9842
9843 PROTOTYPE
9844 obj = new PdfEncryptionData ( $mode, $object_id, $object_data ) ;
9845
9846 DESCRIPTION
9847 Creates an instance of a PdfEncryptionData class, using the information parsed from the supplied object
9848 data.
9849
9850 PARAMETERS
9851 $mode (integer) -
9852 One of the PDFMODE_* constants.
9853
9854 $object_id (integer) -
9855 Id of the object containing enryption parameters.
9856
9857 $object_data (string) -
9858 Encryption parameters.
9859
9860 AUTHOR
9861 Christian Vigh, 03/2017.
9862
9863 HISTORY
9864 [Version : 1.0] [Date : 2017-03-14] [Author : CV]
9865 Initial version.
9866
9867 **************************************************************************************************************/
9868 public function __construct ( $file_id, $mode, $object_id, $object_data )
9869 {
9870 $this -> FileId = $file_id ;
9871 $this -> ObjectId = $object_id ;
9872 $this -> ObjectData = $object_data ;
9873 $this -> Mode = $mode ;
9874
9875 // Encryption algorithm version & revision
9876 preg_match ( '#/V \s+ (?P<value> \d+)#ix', $object_data, $algorithm_match ) ;
9877 $this -> AlgorithmVersion = ( integer ) $algorithm_match [ 'value' ] ;
9878
9879 preg_match ( '#/R \s+ (?P<value> \d+)#ix', $object_data, $algorithm_revision_match ) ;
9880 $this -> AlgorithmRevision = ( integer ) $algorithm_revision_match [ 'value' ] ;
9881
9882 // Encryption flags
9883 preg_match ( '#/P \s+ (?P<value> \-? \d+)#ix', $object_data, $flags_match ) ;
9884 $this -> Flags = ( integer) $flags_match [ 'value' ] ;
9885
9886 // Key length (40 bits, if not specified)
9887 if ( preg_match ( '#/Length \s+ (?P<value> \d+)#ix', $object_data, $key_length_match ) )
9888 $this -> KeyLength = $key_length_match [ 'value' ] ;
9889 else
9890 $this -> KeyLength = 40 ;
9891
9892 // Owner and user passwords
9893 $this -> UserKey = $this -> GetStringParameter ( '/U', $object_data ) ;
9894 $this -> OwnerKey = $this -> GetStringParameter ( '/O', $object_data ) ;
9895
9896 // Owner and user encryption strings
9897 $this -> UserEncryptionString = $this -> GetStringParameter ( '/UE', $object_data ) ;
9898 $this -> OwnerEncryptionString = $this -> GetStringParameter ( '/OE', $object_data ) ;
9899
9900 // EncryptMetadata flag
9901 if ( preg_match ( '# /EncryptMetadata (?P<value> (true) | (1) | (false) | (0) )#imsx', $object_data, $encryption_match ) )
9902 {
9903 if ( ! strcasecmp ( $encryption_match [ 'value' ], 'true' ) || ! strcasecmp ( $encryption_match [ 'value' ], 'false' ) )
9904 $this -> EncryptMetadata = true ;
9905 else
9906 $this -> EncryptMetadata = false ;
9907 }
9908 else
9909 $this -> EncryptMetadata = false ;
9910
9911 // Now, try to determine the encryption algorithm to be used
9912 $user_key_length = strlen ( $this -> UserKey ) ;
9913 $owner_key_length = strlen ( $this -> OwnerKey ) ;
9914 $user_encryption_string_length = strlen ( $this -> UserEncryptionString ) ;
9915 $owner_encryption_string_length = strlen ( $this -> OwnerEncryptionString ) ;
9916
9917 $error_unhandled_version = false ;
9918 $error_unhandled_revision = false ;
9919
9920 switch ( $this -> AlgorithmVersion )
9921 {
9922 case 1 :
9923 switch ( $this -> AlgorithmRevision )
9924 {
9925 case 2 :
9926 if ( $user_key_length != 32 && $owner_key_length != 32 )
9927 {
9928 if ( PdfToText::$DEBUG )
9929 error ( new PdfToTextDecryptionException ( "Invalid user and/or owner key length ($user_key_length/$owner_key_length)", $object_id ) ) ;
9930 }
9931
9932 $this -> EncryptionAlgorithm = self::PDFCRYPT_ALGORITHM_RC4 ;
9933 $this -> FileKeyLength = 5 ;
9934 break ;
9935
9936 default :
9937 $error_unhandled_revision = true ;
9938 }
9939 break ;
9940
9941 default :
9942 $error_unhandled_version = true ;
9943 }
9944
9945 // Report unsupported versions/revisions
9946 if ( $error_unhandled_version || $error_unhandled_revision )
9947 {
9948 if ( PdfToText::$DEBUG )
9949 error ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
9950 $object_id ) ) ;
9951
9952 $this -> UnSupportedEncryptionAlgorithm = true ;
9953
9954 return ;
9955 }
9956
9957 // Build the object key
9958 $this -> Decrypter = PdfDecryptionAlgorithm::GetInstance ( $this ) ;
9959
9960 if ( $this -> Decrypter === false )
9961 {
9962 if ( PdfToText::$DEBUG )
9963 warning ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm #{$this -> EncryptionAlgorithm}, " .
9964 "version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
9965 $object_id ) ) ;
9966
9967 $this -> UnsupportedEncryptionAlgorithm = true ;
9968
9969 return ;
9970 }
9971 //dump ( $this ) ;
9972 }
9973
9974
9975 /*--------------------------------------------------------------------------------------------------------------
9976
9977 NAME
9978 GetInstance - Creates an instance of a PdfEncryptionData object.
9979
9980 PROTOTYPE
9981 $obj = PdfEncryptionData::GetInstance ( $object_id, $object_data ) ;
9982
9983 DESCRIPTION
9984 Returns an instance of encryption data
9985
9986 *-------------------------------------------------------------------------------------------------------------*/
9987 public static function GetInstance ( $file_id, $object_id, $object_data )
9988 {
9989 // Encryption mode
9990 if ( ! preg_match ( '#/Filter \s* / (?P<mode> \w+)#ix', $object_data, $object_data_match ) )
9991 return (false ) ;
9992
9993 switch ( strtolower ( $object_data_match [ 'mode' ] ) )
9994 {
9995 case 'standard' :
9996 $mode = self::PDFMODE_STANDARD ;
9997 break ;
9998
9999 default :
10000 if ( self::$DEBUG > 1 )
10001 error ( new PdfToTextDecodingException ( "Unhandled encryption mode '{$object_data [ 'mode' ]}'", $object_id ) ) ;
10002
10003 return ( false ) ;
10004
10005 }
10006
10007 // Basic checks have been performed, return an instance of encryption data
10008 return ( new PdfEncryptionData ( $file_id, $mode, $object_id, $object_data ) ) ;
10009 }
10010
10011
10012 /*--------------------------------------------------------------------------------------------------------------
10013
10014 NAME
10015 Decrypt - Decrypts object data.
10016
10017 PROTOTYPE
10018 $data = $this -> Decrypt ( $object_id, $object_data ) ;
10019
10020 DESCRIPTION
10021 Decrypts object data, when the PDF file is password-protected.
10022
10023 PARAMETERS
10024 $object_id (integer) -
10025 Pdf object number.
10026
10027 $object_data (string) -
10028 Object data.
10029
10030 RETURN VALUE
10031 Returns the decrypted object data, or false if the encrypted object could not be decrypted.
10032
10033 *-------------------------------------------------------------------------------------------------------------*/
10034 public function Decrypt ( $object_id, $object_data )
10035 {
10036 if ( $this -> UnsupportedEncryptionAlgorithm )
10037 return ( false ) ;
10038
10039 return ( false ) ;
10040 //return ( $this -> Decrypter -> Decrypt ( $object_data ) ) ;
10041 //return ( "BT (coucou)Tj ET" ) ;
10042 }
10043 }
10044
10045
10046/*==============================================================================================================
10047
10048 class PdfDecryptionAlgorithm -
10049 Base class for algorithm decrypters.
10050
10051 ==============================================================================================================*/
10052abstract class PdfDecryptionAlgorithm //extends Object
10053 {
10054 protected $EncryptionData ;
10055 protected $ObjectKey ;
10056 protected $ObjectKeyBytes ;
10057 protected $ObjectKeyLength ;
10058
10059
10060 public function __construct ( $encryption_data )
10061 {
10062 $this -> EncryptionData = $encryption_data ;
10063
10064 $objkey = '' ;
10065
10066 for ( $i = 0 ; $i < $this -> EncryptionData -> FileKeyLength ; $i ++ )
10067 $objkey .= $this -> EncryptionData -> FileId [$i] ;
10068
10069 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId ) & 0xFF ) ;
10070 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 8 ) & 0xFF ) ;
10071 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 16 ) & 0xFF ) ;
10072 $objkey .= chr ( 0 ) ; // obj generation number & 0xFF
10073 $objkey .= chr ( 0 ) ; // obj generation number >> 8 & 0xFF
10074
10075 $md5 = md5 ( $objkey, true ) ;
10076 $this -> ObjectKey = $md5 ;
10077 $this -> ObjectKeyLength = 16 ;
10078
10079 $this -> ObjectKeyBytes = array ( ) ;
10080
10081 for ( $i = 0 ; $i < $this -> ObjectKeyLength ; $i ++ )
10082 $this -> ObjectKeyBytes [] = ord ( $this -> ObjectKey [$i] ) ;
10083 }
10084
10085
10086 public static function GetInstance ( $encryption_data )
10087 {
10088 switch ( $encryption_data -> EncryptionAlgorithm )
10089 {
10090 case PdfEncryptionData::PDFCRYPT_ALGORITHM_RC4 :
10091 return ( new PdfRC4DecryptionAlgorithm ( $encryption_data ) ) ;
10092
10093 default :
10094 return ( false ) ;
10095 }
10096 }
10097
10098
10099 abstract public function Reset ( ) ;
10100 abstract public function Decrypt ( $data ) ;
10101
10102 }
10103
10104
10105/*==============================================================================================================
10106
10107 class PdfRC4DecryptionAlgorithm -
10108 A decrypter class for RC4 encoding.
10109
10110 ==============================================================================================================*/
10111class PdfRC4DecryptionAlgorithm extends PdfDecryptionAlgorithm
10112 {
10113 private static $InitialState = false ;
10114 protected $State ;
10115
10116
10117 public function __construct ( $encryption_data )
10118 {
10119 parent::__construct ( $encryption_data ) ;
10120
10121 if ( self::$InitialState === false )
10122 self::$InitialState = range ( 0, 255 ) ;
10123 }
10124
10125
10126 public function Reset ( )
10127 {
10128 $this -> State = self::$InitialState ;
10129 $index1 =
10130 $index2 = 0 ;
10131
10132 for ( $i = 0 ; $i < 256 ; $i ++ )
10133 {
10134 $index2 = ( $this -> ObjectKeyBytes [ $index1 ] + $this -> State [$i] + $index2 ) & 0xFF ;
10135
10136 // Swap elements $index2 and $i from $State
10137 $x = $this -> State [$i] ;
10138 $this -> State [$i] = $this -> State [ $index2 ] ;
10139 $this -> State [ $index2 ] = $x ;
10140
10141 $index1 = ( $index1 + 1 ) % $this -> ObjectKeyLength ;
10142 }
10143 }
10144
10145
10146 public function Decrypt ( $data )
10147 {
10148 $this -> Reset ( ) ;
10149 $length = strlen ( $data ) ;
10150 $x = 0 ;
10151 $y = 0 ;
10152 $result = '' ;
10153
10154 for ( $i = 0 ; $i < $length ; $i ++ )
10155 {
10156 $ord = ord ( $data [$i] ) ;
10157 $x = ( $x + 1 ) & 0xFF ;
10158 $y = ( $this -> State [$x] + $y ) & 0xFF ;
10159
10160 $tx = $this -> State [$x] ;
10161 $ty = $this -> State [$y] ;
10162
10163 $this -> State [$x] = $ty ;
10164 $this -> State [$y] = $tx ;
10165
10166 $new_ord = $ord ^ $this -> State [ ( $tx + $ty ) & 0xFF ] ;
10167 $result .= chr ( $new_ord ) ;
10168 }
10169
10170 return ( $result ) ;
10171 }
10172 }
10173
10174 /*
10175static Guchar rc4DecryptByte(Guchar *state, Guchar *x, Guchar *y, Guchar c) {
10176 Guchar x1, y1, tx, ty;
10177
10178 x1 = *x = (*x + 1) % 256;
10179 y1 = *y = (state[*x] + *y) % 256;
10180 tx = state[x1];
10181 ty = state[y1];
10182 state[x1] = ty;
10183 state[y1] = tx;
10184 return c ^ state[(tx + ty) % 256];
10185}
10186*/
10187
10188
10189/**************************************************************************************************************
10190 **************************************************************************************************************
10191 **************************************************************************************************************
10192 ****** ******
10193 ****** ******
10194 ****** FORM DATA MANAGEMENT ******
10195 ****** ******
10196 ****** ******
10197 **************************************************************************************************************
10198 **************************************************************************************************************
10199 **************************************************************************************************************/
10200
10201
10202/*==============================================================================================================
10203
10204 class PdfToTextFormDefinitions -
10205 Analyzes a template XML file that describes PDF form data and maps PDF field names to human-readable
10206 names.
10207 The GetFormData() returns an object containing the mapped properties with their respective values.
10208
10209 ==============================================================================================================*/
10210class PdftoTextFormDefinitions // extends Object
10211 implements ArrayAccess, Countable, IteratorAggregate
10212 {
10213 static private $ClassDefinitionCount = 0 ;
10214
10215 // Class name, as specified in the XML template
10216 protected $ClassName ;
10217 // Form definitions (a template may contain several versions of the same for definition)
10218 protected $Definitions ;
10219 // Form definitions coming from the PDF file
10220 protected $PdfDefinitions ;
10221
10222
10223 /*--------------------------------------------------------------------------------------------------------------
10224
10225 Constructor -
10226 Parses the supplied XML template.
10227
10228 *-------------------------------------------------------------------------------------------------------------*/
10229 public function __construct ( $xml_data, $pdf_xml_data )
10230 {
10231 // Get PDF XML form data definitions
10232 $this -> __get_pdf_form_definitions ( $pdf_xml_data ) ;
10233
10234 // Create XML data from scratch, if none specified
10235 if ( ! $xml_data )
10236 $xml_data = $this -> __create_default_xml_data ( $this -> PdfDefinitions ) ;
10237
10238 // Decode XML the hard way, without XSD
10239 $xml = simplexml_load_string ( $xml_data ) ;
10240 $root_entry = $xml -> getName ( ) ;
10241 $definitions = array ( ) ;
10242 $class_name = "PdfFormData" ;
10243
10244 if ( strcasecmp ( $root_entry, "forms" ) )
10245 error ( new PdfToTextFormException ( "Root entry must be <forms>, <$root_entry> was found." ) ) ;
10246
10247 // Get the attribute values of the <forms> tag
10248 foreach ( $xml -> attributes ( ) as $attribute_name => $attribute_value )
10249 {
10250 switch ( strtolower ( $attribute_name ) )
10251 {
10252 case 'class' :
10253 $class_name = ( string ) $attribute_value ;
10254
10255 if ( class_exists ( $class_name, false ) )
10256 error ( new PdfToTextFormException ( "Class \"$class_name\" specified in XML template already exists." ) ) ;
10257
10258 break ;
10259
10260 default :
10261 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <forms> tag." ) ) ;
10262 }
10263 }
10264
10265 // Don't know if it will be useful, but try to avoid class name collisions by appending a sequential number if necessary
10266 if ( class_exists ( $class_name, false ) )
10267 {
10268 self::$ClassDefinitionCount ++ ;
10269 $class_name .= '_' . self::$ClassDefinitionCount ;
10270 }
10271
10272 // Loop through each child <form> entry
10273 foreach ( $xml -> children ( ) as $child )
10274 {
10275 $child_name = $child -> getName ( ) ;
10276
10277 switch ( strtolower ( $child_name ) )
10278 {
10279 case 'form' :
10280 $definitions [] = new PdfToTextFormDefinition ( $class_name, $child, $this -> PdfDefinitions ) ;
10281 break ;
10282
10283 default :
10284 error ( new PdfToTextFormException ( "Invalid tag <$child_name>." ) ) ;
10285 }
10286 }
10287
10288 // Ensure that there is at least one form definition
10289 if ( ! count ( $definitions ) )
10290 error ( new PdfToTextFormException ( "No <form> definition found." ) ) ;
10291
10292 // Save to properties
10293 $this -> ClassName = $class_name ;
10294 $this -> Definitions = $definitions ;
10295 }
10296
10297
10298 /*--------------------------------------------------------------------------------------------------------------
10299
10300 Internal methods.
10301
10302 *-------------------------------------------------------------------------------------------------------------*/
10303
10304 // __get_pdf_form_definitions -
10305 // Retrieves the form field definitions coming from the PDF file.
10306 private function __get_pdf_form_definitions ( $pdf_data )
10307 {
10308 preg_match_all ( '#(?P<field> <field .*? </field \s* >)#imsx', $pdf_data, $matches ) ;
10309
10310 foreach ( $matches [ 'field' ] as $field )
10311 {
10312 $xml_field = simplexml_load_string ( $field ) ;
10313
10314 foreach ( $xml_field -> attributes ( ) as $attribute_name => $attribute_value )
10315 {
10316 switch ( strtolower ( $attribute_name ) )
10317 {
10318 case 'name' :
10319 $field_name = ( string ) $attribute_value ;
10320
10321 if ( isset ( $this -> PdfDefinitions [ $field_name ] ) )
10322 $this -> PdfDefinitions [ $field_name ] [ 'occurrences' ] ++ ;
10323 else
10324 {
10325 $this -> PdfDefinitions [ $field_name ] = array
10326 (
10327 'name' => $field_name,
10328 'occurrences' => 1
10329 ) ;
10330 }
10331
10332 break ;
10333 }
10334 }
10335 }
10336 }
10337
10338
10339 // __create_default_xml_data -
10340 // When no XML template has been specified, creates a default one based of the form definitions located in the PDF file.
10341 private function __create_default_xml_data ( $pdf_definitions )
10342 {
10343 $result = "<forms>" . PHP_EOL .
10344 "\t<form version=\"1.0\">" . PHP_EOL ;
10345
10346 foreach ( $pdf_definitions as $name => $field )
10347 {
10348 $name = str_replace ( '-', '_', $name ) ; // Just in case of
10349 $result .= "\t\t<field name=\"$name\" form-field=\"$name\" type=\"string\"/>" . PHP_EOL ;
10350 }
10351
10352 $result .= "\t</form>" . PHP_EOL .
10353 "</forms>" . PHP_EOL ;
10354
10355 return ( $result ) ;
10356 }
10357
10358
10359 /*--------------------------------------------------------------------------------------------------------------
10360
10361 Interfaces implementations to retrieve form definitions.
10362
10363 *-------------------------------------------------------------------------------------------------------------*/
10364 public function count ( )
10365 { return ( count ( $this - Definitions ) ) ; }
10366
10367
10368 public function getIterator ( )
10369 { return ( new ArrayIterator ( $this -> Definitions ) ) ; }
10370
10371
10372 public function offsetExists ( $offset )
10373 { return ( $offset >= 0 && $offset < count ( $this -> Definitions ) ) ; }
10374
10375
10376 public function offsetGet ( $offset )
10377 { return ( $this -> Definitions [ $offset ] ) ; }
10378
10379
10380 public function offsetSet ( $offset, $value )
10381 { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
10382
10383
10384 public function offsetunset ( $offset )
10385 { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
10386 }
10387
10388
10389/*==============================================================================================================
10390
10391 class PdfToTextFormDefinition -
10392 Holds the description of a form inside a form XML template.
10393
10394 ==============================================================================================================*/
10395class PdfToTextFormDefinition // extends Object
10396 {
10397 // Class of the object returned by GetFormData( )
10398 public $ClassName ;
10399
10400 // Form version
10401 public $Version ;
10402
10403 // Field definitions
10404 public $FieldDefinitions = array ( ) ;
10405
10406 // Field groups (ie, fields that are the results of the concatenation of several form fields)
10407 public $Groups = array ( ) ;
10408
10409 // Pdf field definitions
10410 public $PdfDefinitions ;
10411
10412 // Class definition in PHP, whose instance will be returned by GetFormData()
10413 private $ClassDefinition = false ;
10414
10415 // Direct access to field definitions either through their template name or PDF name
10416 private $FieldDefinitionsByName = array ( ) ;
10417 private $FieldDefinitionsByPdfName = array ( ) ;
10418
10419
10420 /*--------------------------------------------------------------------------------------------------------------
10421
10422 Constructor -
10423 Analyze the contents of an XML template form definition.
10424
10425 *-------------------------------------------------------------------------------------------------------------*/
10426 public function __construct ( $class_name, $form_definition, $pdf_definitions )
10427 {
10428 $this -> ClassName = $class_name ;
10429 $this -> PdfDefinitions = $pdf_definitions ;
10430 $field_count = 0 ;
10431
10432 // Get <form> tag attributes
10433 foreach ( $form_definition -> attributes ( ) as $attribute_name => $attribute_value )
10434 {
10435 switch ( strtolower ( $attribute_name ) )
10436 {
10437 case 'version' :
10438 $this -> Version = ( string ) $attribute_value ;
10439 break ;
10440
10441 default :
10442 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <form> tag." ) ) ;
10443 }
10444 }
10445
10446 // Loop through subtags
10447 foreach ( $form_definition -> children ( ) as $child )
10448 {
10449 $tag_name = $child -> getName ( ) ;
10450
10451 // Check subtags
10452 switch ( strtolower ( $tag_name ) )
10453 {
10454 // <group> :
10455 // A group is used to create a property that is the concatenation of several existing properties.
10456 case 'group' :
10457 $fields = array ( ) ;
10458 $separator = '' ;
10459 $name = false ;
10460
10461 // Loop through attribute names
10462 foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value )
10463 {
10464 switch ( $attribute_name )
10465 {
10466 // "name" attribute" :
10467 // The name of the property, as it will appear in the output object.
10468 case 'name' :
10469 $name = PdfToTextObjectBase::ValidatePhpName ( ( string ) $attribute_value ) ;
10470 break ;
10471
10472 // "separator" attribute :
10473 // Separator to be used when concatenating the underlying properties.
10474 case 'separator' :
10475 $separator = ( string ) $attribute_value ;
10476 break ;
10477
10478 // "fields" :
10479 // A list of comma-separated field names, whose values will be concatenated together
10480 // using the specified separator.
10481 case 'fields' :
10482 $items = explode ( ',', ( string ) $attribute_value ) ;
10483
10484 if ( ! count ( $items ) )
10485 error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
10486
10487 foreach ( $items as $item )
10488 $fields [] = PdfToTextObjectBase::ValidatePhpName ( $item ) ;
10489
10490 break ;
10491
10492 // Other attribute names : not allowed
10493 default :
10494 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <group> tag." ) ) ;
10495 }
10496 }
10497
10498 // Check that at least one field has been specified
10499 if ( ! count ( $fields ) )
10500 error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
10501
10502 // Check that the mandatory property name has been specified
10503 if ( ! $name )
10504 error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory in <group> tag." ) ) ;
10505
10506 // Add this new grouped property to the list of existing groups
10507 $this -> Groups [] = array
10508 (
10509 'name' => $name,
10510 'separator' => $separator,
10511 'fields' => $fields
10512 ) ;
10513
10514 break ;
10515
10516 // <field> :
10517 // Field definition.
10518 case 'field' :
10519 $field_def = new PdfToTextFormFieldDefinition ( $child ) ;
10520 $this -> FieldDefinitions [] = $field_def ;
10521 $this -> FieldDefinitionsByName [ $field_def -> Name ] =
10522 $this -> FieldDefinitionsByPdfName [ $field_def -> PdfName ] = $field_count ;
10523 $field_count ++ ;
10524 break ;
10525
10526 // Don't allow other attribute names
10527 default :
10528 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <form> definition." ) ) ;
10529 }
10530 }
10531
10532 // Check that everything is ok (ie, that there is no duplicate fields)
10533 $this -> __paranoid_checks ( ) ;
10534 }
10535
10536
10537 /*--------------------------------------------------------------------------------------------------------------
10538
10539 NAME
10540 GetClassDefinition - Returns the class definition for the urrent form.
10541
10542 PROTOTYPE
10543 $def = $form_def -> GetClassDefinition ( ) ;
10544
10545 DESCRIPTION
10546 Returns a string containing the PHP class definition that will contain the properties defined in the XML
10547 form template.
10548
10549 RETURN VALUE
10550 Returns a string containing the PHP class definition for the current form.
10551
10552 *-------------------------------------------------------------------------------------------------------------*/
10553 public function GetClassDefinition ( )
10554 {
10555 // Return the existing definition, if this method has been called more than once
10556 if ( $this -> ClassDefinition )
10557 return ( $this -> ClassDefinition ) ;
10558
10559 $class_def = "// Class " . $this -> ClassName . " : " . $this -> Version . PHP_EOL .
10560 "class {$this -> ClassName}\t\textends PdfToTextFormData" . PHP_EOL .
10561 " {" . PHP_EOL ;
10562
10563 // Get the maximum width of constant and field names
10564 $max_width = 0 ;
10565
10566 foreach ( $this -> FieldDefinitions as $def )
10567 {
10568 $length1 = strlen ( $def -> Name ) ;
10569 $length2 = strlen ( $def -> PdfName ) ;
10570
10571 if ( $length1 > $max_width || $length2 > $max_width )
10572 $max_width = max ( $length1, $length2 ) ;
10573
10574 foreach ( $def -> Constants as $constant )
10575 {
10576 $length = strlen ( $constant [ 'name' ] ) ;
10577
10578 if ( $length > $max_width )
10579 $max_width = $length ;
10580 }
10581 }
10582
10583 // First, write out the constant definitions
10584 $all_constants = array ( ) ;
10585
10586 foreach ( $this -> FieldDefinitions as $def )
10587 {
10588 foreach ( $def -> Constants as $constant )
10589 {
10590 $name = $constant [ 'name' ] ;
10591 $value = $constant [ 'value' ] ;
10592
10593 if ( isset ( $all_constants [ $name ] ) )
10594 {
10595 if ( $all_constants [ $name ] != $value )
10596 error ( new PdfToTextFormException ( "Constant \"$name\" is defined more than once with different values." ) ) ;
10597 }
10598 else
10599 {
10600 $all_constants [ $name ] = $value ;
10601
10602 if ( ! is_numeric ( $value ) )
10603 $value = '"' . addslashes ( $value ) . '"' ;
10604
10605 $class_def .= "\tconst\t" . str_pad ( $name, $max_width, " ", STR_PAD_RIGHT ) . "\t = $value ; " . PHP_EOL ;
10606 }
10607 }
10608 }
10609
10610 $class_def .= PHP_EOL . PHP_EOL ;
10611
10612 // Then write property definitions
10613 foreach ( $this -> FieldDefinitions as $def )
10614 {
10615 $class_def .= "\t/** @formdata */" . PHP_EOL .
10616 "\tprotected\t\t\${$def -> Name} ;" . PHP_EOL ;
10617 }
10618
10619 $class_def .= PHP_EOL . PHP_EOL ;
10620
10621 // And finally, grouped properties
10622 foreach ( $this -> Groups as $group )
10623 {
10624 $class_def .= "\t/**" . PHP_EOL .
10625 "\t\t@formdata" . PHP_EOL .
10626 "\t\t@group(" . implode ( ',', $group [ 'fields' ] ) . ')' . PHP_EOL .
10627 "\t\t@separator(" . str_replace ( ')', '\)', $group [ 'separator' ] ) . ')' . PHP_EOL .
10628 "\t */" . PHP_EOL .
10629 "\tprotected\t\t\${$group [ 'name' ]} ;" . PHP_EOL .PHP_EOL ;
10630 }
10631
10632 // Constructor
10633 $class_def .= PHP_EOL . PHP_EOL .
10634 "\t// Class constructor" . PHP_EOL .
10635 "\tpublic function __construct ( )" . PHP_EOL .
10636 "\t {" . PHP_EOL .
10637 "\t\tparent::__construct ( ) ;" . PHP_EOL .
10638 "\t }" . PHP_EOL ;
10639
10640 $class_def .= " }" . PHP_EOL ;
10641
10642 // Save the definition, if a second call occurs
10643 $this -> ClassDefinition = $class_def ;
10644
10645 // All done, return
10646 return ( $class_def ) ;
10647 }
10648
10649
10650 /*--------------------------------------------------------------------------------------------------------------
10651
10652 NAME
10653 GetFormData - Returns a form data object containing properties mapped to the form data.
10654
10655 PROTOTYPE
10656 $object = $form_def -> GetFormData ( $fields ) ;
10657
10658 DESCRIPTION
10659 Returns an object containing properties mapped to actual form data.
10660
10661 PARAMETERS
10662 $fields (array) -
10663 An associative array whoses keys are the PDF form field names, and values their values as stored
10664 in the PDF file.
10665
10666 RETURN VALUE
10667 Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
10668 class constructor.
10669
10670 *-------------------------------------------------------------------------------------------------------------*/
10671 public function GetFormData ( $fields = array ( ) )
10672 {
10673 if ( ! class_exists ( $this -> ClassName, false ) )
10674 {
10675 $class_def = $this -> GetClassDefinition ( ) ;
10676 eval ( $class_def ) ;
10677 }
10678
10679 $class_name = $this -> ClassName ;
10680 $object = new $class_name ( ) ;
10681
10682 foreach ( $fields as $name => $value )
10683 {
10684 if ( isset ( $this -> FieldDefinitionsByPdfName [ $name ] ) )
10685 {
10686 $property = $this -> FieldDefinitions [ $this -> FieldDefinitionsByPdfName [ $name ] ] -> Name ;
10687 $object -> $property = $this -> __process_field_value ( $value ) ;
10688 }
10689 }
10690
10691 return ( $object ) ;
10692 }
10693
10694
10695 // __process_field_values -
10696 // Translates html entities and removes carriage returns (which are apparently used for multiline field) to
10697 // replace them with newlines.
10698 private function __process_field_value ( $value )
10699 {
10700 $value = html_entity_decode ( $value ) ;
10701 $result = '' ;
10702
10703 for ( $i = 0, $length = strlen ( $value ) ; $i < $length ; $i ++ )
10704 {
10705 if ( $value [$i] !== "\r" )
10706 $result .= $value [$i] ;
10707 else
10708 {
10709 if ( isset ( $value [ $i + 1 ] ) )
10710 {
10711 if ( $value [ $i + 1 ] !== "\n" )
10712 $result .= "\n" ;
10713 }
10714 else
10715 $result .= "\n" ;
10716 }
10717 }
10718
10719 return ( $result ) ;
10720 }
10721
10722
10723 /*--------------------------------------------------------------------------------------------------------------
10724
10725 NAME
10726 GetformDataFromPdfObject - Same as GetFormData(), except that it operates on XML data.
10727
10728 PROTOTYPE
10729 $object = $pdf -> GetFormDataFromPdfObject ( $pdf_data ) ;
10730
10731 DESCRIPTION
10732 Behaves the same as GetFormData(), except that it takes as input the XML contents of a PDF object.
10733
10734 PARAMETERS
10735 $pdf_data (string) -
10736 XML data coming from the PDF file.
10737
10738 RETURN VALUE
10739 Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
10740 class constructor.
10741
10742 *-------------------------------------------------------------------------------------------------------------*/
10743 protected function GetFormDataFromPdfObject ( $pdf_data )
10744 {
10745 // simplexml_ functions do not like tags that contain a colon - replace them with a dash
10746 $pdf_data = preg_replace ( '/(<[^:]+?)(:)/', '$1-', $pdf_data ) ;
10747
10748 // Load the xml data
10749 $xml = simplexml_load_string ( $pdf_data ) ;
10750
10751 // Get the form field values
10752 $fields = array ( ) ;
10753
10754 $this -> __get_pdfform_data ( $fields, $xml ) ;
10755
10756 // Return the object
10757 return ( $this -> GetFormData ( $fields ) ) ;
10758 }
10759
10760
10761 // __getpdfform_data -
10762 // Retrieve the form field values from the specified PDF object, specified as XML
10763 private function __get_pdfform_data ( &$fields, $xml )
10764 {
10765 $tag_name = $xml -> getName ( ) ;
10766
10767 if ( isset ( $this -> PdfDefinitions [ $tag_name ] ) )
10768 $fields [ $tag_name ] = ( string ) $xml ;
10769 else
10770 {
10771 foreach ( $xml -> children ( ) as $child )
10772 {
10773 $this -> __get_pdfform_data ( $fields, $child ) ;
10774 }
10775 }
10776 }
10777
10778
10779 // __paranoid_checks -
10780 // Checks for several kinds of inconsistencies in the supplied XML template.
10781 private function __paranoid_checks ( )
10782 {
10783 // Check that field names, PDF field names and constant names are unique
10784 $names = array ( ) ;
10785 $pdf_names = array ( ) ;
10786 $constant_names = array ( ) ;
10787
10788 foreach ( $this -> FieldDefinitions as $def )
10789 {
10790 if ( ! isset ( $this -> PdfDefinitions [ $def -> PdfName ] ) )
10791 error ( new PdfToTextFormException ( "Field \"{$def -> PdfName}\" is not defined in the PDF file." ) ) ;
10792
10793 if ( isset ( $names [ $def -> Name ] ) )
10794 error ( new PdfToTextFormException ( "Field \"{$def -> Name}\" is defined more than once." ) ) ;
10795
10796 $names [ $def -> Name ] = true ;
10797
10798 if ( isset ( $pdf_names [ $def -> PdfName ] ) )
10799 error ( new PdfToTextFormException ( "PDF Field \"{$def -> PdfName}\" is referenced more than once." ) ) ;
10800
10801 $pdf_names [ $def -> PdfName ] = true ;
10802
10803 foreach ( $def -> Constants as $constant )
10804 {
10805 $constant_name = $constant [ 'name' ] ;
10806
10807 if ( isset ( $constant_names [ $constant_name ] ) && $constant_names [ $constant_name ] != $constant [ 'value' ] )
10808 error ( new PdfToTextFormException ( "Constant \"$constant_name\" is defined more than once with different values." ) ) ;
10809
10810 $constant_names [ $constant_name ] = $constant [ 'value' ] ;
10811 }
10812 }
10813
10814 // Check that group names are unique and that the fields they are referencing exist
10815 $group_names = array ( ) ;
10816
10817 foreach ( $this -> Groups as $group )
10818 {
10819 if ( isset ( $group_names [ $group [ 'name' ] ] ) )
10820 error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" is defined more than once." ) ) ;
10821
10822 if ( isset ( $names [ $group [ 'name' ] ] ) )
10823 error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" has the same name as an existing field." ) ) ;
10824
10825 foreach ( $group [ 'fields' ] as $field_name )
10826 {
10827 if ( ! isset ( $names [ $field_name ] ) )
10828 error ( new PdfToTextFormException ( "Field \"$field_name\" of group \"{$group [ 'name' ]}\" does not exist." ) ) ;
10829 }
10830 }
10831 }
10832 }
10833
10834
10835/*==============================================================================================================
10836
10837 class PdfToTextFormFieldDefinition -
10838 Contains an XML template form field definition.
10839
10840 ==============================================================================================================*/
10841class PdfToTextFormFieldDefinition // extends Object
10842 {
10843 // Supported field types
10844 const TYPE_STRING = 1 ; // String
10845 const TYPE_CHOICE = 2 ; // Choice (must have <constant> subtags)
10846
10847 // Official name (as it will appear in the class based on the XML template)
10848 public $Name = false ;
10849 // Field name, as specified in the input PDF file
10850 public $PdfName = false ;
10851 // Field type
10852 public $Type = self::TYPE_STRING ;
10853 // Available constant values for this field when the "type" attribute has the value "choice"
10854 public $Constants = array ( ) ;
10855
10856
10857 /*--------------------------------------------------------------------------------------------------------------
10858
10859 Constructor -
10860 Builds the field definition object.
10861
10862 *-------------------------------------------------------------------------------------------------------------*/
10863 public function __construct ( $field_node )
10864 {
10865 // Loop through attributes
10866 foreach ( $field_node -> attributes ( ) as $attribute_name => $attribute_value )
10867 {
10868 switch ( strtolower ( $attribute_name ) )
10869 {
10870 // "name" attribute :
10871 // Specifies the field name as it will appear in the output class. Must be a valid PHP name.
10872 case 'name' :
10873 $this -> Name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
10874 break ;
10875
10876 // "form-field" attribute :
10877 // Corresponding field name in the input PDF form.
10878 case 'form-field' :
10879 $this -> PdfName = ( string ) $attribute_value ;
10880 break ;
10881
10882 // "type" :
10883 // Field type. Can be either :
10884 // - "string" :
10885 // The field value can be any type of string.
10886 // - "choice" :
10887 // The field value has one of the values defined by the <case> or <default> subtags.
10888 case 'type' :
10889 switch ( strtolower ( ( string ) $attribute_value ) )
10890 {
10891 case 'string' :
10892 $this -> Type = self::TYPE_STRING ;
10893 break ;
10894
10895 case 'choice' :
10896 $this -> Type = self::TYPE_CHOICE ;
10897 break ;
10898
10899 default :
10900 error ( new PdfToTextFormException ( "Invalid value \"$attribute_value\" for the \"$attribute_name\" attribute of the <field> tag." ) ) ;
10901 }
10902 }
10903 }
10904
10905 // The "name" and "form-field" attributes are mandatory
10906 if ( ! $this -> Name )
10907 error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory for the <field> tag." ) ) ;
10908
10909 if ( ! $this -> PdfName )
10910 error ( new PdfToTextFormException ( "The \"form-field\" attribute is mandatory for the <field> tag." ) ) ;
10911
10912 // For "type=choice" entries, we have to look for <case> or <default> subtags
10913 if ( $this -> Type === self::TYPE_CHOICE )
10914 {
10915 foreach ( $field_node -> children ( ) as $child )
10916 {
10917 $tag_name = $child -> getName ( ) ;
10918 $lcname = strtolower ( $tag_name ) ;
10919 $is_default = false ;
10920
10921 switch ( $lcname )
10922 {
10923 // Default value to be used when no PDF field value matches the defined constants
10924 case 'default' :
10925 $is_default = true ;
10926
10927 // "case" attribute :
10928 // Maps a value to constant name that will be defined in the generated class.
10929 case 'case' :
10930 $constant_value = "" ;
10931 $constant_name = false ;
10932
10933 // Retrieve attributes
10934 foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value )
10935 {
10936 switch ( strtolower ( $attribute_name ) )
10937 {
10938 // "value" attribute :
10939 // PDF form field value.
10940 case 'value' :
10941 $constant_value = ( string ) $attribute_value ;
10942 break ;
10943
10944 // "constant" attribute :
10945 // Associated constant.
10946 case 'constant' :
10947 $constant_name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
10948 break ;
10949
10950 // Bail out if any unrecognized attribute has been specified
10951 default :
10952 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
10953 }
10954 }
10955
10956 // Each <case> entry must have a "constant" attribute
10957 if ( $constant_value === false && ! $is_default )
10958 error ( new PdfToTextFormException ( "Missing constant value in <case> tag." ) ) ;
10959
10960 if ( $constant_name === false )
10961 error ( new PdfToTextFormException ( "Attribute \"constant-name\" is required for <$tag_name> tag." ) ) ;
10962
10963 // Add this to the list of existing constants
10964 $this -> Constants [] = array
10965 (
10966 'name' => $constant_name,
10967 'value' => $constant_value,
10968 'default' => $is_default
10969 ) ;
10970
10971 break ;
10972
10973 // Check for unrecognized tags
10974 default :
10975 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
10976 }
10977 }
10978 }
10979 }
10980 }
10981
10982
10983/*==============================================================================================================
10984
10985 class PdfToTextFormData -
10986 Base class for all Pdf form templates data.
10987
10988 ==============================================================================================================*/
10989class PdfToTextFormData // extends Object
10990 {
10991 // Doc comments provide information about form data fields (mainly to handle grouped field values)
10992 // The $__Properties array gives information about the form data fields themselves
10993 private $__Properties = array ( ) ;
10994
10995
10996 /*--------------------------------------------------------------------------------------------------------------
10997
10998 Constructor -
10999 Retrieve information about the derived class properties, which are specified by the derived class
11000 generated on the fly.
11001
11002 *-------------------------------------------------------------------------------------------------------------*/
11003 public function __construct ( )
11004 {
11005 // Get class properties
11006 $reflection = new ReflectionClass ( $this ) ;
11007 $properties = $reflection -> getProperties ( ) ;
11008
11009 // Loop through class properties
11010 foreach ( $properties as $property )
11011 {
11012 $propname = $property -> getName ( ) ;
11013 $doc_comment = $property -> getDocComment ( ) ;
11014
11015 $fields = false ;
11016 $separator = false ;
11017
11018 // A doc comment may indicate either :
11019 // - A form data field (@formdata)
11020 // - A grouped field ; in this case, we will have the following tags :
11021 // . @formdata
11022 // . @group(field_list) : list of fields grouped for this property
11023 // . @separator(string) : a separator used when catenating grouped fields
11024 if ( $doc_comment )
11025 {
11026 // The @formdata tag must be present
11027 if ( strpos ( $doc_comment, '@formdata' ) === false )
11028 continue ;
11029
11030 // @group(fields) pattern
11031 if ( preg_match ( '/group \s* \( \s* (?P<fields> [^)]+) \)/imsx', $doc_comment, $match ) )
11032 {
11033 $items = explode ( ',', $match [ 'fields' ] ) ;
11034 $fields = array ( ) ;
11035
11036 foreach ( $items as $item )
11037 $fields [] = $item ;
11038 }
11039
11040 // @separator(string) pattern
11041 if ( preg_match ( '/separator \s* \( \s* (?P<separator> ( (\\\)) | (.) )+ \) /imsx', $doc_comment, $match ) )
11042 {
11043 $separator = stripslashes ( $match [ 'separator' ]) ;
11044 }
11045 }
11046 // Ignore non-formdata properties
11047 else
11048 continue ;
11049
11050 // Property belongs to the form - add it to the list of available properties
11051 $this -> __Properties [ $propname ] = array
11052 (
11053 'name' => $propname,
11054 'fields' => $fields,
11055 'separator' => $separator
11056 ) ;
11057 }
11058 }
11059
11060
11061 /*--------------------------------------------------------------------------------------------------------------
11062
11063 __get -
11064 Returns the underlying property value for this PDF data field.
11065 *-------------------------------------------------------------------------------------------------------------*/
11066 public function __get ( $member )
11067 {
11068 if ( ! isset ( $this -> __Properties [ $member ] ) )
11069 warning ( new PdfToTextFormException ( "Undefined property \"$member\"." ) ) ;
11070
11071 return ( $this -> $member ) ;
11072 }
11073
11074
11075 /*--------------------------------------------------------------------------------------------------------------
11076
11077 __set -
11078 Sets the underlying property value for this PDF data field.
11079 When the property is a compound one, sets individual members as well.
11080
11081 *-------------------------------------------------------------------------------------------------------------*/
11082 public function __set ( $member, $value )
11083 {
11084 // Property exists : some special processing will be needed
11085 if ( isset ( $this -> __Properties [ $member ] ) )
11086 {
11087 $prop_entry = $this -> __Properties [ $member ] ;
11088
11089 // Non-compound property
11090 if ( ! $prop_entry [ 'fields' ] )
11091 {
11092 $this -> $member = $value ;
11093
11094 // However, we have to check that this property belongs to a compound property and change
11095 // the compound property valu accordingly
11096 foreach ( $this -> __Properties as $name => $property )
11097 {
11098 if ( $property [ 'fields' ] )
11099 {
11100 if ( in_array ( $member, $property [ 'fields' ] ) )
11101 {
11102 $values = array ( ) ;
11103
11104 foreach ( $property [ 'fields' ] as $value )
11105 $values [] = $this -> $value ;
11106
11107 // Change compound property value accordingly, using the specified separator
11108 $this -> $name = implode ( $property [ 'separator' ], $values ) ;
11109 }
11110 }
11111 }
11112 }
11113 // Compound property : we will have to explode it in separate parts, using the compound property separator,
11114 // then set individual property values
11115 else
11116 {
11117 $values = explode ( $prop_entry [ 'separator' ], $value ) ;
11118 $value_count = count ( $values ) ;
11119 $field_count = count ( $prop_entry [ 'fields' ] ) ;
11120
11121 if ( $value_count < $field_count )
11122 error ( new PdfToTextFormException ( "Not enough value parts specified for the \"$member\" property ($value)." ) ) ;
11123 else if ( $value_count > $field_count )
11124 error ( new PdfToTextFormException ( "Too much value parts specified for the \"$member\" property ($value)." ) ) ;
11125
11126 $this -> $member = $value ;
11127
11128 for ( $i = 0 ; $i < $value_count ; $i ++ )
11129 {
11130 $sub_member = $prop_entry [ 'fields' ] [$i] ;
11131 $this -> $sub_member = $values [$i] ;
11132 }
11133 }
11134 }
11135 // Property does not exist : let PHP act as the default way
11136 else
11137 $this -> $member = $value ;
11138 }
11139 }
11140
11141
11142/**************************************************************************************************************
11143 **************************************************************************************************************
11144 **************************************************************************************************************
11145 ****** ******
11146 ****** ******
11147 ****** CAPTURE DEFINITION MANAGEMENT ******
11148 ****** (none of the classes listed here are meant to be instantiated outside this file) ******
11149 ****** ******
11150 ****** ******
11151 **************************************************************************************************************
11152 **************************************************************************************************************
11153 **************************************************************************************************************/
11154
11155/*==============================================================================================================
11156
11157 class PdfToTextCaptureDefinitions -
11158 Holds text capture definitions, whose XML data has been supplied to the PdfToText::SetCapture() method.
11159
11160 ==============================================================================================================*/
11161class PdfToTextCaptureDefinitions // extends Object
11162 implements ArrayAccess, Countable, Iterator
11163 {
11164 // Shape definitions - The actual objects populating this array depend on the definitions supplied
11165 // (rectangle, etc.)
11166 protected $ShapeDefinitions = array ( ) ;
11167
11168 // Shape field names - used for iteration
11169 private $ShapeNames ;
11170
11171 // Page count
11172 private $PageCount = false ;
11173
11174
11175 /*--------------------------------------------------------------------------------------------------------------
11176
11177 CONSTRUCTOR -
11178 Analyzes the XML data defining the areas to be captured.
11179
11180 *-------------------------------------------------------------------------------------------------------------*/
11181 public function __construct ( $xml_data )
11182 {
11183 $xml = simplexml_load_string ( $xml_data ) ;
11184 $root_entry = $xml -> getName ( ) ;
11185
11186 // Root tag must be <captures>
11187 if ( strcasecmp ( $root_entry, "captures" ) )
11188 error ( new PdfToTextCaptureException ( "Root entry must be <captures>, <$root_entry> was found." ) ) ;
11189
11190 // Process the child nodes
11191 foreach ( $xml -> children ( ) as $child )
11192 {
11193 $tag_name = $child -> getName ( ) ;
11194
11195 switch ( strtolower ( $tag_name ) )
11196 {
11197 // <rectangle> :
11198 // An rectangle whose dimensions are given in the <page> subtags.
11199 case 'rectangle' :
11200 $shape_object = new PdfToTextCaptureRectangleDefinition ( $child ) ;
11201 break ;
11202
11203 // <columns> :
11204 // A definition of columns and their applicable pages.
11205 case 'lines' :
11206 $shape_object = new PdfToTextCaptureLinesDefinition ( $child ) ;
11207 break ;
11208
11209 // Complain if an unknown tag is found
11210 default :
11211 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <captures>." ) ) ;
11212 }
11213
11214 // Shape names must be unique within the definitinos
11215 if ( isset ( $this -> ShapeDefinitions [ $shape_object -> Name ] ) )
11216 error ( new PdfToTextCaptureLinesDefinition ( "The shape named \"{$shape_object -> Name}\" has been defined more than once." ) ) ;
11217 else
11218 $this -> ShapeDefinitions [ $shape_object -> Name ] = $shape_object ;
11219 }
11220
11221 // Build an array of shape names for the iterator interface
11222 $this -> ShapeNames = array_keys ( $this -> ShapeDefinitions ) ;
11223 }
11224
11225
11226 /*--------------------------------------------------------------------------------------------------------------
11227
11228 NAME
11229 GetCapturedObject - Creates an object reflecting the captured data.
11230
11231 PROTOTYPE
11232 $captures = $capture_definitions -> GetCapturedObject ( $document_fragments ) ;
11233
11234 DESCRIPTION
11235 Returns an object of type PdfToTextCapturedData,containing the data that has been captured, based on
11236 the capture definitions.
11237
11238 PARAMETERS
11239 $document_fragments (type) -
11240 Document text fragments collected during the text layout rendering process.
11241
11242 RETURN VALUE
11243 An object of type PdfToTextCaptures, cntaining the captured data.
11244
11245 *-------------------------------------------------------------------------------------------------------------*/
11246 public function GetCapturedObject ( $document_fragments )
11247 {
11248 $captures = array ( ) ;
11249
11250 foreach ( $this -> ShapeDefinitions as $shape )
11251 {
11252 $capture = $shape -> ExtractAreas ( $document_fragments ) ;
11253
11254 foreach ( $capture as $page => $items )
11255 {
11256 $captures [ $page ] [] = $items ;
11257 }
11258 }
11259
11260 $captured_object = new PdfToTextCaptures ( $captures ) ;
11261
11262 return ( $captured_object ) ;
11263 }
11264
11265
11266 /*--------------------------------------------------------------------------------------------------------------
11267
11268 NAME
11269 SetPageCount - Defines the total number of pages in the document.
11270
11271 PROTOTYPE
11272 $shape -> SetPageCount ( $count ) ;
11273
11274 DESCRIPTION
11275 At the time when XML definitions are processed, the total number of pages in the document is not yet
11276 known. Moreover, page ranges or page numbers can be expressed relative to the last page of the
11277 document (for example : 1..$-1, which means "from the first page to the last page - 1).
11278 Setting the page count once it is known allows to process the expressions specified in the "number"
11279 attribute of the <pages> tag so that the expressions are transformed into actual page numbers.
11280
11281 PARAMETERS
11282 $count (integer) -
11283 Number of pages in the document.
11284
11285 *-------------------------------------------------------------------------------------------------------------*/
11286 public function SetPageCount ( $count )
11287 {
11288 $this -> PageCount = $count ;
11289
11290 foreach ( $this -> ShapeDefinitions as $def )
11291 {
11292 $def -> SetPageCount ( $count ) ;
11293 }
11294 }
11295
11296
11297 /*--------------------------------------------------------------------------------------------------------------
11298
11299 NAME
11300 GetNodeAttributes - Retrieves an XML node's attributes.
11301
11302 PROTOTYPE
11303 $result = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
11304
11305 DESCRIPTION
11306 Retrieves the attributes defined for the specified XML node.
11307
11308 PARAMETERS
11309 $node (SimpleXMLElement) -
11310 Node whose attributes are to be extracted.
11311
11312 $attributes (associative array) -
11313 Associative array whose keys are the attribute names and whose values define a boolean
11314 indicating whether the attribute is mandatory or not.
11315
11316 RETURN VALUE
11317 Returns an associative whose key are the attribute names and whose values are the attribute values,
11318 specified as a string.
11319 For optional unspecified attributes, the value will be boolean false.
11320
11321 NOTES
11322 The method throws an exception if the node contains an unknown attribute, or if a mandatory attribute
11323 is missing.
11324
11325 *-------------------------------------------------------------------------------------------------------------*/
11326 public static function GetNodeAttributes ( $node, $attributes )
11327 {
11328 $tag_name = $node -> getName ( ) ;
11329
11330 // Build the initial value for the resulting array
11331 $result = array ( ) ;
11332
11333 foreach ( array_keys ( $attributes ) as $name )
11334 $result [ $name ] = false ;
11335
11336 // Loop through node attributes
11337 foreach ( $node -> attributes ( ) as $attribute_name => $attribute_value )
11338 {
11339 $attribute_name = strtolower ( $attribute_name ) ;
11340
11341 // Check that the attributes exists ; if yes, add it to the resulting array
11342 if ( isset ( $attributes [ $attribute_name ] ) )
11343 $result [ $attribute_name ] = ( string ) $attribute_value ;
11344 // Otherwise, throw an exception
11345 else
11346 error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
11347 }
11348
11349 // Check that all mandatory attributes have been specified
11350 foreach ( $attributes as $attribute_name => $mandatory )
11351 {
11352 if ( $mandatory && $result [ $attribute_name ] === false )
11353 error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
11354 }
11355
11356 // All done, return
11357 return ( $result ) ;
11358 }
11359
11360
11361 /*--------------------------------------------------------------------------------------------------------------
11362
11363 NAME
11364 GetBooleanAttribute - Returns a boolean value associated to a string.
11365
11366 PROTOTYPE
11367 $bool = PdfToTextCaptureDefinitions::GetBooleanValue ( $value ) ;
11368
11369 DESCRIPTION
11370 Returns a boolean value corresponding to a boolean specified as a string.
11371
11372 PARAMETERS
11373 $value (string) -
11374 A boolean value represented as a string.
11375 The strings 'true', 'yes', 'on' and '1' will be interpreted as boolean true.
11376 The strings 'false', 'no', 'off' and '0' will be interpreted as boolean false.
11377
11378 RETURN VALUE
11379 The boolean value corresponding to the specified string.
11380
11381 NOTES
11382 An exception is thrown if the supplied string is incorrect.
11383
11384 *-------------------------------------------------------------------------------------------------------------*/
11385 public static function GetBooleanAttribute ( $value )
11386 {
11387 $lcvalue = strtolower ( $value ) ;
11388
11389 if ( $lcvalue === 'true' || $lcvalue === 'on' || $lcvalue === 'yes' || $lcvalue === '1' || $value === true )
11390 return ( true ) ;
11391 else if ( $lcvalue === 'false' || $lcvalue === 'off' || $lcvalue === 'no' || $lcvalue === '0' || $value === false )
11392 return( false ) ;
11393 else
11394 error ( new PdfToTextCaptureLinesDefinition ( "Invalid boolean value \"$value\"." ) ) ;
11395 }
11396
11397
11398 /*--------------------------------------------------------------------------------------------------------------
11399
11400 Interfaces implementations.
11401
11402 *-------------------------------------------------------------------------------------------------------------*/
11403
11404 // Countable interface
11405 public function count ( )
11406 { return ( count ( $this -> ShapeDefinitions ) ) ; }
11407
11408
11409 // ArrayAccess interface
11410 public function offsetExists ( $offset )
11411 { return ( isset ( $this -> ShapeDefinitions [ $offset ] ) ) ; }
11412
11413
11414 public function offsetGet ( $offset )
11415 { return ( $this -> ShapeDefinitions [ $offset ] ) ; }
11416
11417
11418 public function offsetSet ( $offset, $value )
11419 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
11420
11421
11422 public function offsetunset ( $offset )
11423 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
11424
11425
11426 // Iterator interface -
11427 // Iteration is made through shape names, which are supplied by the $ShapeNames property
11428 private $__iterator_index = 0 ;
11429
11430 public function rewind ( )
11431 { $this -> __iterator_index = 0 ; }
11432
11433 public function valid ( )
11434 { return ( $this -> __iterator_index >= 0 && $this -> __iterator_index < count ( $this -> ShapeNames ) ) ; }
11435
11436 public function key ( )
11437 { return ( $this -> ShapeNames [ $this -> __iterator_index ] ) ; }
11438
11439 public function next ( )
11440 { $this -> __iterator_index ++ ; }
11441
11442 public function current ( )
11443 { return ( $this -> ShapeDefinitions [ $this -> ShapeNames [ $this -> __iterator_index ] ] ) ; }
11444 }
11445
11446
11447/*==============================================================================================================
11448
11449 class PdfToTextCaptureShapeDefinition -
11450 Base class for capturing shapes.
11451
11452 ==============================================================================================================*/
11453abstract class PdfToTextCaptureShapeDefinition //extends Object
11454 {
11455 const SHAPE_RECTANGLE = 1 ;
11456 const SHAPE_COLUMN = 2 ;
11457 const SHAPE_LINE = 3 ;
11458
11459 // Capture name
11460 public $Name ;
11461 // Capture type - one of the SHAPE_* constants, assigned by derived classes.
11462 public $Type ;
11463 // Applicable pages for this capture
11464 public $ApplicablePages ;
11465 // Areas per page for this shape
11466 public $Areas = array ( ) ;
11467 // Separator used when multiple elements are covered by the same shape
11468 public $Separator = " " ;
11469
11470
11471 /*--------------------------------------------------------------------------------------------------------------
11472
11473 Constructor -
11474 Initializes the base capture class.
11475
11476 *-------------------------------------------------------------------------------------------------------------*/
11477 public function __construct ( $type )
11478 {
11479 $this -> Type = $type ;
11480 $this -> ApplicablePages = new PdfToTextCaptureApplicablePages ( ) ;
11481 }
11482
11483
11484 /*--------------------------------------------------------------------------------------------------------------
11485
11486 SetPageCount -
11487 Sets the page count, so that all the applicable pages can be determined.
11488 Derived classes can implement this function if some additional work is needed.
11489
11490 *-------------------------------------------------------------------------------------------------------------*/
11491 public function SetPageCount ( $count )
11492 {
11493 $this -> ApplicablePages -> SetPageCount ( $count ) ;
11494 }
11495
11496
11497 /*--------------------------------------------------------------------------------------------------------------
11498
11499 GetFragmentData -
11500 Extracts data from a text fragment (text + coordinates).
11501
11502 *-------------------------------------------------------------------------------------------------------------*/
11503 protected function GetFragmentData ( $fragment, &$text, &$left, &$top, &$right, &$bottom )
11504 {
11505 $left = ( double ) $fragment [ 'x' ] ;
11506 $top = ( double ) $fragment [ 'y' ] ;
11507 $right = $left + ( double ) $fragment [ 'width' ] - 1 ;
11508 $bottom = $top - ( double ) $fragment [ 'font-height' ] ;
11509 $text = $fragment [ 'text' ] ;
11510 }
11511
11512
11513 /*--------------------------------------------------------------------------------------------------------------
11514
11515 GetAttributes -
11516 Retrieves the attributes of the given XML node. Processes the following attributes, which are common to
11517 all shapes :
11518 - Name
11519 - Separator
11520
11521 *-------------------------------------------------------------------------------------------------------------*/
11522 protected function GetAttributes ( $node, $attributes = array ( ) )
11523 {
11524 $attributes = array_merge ( $attributes, array ( 'name' => true, 'separator' => false ) ) ;
11525 $shape_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
11526 $this -> Name = $shape_attributes [ 'name' ] ;
11527
11528 if ( $shape_attributes [ 'separator' ] !== false )
11529 $this -> Separator = PdfToText::Unescape ( $shape_attributes [ 'separator' ] ) ;
11530
11531 return ( $shape_attributes ) ;
11532 }
11533
11534
11535 /*--------------------------------------------------------------------------------------------------------------
11536
11537 ExtractAreas -
11538 Extracts text contents from the document fragments.
11539
11540 *-------------------------------------------------------------------------------------------------------------*/
11541 public abstract function ExtractAreas ( $document_fragments ) ;
11542 }
11543
11544
11545/*==============================================================================================================
11546
11547 class PdfToTextCaptureRectangleDefinition -
11548 A shape for capturing text in rectangle areas.
11549
11550 ==============================================================================================================*/
11551class PdfToTextCaptureRectangleDefinition extends PdfToTextCaptureShapeDefinition
11552 {
11553 /*--------------------------------------------------------------------------------------------------------------
11554
11555 CONSTRUCTOR -
11556 Analyzes the contents of a <rectangle> XML node, which contains <page> child node giving the
11557 applicable pages and the rectangle dimensions.
11558
11559 *-------------------------------------------------------------------------------------------------------------*/
11560 public function __construct ( $node )
11561 {
11562 parent::__construct ( self::SHAPE_RECTANGLE ) ;
11563
11564 $this -> GetAttributes ( $node ) ;
11565
11566 // Loop through node's children
11567 foreach ( $node -> children ( ) as $child )
11568 {
11569 $tag_name = $child -> getName ( ) ;
11570
11571 switch ( strtolower ( $tag_name ) )
11572 {
11573 // <page> tag : applicable page(s)
11574 case 'page' :
11575 // Retrieve the specified attributes
11576 $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
11577 (
11578 $child,
11579 array
11580 (
11581 'number' => true,
11582 'left' => true,
11583 'right' => false,
11584 'top' => true,
11585 'bottom' => false,
11586 'width' => false,
11587 'height' => false
11588 )
11589 ) ;
11590
11591 $page_number = $page_attributes [ 'number' ] ;
11592
11593 // Add this page to the list of applicable pages for this shape
11594 $this -> ApplicablePages -> Add ( $page_number, $page_attributes ) ;
11595
11596 break ;
11597
11598 // Other tag : throw an exception
11599 default :
11600 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
11601 }
11602 }
11603 }
11604
11605
11606 /*--------------------------------------------------------------------------------------------------------------
11607
11608 ExtractAreas -
11609 Extracts text contents from the document fragments.
11610
11611 *-------------------------------------------------------------------------------------------------------------*/
11612 public function ExtractAreas ( $document_fragments )
11613 {
11614 $result = array ( ) ;
11615
11616 // Loop through document fragments
11617 foreach ( $document_fragments as $page => $page_contents )
11618 {
11619 $fragments = $page_contents [ 'fragments' ] ;
11620
11621 // Ignore pages that are not applicable
11622 if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
11623 continue ;
11624
11625 // Loop through each text fragment of the page
11626 foreach ( $fragments as $fragment )
11627 {
11628 $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
11629
11630 // Only handle text fragments that are within the specified area
11631 if ( $this -> Areas [ $page ] -> Contains ( $left, $top, $right, $bottom ) )
11632 {
11633 // Normally, rectangle shapes are used to capture a single line...
11634 if ( ! isset ( $result [ $page ] ) )
11635 $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, $text, $left, $top, $right, $bottom, $this ) ;
11636 // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <rectangle> tag will
11637 // be used to separate items
11638 else
11639 {
11640 $existing_area = $result [ $page ] ;
11641
11642 $existing_area -> Top = max ( $existing_area -> Top , $top ) ;
11643 $existing_area -> Bottom = min ( $existing_area -> Bottom, $bottom ) ;
11644 $existing_area -> Left = min ( $existing_area -> Left , $left ) ;
11645 $existing_area -> Right = max ( $existing_area -> Right , $right ) ;
11646 $existing_area -> Text .= $this -> Separator . $text ;
11647 }
11648 }
11649 }
11650 }
11651
11652
11653 // Provide empty values for pages which did not capture a rectangle shape
11654 $added_missing_pages = false ;
11655
11656 foreach ( $this -> ApplicablePages as $page => $applicable )
11657 {
11658 if ( ! isset ( $result [ $page ] ) )
11659 {
11660 $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, '', 0, 0, 0, 0, $this ) ;
11661 $added_missing_pages = true ;
11662 }
11663 }
11664
11665 if ( $added_missing_pages ) // Sort by page number if empty values were added
11666 ksort ( $result ) ;
11667
11668 // All done, return
11669 return ( $result ) ;
11670 }
11671
11672
11673 /*--------------------------------------------------------------------------------------------------------------
11674
11675 SetPageCount -
11676 Ensures that an Area is created for each related page.
11677
11678 *-------------------------------------------------------------------------------------------------------------*/
11679 public function SetPageCount ( $count )
11680 {
11681 parent::SetPageCount ( $count ) ;
11682
11683 // Create a rectangle area for each page concerned - this can only be done when the number of pages is known
11684 // (and the ApplicablePages object updated accordingly)
11685 foreach ( $this -> ApplicablePages -> ExtraPageMapData as $page => $data )
11686 $this -> Areas [ $page ] = new PdfToTextCaptureArea ( $data ) ;
11687 }
11688 }
11689
11690
11691/*==============================================================================================================
11692
11693 class PdfToTextCaptureLinesDefinition -
11694 A shape for capturing text in rectangle areas.
11695
11696 ==============================================================================================================*/
11697class PdfToTextCaptureLinesDefinition extends PdfToTextCaptureShapeDefinition
11698 {
11699 // Column areas
11700 public $Columns = array ( ) ;
11701 // Top and bottom lines
11702 public $Tops = array ( ) ;
11703 public $Bottoms = array ( ) ;
11704 // Column names
11705 private $ColumnNames = array ( ) ;
11706
11707
11708 /*--------------------------------------------------------------------------------------------------------------
11709
11710 CONSTRUCTOR -
11711 Analyzes the contents of a <columns> XML node, which contains <page> nodes giving a part of the column
11712 dimensions, and <column> nodes which specify the name of the column and the remaining coordinates,
11713 such as "left" or "width"
11714
11715 *-------------------------------------------------------------------------------------------------------------*/
11716 public function __construct ( $node )
11717 {
11718 parent::__construct ( self::SHAPE_COLUMN ) ;
11719
11720 $shape_attributes = $this -> GetAttributes ( $node, array ( 'default' => false ) ) ;
11721 $column_default = ( $shape_attributes [ 'default' ] ) ? $shape_attributes [ 'default' ] : '' ;
11722
11723 // Loop through node's children
11724 foreach ( $node -> children ( ) as $child )
11725 {
11726 $tag_name = $child -> getName ( ) ;
11727
11728 switch ( strtolower ( $tag_name ) )
11729 {
11730 // <page> tag
11731 case 'page' :
11732 // Retrieve the specified attributes
11733 $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
11734 (
11735 $child,
11736 array
11737 (
11738 'number' => true,
11739 'top' => true,
11740 'height' => true,
11741 'bottom' => false
11742 )
11743 ) ;
11744
11745 // We have to store the y-coordinate of the first and last lines, to determine until which
11746 // position we have to check for column contents.
11747 // The "top" and "bottom" attributes of the <page> tag actually determine the top and bottom
11748 // y-coordinates where to search for columns. However, we will have to rename the "bottom"
11749 // attribute to "column-bottom", in order for it not to be mistaken with actual column rectangle
11750 // (only the "height" attribute of the <page> tag gives the height of a line)
11751 $page_attributes [ 'column-top' ] = $page_attributes [ 'top' ] ;
11752 $page_attributes [ 'column-bottom' ] = ( double ) $page_attributes [ 'bottom' ] ;
11753 unset ( $page_attributes [ 'bottom' ] ) ;
11754
11755 // Add this page to the list of applicable pages for this shape
11756 $this -> ApplicablePages -> Add ( $page_attributes [ 'number' ], $page_attributes ) ;
11757
11758 break ;
11759
11760 // <column> tag :
11761 case 'column' :
11762 $column_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
11763 (
11764 $child,
11765 array
11766 (
11767 'name' => true,
11768 'left' => false,
11769 'right' => false,
11770 'width' => false,
11771 'default' => false
11772 )
11773 ) ;
11774
11775 $column_name = $column_attributes [ 'name' ] ;
11776
11777 // Build the final default value, if any one is specified ; the following special constructs are processed :
11778 // - "%c" :
11779 // Replaced by the column name.
11780 // - "%n" :
11781 // Replaced by the column index (starting from zero).
11782 if ( ! $column_attributes [ 'default' ] )
11783 $column_attributes [ 'default' ] = $column_default ;
11784
11785 $substitutes = array
11786 (
11787 '%c' => $column_name,
11788 '%n' => count ( $this -> Columns )
11789 ) ;
11790
11791 $column_attributes [ 'default' ] = str_replace
11792 (
11793 array_keys ( $substitutes ),
11794 array_values ( $substitutes ),
11795 $column_attributes [ 'default' ]
11796 ) ;
11797
11798 // Add the column definition to this object
11799 if ( ! isset ( $this -> Columns [ $column_name ] ) )
11800 {
11801 $this -> Columns [ $column_attributes [ 'name' ] ] = $column_attributes ;
11802 $this -> ColumnNames [] = $column_attributes [ 'name' ] ;
11803 }
11804 else
11805 error ( new PdfToTextCaptureException ( "Column \"$column_name\" is defined more than once." ) ) ;
11806
11807 break ;
11808
11809 // Other tag : throw an exception
11810 default :
11811 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
11812 }
11813 }
11814 }
11815
11816
11817 /*--------------------------------------------------------------------------------------------------------------
11818
11819 ExtractAreas -
11820 Extracts text contents from the document fragments.
11821
11822 *-------------------------------------------------------------------------------------------------------------*/
11823 public function ExtractAreas ( $document_fragments )
11824 {
11825 $result = array ( ) ;
11826
11827 // Loop through each page of document fragments
11828 foreach ( $document_fragments as $page => $page_contents )
11829 {
11830 $fragments = $page_contents [ 'fragments' ] ;
11831
11832 // Ignore this page if not included in the <columns> definition
11833 if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
11834 continue ;
11835
11836 // <columns> definition only gives the location of the first line of each column, together
11837 // with its height.
11838 // We will build as many new column areas as can fit on one page
11839 $this_page_areas = $this -> Areas [ $page ] ;
11840 $column_areas = array ( ) ;
11841
11842 for ( $i = 0, $count = count ( $this_page_areas ) ; $i < $count ; $i ++ )
11843 {
11844 // For now, duplicate the existing column areas - they will represent the 1st line of columns
11845 $this_page_area = $this_page_areas [$i] ;
11846 $new_area = clone ( $this_page_area ) ;
11847 $column_areas [0] [] = $new_area ;
11848 $line_height = $new_area -> Height ;
11849 $current_top = $new_area -> Top - $line_height ;
11850 $current_line = 0 ;
11851
11852 // Then build new column areas for each successive lines
11853 while ( $current_top - $line_height >= 0 )
11854 {
11855 $current_line ++ ;
11856 $new_area = clone ( $new_area ) ;
11857 $new_area -> Top -= $line_height ;
11858 $new_area -> Bottom -= $line_height ;
11859
11860 $column_areas [ $current_line ] [] = $new_area ;
11861 $current_top -= $line_height ;
11862 }
11863 }
11864
11865 // Now extract the columns, line per line, from the current page's text fragments
11866 $found_lines = array ( ) ;
11867
11868 foreach ( $fragments as $fragment )
11869 {
11870 $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
11871
11872 // Loop through each line of column areas, built from the above step
11873 foreach ( $column_areas as $line => $column_areas_per_name )
11874 {
11875 $index = 0 ; // Column index
11876
11877 // Process each column area
11878 foreach ( $column_areas_per_name as $column_area )
11879 {
11880 // ... but only do something if the current column area is contained in the current fragment
11881 if ( $column_area -> Contains ( $left, $top, $right, $bottom ) )
11882 {
11883 // The normal usage will be to capture one-line columns...
11884 if ( ! isset ( $found_lines [ $line ] [ $column_area -> Name ] ) )
11885 {
11886 $found_lines [ $line ] [ $column_area -> Name ] =
11887 new PdfToTextCapturedColumn ( $page, $column_area -> Name, $text,
11888 $left, $top, $right, $bottom, $this ) ;
11889 }
11890 // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <lines> or
11891 // <column> tag will be used to separate items
11892 else
11893 {
11894 $existing_area = $found_lines [ $line ] [ $column_area -> Name ] ;
11895
11896 $existing_area -> Top = max ( $existing_area -> Top , $column_area -> Top ) ;
11897 $existing_area -> Bottom = min ( $existing_area -> Bottom, $column_area -> Bottom ) ;
11898 $existing_area -> Left = min ( $existing_area -> Left , $column_area -> Left ) ;
11899 $existing_area -> Right = max ( $existing_area -> Right , $column_area -> Right ) ;
11900 $existing_area -> Text .= $this -> Separator . $text ;
11901 }
11902 }
11903
11904 $index ++ ;
11905 }
11906 }
11907 }
11908
11909 // A final pass to provide default values for empty columns (usually, column values that are not represented in the PDF file)
11910 // Also get the surrounding box for the whole line
11911 $final_lines = array ( ) ;
11912
11913 foreach ( $found_lines as $line => $columns_line )
11914 {
11915 foreach ( $this -> ColumnNames as $column_name )
11916 {
11917 if ( ! isset ( $columns_line [ $column_name ] ) )
11918 {
11919 $columns_line [ $column_name ] =
11920 new PdfToTextCapturedColumn ( $page, $column_name, $this -> Columns [ $column_name ] [ 'default' ], 0, 0, 0, 0, $this ) ;
11921 }
11922 }
11923
11924 // Get the (left,top) coordinates of the line
11925 $line_left = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Left ;
11926 $line_top = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Top ;
11927
11928 // Get the (right,bottom) coordinates - we have to find the last column whose value is not a default value
11929 // (and therefore, has a non-zero Right coordinate)
11930 $last = count ( $this -> ColumnNames ) - 1 ;
11931 $line_right = 0 ;
11932 $line_bottom = 0 ;
11933
11934 while ( $last >= 0 && ! $columns_line [ $this -> ColumnNames [ $last ] ] -> Right )
11935 $last -- ;
11936
11937 if ( $last > 0 )
11938 {
11939 $line_right = $columns_line [ $this -> ColumnNames [ $last ] ] -> Right ;
11940 $line_bottom = $columns_line [ $this -> ColumnNames [ $last ] ] -> Bottom ;
11941 }
11942
11943 // Create a CaptureLine entry
11944 $final_lines [] = new PdfToTextCapturedLine ( $page, $this -> Name, $columns_line, $line_left, $line_top, $line_right, $line_bottom, $this ) ;
11945 }
11946
11947 // The result for this page will be a CapturedLines object
11948 $result [ $page ] = new PdfToTextCapturedLines ( $this -> Name, $page, $final_lines ) ;
11949 }
11950
11951 // All done, return
11952 return ( $result ) ;
11953 }
11954
11955
11956 /*--------------------------------------------------------------------------------------------------------------
11957
11958 SetPageCount -
11959 Extracts text contents from the document fragments.
11960
11961 *-------------------------------------------------------------------------------------------------------------*/
11962 public function SetPageCount ( $count )
11963 {
11964 parent::SetPageCount ( $count ) ;
11965
11966 foreach ( $this -> ApplicablePages as $page => $applicable )
11967 {
11968 if ( ! $applicable )
11969 continue ;
11970
11971 foreach ( $this -> Columns as $column )
11972 {
11973 if ( ! isset ( $this -> Tops [ $page ] ) )
11974 {
11975 $this -> Tops [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-top' ] ;
11976 $this -> Bottoms [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-bottom' ] ;
11977 }
11978
11979 $area = new PdfToTextCaptureArea ( $column, $this -> ApplicablePages -> ExtraPageMapData [ $page ], $column [ 'name' ] ) ;
11980
11981 $this -> Areas [ $page ] [] = $area ;
11982 }
11983 }
11984 }
11985
11986
11987 /*--------------------------------------------------------------------------------------------------------------
11988
11989 Support functions.
11990
11991 *-------------------------------------------------------------------------------------------------------------*/
11992 }
11993
11994
11995/*==============================================================================================================
11996
11997 class PdfToTextCaptureApplicablePages -
11998 Holds a list of applicable pages given by the "number" attribute of <page> tags.
11999
12000 ==============================================================================================================*/
12001class PdfToTextCaptureApplicablePages //extends Object
12002 implements ArrayAccess, Countable, Iterator
12003 {
12004 // Ranges of pages, as given by the "number" attribute of the <page> tag. Since a page number expression
12005 // can refer to the last page ("$"), and the total number of pages in the document is not yet known at the
12006 // time of object instantiation, we have to store all the page ranges as is.
12007 protected $PageRanges = array ( ) ;
12008
12009 // Once the SetPageCount() method has been called (ie, once the total number of pages in the document is
12010 // known), then a PageMap is built ; each key is the page number, indicating whether the page applies or not.
12011 public $PageMap = array ( ) ;
12012
12013 // Extra data associated, this time, with each page in PageMap
12014 public $ExtraPageMapData = array ( ) ;
12015
12016 // Page count - set by the SetPageCount() method
12017 public $PageCount = false ;
12018
12019
12020 /*--------------------------------------------------------------------------------------------------------------
12021
12022 CONSTRUCTOR
12023 Initializes the object.
12024
12025 *-------------------------------------------------------------------------------------------------------------*/
12026 public function __construct ( )
12027 {
12028 }
12029
12030
12031 /*--------------------------------------------------------------------------------------------------------------
12032
12033 NAME
12034 Add - Add a page number(s) definition.
12035
12036 PROTOTYPE
12037 $applicable_pages -> Add ( $page_number ) ;
12038
12039 DESCRIPTION
12040 Add the page number(s) specified by the "number" attribute of the <pages> tag to the list of applicable
12041 pages.
12042
12043 PARAMETERS
12044 $page_number (string) -
12045 A string defining which pages are applicable. This can be a single page number :
12046
12047 <page number="1" .../>
12048
12049 or a comma-separated list of pages :
12050
12051 <page number="1, 2, 10" .../>
12052
12053 or range(s) of pages :
12054
12055 <page number="1..10, 12..20" .../>
12056
12057 The special "$" character means "last page" ; thus the following example :
12058
12059 <page number="1, $-9..$" .../>
12060
12061 means : "applicable pages are 1, plus the last ten pages f the document".
12062
12063 *-------------------------------------------------------------------------------------------------------------*/
12064 public function Add ( $page_number, $extra_data = false )
12065 {
12066 $this -> __parse_page_numbers ( $page_number, $extra_data ) ;
12067 }
12068
12069
12070 /*--------------------------------------------------------------------------------------------------------------
12071
12072 NAME
12073 SetPageCount - Sets the total number of pages in the document.
12074
12075 PROTOTYPE
12076 $applicable_pages -> SetPageCount ( $count ) ;
12077
12078 DESCRIPTION
12079 Sets the total number of pages in the document and builds a map of which pages are applicable or not.
12080
12081 PARAMETERS
12082 $count (integer) -
12083 Total number of pages in the document.
12084
12085 *-------------------------------------------------------------------------------------------------------------*/
12086 public function SetPageCount ( $count )
12087 {
12088 $this -> PageCount = $count ;
12089 $this -> PageMap = array ( ) ;
12090
12091 // Loop through the page ranges - every single value in the ranges has been converted to an integer ;
12092 // the other ones, built as expressions (using "$" for example) are processed here to give the actual
12093 // page number
12094 foreach ( $this -> PageRanges as $range )
12095 {
12096 $low = $range [0] ;
12097 $high = $range [1] ;
12098
12099 // Translate expression to an actual value for the low and high parts of the range, if not already integers
12100 if ( ! is_integer ( $low ) )
12101 $low = $this -> __check_expression ( $low, $count ) ;
12102
12103 if ( ! is_integer ( $high ) )
12104 $high = $this -> __check_expression ( $high, $count ) ;
12105
12106 // Expressions using "$" may lead to negative values - adjust them
12107 if ( $low < 1 )
12108 {
12109 if ( $high < 1 )
12110 $high = 1 ;
12111
12112 $low = 1 ;
12113 }
12114
12115 // Check that the range is consistent
12116 if ( $low > $high )
12117 error ( new PdfToTextCaptureException ( "Low value ($low) must be less or equal to high value ($high) " .
12118 "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
12119
12120 // Ignore ranges where the 'low' value is higher than the number of pages in the document
12121 if ( $low > $count )
12122 {
12123 warning ( new PdfToTextCaptureException ( "Low value ($low) is greater than page count ($count) " .
12124 "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
12125 continue ;
12126 }
12127
12128 // Normalize the 'high' value, so that it's not bigger than the number of pages in the document
12129 if ( $high > $count )
12130 $high = $count ;
12131
12132 // Complement the page map using this range
12133 for ( $i = $low ; $i <= $high ; $i ++ )
12134 {
12135 $this -> PageMap [$i] = true ;
12136 $this -> ExtraPageMapData [$i] = $range [2] ;
12137 }
12138 }
12139 }
12140
12141
12142 /*--------------------------------------------------------------------------------------------------------------
12143
12144 Interfaces implementations.
12145
12146 *-------------------------------------------------------------------------------------------------------------*/
12147
12148 // Countable interface
12149 public function count ( )
12150 { return ( count ( $this -> PageMap ) ) ; }
12151
12152
12153 // Array access interface
12154 public function offsetExists ( $offset )
12155 { return ( isset ( $this -> PageMap [ $offset ] ) ) ; }
12156
12157
12158 public function offsetGet ( $offset )
12159 { return ( ( isset ( $this -> PageMap [ $offset ] ) ) ? true : false ) ; }
12160
12161
12162 public function offsetSet ( $offset, $value )
12163 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
12164
12165
12166 public function offsetunset ( $offset )
12167 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
12168
12169
12170 // Iterator interface
12171 private $__iterator_value = 1 ;
12172
12173 public function rewind ( )
12174 { $this -> __iterator_value = 1 ; }
12175
12176
12177 public function valid ( )
12178 { return ( $this -> __iterator_value >= 1 && $this -> __iterator_value <= $this -> PageCount ) ; }
12179
12180
12181 public function key ( )
12182 { return ( $this -> __iterator_value ) ; }
12183
12184
12185 public function next ( )
12186 { $this -> __iterator_value ++ ; }
12187
12188
12189 public function current ( )
12190 { return ( ( isset ( $this -> PageMap [ $this -> __iterator_value ] ) ) ? true : false ) ; }
12191
12192
12193 /*--------------------------------------------------------------------------------------------------------------
12194
12195 Helper functions.
12196
12197 *-------------------------------------------------------------------------------------------------------------*/
12198
12199 // __parse_page_numbers -
12200 // Performs a first pass on the value of the "number" attribute of the <page> tag. Transforms range expressions
12201 // when possible to integers ; keep the expression string intact when either the low or high value of a range
12202 // is itself an expression, probably using the "$" (page count) character.
12203 private function __parse_page_numbers ( $text, $extra_data )
12204 {
12205 $ranges = explode ( ',', $text ) ;
12206
12207 // Loop through comma-separated ranges
12208 foreach ( $ranges as $range )
12209 {
12210 $items = explode ( '..', $range ) ;
12211
12212 // Check if current item is a range
12213 switch ( count ( $items ) )
12214 {
12215 // If not a range (ie, a single value) then make a range using that value
12216 // (low and high range values will be the same)
12217 case 1 :
12218 if ( is_numeric ( $items [0] ) )
12219 $low = $high = ( integer ) $items [0] ;
12220 else
12221 $low = $high = trim ( $items [0] ) ;
12222
12223 break ;
12224
12225 // If range, store the low and high values
12226 case 2 :
12227 $low = ( is_numeric ( $items [0] ) ) ? ( integer ) $items [0] : trim ( $items [0] ) ;
12228 $high = ( is_numeric ( $items [1] ) ) ? ( integer ) $items [1] : trim ( $items [1] ) ;
12229 break ;
12230
12231 // Other cases : throw an exception
12232 default :
12233 error ( new PdfToTextCaptureException ( "Invalid page range specification \"$range\"." ) ) ;
12234 }
12235
12236 // If the low or high range value is an expression, check at this stage that it is correct
12237 if ( is_string ( $low ) && $this -> __check_expression ( $low ) === false )
12238 error ( new PdfToTextCaptureException ( "Invalid expression \"$low\" in page range specification \"$range\"." ) ) ;
12239
12240 if ( is_string ( $high ) && $this -> __check_expression ( $high ) === false )
12241 error ( new PdfToTextCaptureException ( "Invalid expression \"$high\" in page range specification \"$range\"." ) ) ;
12242
12243 // Add the page range and the extra data
12244 $this -> PageRanges [] = array ( $low, $high, $extra_data ) ;
12245 }
12246 }
12247
12248
12249 // __check_expression -
12250 // Checks that a syntactically correct
12251 private function __check_expression ( $str, $count = 1 )
12252 {
12253 $new_str = str_replace ( '$', $count, $str ) ;
12254 $value = @eval ( "return ( $new_str ) ;" ) ;
12255
12256 return ( $value ) ;
12257 }
12258 }
12259
12260
12261/*==============================================================================================================
12262
12263 class PdfToTextCaptureArea -
12264 A capture area describes a rectangle, either by its top, left, right and bottom coordinates, or by
12265 its top/left coordinates, and its width and height.
12266
12267 ==============================================================================================================*/
12268class PdfToTextCaptureArea //extends Object
12269 {
12270 // List of authorzed keyword for defining the rectangle dimensions
12271 static private $Keys = array ( 'left', 'top', 'right', 'bottom', 'width', 'height' ) ;
12272
12273 // Rectangle dimensions
12274 private $Left = false,
12275 $Top = false,
12276 $Right = false,
12277 $Bottom = false ;
12278
12279 // Area name (for internal purposes)
12280 public $Name ;
12281
12282
12283 /*--------------------------------------------------------------------------------------------------------------
12284
12285 NAME
12286 Constructor
12287
12288 PROTOTYPE
12289 $area = new PdfToTextCaptureArea ( $area, $default_area = null, $name = '' ) ;
12290
12291 DESCRIPTION
12292 Initialize an area (a rectangle) using the supplied coordinates
12293
12294 PARAMETERS
12295 $area (array) -
12296 An associative array that may contain the following entries :
12297
12298 - 'left' (double) :
12299 Left x-coordinate (mandatory).
12300
12301 - 'top' (double) :
12302 Top y-coordinate (mandatory).
12303
12304 - 'right (double) :
12305 Right x-coordinate.
12306
12307 - 'bottom' (double) :
12308 Bottom y-coordinate.
12309
12310 - 'width' (double) :
12311 Width of the rectangle, starting from 'left'.
12312
12313 - 'height' (double) :
12314 Height of the rectangle, starting from 'top'.
12315
12316 Either the 'right' or 'width' entries must be specified. This is the same for the 'bottom' and
12317 'height' entries.
12318
12319 $default_area (array) -
12320 An array that can be used to supply default values when absent from $area.
12321
12322 $name (string) -
12323 An optional name for this area. This information is not used by the class.
12324
12325 NOTES
12326 Coordinate (0,0) is located at the left bottom of the page.
12327
12328 *-------------------------------------------------------------------------------------------------------------*/
12329 public function __construct ( $area, $default_area = null, $name = '' )
12330 {
12331 $left =
12332 $top =
12333 $right =
12334 $bottom =
12335 $width =
12336 $height = false ;
12337
12338 // Retrieve each entry that allows to specify a coordinate component, using $default_area if needed
12339 foreach ( self::$Keys as $key )
12340 {
12341 if ( isset ( $area [ $key ] ) )
12342 {
12343 if ( $area [ $key ] === false )
12344 {
12345 if ( isset ( $default_area [ $key ] ) )
12346 $$key = $default_area [ $key ] ;
12347 else
12348 $$key = false ;
12349 }
12350 else
12351 $$key = $area [ $key ] ;
12352 }
12353 else if ( isset ( $default_area [ $key ] ) )
12354 $$key = $default_area [ $key ] ;
12355 }
12356
12357 // Check for mandatory coordinates
12358 if ( $left === false )
12359 error ( new PdfToTextCaptureException ( "Attribute \"left\" is mandatory." ) );
12360 else
12361 $left = ( double ) $left ;
12362
12363 if ( $top === false )
12364 error ( new PdfToTextCaptureException ( "Attribute \"top\" is mandatory." ) ) ;
12365 else
12366 $top = ( double ) $top ;
12367
12368 // Either the 'right' or 'width' entries are required
12369 if ( $right === false )
12370 {
12371 if ( $width === false )
12372 error ( new PdfToTextCaptureException ( "Either the \"right\" or the \"width\" attribute must be specified." ) ) ;
12373 else
12374 $right = $left + ( double ) $width - 1 ;
12375 }
12376 else
12377 $right = ( double ) $right ;
12378
12379 // Same for 'bottom' and 'height'
12380 if ( $bottom === false )
12381 {
12382 if ( $height === false )
12383 error ( new PdfToTextCaptureException ( "Either the \"bottom\" or the \"height\" attribute must be specified." ) ) ;
12384 else
12385 $bottom = $top - ( double ) $height + 1 ;
12386 }
12387 else
12388 $bottom = ( double ) $bottom ;
12389
12390 // All done, we have the coordinates we wanted
12391 $this -> Left = $left ;
12392 $this -> Right = $right ;
12393 $this -> Top = $top ;
12394 $this -> Bottom = $bottom ;
12395
12396 $this -> Name = $name ;
12397 }
12398
12399
12400 /*--------------------------------------------------------------------------------------------------------------
12401
12402 NAME
12403 __get, __set - Implement the Width and Height properties.
12404
12405 *-------------------------------------------------------------------------------------------------------------*/
12406 public function __get ( $member )
12407 {
12408 switch ( $member )
12409 {
12410 case 'Left' :
12411 case 'Top' :
12412 case 'Right' :
12413 case 'Bottom' :
12414 return ( $this -> $member ) ;
12415
12416 case 'Width' :
12417 return ( $this -> Right - $this -> Left + 1 ) ;
12418
12419 case 'Height' :
12420 return ( $this -> Top - $this -> Bottom + 1 ) ;
12421
12422 default :
12423 trigger_error ( "Undefined property \"$member\"." ) ;
12424 }
12425 }
12426
12427
12428 public function __set ( $member, $value )
12429 {
12430 $value = ( double ) $value ;
12431
12432 switch ( $member )
12433 {
12434 case 'Top' :
12435 case 'Left' :
12436 case 'Right' :
12437 case 'Bottom' :
12438 $this -> $member = $value ;
12439 break ;
12440
12441 case 'Width' :
12442 $this -> Right = $this -> Left + $value - 1 ;
12443 break ;
12444
12445 case 'Height' :
12446 $this -> Bottom = $this -> Top - $value + 1 ;
12447 break ;
12448
12449 default :
12450 trigger_error ( "Undefined property \"$member\"." ) ;
12451 }
12452 }
12453
12454
12455 /*--------------------------------------------------------------------------------------------------------------
12456
12457 NAME
12458 Contains - Check if this area contains the specified rectangle.
12459
12460 *-------------------------------------------------------------------------------------------------------------*/
12461 public function Contains ( $left, $top, $right, $bottom )
12462 {
12463 if ( $left >= $this -> Left && $right <= $this -> Right &&
12464 $top <= $this -> Top && $bottom >= $this -> Bottom )
12465 return ( true ) ;
12466 else
12467 return ( false ) ;
12468 }
12469 }
12470
12471
12472
12473/**************************************************************************************************************
12474 **************************************************************************************************************
12475 **************************************************************************************************************
12476 ****** ******
12477 ****** ******
12478 ****** CAPTURED TEXT MANAGEMENT ******
12479 ****** (none of the classes listed here are meant to be instantiated outside this file) ******
12480 ****** ******
12481 ****** ******
12482 **************************************************************************************************************
12483 **************************************************************************************************************
12484 **************************************************************************************************************/
12485
12486 /*==============================================================================================================
12487
12488 class PdfToTextCapturedText -
12489 Base class for captured text enclosed by shapes.
12490
12491 ==============================================================================================================*/
12492 abstract class PdfToTextCapturedText //extends Object
12493 {
12494 // Shape name (as specified by the "name" attribute of the <rectangle> or <lines> tags, for example)
12495 public $Name ;
12496 // Number of the page where the text was found (starts from 1)
12497 public $Page ;
12498 // Shape type (one of the PfToTextCaptureShape::SHAPE_* constants)
12499 public $Type ;
12500 // Shape definition object (not really used, but in case of...)
12501 private $ShapeDefinition ;
12502 // Captured text
12503 public $Text ;
12504 // Surrounding rectangle in the PDF file
12505 public $Left,
12506 $Top,
12507 $Right,
12508 $Bottom ;
12509
12510
12511
12512 /*--------------------------------------------------------------------------------------------------------------
12513
12514 Constructor -
12515 Initializes a captured text object, whatever the original shape.
12516
12517 *-------------------------------------------------------------------------------------------------------------*/
12518 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12519 {
12520 $this -> Name = $name ;
12521 $this -> Page = $page ;
12522 $this -> ShapeDefinition = $definition ;
12523 $this -> Text = $text ;
12524 $this -> Left = $left ;
12525 $this -> Top = $top ;
12526 $this -> Right = $right ;
12527 $this -> Bottom = $bottom ;
12528 $this -> Type = $definition -> Type ;
12529 }
12530 }
12531
12532
12533 /*==============================================================================================================
12534
12535 class PdfToTextCapturedRectangle -
12536 Implements a text captured by a rectangle shape.
12537
12538 ==============================================================================================================*/
12539class PdfToTextCapturedRectangle extends PdfToTextCapturedText
12540 {
12541 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12542 {
12543 parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
12544 }
12545
12546
12547 public function __tostring ( )
12548 { return ( $this -> Text ) ; }
12549 }
12550
12551
12552 /*==============================================================================================================
12553
12554 class PdfToTextCapturedColumn -
12555 Implements a text captured by a lines/column shape.
12556 Actually behaves like the PdfToTextCapturedRectangle class
12557
12558 ==============================================================================================================*/
12559class PdfToTextCapturedColumn extends PdfToTextCapturedText
12560 {
12561 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12562 {
12563 parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
12564 }
12565
12566
12567 public function __tostring ( )
12568 { return ( $this -> Text ) ; }
12569 }
12570
12571
12572 /*==============================================================================================================
12573
12574 class PdfToTextCapturedLine -
12575 Implements a text captured by a lines shape.
12576
12577 ==============================================================================================================*/
12578class PdfToTextCapturedLine extends PdfToTextCapturedText
12579 implements ArrayAccess, Countable, IteratorAggregate
12580 {
12581 // Column objects
12582 public $Columns ;
12583 // Array of column names, to allow access by either index or column name
12584 private $ColumnsByNames = array ( ) ;
12585
12586
12587 /*--------------------------------------------------------------------------------------------------------------
12588
12589 Constructor -
12590 Builds a Line object based on the supplied columns.
12591 Also builds the Text property, which contains the columns text separated by the separator string
12592 specified in the XML definition.
12593
12594 *-------------------------------------------------------------------------------------------------------------*/
12595 public function __construct ( $page, $name, $columns, $left, $top, $right, $bottom, $definition )
12596 {
12597 // Although the Columns property is most likely to be used, build a text representation of the whole ine
12598 $text = array ( ) ;
12599 $count = 0 ;
12600
12601 foreach ( $columns as $column )
12602 {
12603 $text [] = $column -> Text ;
12604 $this -> ColumnsByNames [ $column -> Name ] = $count ++ ;
12605 }
12606
12607 // Provide this information to the parent constructor
12608 parent::__construct ( $page, $name, implode ( $definition -> Separator, $text ), $left, $top, $right, $bottom, $definition ) ;
12609
12610 // Store the column definitions
12611 $this -> Columns = $columns ;
12612 }
12613
12614
12615 /*--------------------------------------------------------------------------------------------------------------
12616
12617 __get -
12618 Returns access to a column by its name.
12619
12620 *-------------------------------------------------------------------------------------------------------------*/
12621 public function __get ( $member )
12622 {
12623 if ( isset ( $this -> ColumnsByNames [ $member ] ) )
12624 return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
12625 else
12626 trigger_error ( "Undefined property \"$member\"." ) ;
12627 }
12628
12629
12630 /*--------------------------------------------------------------------------------------------------------------
12631
12632 Interfaces implementations.
12633
12634 *-------------------------------------------------------------------------------------------------------------*/
12635 public function count ( )
12636 { return ( $this -> Columns ) ; }
12637
12638
12639 public function getIterator ( )
12640 { return ( new ArrayIterator ( $this -> Columns ) ) ; }
12641
12642
12643 public function offsetExists ( $offset )
12644 {
12645 if ( is_numeric ( $offset ) )
12646 return ( $offset >= 0 && $offset < count ( $this -> Columns ) ) ;
12647 else
12648 return ( isset ( $this -> ColumnsByNames [ $offset ] ) ) ;
12649 }
12650
12651
12652 public function offsetGet ( $offset )
12653 {
12654 if ( is_numeric ( $offset ) )
12655 return ( $this -> Columns [ $offset ] ) ;
12656 else
12657 return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
12658 }
12659
12660
12661 public function offsetSet ( $offset, $value )
12662 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12663
12664
12665 public function offsetUnset ( $offset )
12666 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12667 }
12668
12669
12670 /*==============================================================================================================
12671
12672 class PdfToTextCapturedLines -
12673 Implements a set of lines.
12674
12675 ==============================================================================================================*/
12676class PdfToTextCapturedLines //extends Object
12677 implements ArrayAccess, Countable, IteratorAggregate
12678 {
12679 // Capture name, as specified by the "name" attribute of the <lines> tag
12680 public $Name ;
12681 // Page number of the capture
12682 public $Page ;
12683 // Captured lines
12684 public $Lines ;
12685 // Content type (mimics a little bit the PdfToTextCapturedText class)
12686 public $Type = PdfToTextCaptureShapeDefinition::SHAPE_LINE ;
12687
12688
12689 /*--------------------------------------------------------------------------------------------------------------
12690
12691 Constructor -
12692 Instantiates a PdfToTextCapturedLines object.
12693
12694 *-------------------------------------------------------------------------------------------------------------*/
12695 public function __construct ( $name, $page, $lines )
12696 {
12697 $this -> Name = $name ;
12698 $this -> Page = $page ;
12699 $this -> Lines = $lines ;
12700 }
12701
12702
12703 /*--------------------------------------------------------------------------------------------------------------
12704
12705 Interfaces implementations.
12706
12707 *-------------------------------------------------------------------------------------------------------------*/
12708 public function count ( )
12709 { return ( $this -> Lines ) ; }
12710
12711
12712 public function getIterator ( )
12713 { return ( new ArrayIterator ( $this -> Lines ) ) ; }
12714
12715
12716 public function offsetExists ( $offset )
12717 { return ( $offset >= 0 && $offset < count ( $this -> Lines ) ) ; }
12718
12719
12720 public function offsetGet ( $offset )
12721 { return ( $this -> Captures [ $offset ] ) ; }
12722
12723
12724 public function offsetSet ( $offset, $value )
12725 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12726
12727
12728 public function offsetUnset ( $offset )
12729 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12730 }
12731
12732
12733/**************************************************************************************************************
12734 **************************************************************************************************************
12735 **************************************************************************************************************
12736 ****** ******
12737 ****** ******
12738 ****** CAPTURE INTERFACE FOR THE DEVELOPER ******
12739 ****** (none of the classes listed here are meant to be instantiated outside this file) ******
12740 ****** ******
12741 ****** ******
12742 **************************************************************************************************************
12743 **************************************************************************************************************
12744 **************************************************************************************************************/
12745
12746/*==============================================================================================================
12747
12748 class PdfToTextCaptures -
12749 Represents all the areas in a PDF file captured by the supplied XML definitions.
12750
12751 ==============================================================================================================*/
12752class PdfToTextCaptures //extends Object
12753 {
12754 // Captured objects - May not exactly reflect the PdfToTextCapture*Shape classes
12755 private $CapturedObjects ;
12756 // Allows faster access by capture name
12757 private $ObjectsByName = array ( ) ;
12758
12759
12760 /*--------------------------------------------------------------------------------------------------------------
12761
12762 Constructor -
12763 Instantiates a PdfToTextCaptures object.
12764
12765 *-------------------------------------------------------------------------------------------------------------*/
12766 public function __construct ( $captures )
12767 {
12768 $this -> CapturedObjects = $captures ;
12769
12770 // Build an array of objects indexed by their names
12771 foreach ( $captures as $page => $shapes )
12772 {
12773 foreach ( $shapes as $shape )
12774 $this -> ObjectsByName [ $shape -> Name ] [] = $shape ;
12775 }
12776 }
12777
12778
12779 /*--------------------------------------------------------------------------------------------------------------
12780
12781 ToCaptures -
12782 Returns a simplified view of captured objects, with only name/value pairs.
12783
12784 *-------------------------------------------------------------------------------------------------------------*/
12785 public function ToCaptures ( )
12786 {
12787 $result = new stdClass ( ) ;
12788
12789 foreach ( $this -> CapturedObjects as $page => $captures )
12790 {
12791 foreach ( $captures as $capture )
12792 {
12793 switch ( $capture -> Type )
12794 {
12795 case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
12796 $name = $capture -> Name ;
12797 $value = $capture -> Text ;
12798 $result -> {$name} [ $page ] = $value ;
12799 break ;
12800
12801 case PdfToTextCaptureShapeDefinition::SHAPE_LINE :
12802 $name = $capture -> Name ;
12803
12804 if ( ! isset ( $result -> {$name} ) )
12805 $result -> {$name} = array ( ) ;
12806
12807 foreach ( $capture as $line )
12808 {
12809 $columns = new stdClass ;
12810
12811 foreach ( $line as $column )
12812 {
12813 $column_name = $column -> Name ;
12814 $column_value = $column -> Text ;
12815 $columns -> {$column_name} = $column_value ;
12816 }
12817
12818 $result -> {$name} [] = $columns ;
12819 }
12820 }
12821 }
12822 }
12823
12824 return ( $result ) ;
12825 }
12826
12827
12828 /*--------------------------------------------------------------------------------------------------------------
12829
12830 __get -
12831 Retrieves the captured objects by their name, as specified in the XML definition.
12832
12833 *-------------------------------------------------------------------------------------------------------------*/
12834 public function __get ( $member )
12835 {
12836 $fieldname = "__capture_{$member}__" ;
12837
12838 if ( ! isset ( $this -> $fieldname ) )
12839 {
12840 if ( ! isset ( $this -> ObjectsByName [ $member ] ) )
12841 error ( new PdfToTextException ( "Undefined property \"$member\"." ) ) ;
12842
12843 $this -> $fieldname = $this -> GetCaptureInstance ( $member ) ;
12844 }
12845
12846 return ( $this -> $fieldname ) ;
12847 }
12848
12849
12850 /*--------------------------------------------------------------------------------------------------------------
12851
12852 GetCapturedObjectsByName -
12853 Returns an associative array of the captured shapes, indexed by their name.
12854
12855 *-------------------------------------------------------------------------------------------------------------*/
12856 public function GetCapturedObjectsByName ( )
12857 { return ( $this -> ObjectsByName ) ; }
12858
12859
12860 /*--------------------------------------------------------------------------------------------------------------
12861
12862 GetCaptureInstance -
12863 Returns an object inheriting from the PdfToTextCapture class, that wraps the capture results.
12864
12865 *-------------------------------------------------------------------------------------------------------------*/
12866 protected function GetCaptureInstance ( $fieldname )
12867 {
12868 switch ( $this -> ObjectsByName [ $fieldname ] [0] -> Type )
12869 {
12870 case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
12871 return ( new PdfToTextRectangleCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
12872
12873 case PdfToTextCaptureShapeDefinition::SHAPE_LINE :
12874 return ( new PdfToTextLinesCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
12875
12876 default :
12877 error ( new PdfToTextCaptureException ( "Unhandled shape type " . $this -> ObjectsByName [ $fieldname ] [0] -> Type . "." ) ) ;
12878 }
12879 }
12880
12881
12882 }
12883
12884
12885/*==============================================================================================================
12886
12887 class PdfToTextCapture -
12888 Base class for all capture classes accessible to the caller.
12889
12890 ==============================================================================================================*/
12891class PdfToTextCapture //extends Object
12892 implements ArrayAccess, Countable, IteratorAggregate
12893 {
12894 protected $Captures ;
12895
12896
12897 /*--------------------------------------------------------------------------------------------------------------
12898
12899 Constructor -
12900 Instantiates a PdfToTextCapture object.
12901
12902 *-------------------------------------------------------------------------------------------------------------*/
12903 public function __construct ( $objects )
12904 {
12905 //parent::__construct ( ) ;
12906
12907 $this -> Captures = $objects ;
12908 }
12909
12910
12911 /*--------------------------------------------------------------------------------------------------------------
12912
12913 Interfaces implementations.
12914
12915 *-------------------------------------------------------------------------------------------------------------*/
12916 public function count ( )
12917 { return ( $this -> Captures ) ; }
12918
12919
12920 public function getIterator ( )
12921 { return ( new ArrayIterator ( $this -> Captures ) ) ; }
12922
12923
12924 public function offsetExists ( $offset )
12925 { return ( $offset >= 0 && $offset < count ( $this -> Captures ) ) ; }
12926
12927
12928 public function offsetGet ( $offset )
12929 { return ( $this -> Captures [ $offset ] ) ; }
12930
12931
12932 public function offsetSet ( $offset, $value )
12933 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12934
12935
12936 public function offsetUnset ( $offset )
12937 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12938
12939 }
12940
12941
12942/*==============================================================================================================
12943
12944 class PdfToTextLinesCapture -
12945 Represents a lines capture, without indexation to their page number.
12946
12947 ==============================================================================================================*/
12948class PdfToTextLinesCapture extends PdfToTextCapture
12949 {
12950 /*--------------------------------------------------------------------------------------------------------------
12951
12952 Constructor -
12953 "flattens" the supplied object list, by removing the PdfToTextCapturedLines class level, so that lines
12954 can be iterated whatever their page number is.
12955
12956 *-------------------------------------------------------------------------------------------------------------*/
12957 public function __construct ( $objects )
12958 {
12959 $new_objects = array ( ) ;
12960
12961 foreach ( $objects as $object )
12962 {
12963 foreach ( $object as $line )
12964 $new_objects [] = $line ;
12965 }
12966
12967 parent::__construct ( $new_objects ) ;
12968 }
12969 }
12970
12971
12972/*==============================================================================================================
12973
12974 class PdfToTextRectangleCapture -
12975 Implements a rectangle capture, from the caller point of view.
12976
12977 ==============================================================================================================*/
12978class PdfToTextRectangleCapture extends PdfToTextCapture
12979 {
12980 /*--------------------------------------------------------------------------------------------------------------
12981
12982 Constructor -
12983 Builds an object array indexed by page number.
12984
12985 *-------------------------------------------------------------------------------------------------------------*/
12986 public function __construct ( $objects )
12987 {
12988 $new_objects = array ( ) ;
12989
12990 foreach ( $objects as $object )
12991 $new_objects [ $object -> Page ] = $object ;
12992
12993 parent::__construct ( $new_objects ) ;
12994 }
12995 }