· 6 years ago · Mar 20, 2019, 08:24 AM
1
2<?php
3
4/* START NOT EDIT BEREICH 1 */
5/**
6 * All of the Defines for the classes below.
7 * @author S.C. Chen <me578022@gmail.com>
8 */
9error_reporting(-1);
10define('HDOM_TYPE_ELEMENT', 1);
11define('HDOM_TYPE_COMMENT', 2);
12define('HDOM_TYPE_TEXT', 3);
13define('HDOM_TYPE_ENDTAG', 4);
14define('HDOM_TYPE_ROOT', 5);
15define('HDOM_TYPE_UNKNOWN', 6);
16define('HDOM_QUOTE_DOUBLE', 0);
17define('HDOM_QUOTE_SINGLE', 1);
18define('HDOM_QUOTE_NO', 3);
19define('HDOM_INFO_BEGIN', 0);
20define('HDOM_INFO_END', 1);
21define('HDOM_INFO_QUOTE', 2);
22define('HDOM_INFO_SPACE', 3);
23define('HDOM_INFO_TEXT', 4);
24define('HDOM_INFO_INNER', 5);
25define('HDOM_INFO_OUTER', 6);
26define('HDOM_INFO_ENDSPACE', 7);
27/** The default target charset */
28defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
29/** The default <br> text used instead of <br> tags when returning text */
30defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
31/** The default <span> text used instead of <span> tags when returning text */
32defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
33/** The maximum file size the parser should load */
34defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
35/** Contents between curly braces "{" and "}" are interpreted as text */
36define('HDOM_SMARTY_AS_TEXT', 1);
37
38/* END NOT EDIT BEREICH 1 */
39
40
41// Dummy-URL
42$destinationUrl = "https://www.dell.com/support/home/de/de/dedhs1/product-support/servicetag/SERVICETAG/warranty";
43// Liste der Service Tags
44$serviceTags = array("jcss0n2");
45// Key
46$keyToSearchFor = "Versanddatum";
47
48
49// Laufe Schleife Durch
50foreach ($serviceTags as $tag){
51 // Setze einzelnen Service-Tag ein
52 $tempUrl = str_replace("SERVICETAG", $tag, $destinationUrl);
53 // Ziehe Content um zu parsen
54 $test = file_get_html($tempUrl);
55 // Google Captcha umgehen
56 echo $test;
57 // Neue Schleife um Tabellendaten auszulesen
58 foreach($test->find('th') as $th) {
59 // Suche nach $keyToSearchFor in HTML-Code
60 if (strpos($th, $keyToSearchFor) !== false) {
61 // Gebe aus, wenn gefunden
62 echo "bla";
63 echo $th;
64 }
65 }
66}
67/* DO NOT TOUCH BELOW
68
69INCLUDE LIBRARY
70
71*/
72
73/**
74 * Website: http://sourceforge.net/projects/simplehtmldom/
75 * Additional projects: http://sourceforge.net/projects/debugobject/
76 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
77 * Contributions by:
78 * Yousuke Kumakura (Attribute filters)
79 * Vadim Voituk (Negative indexes supports of "find" method)
80 * Antcs (Constructor with automatically load contents either text or file/url)
81 *
82 * all affected sections have comments starting with "PaperG"
83 *
84 * Paperg - Added case insensitive testing of the value of the selector.
85 *
86 * Paperg - Added tag_start for the starting index of tags - NOTE: This works
87 * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
88 * out, and after the remove_noice calls so it will not reflect the REAL
89 * position of the tag in the source, it will almost always be smaller by some
90 * amount. We use this to determine how far into the file the tag in question
91 * is. This "percentage" will never be accurate as the $dom->size is the "real"
92 * number of bytes the dom was created from. But for most purposes, it's a
93 * really good estimation.
94 *
95 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
96 * closed is great for malformed html, but it CAN lead to parsing errors.
97 *
98 * Allow the user to tell us how much they trust the html.
99 *
100 * Paperg add the text and plaintext to the selectors for the find syntax.
101 * plaintext implies text in the innertext of a node. text implies that the
102 * tag is a text node. This allows for us to find tags based on the text they
103 * contain.
104 *
105 * Create find_ancestor_tag to see if a tag is - at any level - inside of
106 * another specific tag.
107 *
108 * Paperg: added parse_charset so that we know about the character set of
109 * the source document. NOTE: If the user's system has a routine called
110 * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
111 * returning the content-type header from the last transfer or curl_exec, and
112 * we will parse that and use it in preference to any other method of charset
113 * detection.
114 *
115 * Found infinite loop in the case of broken html in restore_noise. Rewrote to
116 * protect from that.
117 *
118 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
119 *
120 * Licensed under The MIT License
121 * Redistributions of files must retain the above copyright notice.
122 *
123 * @author S.C. Chen <me578022@gmail.com>
124 * @author John Schlick
125 * @author Rus Carroll
126 * @version Rev. 1.8.1 (247)
127 * @package PlaceLocalInclude
128 * @subpackage simple_html_dom
129 */
130
131/**
132 * All of the Defines for the classes below.
133 * @author S.C. Chen <me578022@gmail.com>
134 */
135define('HDOM_TYPE_ELEMENT', 1);
136define('HDOM_TYPE_COMMENT', 2);
137define('HDOM_TYPE_TEXT', 3);
138define('HDOM_TYPE_ENDTAG', 4);
139define('HDOM_TYPE_ROOT', 5);
140define('HDOM_TYPE_UNKNOWN', 6);
141define('HDOM_QUOTE_DOUBLE', 0);
142define('HDOM_QUOTE_SINGLE', 1);
143define('HDOM_QUOTE_NO', 3);
144define('HDOM_INFO_BEGIN', 0);
145define('HDOM_INFO_END', 1);
146define('HDOM_INFO_QUOTE', 2);
147define('HDOM_INFO_SPACE', 3);
148define('HDOM_INFO_TEXT', 4);
149define('HDOM_INFO_INNER', 5);
150define('HDOM_INFO_OUTER', 6);
151define('HDOM_INFO_ENDSPACE', 7);
152
153/** The default target charset */
154defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
155
156/** The default <br> text used instead of <br> tags when returning text */
157defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
158
159/** The default <span> text used instead of <span> tags when returning text */
160defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
161
162/** The maximum file size the parser should load */
163defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
164
165/** Contents between curly braces "{" and "}" are interpreted as text */
166define('HDOM_SMARTY_AS_TEXT', 1);
167
168// helper functions
169// -----------------------------------------------------------------------------
170// get html dom from file
171// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
172function file_get_html(
173 $url,
174 $use_include_path = false,
175 $context = null,
176 $offset = 0,
177 $maxLen = -1,
178 $lowercase = true,
179 $forceTagsClosed = true,
180 $target_charset = DEFAULT_TARGET_CHARSET,
181 $stripRN = true,
182 $defaultBRText = DEFAULT_BR_TEXT,
183 $defaultSpanText = DEFAULT_SPAN_TEXT)
184{
185 // Ensure maximum length is greater than zero
186 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
187
188 // We DO force the tags to be terminated.
189 $dom = new simple_html_dom(
190 null,
191 $lowercase,
192 $forceTagsClosed,
193 $target_charset,
194 $stripRN,
195 $defaultBRText,
196 $defaultSpanText);
197
198 /**
199 * For sourceforge users: uncomment the next line and comment the
200 * retrieve_url_contents line 2 lines down if it is not already done.
201 */
202 $contents = file_get_contents(
203 $url,
204 $use_include_path,
205 $context,
206 $offset,
207 $maxLen);
208
209 // Paperg - use our own mechanism for getting the contents as we want to
210 // control the timeout.
211 // $contents = retrieve_url_contents($url);
212 if (empty($contents) || strlen($contents) > $maxLen) { return false; }
213
214 // The second parameter can force the selectors to all be lowercase.
215 $dom->load($contents, $lowercase, $stripRN);
216 return $dom;
217}
218
219// get html dom from string
220function str_get_html(
221 $str,
222 $lowercase = true,
223 $forceTagsClosed = true,
224 $target_charset = DEFAULT_TARGET_CHARSET,
225 $stripRN = true,
226 $defaultBRText = DEFAULT_BR_TEXT,
227 $defaultSpanText = DEFAULT_SPAN_TEXT)
228{
229 $dom = new simple_html_dom(
230 null,
231 $lowercase,
232 $forceTagsClosed,
233 $target_charset,
234 $stripRN,
235 $defaultBRText,
236 $defaultSpanText);
237
238 if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
239 $dom->clear();
240 return false;
241 }
242
243 $dom->load($str, $lowercase, $stripRN);
244 return $dom;
245}
246
247// dump html dom tree
248function dump_html_tree($node, $show_attr = true, $deep = 0)
249{
250 $node->dump($node);
251}
252
253/**
254 * simple html dom node
255 * PaperG - added ability for "find" routine to lowercase the value of the
256 * selector.
257 *
258 * PaperG - added $tag_start to track the start position of the tag in the total
259 * byte index
260 *
261 * @package PlaceLocalInclude
262 */
263class simple_html_dom_node
264{
265 /**
266 * Node type
267 *
268 * Default is {@see HDOM_TYPE_TEXT}
269 *
270 * @var int
271 */
272 public $nodetype = HDOM_TYPE_TEXT;
273
274 /**
275 * Tag name
276 *
277 * Default is 'text'
278 *
279 * @var string
280 */
281 public $tag = 'text';
282
283 /**
284 * List of attributes
285 *
286 * @var array
287 */
288 public $attr = array();
289
290 /**
291 * List of child node objects
292 *
293 * @var array
294 */
295 public $children = array();
296 public $nodes = array();
297
298 /**
299 * The parent node object
300 *
301 * @var object|null
302 */
303 public $parent = null;
304
305 // The "info" array - see HDOM_INFO_... for what each element contains.
306 public $_ = array();
307
308 /**
309 * Start position of the tag in the document
310 *
311 * @var int
312 */
313 public $tag_start = 0;
314
315 /**
316 * The DOM object
317 *
318 * @var object|null
319 */
320 private $dom = null;
321
322 /**
323 * Construct new node object
324 *
325 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
326 */
327 function __construct($dom)
328 {
329 $this->dom = $dom;
330 $dom->nodes[] = $this;
331 }
332
333 function __destruct()
334 {
335 $this->clear();
336 }
337
338 function __toString()
339 {
340 return $this->outertext();
341 }
342
343 // clean up memory due to php5 circular references memory leak...
344 function clear()
345 {
346 $this->dom = null;
347 $this->nodes = null;
348 $this->parent = null;
349 $this->children = null;
350 }
351
352 // dump node's tree
353 function dump($show_attr = true, $deep = 0)
354 {
355 $lead = str_repeat(' ', $deep);
356
357 echo $lead . $this->tag;
358
359 if ($show_attr && count($this->attr) > 0) {
360 echo '(';
361 foreach ($this->attr as $k => $v) {
362 echo "[$k]=>\"" . $this->$k . '", ';
363 }
364 echo ')';
365 }
366
367 echo "\n";
368
369 if ($this->nodes) {
370 foreach ($this->nodes as $c) {
371 $c->dump($show_attr, $deep + 1);
372 }
373 }
374 }
375
376
377 // Debugging function to dump a single dom node with a bunch of information about it.
378 function dump_node($echo = true)
379 {
380 $string = $this->tag;
381
382 if (count($this->attr) > 0) {
383 $string .= '(';
384 foreach ($this->attr as $k => $v) {
385 $string .= "[$k]=>\"" . $this->$k . '", ';
386 }
387 $string .= ')';
388 }
389
390 if (count($this->_) > 0) {
391 $string .= ' $_ (';
392 foreach ($this->_ as $k => $v) {
393 if (is_array($v)) {
394 $string .= "[$k]=>(";
395 foreach ($v as $k2 => $v2) {
396 $string .= "[$k2]=>\"" . $v2 . '", ';
397 }
398 $string .= ')';
399 } else {
400 $string .= "[$k]=>\"" . $v . '", ';
401 }
402 }
403 $string .= ')';
404 }
405
406 if (isset($this->text)) {
407 $string .= ' text: (' . $this->text . ')';
408 }
409
410 $string .= " HDOM_INNER_INFO: '";
411
412 if (isset($node->_[HDOM_INFO_INNER])) {
413 $string .= $node->_[HDOM_INFO_INNER] . "'";
414 } else {
415 $string .= ' NULL ';
416 }
417
418 $string .= ' children: ' . count($this->children);
419 $string .= ' nodes: ' . count($this->nodes);
420 $string .= ' tag_start: ' . $this->tag_start;
421 $string .= "\n";
422
423 if ($echo) {
424 echo $string;
425 return;
426 } else {
427 return $string;
428 }
429 }
430
431 /**
432 * Return or set parent node
433 *
434 * @param object|null $parent (optional) The parent node, `null` to return
435 * the current parent node.
436 * @return object|null The parent node
437 */
438 function parent($parent = null)
439 {
440 // I am SURE that this doesn't work properly.
441 // It fails to unset the current node from it's current parents nodes or
442 // children list first.
443 if ($parent !== null) {
444 $this->parent = $parent;
445 $this->parent->nodes[] = $this;
446 $this->parent->children[] = $this;
447 }
448
449 return $this->parent;
450 }
451
452 /**
453 * @return bool True if the node has at least one child node
454 */
455 function has_child()
456 {
457 return !empty($this->children);
458 }
459
460 /**
461 * Get child node at specified index
462 *
463 * @param int $idx The index of the child node to return, `-1` to return all
464 * child nodes.
465 * @return object|array|null The child node at the specified index, all child
466 * nodes or null if the index is invalid.
467 */
468 function children($idx = -1)
469 {
470 if ($idx === -1) {
471 return $this->children;
472 }
473
474 if (isset($this->children[$idx])) {
475 return $this->children[$idx];
476 }
477
478 return null;
479 }
480
481 /**
482 * Get first child node
483 *
484 * @return object|null The first child node or null if the current node has
485 * no child nodes.
486 *
487 * @todo Use `empty()` instead of `count()` to improve performance on large
488 * arrays.
489 */
490 function first_child()
491 {
492 if (count($this->children) > 0) {
493 return $this->children[0];
494 }
495 return null;
496 }
497
498 /**
499 * Get last child node
500 *
501 * @return object|null The last child node or null if the current node has
502 * no child nodes.
503 *
504 * @todo Use `end()` to slightly improve performance on large arrays.
505 */
506 function last_child()
507 {
508 if (($count = count($this->children)) > 0) {
509 return $this->children[$count - 1];
510 }
511 return null;
512 }
513
514 /**
515 * Get next sibling node
516 *
517 * @return object|null The sibling node or null if the current node has no
518 * sibling nodes.
519 */
520 function next_sibling()
521 {
522 if ($this->parent === null) {
523 return null;
524 }
525
526 $idx = 0;
527 $count = count($this->parent->children);
528
529 while ($idx < $count && $this !== $this->parent->children[$idx]) {
530 ++$idx;
531 }
532
533 if (++$idx >= $count) {
534 return null;
535 }
536
537 return $this->parent->children[$idx];
538 }
539
540 /**
541 * Get previous sibling node
542 *
543 * @return object|null The sibling node or null if the current node has no
544 * sibling nodes.
545 */
546 function prev_sibling()
547 {
548 if ($this->parent === null) { return null; }
549
550 $idx = 0;
551 $count = count($this->parent->children);
552
553 while ($idx < $count && $this !== $this->parent->children[$idx]) {
554 ++$idx;
555 }
556
557 if (--$idx < 0) { return null; }
558
559 return $this->parent->children[$idx];
560 }
561
562 /**
563 * Traverse ancestors to the first matching tag.
564 *
565 * @param string $tag Tag to find
566 * @return object|null First matching node in the DOM tree or null if no
567 * match was found.
568 *
569 * @todo Null is returned implicitly by calling ->parent on the root node.
570 * This behaviour could change at any time, rendering this function invalid.
571 */
572 function find_ancestor_tag($tag)
573 {
574 global $debug_object;
575 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
576
577 // Start by including ourselves in the comparison.
578 $returnDom = $this;
579
580 while (!is_null($returnDom)) {
581 if (is_object($debug_object)) {
582 $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
583 }
584
585 if ($returnDom->tag == $tag) {
586 break;
587 }
588
589 $returnDom = $returnDom->parent;
590 }
591
592 return $returnDom;
593 }
594
595 /**
596 * Get node's inner text (everything inside the opening and closing tags)
597 *
598 * @return string
599 */
600 function innertext()
601 {
602 if (isset($this->_[HDOM_INFO_INNER])) {
603 return $this->_[HDOM_INFO_INNER];
604 }
605
606 if (isset($this->_[HDOM_INFO_TEXT])) {
607 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
608 }
609
610 $ret = '';
611
612 foreach ($this->nodes as $n) {
613 $ret .= $n->outertext();
614 }
615
616 return $ret;
617 }
618
619 /**
620 * Get node's outer text (everything including the opening and closing tags)
621 *
622 * @return string
623 */
624 function outertext()
625 {
626 global $debug_object;
627
628 if (is_object($debug_object)) {
629 $text = '';
630
631 if ($this->tag === 'text') {
632 if (!empty($this->text)) {
633 $text = ' with text: ' . $this->text;
634 }
635 }
636
637 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
638 }
639
640 if ($this->tag === 'root') return $this->innertext();
641
642 // trigger callback
643 if ($this->dom && $this->dom->callback !== null) {
644 call_user_func_array($this->dom->callback, array($this));
645 }
646
647 if (isset($this->_[HDOM_INFO_OUTER])) {
648 return $this->_[HDOM_INFO_OUTER];
649 }
650
651 if (isset($this->_[HDOM_INFO_TEXT])) {
652 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
653 }
654
655 // render begin tag
656 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
657 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
658 } else {
659 $ret = '';
660 }
661
662 // render inner text
663 if (isset($this->_[HDOM_INFO_INNER])) {
664 // If it's a br tag... don't return the HDOM_INNER_INFO that we
665 // may or may not have added.
666 if ($this->tag !== 'br') {
667 $ret .= $this->_[HDOM_INFO_INNER];
668 }
669 } else {
670 if ($this->nodes) {
671 foreach ($this->nodes as $n) {
672 $ret .= $this->convert_text($n->outertext());
673 }
674 }
675 }
676
677 // render end tag
678 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
679 $ret .= '</' . $this->tag . '>';
680 }
681
682 return $ret;
683 }
684
685 /**
686 * Get node's plain text (everything excluding all tags)
687 *
688 * @return string
689 */
690 function text()
691 {
692 if (isset($this->_[HDOM_INFO_INNER])) {
693 return $this->_[HDOM_INFO_INNER];
694 }
695
696 switch ($this->nodetype) {
697 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
698 case HDOM_TYPE_COMMENT: return '';
699 case HDOM_TYPE_UNKNOWN: return '';
700 }
701
702 if (strcasecmp($this->tag, 'script') === 0) { return ''; }
703 if (strcasecmp($this->tag, 'style') === 0) { return ''; }
704
705 $ret = '';
706
707 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
708 // for some span tags, and some p tags) $this->nodes is set to NULL.
709 // NOTE: This indicates that there is a problem where it's set to NULL
710 // without a clear happening.
711 // WHY is this happening?
712 if (!is_null($this->nodes)) {
713 foreach ($this->nodes as $n) {
714 // Start paragraph after a blank line
715 if ($n->tag === 'p') {
716 $ret .= "\n\n";
717 }
718
719 $ret .= $this->convert_text($n->text());
720
721 // If this node is a span... add a space at the end of it so
722 // multiple spans don't run into each other. This is plaintext
723 // after all.
724 if ($n->tag === 'span') {
725 $ret .= $this->dom->default_span_text;
726 }
727 }
728 }
729 return trim($ret);
730 }
731
732 /**
733 * Get node's xml text (inner text as a CDATA section)
734 *
735 * @return string
736 */
737 function xmltext()
738 {
739 $ret = $this->innertext();
740 $ret = str_ireplace('<![CDATA[', '', $ret);
741 $ret = str_replace(']]>', '', $ret);
742 return $ret;
743 }
744
745 // build node's text with tag
746 function makeup()
747 {
748 // text, comment, unknown
749 if (isset($this->_[HDOM_INFO_TEXT])) {
750 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
751 }
752
753 $ret = '<' . $this->tag;
754 $i = -1;
755
756 foreach ($this->attr as $key => $val) {
757 ++$i;
758
759 // skip removed attribute
760 if ($val === null || $val === false) { continue; }
761
762 $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
763
764 //no value attr: nowrap, checked selected...
765 if ($val === true) {
766 $ret .= $key;
767 } else {
768 switch ($this->_[HDOM_INFO_QUOTE][$i])
769 {
770 case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
771 case HDOM_QUOTE_SINGLE: $quote = '\''; break;
772 default: $quote = '';
773 }
774
775 $ret .= $key
776 . $this->_[HDOM_INFO_SPACE][$i][1]
777 . '='
778 . $this->_[HDOM_INFO_SPACE][$i][2]
779 . $quote
780 . $val
781 . $quote;
782 }
783 }
784
785 $ret = $this->dom->restore_noise($ret);
786 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
787 }
788
789 /**
790 * Find elements by CSS selector
791 *
792 * @param string $selector The CSS selector
793 * @param int|null $idx Index of element to return form the list of matching
794 * elements (default: `null` = disabled).
795 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
796 * enabled (default: `false`)
797 * @return array|object|null A list of elements matching the specified CSS
798 * selector or a single element if $idx is specified or null if no element
799 * was found.
800 */
801 function find($selector, $idx = null, $lowercase = false)
802 {
803 $selectors = $this->parse_selector($selector);
804 if (($count = count($selectors)) === 0) { return array(); }
805 $found_keys = array();
806
807 // find each selector
808 for ($c = 0; $c < $count; ++$c) {
809 // The change on the below line was documented on the sourceforge
810 // code tracker id 2788009
811 // used to be: if (($levle=count($selectors[0]))===0) return array();
812 if (($levle = count($selectors[$c])) === 0) { return array(); }
813 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
814
815 $head = array($this->_[HDOM_INFO_BEGIN] => 1);
816 $cmd = ' '; // Combinator
817
818 // handle descendant selectors, no recursive!
819 for ($l = 0; $l < $levle; ++$l) {
820 $ret = array();
821
822 foreach ($head as $k => $v) {
823 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
824 //PaperG - Pass this optional parameter on to the seek function.
825 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
826 }
827
828 $head = $ret;
829 $cmd = $selectors[$c][$l][4]; // Next Combinator
830 }
831
832 foreach ($head as $k => $v) {
833 if (!isset($found_keys[$k])) {
834 $found_keys[$k] = 1;
835 }
836 }
837 }
838
839 // sort keys
840 ksort($found_keys);
841
842 $found = array();
843 foreach ($found_keys as $k => $v) {
844 $found[] = $this->dom->nodes[$k];
845 }
846
847 // return nth-element or array
848 if (is_null($idx)) { return $found; }
849 elseif ($idx < 0) { $idx = count($found) + $idx; }
850 return (isset($found[$idx])) ? $found[$idx] : null;
851 }
852
853 /**
854 * Seek DOM elements by selector
855 *
856 * **Note**
857 * The selector element must be compatible to a selector from
858 * {@see simple_html_dom_node::parse_selector()}
859 *
860 * @param array $selector A selector element
861 * @param array $ret An array of matches
862 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
863 * enabled (default: `false`)
864 * @return void
865 */
866 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
867 {
868 global $debug_object;
869 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
870
871 list($tag, $id, $class, $attributes, $cmb) = $selector;
872 $nodes = array();
873
874 if ($parent_cmd === ' ') { // Descendant Combinator
875 // Find parent closing tag if the current element doesn't have a closing
876 // tag (i.e. void element)
877 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
878 if ($end == 0) {
879 $parent = $this->parent;
880 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
881 $end -= 1;
882 $parent = $parent->parent;
883 }
884 $end += $parent->_[HDOM_INFO_END];
885 }
886
887 // Get list of target nodes
888 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
889 $nodes_count = $end - $nodes_start;
890 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
891 } elseif ($parent_cmd === '>') { // Child Combinator
892 $nodes = $this->children;
893 } elseif ($parent_cmd === '+'
894 && $this->parent
895 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
896 $index = array_search($this, $this->parent->children, true) + 1;
897 $nodes[] = $this->parent->children[$index];
898 } elseif ($parent_cmd === '~'
899 && $this->parent
900 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
901 $index = array_search($this, $this->parent->children, true);
902 $nodes = array_slice($this->parent->children, $index);
903 }
904
905 // Go throgh each element starting at this element until the end tag
906 // Note: If this element is a void tag, any previous void element is
907 // skipped.
908 foreach($nodes as $node) {
909 $pass = true;
910
911 // Skip root nodes
912 if(!$node->parent) {
913 $pass = false;
914 }
915
916 // Skip if node isn't a child node (i.e. text nodes)
917 if($pass && !in_array($node, $node->parent->children, true)) {
918 $pass = false;
919 }
920
921 // Skip if tag doesn't match
922 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
923 $pass = false;
924 }
925
926 // Skip if ID doesn't exist
927 if ($pass && $id !== '' && !isset($node->attr['id'])) {
928 $pass = false;
929 }
930
931 // Check if ID matches
932 if ($pass && $id !== '' && isset($node->attr['id'])) {
933 // Note: Only consider the first ID (as browsers do)
934 $node_id = explode(' ', trim($node->attr['id']))[0];
935
936 if($id !== $node_id) { $pass = false; }
937 }
938
939 // Check if all class(es) exist
940 if ($pass && $class !== '' && is_array($class) && !empty($class)) {
941 if (isset($node->attr['class'])) {
942 $node_classes = explode(' ', $node->attr['class']);
943
944 if ($lowercase) {
945 $node_classes = array_map('strtolower', $node_classes);
946 }
947
948 foreach($class as $c) {
949 if(!in_array($c, $node_classes)) {
950 $pass = false;
951 break;
952 }
953 }
954 } else {
955 $pass = false;
956 }
957 }
958
959 // Check attributes
960 if ($pass
961 && $attributes !== ''
962 && is_array($attributes)
963 && !empty($attributes)) {
964 foreach($attributes as $a) {
965 list (
966 $att_name,
967 $att_expr,
968 $att_val,
969 $att_inv,
970 $att_case_sensitivity
971 ) = $a;
972
973 // Handle indexing attributes (i.e. "[2]")
974 /**
975 * Note: This is not supported by the CSS Standard but adds
976 * the ability to select items compatible to XPath (i.e.
977 * the 3rd element within it's parent).
978 *
979 * Note: This doesn't conflict with the CSS Standard which
980 * doesn't work on numeric attributes anyway.
981 */
982 if (is_numeric($att_name)
983 && $att_expr === ''
984 && $att_val === '') {
985 $count = 0;
986
987 // Find index of current element in parent
988 foreach ($node->parent->children as $c) {
989 if ($c->tag === $node->tag) ++$count;
990 if ($c === $node) break;
991 }
992
993 // If this is the correct node, continue with next
994 // attribute
995 if ($count === (int)$att_name) continue;
996 }
997
998 // Check attribute availability
999 if ($att_inv) { // Attribute should NOT be set
1000 if (isset($node->attr[$att_name])) {
1001 $pass = false;
1002 break;
1003 }
1004 } else { // Attribute should be set
1005 // todo: "plaintext" is not a valid CSS selector!
1006 if ($att_name !== 'plaintext'
1007 && !isset($node->attr[$att_name])) {
1008 $pass = false;
1009 break;
1010 }
1011 }
1012
1013 // Continue with next attribute if expression isn't defined
1014 if ($att_expr === '') continue;
1015
1016 // If they have told us that this is a "plaintext"
1017 // search then we want the plaintext of the node - right?
1018 // todo "plaintext" is not a valid CSS selector!
1019 if ($att_name === 'plaintext') {
1020 $nodeKeyValue = $node->text();
1021 } else {
1022 $nodeKeyValue = $node->attr[$att_name];
1023 }
1024
1025 if (is_object($debug_object)) {
1026 $debug_object->debug_log(2,
1027 'testing node: '
1028 . $node->tag
1029 . ' for attribute: '
1030 . $att_name
1031 . $att_expr
1032 . $att_val
1033 . ' where nodes value is: '
1034 . $nodeKeyValue
1035 );
1036 }
1037
1038 // If lowercase is set, do a case insensitive test of
1039 // the value of the selector.
1040 if ($lowercase) {
1041 $check = $this->match(
1042 $att_expr,
1043 strtolower($att_val),
1044 strtolower($nodeKeyValue),
1045 $att_case_sensitivity
1046 );
1047 } else {
1048 $check = $this->match(
1049 $att_expr,
1050 $att_val,
1051 $nodeKeyValue,
1052 $att_case_sensitivity
1053 );
1054 }
1055
1056 if (is_object($debug_object)) {
1057 $debug_object->debug_log(2,
1058 'after match: '
1059 . ($check ? 'true' : 'false')
1060 );
1061 }
1062
1063 if (!$check) {
1064 $pass = false;
1065 break;
1066 }
1067 }
1068 }
1069
1070 // Found a match. Add to list and clear node
1071 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1072 unset($node);
1073 }
1074 // It's passed by reference so this is actually what this function returns.
1075 if (is_object($debug_object)) {
1076 $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1077 }
1078 }
1079
1080 /**
1081 * Match value and pattern for a given CSS expression
1082 *
1083 * **Supported Expressions**
1084 *
1085 * | Expression | Description
1086 * | ---------- | -----------
1087 * | `=` | $value and $pattern must be equal
1088 * | `!=` | $value and $pattern must not be equal
1089 * | `^=` | $value must start with $pattern
1090 * | `$=` | $value must end with $pattern
1091 * | `*=` | $value must contain $pattern
1092 *
1093 * @param string $exp The expression.
1094 * @param string $pattern The pattern
1095 * @param string $value The value
1096 * @value bool True if $value matches $pattern
1097 */
1098 protected function match($exp, $pattern, $value, $case_sensitivity)
1099 {
1100 global $debug_object;
1101 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1102
1103 if ($case_sensitivity === 'i') {
1104 $pattern = strtolower($pattern);
1105 $value = strtolower($value);
1106 }
1107
1108 switch ($exp) {
1109 case '=':
1110 return ($value === $pattern);
1111 case '!=':
1112 return ($value !== $pattern);
1113 case '^=':
1114 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1115 case '$=':
1116 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1117 case '*=':
1118 return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1119 case '|=':
1120 /**
1121 * [att|=val]
1122 *
1123 * Represents an element with the att attribute, its value
1124 * either being exactly "val" or beginning with "val"
1125 * immediately followed by "-" (U+002D).
1126 */
1127 return strpos($value, $pattern) === 0;
1128 case '~=':
1129 /**
1130 * [att~=val]
1131 *
1132 * Represents an element with the att attribute whose value is a
1133 * whitespace-separated list of words, one of which is exactly
1134 * "val". If "val" contains whitespace, it will never represent
1135 * anything (since the words are separated by spaces). Also if
1136 * "val" is the empty string, it will never represent anything.
1137 */
1138 return in_array($pattern, explode(' ', trim($value)), true);
1139 }
1140 return false;
1141 }
1142
1143 /**
1144 * Parse CSS selector
1145 *
1146 * @param string $selector_string CSS selector string
1147 * @return array List of CSS selectors. The format depends on the type of
1148 * selector:
1149 *
1150 * ```php
1151 *
1152 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1153 * array( // list of combinator selectors, i.e. 'img > p > div'
1154 * array( // selector element
1155 * [0], // (string) The element tag
1156 * [1], // (string) The element id
1157 * [2], // (array<string>) The element classes
1158 * [3], // (array<array<string>>) The list of attributes, each
1159 * // with four elements: name, expression, value, inverted
1160 * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
1161 * )
1162 * )
1163 * )
1164 * ```
1165 *
1166 * @link https://www.w3.org/TR/selectors/#compound Compound selector
1167 */
1168 protected function parse_selector($selector_string)
1169 {
1170 global $debug_object;
1171 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1172
1173 /**
1174 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1175 *
1176 * Paperg: Add the colon to the attribute, so that it properly finds
1177 * <tag attr:ibute="something" > like google does.
1178 *
1179 * Note: if you try to look at this attribute, you MUST use getAttribute
1180 * since $dom->x:y will fail the php syntax check.
1181 *
1182 * Notice the \[ starting the attribute? and the @? following? This
1183 * implies that an attribute can begin with an @ sign that is not
1184 * captured. This implies that an html attribute specifier may start
1185 * with an @ sign that is NOT captured by the expression. Farther study
1186 * is required to determine of this should be documented or removed.
1187 *
1188 * Matches selectors in this order:
1189 *
1190 * [0] - full match
1191 *
1192 * [1] - tag name
1193 * ([\w:\*-]*)
1194 * Matches the tag name consisting of zero or more words, colons,
1195 * asterisks and hyphens.
1196 *
1197 * [2] - id name
1198 * (?:\#([\w-]+))
1199 * Optionally matches a id name, consisting of an "#" followed by
1200 * the id name (one or more words and hyphens).
1201 *
1202 * [3] - class names (including dots)
1203 * (?:\.([\w\.-]+))?
1204 * Optionally matches a list of classs, consisting of an "."
1205 * followed by the class name (one or more words and hyphens)
1206 * where multiple classes can be chained (i.e. ".foo.bar.baz")
1207 *
1208 * [4] - attributes
1209 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1210 * Optionally matches the attributes list
1211 *
1212 * [5] - separator
1213 * ([\/, >+~]+)
1214 * Matches the selector list separator
1215 */
1216 // phpcs:ignore Generic.Files.LineLength
1217 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1218
1219 preg_match_all(
1220 $pattern,
1221 trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1222 $matches,
1223 PREG_SET_ORDER
1224 );
1225
1226 if (is_object($debug_object)) {
1227 $debug_object->debug_log(2, 'Matches Array: ', $matches);
1228 }
1229
1230 $selectors = array();
1231 $result = array();
1232
1233 foreach ($matches as $m) {
1234 $m[0] = trim($m[0]);
1235
1236 // Skip NoOps
1237 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1238
1239 // Convert to lowercase
1240 if ($this->dom->lowercase) {
1241 $m[1] = strtolower($m[1]);
1242 }
1243
1244 // Extract classes
1245 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1246
1247 /* Extract attributes (pattern based on the pattern above!)
1248
1249 * [0] - full match
1250 * [1] - attribute name
1251 * [2] - attribute expression
1252 * [3] - attribute value
1253 * [4] - case sensitivity
1254 *
1255 * Note: Attributes can be negated with a "!" prefix to their name
1256 */
1257 if($m[4] !== '') {
1258 preg_match_all(
1259 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1260 trim($m[4]),
1261 $attributes,
1262 PREG_SET_ORDER
1263 );
1264
1265 // Replace element by array
1266 $m[4] = array();
1267
1268 foreach($attributes as $att) {
1269 // Skip empty matches
1270 if(trim($att[0]) === '') { continue; }
1271
1272 $inverted = (isset($att[1][0]) && $att[1][0] === '!');
1273 $m[4][] = array(
1274 $inverted ? substr($att[1], 1) : $att[1], // Name
1275 (isset($att[2])) ? $att[2] : '', // Expression
1276 (isset($att[3])) ? $att[3] : '', // Value
1277 $inverted, // Inverted Flag
1278 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1279 );
1280 }
1281 }
1282
1283 // Sanitize Separator
1284 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1285 $m[5] = ' ';
1286 } else { // Other Separator
1287 $m[5] = trim($m[5]);
1288 }
1289
1290 // Clear Separator if it's a Selector List
1291 if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1292
1293 // Remove full match before adding to results
1294 array_shift($m);
1295 $result[] = $m;
1296
1297 if ($is_list) { // Selector List
1298 $selectors[] = $result;
1299 $result = array();
1300 }
1301 }
1302
1303 if (count($result) > 0) { $selectors[] = $result; }
1304 return $selectors;
1305 }
1306
1307 function __get($name)
1308 {
1309 if (isset($this->attr[$name])) {
1310 return $this->convert_text($this->attr[$name]);
1311 }
1312 switch ($name) {
1313 case 'outertext': return $this->outertext();
1314 case 'innertext': return $this->innertext();
1315 case 'plaintext': return $this->text();
1316 case 'xmltext': return $this->xmltext();
1317 default: return array_key_exists($name, $this->attr);
1318 }
1319 }
1320
1321 function __set($name, $value)
1322 {
1323 global $debug_object;
1324 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1325
1326 switch ($name) {
1327 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1328 case 'innertext':
1329 if (isset($this->_[HDOM_INFO_TEXT])) {
1330 return $this->_[HDOM_INFO_TEXT] = $value;
1331 }
1332 return $this->_[HDOM_INFO_INNER] = $value;
1333 }
1334
1335 if (!isset($this->attr[$name])) {
1336 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1337 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1338 }
1339
1340 $this->attr[$name] = $value;
1341 }
1342
1343 function __isset($name)
1344 {
1345 switch ($name) {
1346 case 'outertext': return true;
1347 case 'innertext': return true;
1348 case 'plaintext': return true;
1349 }
1350 //no value attr: nowrap, checked selected...
1351 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1352 }
1353
1354 function __unset($name)
1355 {
1356 if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1357 }
1358
1359 // PaperG - Function to convert the text from one character set to another
1360 // if the two sets are not the same.
1361 function convert_text($text)
1362 {
1363 global $debug_object;
1364 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1365
1366 $converted_text = $text;
1367
1368 $sourceCharset = '';
1369 $targetCharset = '';
1370
1371 if ($this->dom) {
1372 $sourceCharset = strtoupper($this->dom->_charset);
1373 $targetCharset = strtoupper($this->dom->_target_charset);
1374 }
1375
1376 if (is_object($debug_object)) {
1377 $debug_object->debug_log(3,
1378 'source charset: '
1379 . $sourceCharset
1380 . ' target charaset: '
1381 . $targetCharset
1382 );
1383 }
1384
1385 if (!empty($sourceCharset)
1386 && !empty($targetCharset)
1387 && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1388 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1389 if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1390 && ($this->is_utf8($text))) {
1391 $converted_text = $text;
1392 } else {
1393 $converted_text = iconv($sourceCharset, $targetCharset, $text);
1394 }
1395 }
1396
1397 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1398 if ($targetCharset === 'UTF-8') {
1399 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1400 $converted_text = substr($converted_text, 3);
1401 }
1402
1403 if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1404 $converted_text = substr($converted_text, 0, -3);
1405 }
1406 }
1407
1408 return $converted_text;
1409 }
1410
1411 /**
1412 * Returns true if $string is valid UTF-8 and false otherwise.
1413 *
1414 * @param mixed $str String to be tested
1415 * @return boolean
1416 */
1417 static function is_utf8($str)
1418 {
1419 $c = 0; $b = 0;
1420 $bits = 0;
1421 $len = strlen($str);
1422 for($i = 0; $i < $len; $i++) {
1423 $c = ord($str[$i]);
1424 if($c > 128) {
1425 if(($c >= 254)) { return false; }
1426 elseif($c >= 252) { $bits = 6; }
1427 elseif($c >= 248) { $bits = 5; }
1428 elseif($c >= 240) { $bits = 4; }
1429 elseif($c >= 224) { $bits = 3; }
1430 elseif($c >= 192) { $bits = 2; }
1431 else { return false; }
1432 if(($i + $bits) > $len) { return false; }
1433 while($bits > 1) {
1434 $i++;
1435 $b = ord($str[$i]);
1436 if($b < 128 || $b > 191) { return false; }
1437 $bits--;
1438 }
1439 }
1440 }
1441 return true;
1442 }
1443
1444 /**
1445 * Function to try a few tricks to determine the displayed size of an img on
1446 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1447 * other tag types.
1448 *
1449 * @author John Schlick
1450 * @version April 19 2012
1451 * @return array an array containing the 'height' and 'width' of the image
1452 * on the page or -1 if we can't figure it out.
1453 */
1454 function get_display_size()
1455 {
1456 global $debug_object;
1457
1458 $width = -1;
1459 $height = -1;
1460
1461 if ($this->tag !== 'img') {
1462 return false;
1463 }
1464
1465 // See if there is aheight or width attribute in the tag itself.
1466 if (isset($this->attr['width'])) {
1467 $width = $this->attr['width'];
1468 }
1469
1470 if (isset($this->attr['height'])) {
1471 $height = $this->attr['height'];
1472 }
1473
1474 // Now look for an inline style.
1475 if (isset($this->attr['style'])) {
1476 // Thanks to user gnarf from stackoverflow for this regular expression.
1477 $attributes = array();
1478
1479 preg_match_all(
1480 '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1481 $this->attr['style'],
1482 $matches,
1483 PREG_SET_ORDER
1484 );
1485
1486 foreach ($matches as $match) {
1487 $attributes[$match[1]] = $match[2];
1488 }
1489
1490 // If there is a width in the style attributes:
1491 if (isset($attributes['width']) && $width == -1) {
1492 // check that the last two characters are px (pixels)
1493 if (strtolower(substr($attributes['width'], -2)) === 'px') {
1494 $proposed_width = substr($attributes['width'], 0, -2);
1495 // Now make sure that it's an integer and not something stupid.
1496 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1497 $width = $proposed_width;
1498 }
1499 }
1500 }
1501
1502 // If there is a width in the style attributes:
1503 if (isset($attributes['height']) && $height == -1) {
1504 // check that the last two characters are px (pixels)
1505 if (strtolower(substr($attributes['height'], -2)) == 'px') {
1506 $proposed_height = substr($attributes['height'], 0, -2);
1507 // Now make sure that it's an integer and not something stupid.
1508 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1509 $height = $proposed_height;
1510 }
1511 }
1512 }
1513
1514 }
1515
1516 // Future enhancement:
1517 // Look in the tag to see if there is a class or id specified that has
1518 // a height or width attribute to it.
1519
1520 // Far future enhancement
1521 // Look at all the parent tags of this image to see if they specify a
1522 // class or id that has an img selector that specifies a height or width
1523 // Note that in this case, the class or id will have the img subselector
1524 // for it to apply to the image.
1525
1526 // ridiculously far future development
1527 // If the class or id is specified in a SEPARATE css file thats not on
1528 // the page, go get it and do what we were just doing for the ones on
1529 // the page.
1530
1531 $result = array(
1532 'height' => $height,
1533 'width' => $width
1534 );
1535
1536 return $result;
1537 }
1538
1539 // camel naming conventions
1540 function getAllAttributes()
1541 {
1542 return $this->attr;
1543 }
1544
1545 function getAttribute($name)
1546 {
1547 return $this->__get($name);
1548 }
1549
1550 function setAttribute($name, $value)
1551 {
1552 $this->__set($name, $value);
1553 }
1554
1555 function hasAttribute($name)
1556 {
1557 return $this->__isset($name);
1558 }
1559
1560 function removeAttribute($name)
1561 {
1562 $this->__set($name, null);
1563 }
1564
1565 function getElementById($id)
1566 {
1567 return $this->find("#$id", 0);
1568 }
1569
1570 function getElementsById($id, $idx = null)
1571 {
1572 return $this->find("#$id", $idx);
1573 }
1574
1575 function getElementByTagName($name)
1576 {
1577 return $this->find($name, 0);
1578 }
1579
1580 function getElementsByTagName($name, $idx = null)
1581 {
1582 return $this->find($name, $idx);
1583 }
1584
1585 function parentNode()
1586 {
1587 return $this->parent();
1588 }
1589
1590 function childNodes($idx = -1)
1591 {
1592 return $this->children($idx);
1593 }
1594
1595 function firstChild()
1596 {
1597 return $this->first_child();
1598 }
1599
1600 function lastChild()
1601 {
1602 return $this->last_child();
1603 }
1604
1605 function nextSibling()
1606 {
1607 return $this->next_sibling();
1608 }
1609
1610 function previousSibling()
1611 {
1612 return $this->prev_sibling();
1613 }
1614
1615 function hasChildNodes()
1616 {
1617 return $this->has_child();
1618 }
1619
1620 function nodeName()
1621 {
1622 return $this->tag;
1623 }
1624
1625 function appendChild($node)
1626 {
1627 $node->parent($this);
1628 return $node;
1629 }
1630
1631}
1632
1633/**
1634 * simple html dom parser
1635 *
1636 * Paperg - in the find routine: allow us to specify that we want case
1637 * insensitive testing of the value of the selector.
1638 *
1639 * Paperg - change $size from protected to public so we can easily access it
1640 *
1641 * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1642 * trust the html or not. Default is to NOT trust it.
1643 *
1644 * @package PlaceLocalInclude
1645 */
1646class simple_html_dom
1647{
1648 /**
1649 * The root node of the document
1650 *
1651 * @var object
1652 */
1653 public $root = null;
1654
1655 /**
1656 * List of nodes in the current DOM
1657 *
1658 * @var array
1659 */
1660 public $nodes = array();
1661
1662 /**
1663 * Callback function to run for each element in the DOM.
1664 *
1665 * @var callable|null
1666 */
1667 public $callback = null;
1668
1669 /**
1670 * Indicates how tags and attributes are matched
1671 *
1672 * @var bool When set to **true** tags and attributes will be converted to
1673 * lowercase before matching.
1674 */
1675 public $lowercase = false;
1676
1677 /**
1678 * Original document size
1679 *
1680 * Holds the original document size.
1681 *
1682 * @var int
1683 */
1684 public $original_size;
1685
1686 /**
1687 * Current document size
1688 *
1689 * Holds the current document size. The document size is determined by the
1690 * string length of ({@see simple_html_dom::$doc}).
1691 *
1692 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1693 *
1694 * @var int
1695 * */
1696 public $size;
1697
1698 /**
1699 * Current position in the document
1700 *
1701 * @var int
1702 */
1703 protected $pos;
1704
1705 /**
1706 * The document
1707 *
1708 * @var string
1709 */
1710 protected $doc;
1711
1712 /**
1713 * Current character
1714 *
1715 * Holds the current character at position {@see simple_html_dom::$pos} in
1716 * the document {@see simple_html_dom::$doc}
1717 *
1718 * _Note_: Using this variable is more efficient than calling
1719 * `substr($doc, $pos, 1)`
1720 *
1721 * @var string
1722 */
1723 protected $char;
1724
1725 protected $cursor;
1726
1727 /**
1728 * Parent node of the next node detected by the parser
1729 *
1730 * @var object
1731 */
1732 protected $parent;
1733 protected $noise = array();
1734
1735 /**
1736 * Tokens considered blank in HTML
1737 *
1738 * @var string
1739 */
1740 protected $token_blank = " \t\r\n";
1741
1742 /**
1743 * Tokens to identify the equal sign for attributes, stopping either at the
1744 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1745 * "<html>")
1746 *
1747 * @var string
1748 */
1749 protected $token_equal = ' =/>';
1750
1751 /**
1752 * Tokens to identify the end of a tag name. A tag name either ends on the
1753 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1754 *
1755 * @var string
1756 */
1757 protected $token_slash = " />\r\n\t";
1758
1759 /**
1760 * Tokens to identify the end of an attribute
1761 *
1762 * @var string
1763 */
1764 protected $token_attr = ' >';
1765
1766 // Note that this is referenced by a child node, and so it needs to be
1767 // public for that node to see this information.
1768 public $_charset = '';
1769 public $_target_charset = '';
1770
1771 /**
1772 * Innertext for <br> elements
1773 *
1774 * @var string
1775 */
1776 protected $default_br_text = '';
1777
1778 /**
1779 * Suffix for <span> elements
1780 *
1781 * @var string
1782 */
1783 public $default_span_text = '';
1784
1785 /**
1786 * Defines a list of self-closing tags (Void elements) according to the HTML
1787 * Specification
1788 *
1789 * _Remarks_:
1790 * - Use `isset()` instead of `in_array()` on array elements to boost
1791 * performance about 30%
1792 * - Sort elements by name for better readability!
1793 *
1794 * @link https://www.w3.org/TR/html HTML Specification
1795 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1796 */
1797 protected $self_closing_tags = array(
1798 'area' => 1,
1799 'base' => 1,
1800 'br' => 1,
1801 'col' => 1,
1802 'embed' => 1,
1803 'hr' => 1,
1804 'img' => 1,
1805 'input' => 1,
1806 'link' => 1,
1807 'meta' => 1,
1808 'param' => 1,
1809 'source' => 1,
1810 'track' => 1,
1811 'wbr' => 1
1812 );
1813
1814 /**
1815 * Defines a list of tags which - if closed - close all optional closing
1816 * elements within if they haven't been closed yet. (So, an element where
1817 * neither opening nor closing tag is omissible consistently closes every
1818 * optional closing element within)
1819 *
1820 * _Remarks_:
1821 * - Use `isset()` instead of `in_array()` on array elements to boost
1822 * performance about 30%
1823 * - Sort elements by name for better readability!
1824 */
1825 protected $block_tags = array(
1826 'body' => 1,
1827 'div' => 1,
1828 'form' => 1,
1829 'root' => 1,
1830 'span' => 1,
1831 'table' => 1
1832 );
1833
1834 /**
1835 * Defines elements whose end tag is omissible.
1836 *
1837 * * key = Name of an element whose end tag is omissible.
1838 * * value = Names of elements whose end tag is omissible, that are closed
1839 * by the current element.
1840 *
1841 * _Remarks_:
1842 * - Use `isset()` instead of `in_array()` on array elements to boost
1843 * performance about 30%
1844 * - Sort elements by name for better readability!
1845 *
1846 * **Example**
1847 *
1848 * An `li` element’s end tag may be omitted if the `li` element is immediately
1849 * followed by another `li` element. To do that, add following element to the
1850 * array:
1851 *
1852 * ```php
1853 * 'li' => array('li'),
1854 * ```
1855 *
1856 * With this, the following two examples are considered equal. Note that the
1857 * second example is missing the closing tags on `li` elements.
1858 *
1859 * ```html
1860 * <ul><li>First Item</li><li>Second Item</li></ul>
1861 * ```
1862 *
1863 * <ul><li>First Item</li><li>Second Item</li></ul>
1864 *
1865 * ```html
1866 * <ul><li>First Item<li>Second Item</ul>
1867 * ```
1868 *
1869 * <ul><li>First Item<li>Second Item</ul>
1870 *
1871 * @var array A two-dimensional array where the key is the name of an
1872 * element whose end tag is omissible and the value is an array of elements
1873 * whose end tag is omissible, that are closed by the current element.
1874 *
1875 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1876 *
1877 * @todo The implementation of optional closing tags doesn't work in all cases
1878 * because it only consideres elements who close other optional closing
1879 * tags, not taking into account that some (non-blocking) tags should close
1880 * these optional closing tags. For example, the end tag for "p" is omissible
1881 * and can be closed by an "address" element, whose end tag is NOT omissible.
1882 * Currently a "p" element without closing tag stops at the next "p" element
1883 * or blocking tag, even if it contains other elements.
1884 *
1885 * @todo Known sourceforge issue #2977341
1886 * B tags that are not closed cause us to return everything to the end of
1887 * the document.
1888 */
1889 protected $optional_closing_tags = array(
1890 // Not optional, see
1891 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1892 'b' => array('b' => 1),
1893 'dd' => array('dd' => 1, 'dt' => 1),
1894 // Not optional, see
1895 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1896 'dl' => array('dd' => 1, 'dt' => 1),
1897 'dt' => array('dd' => 1, 'dt' => 1),
1898 'li' => array('li' => 1),
1899 'optgroup' => array('optgroup' => 1, 'option' => 1),
1900 'option' => array('optgroup' => 1, 'option' => 1),
1901 'p' => array('p' => 1),
1902 'rp' => array('rp' => 1, 'rt' => 1),
1903 'rt' => array('rp' => 1, 'rt' => 1),
1904 'td' => array('td' => 1, 'th' => 1),
1905 'th' => array('td' => 1, 'th' => 1),
1906 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1907 );
1908
1909 function __construct(
1910 $str = null,
1911 $lowercase = true,
1912 $forceTagsClosed = true,
1913 $target_charset = DEFAULT_TARGET_CHARSET,
1914 $stripRN = true,
1915 $defaultBRText = DEFAULT_BR_TEXT,
1916 $defaultSpanText = DEFAULT_SPAN_TEXT,
1917 $options = 0)
1918 {
1919 if ($str) {
1920 if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1921 $this->load_file($str);
1922 } else {
1923 $this->load(
1924 $str,
1925 $lowercase,
1926 $stripRN,
1927 $defaultBRText,
1928 $defaultSpanText,
1929 $options
1930 );
1931 }
1932 }
1933 // Forcing tags to be closed implies that we don't trust the html, but
1934 // it can lead to parsing errors if we SHOULD trust the html.
1935 if (!$forceTagsClosed) {
1936 $this->optional_closing_array = array();
1937 }
1938
1939 $this->_target_charset = $target_charset;
1940 }
1941
1942 function __destruct()
1943 {
1944 $this->clear();
1945 }
1946
1947 // load html from string
1948 function load(
1949 $str,
1950 $lowercase = true,
1951 $stripRN = true,
1952 $defaultBRText = DEFAULT_BR_TEXT,
1953 $defaultSpanText = DEFAULT_SPAN_TEXT,
1954 $options = 0)
1955 {
1956 global $debug_object;
1957
1958 // prepare
1959 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1960
1961 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1962 // Script tags removal now preceeds style tag removal.
1963 // strip out <script> tags
1964 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1965 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1966
1967 // strip out the \r \n's if we are told to.
1968 if ($stripRN) {
1969 $this->doc = str_replace("\r", ' ', $this->doc);
1970 $this->doc = str_replace("\n", ' ', $this->doc);
1971
1972 // set the length of content since we have changed it.
1973 $this->size = strlen($this->doc);
1974 }
1975
1976 // strip out cdata
1977 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1978 // strip out comments
1979 $this->remove_noise("'<!--(.*?)-->'is");
1980 // strip out <style> tags
1981 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1982 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1983 // strip out preformatted tags
1984 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1985 // strip out server side scripts
1986 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1987
1988 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1989 $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1990 }
1991
1992 // parsing
1993 $this->parse();
1994 // end
1995 $this->root->_[HDOM_INFO_END] = $this->cursor;
1996 $this->parse_charset();
1997
1998 // make load function chainable
1999 return $this;
2000 }
2001
2002 // load html from file
2003 function load_file()
2004 {
2005 $args = func_get_args();
2006
2007 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
2008 $this->load($doc, true);
2009 } else {
2010 return false;
2011 }
2012 }
2013
2014 /**
2015 * Set the callback function
2016 *
2017 * @param callable $function_name Callback function to run for each element
2018 * in the DOM.
2019 * @return void
2020 */
2021 function set_callback($function_name)
2022 {
2023 $this->callback = $function_name;
2024 }
2025
2026 /**
2027 * Remove callback function
2028 *
2029 * @return void
2030 */
2031 function remove_callback()
2032 {
2033 $this->callback = null;
2034 }
2035
2036 // save dom as string
2037 function save($filepath = '')
2038 {
2039 $ret = $this->root->innertext();
2040 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
2041 return $ret;
2042 }
2043
2044 // find dom node by css selector
2045 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
2046 function find($selector, $idx = null, $lowercase = false)
2047 {
2048 return $this->root->find($selector, $idx, $lowercase);
2049 }
2050
2051 // clean up memory due to php5 circular references memory leak...
2052 function clear()
2053 {
2054 foreach ($this->nodes as $n) {
2055 $n->clear(); $n = null;
2056 }
2057
2058 // This add next line is documented in the sourceforge repository.
2059 // 2977248 as a fix for ongoing memory leaks that occur even with the
2060 // use of clear.
2061 if (isset($this->children)) {
2062 foreach ($this->children as $n) {
2063 $n->clear(); $n = null;
2064 }
2065 }
2066
2067 if (isset($this->parent)) {
2068 $this->parent->clear();
2069 unset($this->parent);
2070 }
2071
2072 if (isset($this->root)) {
2073 $this->root->clear();
2074 unset($this->root);
2075 }
2076
2077 unset($this->doc);
2078 unset($this->noise);
2079 }
2080
2081 function dump($show_attr = true)
2082 {
2083 $this->root->dump($show_attr);
2084 }
2085
2086 // prepare HTML data and init everything
2087 protected function prepare(
2088 $str, $lowercase = true,
2089 $defaultBRText = DEFAULT_BR_TEXT,
2090 $defaultSpanText = DEFAULT_SPAN_TEXT)
2091 {
2092 $this->clear();
2093
2094 $this->doc = trim($str);
2095 $this->size = strlen($this->doc);
2096 $this->original_size = $this->size; // original size of the html
2097 $this->pos = 0;
2098 $this->cursor = 1;
2099 $this->noise = array();
2100 $this->nodes = array();
2101 $this->lowercase = $lowercase;
2102 $this->default_br_text = $defaultBRText;
2103 $this->default_span_text = $defaultSpanText;
2104 $this->root = new simple_html_dom_node($this);
2105 $this->root->tag = 'root';
2106 $this->root->_[HDOM_INFO_BEGIN] = -1;
2107 $this->root->nodetype = HDOM_TYPE_ROOT;
2108 $this->parent = $this->root;
2109 if ($this->size > 0) { $this->char = $this->doc[0]; }
2110 }
2111
2112 /**
2113 * Parse HTML content
2114 *
2115 * @return bool True on success
2116 */
2117 protected function parse()
2118 {
2119 while (true) {
2120 // Read next tag if there is no text between current position and the
2121 // next opening tag.
2122 if (($s = $this->copy_until_char('<')) === '') {
2123 if($this->read_tag()) {
2124 continue;
2125 } else {
2126 return true;
2127 }
2128 }
2129
2130 // Add a text node for text between tags
2131 $node = new simple_html_dom_node($this);
2132 ++$this->cursor;
2133 $node->_[HDOM_INFO_TEXT] = $s;
2134 $this->link_nodes($node, false);
2135 }
2136 }
2137
2138 // PAPERG - dkchou - added this to try to identify the character set of the
2139 // page we have just parsed so we know better how to spit it out later.
2140 // NOTE: IF you provide a routine called
2141 // get_last_retrieve_url_contents_content_type which returns the
2142 // CURLINFO_CONTENT_TYPE from the last curl_exec
2143 // (or the content_type header from the last transfer), we will parse THAT,
2144 // and if a charset is specified, we will use it over any other mechanism.
2145 protected function parse_charset()
2146 {
2147 global $debug_object;
2148
2149 $charset = null;
2150
2151 if (function_exists('get_last_retrieve_url_contents_content_type')) {
2152 $contentTypeHeader = get_last_retrieve_url_contents_content_type();
2153 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2154 if ($success) {
2155 $charset = $matches[1];
2156 if (is_object($debug_object)) {
2157 $debug_object->debug_log(2,
2158 'header content-type found charset of: '
2159 . $charset
2160 );
2161 }
2162 }
2163 }
2164
2165 if (empty($charset)) {
2166 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2167
2168 if (!empty($el)) {
2169 $fullvalue = $el->content;
2170 if (is_object($debug_object)) {
2171 $debug_object->debug_log(2,
2172 'meta content-type tag found'
2173 . $fullvalue
2174 );
2175 }
2176
2177 if (!empty($fullvalue)) {
2178 $success = preg_match(
2179 '/charset=(.+)/i',
2180 $fullvalue,
2181 $matches
2182 );
2183
2184 if ($success) {
2185 $charset = $matches[1];
2186 } else {
2187 // If there is a meta tag, and they don't specify the
2188 // character set, research says that it's typically
2189 // ISO-8859-1
2190 if (is_object($debug_object)) {
2191 $debug_object->debug_log(2,
2192 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2193 );
2194 }
2195
2196 $charset = 'ISO-8859-1';
2197 }
2198 }
2199 }
2200 }
2201
2202 // If we couldn't find a charset above, then lets try to detect one
2203 // based on the text we got...
2204 if (empty($charset)) {
2205 // Use this in case mb_detect_charset isn't installed/loaded on
2206 // this machine.
2207 $charset = false;
2208 if (function_exists('mb_detect_encoding')) {
2209 // Have php try to detect the encoding from the text given to us.
2210 $charset = mb_detect_encoding(
2211 $this->doc . 'ascii',
2212 $encoding_list = array( 'UTF-8', 'CP1252' )
2213 );
2214
2215 if (is_object($debug_object)) {
2216 $debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2217 }
2218 }
2219
2220 // and if this doesn't work... then we need to just wrongheadedly
2221 // assume it's UTF-8 so that we can move on - cause this will
2222 // usually give us most of what we need...
2223 if ($charset === false) {
2224 if (is_object($debug_object)) {
2225 $debug_object->debug_log(
2226 2,
2227 'since mb_detect failed - using default of utf-8'
2228 );
2229 }
2230
2231 $charset = 'UTF-8';
2232 }
2233 }
2234
2235 // Since CP1252 is a superset, if we get one of it's subsets, we want
2236 // it instead.
2237 if ((strtolower($charset) == strtolower('ISO-8859-1'))
2238 || (strtolower($charset) == strtolower('Latin1'))
2239 || (strtolower($charset) == strtolower('Latin-1'))) {
2240
2241 if (is_object($debug_object)) {
2242 $debug_object->debug_log(
2243 2,
2244 'replacing ' . $charset . ' with CP1252 as its a superset'
2245 );
2246 }
2247
2248 $charset = 'CP1252';
2249 }
2250
2251 if (is_object($debug_object)) {
2252 $debug_object->debug_log(1, 'EXIT - ' . $charset);
2253 }
2254
2255 return $this->_charset = $charset;
2256 }
2257
2258 /**
2259 * Parse tag from current document position.
2260 *
2261 * @return bool True if a tag was found, false otherwise
2262 */
2263 protected function read_tag()
2264 {
2265 // Set end position if no further tags found
2266 if ($this->char !== '<') {
2267 $this->root->_[HDOM_INFO_END] = $this->cursor;
2268 return false;
2269 }
2270
2271 $begin_tag_pos = $this->pos;
2272 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2273
2274 // end tag
2275 if ($this->char === '/') {
2276 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2277
2278 // Skip whitespace in end tags (i.e. in "</ html>")
2279 $this->skip($this->token_blank);
2280 $tag = $this->copy_until_char('>');
2281
2282 // Skip attributes in end tags
2283 if (($pos = strpos($tag, ' ')) !== false) {
2284 $tag = substr($tag, 0, $pos);
2285 }
2286
2287 $parent_lower = strtolower($this->parent->tag);
2288 $tag_lower = strtolower($tag);
2289
2290 // The end tag is supposed to close the parent tag. Handle situations
2291 // when it doesn't
2292 if ($parent_lower !== $tag_lower) {
2293 // Parent tag does not have to be closed necessarily (optional closing tag)
2294 // Current tag is a block tag, so it may close an ancestor
2295 if (isset($this->optional_closing_tags[$parent_lower])
2296 && isset($this->block_tags[$tag_lower])) {
2297
2298 $this->parent->_[HDOM_INFO_END] = 0;
2299 $org_parent = $this->parent;
2300
2301 // Traverse ancestors to find a matching opening tag
2302 // Stop at root node
2303 while (($this->parent->parent)
2304 && strtolower($this->parent->tag) !== $tag_lower
2305 ){
2306 $this->parent = $this->parent->parent;
2307 }
2308
2309 // If we don't have a match add current tag as text node
2310 if (strtolower($this->parent->tag) !== $tag_lower) {
2311 $this->parent = $org_parent; // restore origonal parent
2312
2313 if ($this->parent->parent) {
2314 $this->parent = $this->parent->parent;
2315 }
2316
2317 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2318 return $this->as_text_node($tag);
2319 }
2320 } elseif (($this->parent->parent)
2321 && isset($this->block_tags[$tag_lower])
2322 ) {
2323 // Grandparent exists and current tag is a block tag, so our
2324 // parent doesn't have an end tag
2325 $this->parent->_[HDOM_INFO_END] = 0; // No end tag
2326 $org_parent = $this->parent;
2327
2328 // Traverse ancestors to find a matching opening tag
2329 // Stop at root node
2330 while (($this->parent->parent)
2331 && strtolower($this->parent->tag) !== $tag_lower
2332 ) {
2333 $this->parent = $this->parent->parent;
2334 }
2335
2336 // If we don't have a match add current tag as text node
2337 if (strtolower($this->parent->tag) !== $tag_lower) {
2338 $this->parent = $org_parent; // restore origonal parent
2339 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2340 return $this->as_text_node($tag);
2341 }
2342 } elseif (($this->parent->parent)
2343 && strtolower($this->parent->parent->tag) === $tag_lower
2344 ) { // Grandparent exists and current tag closes it
2345 $this->parent->_[HDOM_INFO_END] = 0;
2346 $this->parent = $this->parent->parent;
2347 } else { // Random tag, add as text node
2348 return $this->as_text_node($tag);
2349 }
2350 }
2351
2352 // Set end position of parent tag to current cursor position
2353 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2354
2355 if ($this->parent->parent) {
2356 $this->parent = $this->parent->parent;
2357 }
2358
2359 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2360 return true;
2361 }
2362
2363 // start tag
2364 $node = new simple_html_dom_node($this);
2365 $node->_[HDOM_INFO_BEGIN] = $this->cursor;
2366 ++$this->cursor;
2367 $tag = $this->copy_until($this->token_slash); // Get tag name
2368 $node->tag_start = $begin_tag_pos;
2369
2370 // doctype, cdata & comments...
2371 // <!DOCTYPE html>
2372 // <![CDATA[ ... ]]>
2373 // <!-- Comment -->
2374 if (isset($tag[0]) && $tag[0] === '!') {
2375 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2376
2377 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2378 $node->nodetype = HDOM_TYPE_COMMENT;
2379 $node->tag = 'comment';
2380 } else { // Could be doctype or CDATA but we don't care
2381 $node->nodetype = HDOM_TYPE_UNKNOWN;
2382 $node->tag = 'unknown';
2383 }
2384
2385 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2386
2387 $this->link_nodes($node, true);
2388 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2389 return true;
2390 }
2391
2392 // The start tag cannot contain another start tag, if so add as text
2393 // i.e. "<<html>"
2394 if ($pos = strpos($tag, '<') !== false) {
2395 $tag = '<' . substr($tag, 0, -1);
2396 $node->_[HDOM_INFO_TEXT] = $tag;
2397 $this->link_nodes($node, false);
2398 $this->char = $this->doc[--$this->pos]; // prev
2399 return true;
2400 }
2401
2402 // Handle invalid tag names (i.e. "<html#doc>")
2403 if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2404 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2405
2406 // Next char is the beginning of a new tag, don't touch it.
2407 if ($this->char === '<') {
2408 $this->link_nodes($node, false);
2409 return true;
2410 }
2411
2412 // Next char closes current tag, add and be done with it.
2413 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2414 $this->link_nodes($node, false);
2415 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2416 return true;
2417 }
2418
2419 // begin tag, add new node
2420 $node->nodetype = HDOM_TYPE_ELEMENT;
2421 $tag_lower = strtolower($tag);
2422 $node->tag = ($this->lowercase) ? $tag_lower : $tag;
2423
2424 // handle optional closing tags
2425 if (isset($this->optional_closing_tags[$tag_lower])) {
2426 // Traverse ancestors to close all optional closing tags
2427 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2428 $this->parent->_[HDOM_INFO_END] = 0;
2429 $this->parent = $this->parent->parent;
2430 }
2431 $node->parent = $this->parent;
2432 }
2433
2434 $guard = 0; // prevent infinity loop
2435
2436 // [0] Space between tag and first attribute
2437 $space = array($this->copy_skip($this->token_blank), '', '');
2438
2439 // attributes
2440 do {
2441 // Everything until the first equal sign should be the attribute name
2442 $name = $this->copy_until($this->token_equal);
2443
2444 if ($name === '' && $this->char !== null && $space[0] === '') {
2445 break;
2446 }
2447
2448 if ($guard === $this->pos) { // Escape infinite loop
2449 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2450 continue;
2451 }
2452
2453 $guard = $this->pos;
2454
2455 // handle endless '<'
2456 // Out of bounds before the tag ended
2457 if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2458 $node->nodetype = HDOM_TYPE_TEXT;
2459 $node->_[HDOM_INFO_END] = 0;
2460 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2461 $node->tag = 'text';
2462 $this->link_nodes($node, false);
2463 return true;
2464 }
2465
2466 // handle mismatch '<'
2467 // Attributes cannot start after opening tag
2468 if ($this->doc[$this->pos - 1] == '<') {
2469 $node->nodetype = HDOM_TYPE_TEXT;
2470 $node->tag = 'text';
2471 $node->attr = array();
2472 $node->_[HDOM_INFO_END] = 0;
2473 $node->_[HDOM_INFO_TEXT] = substr(
2474 $this->doc,
2475 $begin_tag_pos,
2476 $this->pos - $begin_tag_pos - 1
2477 );
2478 $this->pos -= 2;
2479 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2480 $this->link_nodes($node, false);
2481 return true;
2482 }
2483
2484 if ($name !== '/' && $name !== '') { // this is a attribute name
2485 // [1] Whitespace after attribute name
2486 $space[1] = $this->copy_skip($this->token_blank);
2487
2488 $name = $this->restore_noise($name); // might be a noisy name
2489
2490 if ($this->lowercase) { $name = strtolower($name); }
2491
2492 if ($this->char === '=') { // attribute with value
2493 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2494 $this->parse_attr($node, $name, $space); // get attribute value
2495 } else {
2496 //no value attr: nowrap, checked selected...
2497 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2498 $node->attr[$name] = true;
2499 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2500 }
2501
2502 $node->_[HDOM_INFO_SPACE][] = $space;
2503
2504 // prepare for next attribute
2505 $space = array(
2506 $this->copy_skip($this->token_blank),
2507 '',
2508 ''
2509 );
2510 } else { // no more attributes
2511 break;
2512 }
2513 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2514
2515 $this->link_nodes($node, true);
2516 $node->_[HDOM_INFO_ENDSPACE] = $space[0];
2517
2518 // handle empty tags (i.e. "<div/>")
2519 if ($this->copy_until_char('>') === '/') {
2520 $node->_[HDOM_INFO_ENDSPACE] .= '/';
2521 $node->_[HDOM_INFO_END] = 0;
2522 } else {
2523 // reset parent
2524 if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2525 $this->parent = $node;
2526 }
2527 }
2528
2529 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2530
2531 // If it's a BR tag, we need to set it's text to the default text.
2532 // This way when we see it in plaintext, we can generate formatting that the user wants.
2533 // since a br tag never has sub nodes, this works well.
2534 if ($node->tag === 'br') {
2535 $node->_[HDOM_INFO_INNER] = $this->default_br_text;
2536 }
2537
2538 return true;
2539 }
2540
2541 /**
2542 * Parse attribute from current document position
2543 *
2544 * @param object $node Node for the attributes
2545 * @param string $name Name of the current attribute
2546 * @param array $space Array for spacing information
2547 * @return void
2548 */
2549 protected function parse_attr($node, $name, &$space)
2550 {
2551 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2552 // If the attribute is already defined inside a tag, only pay attention
2553 // to the first one as opposed to the last one.
2554 // https://stackoverflow.com/a/26341866
2555 if (isset($node->attr[$name])) {
2556 return;
2557 }
2558
2559 // [2] Whitespace between "=" and the value
2560 $space[2] = $this->copy_skip($this->token_blank);
2561
2562 switch ($this->char) {
2563 case '"': // value is anything between double quotes
2564 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2565 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2566 $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2567 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2568 break;
2569 case '\'': // value is anything between single quotes
2570 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2571 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2572 $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2573 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2574 break;
2575 default: // value is anything until the first space or end tag
2576 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2577 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2578 }
2579 // PaperG: Attributes should not have \r or \n in them, that counts as
2580 // html whitespace.
2581 $node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2582 $node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2583 // PaperG: If this is a "class" selector, lets get rid of the preceeding
2584 // and trailing space since some people leave it in the multi class case.
2585 if ($name === 'class') {
2586 $node->attr[$name] = trim($node->attr[$name]);
2587 }
2588 }
2589
2590 /**
2591 * Link node to parent node
2592 *
2593 * @param object $node Node to link to parent
2594 * @param bool $is_child True if the node is a child of parent
2595 * @return void
2596 */
2597 // link node's parent
2598 protected function link_nodes(&$node, $is_child)
2599 {
2600 $node->parent = $this->parent;
2601 $this->parent->nodes[] = $node;
2602 if ($is_child) {
2603 $this->parent->children[] = $node;
2604 }
2605 }
2606
2607 /**
2608 * Add tag as text node to current node
2609 *
2610 * @param string $tag Tag name
2611 * @return bool True on success
2612 */
2613 protected function as_text_node($tag)
2614 {
2615 $node = new simple_html_dom_node($this);
2616 ++$this->cursor;
2617 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2618 $this->link_nodes($node, false);
2619 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2620 return true;
2621 }
2622
2623 /**
2624 * Seek from the current document position to the first occurrence of a
2625 * character not defined by the provided string. Update the current document
2626 * position to the new position.
2627 *
2628 * @param string $chars A string containing every allowed character.
2629 * @return void
2630 */
2631 protected function skip($chars)
2632 {
2633 $this->pos += strspn($this->doc, $chars, $this->pos);
2634 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2635 }
2636
2637 /**
2638 * Copy substring from the current document position to the first occurrence
2639 * of a character not defined by the provided string.
2640 *
2641 * @param string $chars A string containing every allowed character.
2642 * @return string Substring from the current document position to the first
2643 * occurrence of a character not defined by the provided string.
2644 */
2645 protected function copy_skip($chars)
2646 {
2647 $pos = $this->pos;
2648 $len = strspn($this->doc, $chars, $pos);
2649 $this->pos += $len;
2650 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2651 if ($len === 0) { return ''; }
2652 return substr($this->doc, $pos, $len);
2653 }
2654
2655 /**
2656 * Copy substring from the current document position to the first occurrence
2657 * of any of the provided characters.
2658 *
2659 * @param string $chars A string containing every character to stop at.
2660 * @return string Substring from the current document position to the first
2661 * occurrence of any of the provided characters.
2662 */
2663 protected function copy_until($chars)
2664 {
2665 $pos = $this->pos;
2666 $len = strcspn($this->doc, $chars, $pos);
2667 $this->pos += $len;
2668 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2669 return substr($this->doc, $pos, $len);
2670 }
2671
2672 /**
2673 * Copy substring from the current document position to the first occurrence
2674 * of the provided string.
2675 *
2676 * @param string $char The string to stop at.
2677 * @return string Substring from the current document position to the first
2678 * occurrence of the provided string.
2679 */
2680 protected function copy_until_char($char)
2681 {
2682 if ($this->char === null) { return ''; }
2683
2684 if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2685 $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2686 $this->char = null;
2687 $this->pos = $this->size;
2688 return $ret;
2689 }
2690
2691 if ($pos === $this->pos) { return ''; }
2692
2693 $pos_old = $this->pos;
2694 $this->char = $this->doc[$pos];
2695 $this->pos = $pos;
2696 return substr($this->doc, $pos_old, $pos - $pos_old);
2697 }
2698
2699 /**
2700 * Remove noise from HTML content
2701 *
2702 * Noise is stored to {@see simple_html_dom::$noise}
2703 *
2704 * @param string $pattern The regex pattern used for finding noise
2705 * @param bool $remove_tag True to remove the entire match. Default is false
2706 * to only remove the captured data.
2707 */
2708 protected function remove_noise($pattern, $remove_tag = false)
2709 {
2710 global $debug_object;
2711 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2712
2713 $count = preg_match_all(
2714 $pattern,
2715 $this->doc,
2716 $matches,
2717 PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2718 );
2719
2720 for ($i = $count - 1; $i > -1; --$i) {
2721 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2722
2723 if (is_object($debug_object)) {
2724 $debug_object->debug_log(2, 'key is: ' . $key);
2725 }
2726
2727 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2728 $this->noise[$key] = $matches[$i][$idx][0];
2729 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2730 }
2731
2732 // reset the length of content
2733 $this->size = strlen($this->doc);
2734
2735 if ($this->size > 0) {
2736 $this->char = $this->doc[0];
2737 }
2738 }
2739
2740 /**
2741 * Restore noise to HTML content
2742 *
2743 * Noise is restored from {@see simple_html_dom::$noise}
2744 *
2745 * @param string $text A subset of HTML containing noise
2746 * @return string The same content with noise restored
2747 */
2748 function restore_noise($text)
2749 {
2750 global $debug_object;
2751 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2752
2753 while (($pos = strpos($text, '___noise___')) !== false) {
2754 // Sometimes there is a broken piece of markup, and we don't GET the
2755 // pos+11 etc... token which indicates a problem outside of us...
2756
2757 // todo: "___noise___1000" (or any number with four or more digits)
2758 // in the DOM causes an infinite loop which could be utilized by
2759 // malicious software
2760 if (strlen($text) > $pos + 15) {
2761 $key = '___noise___'
2762 . $text[$pos + 11]
2763 . $text[$pos + 12]
2764 . $text[$pos + 13]
2765 . $text[$pos + 14]
2766 . $text[$pos + 15];
2767
2768 if (is_object($debug_object)) {
2769 $debug_object->debug_log(2, 'located key of: ' . $key);
2770 }
2771
2772 if (isset($this->noise[$key])) {
2773 $text = substr($text, 0, $pos)
2774 . $this->noise[$key]
2775 . substr($text, $pos + 16);
2776 } else {
2777 // do this to prevent an infinite loop.
2778 $text = substr($text, 0, $pos)
2779 . 'UNDEFINED NOISE FOR KEY: '
2780 . $key
2781 . substr($text, $pos + 16);
2782 }
2783 } else {
2784 // There is no valid key being given back to us... We must get
2785 // rid of the ___noise___ or we will have a problem.
2786 $text = substr($text, 0, $pos)
2787 . 'NO NUMERIC NOISE KEY'
2788 . substr($text, $pos + 11);
2789 }
2790 }
2791 return $text;
2792 }
2793
2794 // Sometimes we NEED one of the noise elements.
2795 function search_noise($text)
2796 {
2797 global $debug_object;
2798 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2799
2800 foreach($this->noise as $noiseElement) {
2801 if (strpos($noiseElement, $text) !== false) {
2802 return $noiseElement;
2803 }
2804 }
2805 }
2806
2807 function __toString()
2808 {
2809 return $this->root->innertext();
2810 }
2811
2812 function __get($name)
2813 {
2814 switch ($name) {
2815 case 'outertext':
2816 return $this->root->innertext();
2817 case 'innertext':
2818 return $this->root->innertext();
2819 case 'plaintext':
2820 return $this->root->text();
2821 case 'charset':
2822 return $this->_charset;
2823 case 'target_charset':
2824 return $this->_target_charset;
2825 }
2826 }
2827
2828 // camel naming conventions
2829 function childNodes($idx = -1)
2830 {
2831 return $this->root->childNodes($idx);
2832 }
2833
2834 function firstChild()
2835 {
2836 return $this->root->first_child();
2837 }
2838
2839 function lastChild()
2840 {
2841 return $this->root->last_child();
2842 }
2843
2844 function createElement($name, $value = null)
2845 {
2846 return @str_get_html("<$name>$value</$name>")->first_child();
2847 }
2848
2849 function createTextNode($value)
2850 {
2851 return @end(str_get_html($value)->nodes);
2852 }
2853
2854 function getElementById($id)
2855 {
2856 return $this->find("#$id", 0);
2857 }
2858
2859 function getElementsById($id, $idx = null)
2860 {
2861 return $this->find("#$id", $idx);
2862 }
2863
2864 function getElementByTagName($name)
2865 {
2866 return $this->find($name, 0);
2867 }
2868
2869 function getElementsByTagName($name, $idx = -1)
2870 {
2871 return $this->find($name, $idx);
2872 }
2873
2874 function loadFile()
2875 {
2876 $args = func_get_args();
2877 $this->load_file($args);
2878 }
2879}
2880
2881
2882?>