· 6 years ago · Nov 05, 2019, 11:56 PM
1<?php
2/**
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6 * Contributions by:
7 * Yousuke Kumakura (Attribute filters)
8 * Vadim Voituk (Negative indexes supports of "find" method)
9 * Antcs (Constructor with automatically load contents either text or file/url)
10 *
11 * all affected sections have comments starting with "PaperG"
12 *
13 * Paperg - Added case insensitive testing of the value of the selector.
14 *
15 * Paperg - Added tag_start for the starting index of tags - NOTE: This works
16 * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
17 * out, and after the remove_noice calls so it will not reflect the REAL
18 * position of the tag in the source, it will almost always be smaller by some
19 * amount. We use this to determine how far into the file the tag in question
20 * is. This "percentage" will never be accurate as the $dom->size is the "real"
21 * number of bytes the dom was created from. But for most purposes, it's a
22 * really good estimation.
23 *
24 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
25 * closed is great for malformed html, but it CAN lead to parsing errors.
26 *
27 * Allow the user to tell us how much they trust the html.
28 *
29 * Paperg add the text and plaintext to the selectors for the find syntax.
30 * plaintext implies text in the innertext of a node. text implies that the
31 * tag is a text node. This allows for us to find tags based on the text they
32 * contain.
33 *
34 * Create find_ancestor_tag to see if a tag is - at any level - inside of
35 * another specific tag.
36 *
37 * Paperg: added parse_charset so that we know about the character set of
38 * the source document. NOTE: If the user's system has a routine called
39 * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
40 * returning the content-type header from the last transfer or curl_exec, and
41 * we will parse that and use it in preference to any other method of charset
42 * detection.
43 *
44 * Found infinite loop in the case of broken html in restore_noise. Rewrote to
45 * protect from that.
46 *
47 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
48 *
49 * Licensed under The MIT License
50 * Redistributions of files must retain the above copyright notice.
51 *
52 * @author S.C. Chen <me578022@gmail.com>
53 * @author John Schlick
54 * @author Rus Carroll
55 * @version Rev. 1.8.1 (247)
56 * @package PlaceLocalInclude
57 * @subpackage simple_html_dom
58 */
59
60/**
61 * All of the Defines for the classes below.
62 * @author S.C. Chen <me578022@gmail.com>
63 */
64define('HDOM_TYPE_ELEMENT', 1);
65define('HDOM_TYPE_COMMENT', 2);
66define('HDOM_TYPE_TEXT', 3);
67define('HDOM_TYPE_ENDTAG', 4);
68define('HDOM_TYPE_ROOT', 5);
69define('HDOM_TYPE_UNKNOWN', 6);
70define('HDOM_QUOTE_DOUBLE', 0);
71define('HDOM_QUOTE_SINGLE', 1);
72define('HDOM_QUOTE_NO', 3);
73define('HDOM_INFO_BEGIN', 0);
74define('HDOM_INFO_END', 1);
75define('HDOM_INFO_QUOTE', 2);
76define('HDOM_INFO_SPACE', 3);
77define('HDOM_INFO_TEXT', 4);
78define('HDOM_INFO_INNER', 5);
79define('HDOM_INFO_OUTER', 6);
80define('HDOM_INFO_ENDSPACE', 7);
81
82/** The default target charset */
83defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
84
85/** The default <br> text used instead of <br> tags when returning text */
86defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
87
88/** The default <span> text used instead of <span> tags when returning text */
89defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
90
91/** The maximum file size the parser should load */
92defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
93
94/** Contents between curly braces "{" and "}" are interpreted as text */
95define('HDOM_SMARTY_AS_TEXT', 1);
96
97// helper functions
98// -----------------------------------------------------------------------------
99// get html dom from file
100// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
101function file_get_html(
102 $url,
103 $use_include_path = false,
104 $context = null,
105 $offset = 0,
106 $maxLen = -1,
107 $lowercase = true,
108 $forceTagsClosed = true,
109 $target_charset = DEFAULT_TARGET_CHARSET,
110 $stripRN = true,
111 $defaultBRText = DEFAULT_BR_TEXT,
112 $defaultSpanText = DEFAULT_SPAN_TEXT)
113{
114 // Ensure maximum length is greater than zero
115 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
116
117 // We DO force the tags to be terminated.
118 $dom = new simple_html_dom(
119 null,
120 $lowercase,
121 $forceTagsClosed,
122 $target_charset,
123 $stripRN,
124 $defaultBRText,
125 $defaultSpanText);
126
127 /**
128 * For sourceforge users: uncomment the next line and comment the
129 * retrieve_url_contents line 2 lines down if it is not already done.
130 */
131 $contents = file_get_contents(
132 $url,
133 $use_include_path,
134 $context,
135 $offset,
136 $maxLen);
137
138 // Paperg - use our own mechanism for getting the contents as we want to
139 // control the timeout.
140 // $contents = retrieve_url_contents($url);
141 if (empty($contents) || strlen($contents) > $maxLen) { return false; }
142
143 // The second parameter can force the selectors to all be lowercase.
144 $dom->load($contents, $lowercase, $stripRN);
145 return $dom;
146}
147
148// get html dom from string
149function str_get_html(
150 $str,
151 $lowercase = true,
152 $forceTagsClosed = true,
153 $target_charset = DEFAULT_TARGET_CHARSET,
154 $stripRN = true,
155 $defaultBRText = DEFAULT_BR_TEXT,
156 $defaultSpanText = DEFAULT_SPAN_TEXT)
157{
158 $dom = new simple_html_dom(
159 null,
160 $lowercase,
161 $forceTagsClosed,
162 $target_charset,
163 $stripRN,
164 $defaultBRText,
165 $defaultSpanText);
166
167 if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
168 $dom->clear();
169 return false;
170 }
171
172 $dom->load($str, $lowercase, $stripRN);
173 return $dom;
174}
175
176// dump html dom tree
177function dump_html_tree($node, $show_attr = true, $deep = 0)
178{
179 $node->dump($node);
180}
181
182/**
183 * simple html dom node
184 * PaperG - added ability for "find" routine to lowercase the value of the
185 * selector.
186 *
187 * PaperG - added $tag_start to track the start position of the tag in the total
188 * byte index
189 *
190 * @package PlaceLocalInclude
191 */
192class simple_html_dom_node
193{
194 /**
195 * Node type
196 *
197 * Default is {@see HDOM_TYPE_TEXT}
198 *
199 * @var int
200 */
201 public $nodetype = HDOM_TYPE_TEXT;
202
203 /**
204 * Tag name
205 *
206 * Default is 'text'
207 *
208 * @var string
209 */
210 public $tag = 'text';
211
212 /**
213 * List of attributes
214 *
215 * @var array
216 */
217 public $attr = array();
218
219 /**
220 * List of child node objects
221 *
222 * @var array
223 */
224 public $children = array();
225 public $nodes = array();
226
227 /**
228 * The parent node object
229 *
230 * @var object|null
231 */
232 public $parent = null;
233
234 // The "info" array - see HDOM_INFO_... for what each element contains.
235 public $_ = array();
236
237 /**
238 * Start position of the tag in the document
239 *
240 * @var int
241 */
242 public $tag_start = 0;
243
244 /**
245 * The DOM object
246 *
247 * @var object|null
248 */
249 private $dom = null;
250
251 /**
252 * Construct new node object
253 *
254 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
255 */
256 function __construct($dom)
257 {
258 $this->dom = $dom;
259 $dom->nodes[] = $this;
260 }
261
262 function __destruct()
263 {
264 $this->clear();
265 }
266
267 function __toString()
268 {
269 return $this->outertext();
270 }
271
272 // clean up memory due to php5 circular references memory leak...
273 function clear()
274 {
275 $this->dom = null;
276 $this->nodes = null;
277 $this->parent = null;
278 $this->children = null;
279 }
280
281 // dump node's tree
282 function dump($show_attr = true, $deep = 0)
283 {
284 $lead = str_repeat(' ', $deep);
285
286 echo $lead . $this->tag;
287
288 if ($show_attr && count($this->attr) > 0) {
289 echo '(';
290 foreach ($this->attr as $k => $v) {
291 echo "[$k]=>\"" . $this->$k . '", ';
292 }
293 echo ')';
294 }
295
296 echo "\n";
297
298 if ($this->nodes) {
299 foreach ($this->nodes as $c) {
300 $c->dump($show_attr, $deep + 1);
301 }
302 }
303 }
304
305
306 // Debugging function to dump a single dom node with a bunch of information about it.
307 function dump_node($echo = true)
308 {
309 $string = $this->tag;
310
311 if (count($this->attr) > 0) {
312 $string .= '(';
313 foreach ($this->attr as $k => $v) {
314 $string .= "[$k]=>\"" . $this->$k . '", ';
315 }
316 $string .= ')';
317 }
318
319 if (count($this->_) > 0) {
320 $string .= ' $_ (';
321 foreach ($this->_ as $k => $v) {
322 if (is_array($v)) {
323 $string .= "[$k]=>(";
324 foreach ($v as $k2 => $v2) {
325 $string .= "[$k2]=>\"" . $v2 . '", ';
326 }
327 $string .= ')';
328 } else {
329 $string .= "[$k]=>\"" . $v . '", ';
330 }
331 }
332 $string .= ')';
333 }
334
335 if (isset($this->text)) {
336 $string .= ' text: (' . $this->text . ')';
337 }
338
339 $string .= " HDOM_INNER_INFO: '";
340
341 if (isset($node->_[HDOM_INFO_INNER])) {
342 $string .= $node->_[HDOM_INFO_INNER] . "'";
343 } else {
344 $string .= ' NULL ';
345 }
346
347 $string .= ' children: ' . count($this->children);
348 $string .= ' nodes: ' . count($this->nodes);
349 $string .= ' tag_start: ' . $this->tag_start;
350 $string .= "\n";
351
352 if ($echo) {
353 echo $string;
354 return;
355 } else {
356 return $string;
357 }
358 }
359
360 /**
361 * Return or set parent node
362 *
363 * @param object|null $parent (optional) The parent node, `null` to return
364 * the current parent node.
365 * @return object|null The parent node
366 */
367 function parent($parent = null)
368 {
369 // I am SURE that this doesn't work properly.
370 // It fails to unset the current node from it's current parents nodes or
371 // children list first.
372 if ($parent !== null) {
373 $this->parent = $parent;
374 $this->parent->nodes[] = $this;
375 $this->parent->children[] = $this;
376 }
377
378 return $this->parent;
379 }
380
381 /**
382 * @return bool True if the node has at least one child node
383 */
384 function has_child()
385 {
386 return !empty($this->children);
387 }
388
389 /**
390 * Get child node at specified index
391 *
392 * @param int $idx The index of the child node to return, `-1` to return all
393 * child nodes.
394 * @return object|array|null The child node at the specified index, all child
395 * nodes or null if the index is invalid.
396 */
397 function children($idx = -1)
398 {
399 if ($idx === -1) {
400 return $this->children;
401 }
402
403 if (isset($this->children[$idx])) {
404 return $this->children[$idx];
405 }
406
407 return null;
408 }
409
410 /**
411 * Get first child node
412 *
413 * @return object|null The first child node or null if the current node has
414 * no child nodes.
415 *
416 * @todo Use `empty()` instead of `count()` to improve performance on large
417 * arrays.
418 */
419 function first_child()
420 {
421 if (count($this->children) > 0) {
422 return $this->children[0];
423 }
424 return null;
425 }
426
427 /**
428 * Get last child node
429 *
430 * @return object|null The last child node or null if the current node has
431 * no child nodes.
432 *
433 * @todo Use `end()` to slightly improve performance on large arrays.
434 */
435 function last_child()
436 {
437 if (($count = count($this->children)) > 0) {
438 return $this->children[$count - 1];
439 }
440 return null;
441 }
442
443 /**
444 * Get next sibling node
445 *
446 * @return object|null The sibling node or null if the current node has no
447 * sibling nodes.
448 */
449 function next_sibling()
450 {
451 if ($this->parent === null) {
452 return null;
453 }
454
455 $idx = 0;
456 $count = count($this->parent->children);
457
458 while ($idx < $count && $this !== $this->parent->children[$idx]) {
459 ++$idx;
460 }
461
462 if (++$idx >= $count) {
463 return null;
464 }
465
466 return $this->parent->children[$idx];
467 }
468
469 /**
470 * Get previous sibling node
471 *
472 * @return object|null The sibling node or null if the current node has no
473 * sibling nodes.
474 */
475 function prev_sibling()
476 {
477 if ($this->parent === null) { return null; }
478
479 $idx = 0;
480 $count = count($this->parent->children);
481
482 while ($idx < $count && $this !== $this->parent->children[$idx]) {
483 ++$idx;
484 }
485
486 if (--$idx < 0) { return null; }
487
488 return $this->parent->children[$idx];
489 }
490
491 /**
492 * Traverse ancestors to the first matching tag.
493 *
494 * @param string $tag Tag to find
495 * @return object|null First matching node in the DOM tree or null if no
496 * match was found.
497 *
498 * @todo Null is returned implicitly by calling ->parent on the root node.
499 * This behaviour could change at any time, rendering this function invalid.
500 */
501 function find_ancestor_tag($tag)
502 {
503 global $debug_object;
504 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
505
506 // Start by including ourselves in the comparison.
507 $returnDom = $this;
508
509 while (!is_null($returnDom)) {
510 if (is_object($debug_object)) {
511 $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
512 }
513
514 if ($returnDom->tag == $tag) {
515 break;
516 }
517
518 $returnDom = $returnDom->parent;
519 }
520
521 return $returnDom;
522 }
523
524 /**
525 * Get node's inner text (everything inside the opening and closing tags)
526 *
527 * @return string
528 */
529 function innertext()
530 {
531 if (isset($this->_[HDOM_INFO_INNER])) {
532 return $this->_[HDOM_INFO_INNER];
533 }
534
535 if (isset($this->_[HDOM_INFO_TEXT])) {
536 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
537 }
538
539 $ret = '';
540
541 foreach ($this->nodes as $n) {
542 $ret .= $n->outertext();
543 }
544
545 return $ret;
546 }
547
548 /**
549 * Get node's outer text (everything including the opening and closing tags)
550 *
551 * @return string
552 */
553 function outertext()
554 {
555 global $debug_object;
556
557 if (is_object($debug_object)) {
558 $text = '';
559
560 if ($this->tag === 'text') {
561 if (!empty($this->text)) {
562 $text = ' with text: ' . $this->text;
563 }
564 }
565
566 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
567 }
568
569 if ($this->tag === 'root') return $this->innertext();
570
571 // trigger callback
572 if ($this->dom && $this->dom->callback !== null) {
573 call_user_func_array($this->dom->callback, array($this));
574 }
575
576 if (isset($this->_[HDOM_INFO_OUTER])) {
577 return $this->_[HDOM_INFO_OUTER];
578 }
579
580 if (isset($this->_[HDOM_INFO_TEXT])) {
581 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
582 }
583
584 // render begin tag
585 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
586 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
587 } else {
588 $ret = '';
589 }
590
591 // render inner text
592 if (isset($this->_[HDOM_INFO_INNER])) {
593 // If it's a br tag... don't return the HDOM_INNER_INFO that we
594 // may or may not have added.
595 if ($this->tag !== 'br') {
596 $ret .= $this->_[HDOM_INFO_INNER];
597 }
598 } else {
599 if ($this->nodes) {
600 foreach ($this->nodes as $n) {
601 $ret .= $this->convert_text($n->outertext());
602 }
603 }
604 }
605
606 // render end tag
607 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
608 $ret .= '</' . $this->tag . '>';
609 }
610
611 return $ret;
612 }
613
614 /**
615 * Get node's plain text (everything excluding all tags)
616 *
617 * @return string
618 */
619 function text()
620 {
621 if (isset($this->_[HDOM_INFO_INNER])) {
622 return $this->_[HDOM_INFO_INNER];
623 }
624
625 switch ($this->nodetype) {
626 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
627 case HDOM_TYPE_COMMENT: return '';
628 case HDOM_TYPE_UNKNOWN: return '';
629 }
630
631 if (strcasecmp($this->tag, 'script') === 0) { return ''; }
632 if (strcasecmp($this->tag, 'style') === 0) { return ''; }
633
634 $ret = '';
635
636 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
637 // for some span tags, and some p tags) $this->nodes is set to NULL.
638 // NOTE: This indicates that there is a problem where it's set to NULL
639 // without a clear happening.
640 // WHY is this happening?
641 if (!is_null($this->nodes)) {
642 foreach ($this->nodes as $n) {
643 // Start paragraph after a blank line
644 if ($n->tag === 'p') {
645 $ret .= "\n\n";
646 }
647
648 $ret .= $this->convert_text($n->text());
649
650 // If this node is a span... add a space at the end of it so
651 // multiple spans don't run into each other. This is plaintext
652 // after all.
653 if ($n->tag === 'span') {
654 $ret .= $this->dom->default_span_text;
655 }
656 }
657 }
658 return trim($ret);
659 }
660
661 /**
662 * Get node's xml text (inner text as a CDATA section)
663 *
664 * @return string
665 */
666 function xmltext()
667 {
668 $ret = $this->innertext();
669 $ret = str_ireplace('<![CDATA[', '', $ret);
670 $ret = str_replace(']]>', '', $ret);
671 return $ret;
672 }
673
674 // build node's text with tag
675 function makeup()
676 {
677 // text, comment, unknown
678 if (isset($this->_[HDOM_INFO_TEXT])) {
679 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
680 }
681
682 $ret = '<' . $this->tag;
683 $i = -1;
684
685 foreach ($this->attr as $key => $val) {
686 ++$i;
687
688 // skip removed attribute
689 if ($val === null || $val === false) { continue; }
690
691 $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
692
693 //no value attr: nowrap, checked selected...
694 if ($val === true) {
695 $ret .= $key;
696 } else {
697 switch ($this->_[HDOM_INFO_QUOTE][$i])
698 {
699 case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
700 case HDOM_QUOTE_SINGLE: $quote = '\''; break;
701 default: $quote = '';
702 }
703
704 $ret .= $key
705 . $this->_[HDOM_INFO_SPACE][$i][1]
706 . '='
707 . $this->_[HDOM_INFO_SPACE][$i][2]
708 . $quote
709 . $val
710 . $quote;
711 }
712 }
713
714 $ret = $this->dom->restore_noise($ret);
715 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
716 }
717
718 /**
719 * Find elements by CSS selector
720 *
721 * @param string $selector The CSS selector
722 * @param int|null $idx Index of element to return form the list of matching
723 * elements (default: `null` = disabled).
724 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
725 * enabled (default: `false`)
726 * @return array|object|null A list of elements matching the specified CSS
727 * selector or a single element if $idx is specified or null if no element
728 * was found.
729 */
730 function find($selector, $idx = null, $lowercase = false)
731 {
732 $selectors = $this->parse_selector($selector);
733 if (($count = count($selectors)) === 0) { return array(); }
734 $found_keys = array();
735
736 // find each selector
737 for ($c = 0; $c < $count; ++$c) {
738 // The change on the below line was documented on the sourceforge
739 // code tracker id 2788009
740 // used to be: if (($levle=count($selectors[0]))===0) return array();
741 if (($levle = count($selectors[$c])) === 0) { return array(); }
742 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
743
744 $head = array($this->_[HDOM_INFO_BEGIN] => 1);
745 $cmd = ' '; // Combinator
746
747 // handle descendant selectors, no recursive!
748 for ($l = 0; $l < $levle; ++$l) {
749 $ret = array();
750
751 foreach ($head as $k => $v) {
752 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
753 //PaperG - Pass this optional parameter on to the seek function.
754 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
755 }
756
757 $head = $ret;
758 $cmd = $selectors[$c][$l][4]; // Next Combinator
759 }
760
761 foreach ($head as $k => $v) {
762 if (!isset($found_keys[$k])) {
763 $found_keys[$k] = 1;
764 }
765 }
766 }
767
768 // sort keys
769 ksort($found_keys);
770
771 $found = array();
772 foreach ($found_keys as $k => $v) {
773 $found[] = $this->dom->nodes[$k];
774 }
775
776 // return nth-element or array
777 if (is_null($idx)) { return $found; }
778 elseif ($idx < 0) { $idx = count($found) + $idx; }
779 return (isset($found[$idx])) ? $found[$idx] : null;
780 }
781
782 /**
783 * Seek DOM elements by selector
784 *
785 * **Note**
786 * The selector element must be compatible to a selector from
787 * {@see simple_html_dom_node::parse_selector()}
788 *
789 * @param array $selector A selector element
790 * @param array $ret An array of matches
791 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
792 * enabled (default: `false`)
793 * @return void
794 */
795 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
796 {
797 global $debug_object;
798 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
799
800 list($tag, $id, $class, $attributes, $cmb) = $selector;
801 $nodes = array();
802
803 if ($parent_cmd === ' ') { // Descendant Combinator
804 // Find parent closing tag if the current element doesn't have a closing
805 // tag (i.e. void element)
806 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
807 if ($end == 0) {
808 $parent = $this->parent;
809 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
810 $end -= 1;
811 $parent = $parent->parent;
812 }
813 $end += $parent->_[HDOM_INFO_END];
814 }
815
816 // Get list of target nodes
817 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
818 $nodes_count = $end - $nodes_start;
819 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
820 } elseif ($parent_cmd === '>') { // Child Combinator
821 $nodes = $this->children;
822 } elseif ($parent_cmd === '+'
823 && $this->parent
824 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
825 $index = array_search($this, $this->parent->children, true) + 1;
826 $nodes[] = $this->parent->children[$index];
827 } elseif ($parent_cmd === '~'
828 && $this->parent
829 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
830 $index = array_search($this, $this->parent->children, true);
831 $nodes = array_slice($this->parent->children, $index);
832 }
833
834 // Go throgh each element starting at this element until the end tag
835 // Note: If this element is a void tag, any previous void element is
836 // skipped.
837 foreach($nodes as $node) {
838 $pass = true;
839
840 // Skip root nodes
841 if(!$node->parent) {
842 $pass = false;
843 }
844
845 // Skip if node isn't a child node (i.e. text nodes)
846 if($pass && !in_array($node, $node->parent->children, true)) {
847 $pass = false;
848 }
849
850 // Skip if tag doesn't match
851 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
852 $pass = false;
853 }
854
855 // Skip if ID doesn't exist
856 if ($pass && $id !== '' && !isset($node->attr['id'])) {
857 $pass = false;
858 }
859
860 // Check if ID matches
861 if ($pass && $id !== '' && isset($node->attr['id'])) {
862 // Note: Only consider the first ID (as browsers do)
863 $node_id = explode(' ', trim($node->attr['id']))[0];
864
865 if($id !== $node_id) { $pass = false; }
866 }
867
868 // Check if all class(es) exist
869 if ($pass && $class !== '' && is_array($class) && !empty($class)) {
870 if (isset($node->attr['class'])) {
871 $node_classes = explode(' ', $node->attr['class']);
872
873 if ($lowercase) {
874 $node_classes = array_map('strtolower', $node_classes);
875 }
876
877 foreach($class as $c) {
878 if(!in_array($c, $node_classes)) {
879 $pass = false;
880 break;
881 }
882 }
883 } else {
884 $pass = false;
885 }
886 }
887
888 // Check attributes
889 if ($pass
890 && $attributes !== ''
891 && is_array($attributes)
892 && !empty($attributes)) {
893 foreach($attributes as $a) {
894 list (
895 $att_name,
896 $att_expr,
897 $att_val,
898 $att_inv,
899 $att_case_sensitivity
900 ) = $a;
901
902 // Handle indexing attributes (i.e. "[2]")
903 /**
904 * Note: This is not supported by the CSS Standard but adds
905 * the ability to select items compatible to XPath (i.e.
906 * the 3rd element within it's parent).
907 *
908 * Note: This doesn't conflict with the CSS Standard which
909 * doesn't work on numeric attributes anyway.
910 */
911 if (is_numeric($att_name)
912 && $att_expr === ''
913 && $att_val === '') {
914 $count = 0;
915
916 // Find index of current element in parent
917 foreach ($node->parent->children as $c) {
918 if ($c->tag === $node->tag) ++$count;
919 if ($c === $node) break;
920 }
921
922 // If this is the correct node, continue with next
923 // attribute
924 if ($count === (int)$att_name) continue;
925 }
926
927 // Check attribute availability
928 if ($att_inv) { // Attribute should NOT be set
929 if (isset($node->attr[$att_name])) {
930 $pass = false;
931 break;
932 }
933 } else { // Attribute should be set
934 // todo: "plaintext" is not a valid CSS selector!
935 if ($att_name !== 'plaintext'
936 && !isset($node->attr[$att_name])) {
937 $pass = false;
938 break;
939 }
940 }
941
942 // Continue with next attribute if expression isn't defined
943 if ($att_expr === '') continue;
944
945 // If they have told us that this is a "plaintext"
946 // search then we want the plaintext of the node - right?
947 // todo "plaintext" is not a valid CSS selector!
948 if ($att_name === 'plaintext') {
949 $nodeKeyValue = $node->text();
950 } else {
951 $nodeKeyValue = $node->attr[$att_name];
952 }
953
954 if (is_object($debug_object)) {
955 $debug_object->debug_log(2,
956 'testing node: '
957 . $node->tag
958 . ' for attribute: '
959 . $att_name
960 . $att_expr
961 . $att_val
962 . ' where nodes value is: '
963 . $nodeKeyValue
964 );
965 }
966
967 // If lowercase is set, do a case insensitive test of
968 // the value of the selector.
969 if ($lowercase) {
970 $check = $this->match(
971 $att_expr,
972 strtolower($att_val),
973 strtolower($nodeKeyValue),
974 $att_case_sensitivity
975 );
976 } else {
977 $check = $this->match(
978 $att_expr,
979 $att_val,
980 $nodeKeyValue,
981 $att_case_sensitivity
982 );
983 }
984
985 if (is_object($debug_object)) {
986 $debug_object->debug_log(2,
987 'after match: '
988 . ($check ? 'true' : 'false')
989 );
990 }
991
992 if (!$check) {
993 $pass = false;
994 break;
995 }
996 }
997 }
998
999 // Found a match. Add to list and clear node
1000 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1001 unset($node);
1002 }
1003 // It's passed by reference so this is actually what this function returns.
1004 if (is_object($debug_object)) {
1005 $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1006 }
1007 }
1008
1009 /**
1010 * Match value and pattern for a given CSS expression
1011 *
1012 * **Supported Expressions**
1013 *
1014 * | Expression | Description
1015 * | ---------- | -----------
1016 * | `=` | $value and $pattern must be equal
1017 * | `!=` | $value and $pattern must not be equal
1018 * | `^=` | $value must start with $pattern
1019 * | `$=` | $value must end with $pattern
1020 * | `*=` | $value must contain $pattern
1021 *
1022 * @param string $exp The expression.
1023 * @param string $pattern The pattern
1024 * @param string $value The value
1025 * @value bool True if $value matches $pattern
1026 */
1027 protected function match($exp, $pattern, $value, $case_sensitivity)
1028 {
1029 global $debug_object;
1030 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1031
1032 if ($case_sensitivity === 'i') {
1033 $pattern = strtolower($pattern);
1034 $value = strtolower($value);
1035 }
1036
1037 switch ($exp) {
1038 case '=':
1039 return ($value === $pattern);
1040 case '!=':
1041 return ($value !== $pattern);
1042 case '^=':
1043 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1044 case '$=':
1045 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1046 case '*=':
1047 return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1048 case '|=':
1049 /**
1050 * [att|=val]
1051 *
1052 * Represents an element with the att attribute, its value
1053 * either being exactly "val" or beginning with "val"
1054 * immediately followed by "-" (U+002D).
1055 */
1056 return strpos($value, $pattern) === 0;
1057 case '~=':
1058 /**
1059 * [att~=val]
1060 *
1061 * Represents an element with the att attribute whose value is a
1062 * whitespace-separated list of words, one of which is exactly
1063 * "val". If "val" contains whitespace, it will never represent
1064 * anything (since the words are separated by spaces). Also if
1065 * "val" is the empty string, it will never represent anything.
1066 */
1067 return in_array($pattern, explode(' ', trim($value)), true);
1068 }
1069 return false;
1070 }
1071
1072 /**
1073 * Parse CSS selector
1074 *
1075 * @param string $selector_string CSS selector string
1076 * @return array List of CSS selectors. The format depends on the type of
1077 * selector:
1078 *
1079 * ```php
1080 *
1081 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1082 * array( // list of combinator selectors, i.e. 'img > p > div'
1083 * array( // selector element
1084 * [0], // (string) The element tag
1085 * [1], // (string) The element id
1086 * [2], // (array<string>) The element classes
1087 * [3], // (array<array<string>>) The list of attributes, each
1088 * // with four elements: name, expression, value, inverted
1089 * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
1090 * )
1091 * )
1092 * )
1093 * ```
1094 *
1095 * @link https://www.w3.org/TR/selectors/#compound Compound selector
1096 */
1097 protected function parse_selector($selector_string)
1098 {
1099 global $debug_object;
1100 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1101
1102 /**
1103 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1104 *
1105 * Paperg: Add the colon to the attribute, so that it properly finds
1106 * <tag attr:ibute="something" > like google does.
1107 *
1108 * Note: if you try to look at this attribute, you MUST use getAttribute
1109 * since $dom->x:y will fail the php syntax check.
1110 *
1111 * Notice the \[ starting the attribute? and the @? following? This
1112 * implies that an attribute can begin with an @ sign that is not
1113 * captured. This implies that an html attribute specifier may start
1114 * with an @ sign that is NOT captured by the expression. Farther study
1115 * is required to determine of this should be documented or removed.
1116 *
1117 * Matches selectors in this order:
1118 *
1119 * [0] - full match
1120 *
1121 * [1] - tag name
1122 * ([\w:\*-]*)
1123 * Matches the tag name consisting of zero or more words, colons,
1124 * asterisks and hyphens.
1125 *
1126 * [2] - id name
1127 * (?:\#([\w-]+))
1128 * Optionally matches a id name, consisting of an "#" followed by
1129 * the id name (one or more words and hyphens).
1130 *
1131 * [3] - class names (including dots)
1132 * (?:\.([\w\.-]+))?
1133 * Optionally matches a list of classs, consisting of an "."
1134 * followed by the class name (one or more words and hyphens)
1135 * where multiple classes can be chained (i.e. ".foo.bar.baz")
1136 *
1137 * [4] - attributes
1138 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1139 * Optionally matches the attributes list
1140 *
1141 * [5] - separator
1142 * ([\/, >+~]+)
1143 * Matches the selector list separator
1144 */
1145 // phpcs:ignore Generic.Files.LineLength
1146 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1147
1148 preg_match_all(
1149 $pattern,
1150 trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1151 $matches,
1152 PREG_SET_ORDER
1153 );
1154
1155 if (is_object($debug_object)) {
1156 $debug_object->debug_log(2, 'Matches Array: ', $matches);
1157 }
1158
1159 $selectors = array();
1160 $result = array();
1161
1162 foreach ($matches as $m) {
1163 $m[0] = trim($m[0]);
1164
1165 // Skip NoOps
1166 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1167
1168 // Convert to lowercase
1169 if ($this->dom->lowercase) {
1170 $m[1] = strtolower($m[1]);
1171 }
1172
1173 // Extract classes
1174 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1175
1176 /* Extract attributes (pattern based on the pattern above!)
1177
1178 * [0] - full match
1179 * [1] - attribute name
1180 * [2] - attribute expression
1181 * [3] - attribute value
1182 * [4] - case sensitivity
1183 *
1184 * Note: Attributes can be negated with a "!" prefix to their name
1185 */
1186 if($m[4] !== '') {
1187 preg_match_all(
1188 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1189 trim($m[4]),
1190 $attributes,
1191 PREG_SET_ORDER
1192 );
1193
1194 // Replace element by array
1195 $m[4] = array();
1196
1197 foreach($attributes as $att) {
1198 // Skip empty matches
1199 if(trim($att[0]) === '') { continue; }
1200
1201 $inverted = (isset($att[1][0]) && $att[1][0] === '!');
1202 $m[4][] = array(
1203 $inverted ? substr($att[1], 1) : $att[1], // Name
1204 (isset($att[2])) ? $att[2] : '', // Expression
1205 (isset($att[3])) ? $att[3] : '', // Value
1206 $inverted, // Inverted Flag
1207 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1208 );
1209 }
1210 }
1211
1212 // Sanitize Separator
1213 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1214 $m[5] = ' ';
1215 } else { // Other Separator
1216 $m[5] = trim($m[5]);
1217 }
1218
1219 // Clear Separator if it's a Selector List
1220 if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1221
1222 // Remove full match before adding to results
1223 array_shift($m);
1224 $result[] = $m;
1225
1226 if ($is_list) { // Selector List
1227 $selectors[] = $result;
1228 $result = array();
1229 }
1230 }
1231
1232 if (count($result) > 0) { $selectors[] = $result; }
1233 return $selectors;
1234 }
1235
1236 function __get($name)
1237 {
1238 if (isset($this->attr[$name])) {
1239 return $this->convert_text($this->attr[$name]);
1240 }
1241 switch ($name) {
1242 case 'outertext': return $this->outertext();
1243 case 'innertext': return $this->innertext();
1244 case 'plaintext': return $this->text();
1245 case 'xmltext': return $this->xmltext();
1246 default: return array_key_exists($name, $this->attr);
1247 }
1248 }
1249
1250 function __set($name, $value)
1251 {
1252 global $debug_object;
1253 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1254
1255 switch ($name) {
1256 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1257 case 'innertext':
1258 if (isset($this->_[HDOM_INFO_TEXT])) {
1259 return $this->_[HDOM_INFO_TEXT] = $value;
1260 }
1261 return $this->_[HDOM_INFO_INNER] = $value;
1262 }
1263
1264 if (!isset($this->attr[$name])) {
1265 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1266 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1267 }
1268
1269 $this->attr[$name] = $value;
1270 }
1271
1272 function __isset($name)
1273 {
1274 switch ($name) {
1275 case 'outertext': return true;
1276 case 'innertext': return true;
1277 case 'plaintext': return true;
1278 }
1279 //no value attr: nowrap, checked selected...
1280 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1281 }
1282
1283 function __unset($name)
1284 {
1285 if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1286 }
1287
1288 // PaperG - Function to convert the text from one character set to another
1289 // if the two sets are not the same.
1290 function convert_text($text)
1291 {
1292 global $debug_object;
1293 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1294
1295 $converted_text = $text;
1296
1297 $sourceCharset = '';
1298 $targetCharset = '';
1299
1300 if ($this->dom) {
1301 $sourceCharset = strtoupper($this->dom->_charset);
1302 $targetCharset = strtoupper($this->dom->_target_charset);
1303 }
1304
1305 if (is_object($debug_object)) {
1306 $debug_object->debug_log(3,
1307 'source charset: '
1308 . $sourceCharset
1309 . ' target charaset: '
1310 . $targetCharset
1311 );
1312 }
1313
1314 if (!empty($sourceCharset)
1315 && !empty($targetCharset)
1316 && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1317 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1318 if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1319 && ($this->is_utf8($text))) {
1320 $converted_text = $text;
1321 } else {
1322 $converted_text = iconv($sourceCharset, $targetCharset, $text);
1323 }
1324 }
1325
1326 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1327 if ($targetCharset === 'UTF-8') {
1328 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1329 $converted_text = substr($converted_text, 3);
1330 }
1331
1332 if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1333 $converted_text = substr($converted_text, 0, -3);
1334 }
1335 }
1336
1337 return $converted_text;
1338 }
1339
1340 /**
1341 * Returns true if $string is valid UTF-8 and false otherwise.
1342 *
1343 * @param mixed $str String to be tested
1344 * @return boolean
1345 */
1346 static function is_utf8($str)
1347 {
1348 $c = 0; $b = 0;
1349 $bits = 0;
1350 $len = strlen($str);
1351 for($i = 0; $i < $len; $i++) {
1352 $c = ord($str[$i]);
1353 if($c > 128) {
1354 if(($c >= 254)) { return false; }
1355 elseif($c >= 252) { $bits = 6; }
1356 elseif($c >= 248) { $bits = 5; }
1357 elseif($c >= 240) { $bits = 4; }
1358 elseif($c >= 224) { $bits = 3; }
1359 elseif($c >= 192) { $bits = 2; }
1360 else { return false; }
1361 if(($i + $bits) > $len) { return false; }
1362 while($bits > 1) {
1363 $i++;
1364 $b = ord($str[$i]);
1365 if($b < 128 || $b > 191) { return false; }
1366 $bits--;
1367 }
1368 }
1369 }
1370 return true;
1371 }
1372
1373 /**
1374 * Function to try a few tricks to determine the displayed size of an img on
1375 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1376 * other tag types.
1377 *
1378 * @author John Schlick
1379 * @version April 19 2012
1380 * @return array an array containing the 'height' and 'width' of the image
1381 * on the page or -1 if we can't figure it out.
1382 */
1383 function get_display_size()
1384 {
1385 global $debug_object;
1386
1387 $width = -1;
1388 $height = -1;
1389
1390 if ($this->tag !== 'img') {
1391 return false;
1392 }
1393
1394 // See if there is aheight or width attribute in the tag itself.
1395 if (isset($this->attr['width'])) {
1396 $width = $this->attr['width'];
1397 }
1398
1399 if (isset($this->attr['height'])) {
1400 $height = $this->attr['height'];
1401 }
1402
1403 // Now look for an inline style.
1404 if (isset($this->attr['style'])) {
1405 // Thanks to user gnarf from stackoverflow for this regular expression.
1406 $attributes = array();
1407
1408 preg_match_all(
1409 '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1410 $this->attr['style'],
1411 $matches,
1412 PREG_SET_ORDER
1413 );
1414
1415 foreach ($matches as $match) {
1416 $attributes[$match[1]] = $match[2];
1417 }
1418
1419 // If there is a width in the style attributes:
1420 if (isset($attributes['width']) && $width == -1) {
1421 // check that the last two characters are px (pixels)
1422 if (strtolower(substr($attributes['width'], -2)) === 'px') {
1423 $proposed_width = substr($attributes['width'], 0, -2);
1424 // Now make sure that it's an integer and not something stupid.
1425 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1426 $width = $proposed_width;
1427 }
1428 }
1429 }
1430
1431 // If there is a width in the style attributes:
1432 if (isset($attributes['height']) && $height == -1) {
1433 // check that the last two characters are px (pixels)
1434 if (strtolower(substr($attributes['height'], -2)) == 'px') {
1435 $proposed_height = substr($attributes['height'], 0, -2);
1436 // Now make sure that it's an integer and not something stupid.
1437 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1438 $height = $proposed_height;
1439 }
1440 }
1441 }
1442
1443 }
1444
1445 // Future enhancement:
1446 // Look in the tag to see if there is a class or id specified that has
1447 // a height or width attribute to it.
1448
1449 // Far future enhancement
1450 // Look at all the parent tags of this image to see if they specify a
1451 // class or id that has an img selector that specifies a height or width
1452 // Note that in this case, the class or id will have the img subselector
1453 // for it to apply to the image.
1454
1455 // ridiculously far future development
1456 // If the class or id is specified in a SEPARATE css file thats not on
1457 // the page, go get it and do what we were just doing for the ones on
1458 // the page.
1459
1460 $result = array(
1461 'height' => $height,
1462 'width' => $width
1463 );
1464
1465 return $result;
1466 }
1467
1468 // camel naming conventions
1469 function getAllAttributes()
1470 {
1471 return $this->attr;
1472 }
1473
1474 function getAttribute($name)
1475 {
1476 return $this->__get($name);
1477 }
1478
1479 function setAttribute($name, $value)
1480 {
1481 $this->__set($name, $value);
1482 }
1483
1484 function hasAttribute($name)
1485 {
1486 return $this->__isset($name);
1487 }
1488
1489 function removeAttribute($name)
1490 {
1491 $this->__set($name, null);
1492 }
1493
1494 function getElementById($id)
1495 {
1496 return $this->find("#$id", 0);
1497 }
1498
1499 function getElementsById($id, $idx = null)
1500 {
1501 return $this->find("#$id", $idx);
1502 }
1503
1504 function getElementByTagName($name)
1505 {
1506 return $this->find($name, 0);
1507 }
1508
1509 function getElementsByTagName($name, $idx = null)
1510 {
1511 return $this->find($name, $idx);
1512 }
1513
1514 function parentNode()
1515 {
1516 return $this->parent();
1517 }
1518
1519 function childNodes($idx = -1)
1520 {
1521 return $this->children($idx);
1522 }
1523
1524 function firstChild()
1525 {
1526 return $this->first_child();
1527 }
1528
1529 function lastChild()
1530 {
1531 return $this->last_child();
1532 }
1533
1534 function nextSibling()
1535 {
1536 return $this->next_sibling();
1537 }
1538
1539 function previousSibling()
1540 {
1541 return $this->prev_sibling();
1542 }
1543
1544 function hasChildNodes()
1545 {
1546 return $this->has_child();
1547 }
1548
1549 function nodeName()
1550 {
1551 return $this->tag;
1552 }
1553
1554 function appendChild($node)
1555 {
1556 $node->parent($this);
1557 return $node;
1558 }
1559
1560}
1561
1562/**
1563 * simple html dom parser
1564 *
1565 * Paperg - in the find routine: allow us to specify that we want case
1566 * insensitive testing of the value of the selector.
1567 *
1568 * Paperg - change $size from protected to public so we can easily access it
1569 *
1570 * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1571 * trust the html or not. Default is to NOT trust it.
1572 *
1573 * @package PlaceLocalInclude
1574 */
1575class simple_html_dom
1576{
1577 /**
1578 * The root node of the document
1579 *
1580 * @var object
1581 */
1582 public $root = null;
1583
1584 /**
1585 * List of nodes in the current DOM
1586 *
1587 * @var array
1588 */
1589 public $nodes = array();
1590
1591 /**
1592 * Callback function to run for each element in the DOM.
1593 *
1594 * @var callable|null
1595 */
1596 public $callback = null;
1597
1598 /**
1599 * Indicates how tags and attributes are matched
1600 *
1601 * @var bool When set to **true** tags and attributes will be converted to
1602 * lowercase before matching.
1603 */
1604 public $lowercase = false;
1605
1606 /**
1607 * Original document size
1608 *
1609 * Holds the original document size.
1610 *
1611 * @var int
1612 */
1613 public $original_size;
1614
1615 /**
1616 * Current document size
1617 *
1618 * Holds the current document size. The document size is determined by the
1619 * string length of ({@see simple_html_dom::$doc}).
1620 *
1621 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1622 *
1623 * @var int
1624 * */
1625 public $size;
1626
1627 /**
1628 * Current position in the document
1629 *
1630 * @var int
1631 */
1632 protected $pos;
1633
1634 /**
1635 * The document
1636 *
1637 * @var string
1638 */
1639 protected $doc;
1640
1641 /**
1642 * Current character
1643 *
1644 * Holds the current character at position {@see simple_html_dom::$pos} in
1645 * the document {@see simple_html_dom::$doc}
1646 *
1647 * _Note_: Using this variable is more efficient than calling
1648 * `substr($doc, $pos, 1)`
1649 *
1650 * @var string
1651 */
1652 protected $char;
1653
1654 protected $cursor;
1655
1656 /**
1657 * Parent node of the next node detected by the parser
1658 *
1659 * @var object
1660 */
1661 protected $parent;
1662 protected $noise = array();
1663
1664 /**
1665 * Tokens considered blank in HTML
1666 *
1667 * @var string
1668 */
1669 protected $token_blank = " \t\r\n";
1670
1671 /**
1672 * Tokens to identify the equal sign for attributes, stopping either at the
1673 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1674 * "<html>")
1675 *
1676 * @var string
1677 */
1678 protected $token_equal = ' =/>';
1679
1680 /**
1681 * Tokens to identify the end of a tag name. A tag name either ends on the
1682 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1683 *
1684 * @var string
1685 */
1686 protected $token_slash = " />\r\n\t";
1687
1688 /**
1689 * Tokens to identify the end of an attribute
1690 *
1691 * @var string
1692 */
1693 protected $token_attr = ' >';
1694
1695 // Note that this is referenced by a child node, and so it needs to be
1696 // public for that node to see this information.
1697 public $_charset = '';
1698 public $_target_charset = '';
1699
1700 /**
1701 * Innertext for <br> elements
1702 *
1703 * @var string
1704 */
1705 protected $default_br_text = '';
1706
1707 /**
1708 * Suffix for <span> elements
1709 *
1710 * @var string
1711 */
1712 public $default_span_text = '';
1713
1714 /**
1715 * Defines a list of self-closing tags (Void elements) according to the HTML
1716 * Specification
1717 *
1718 * _Remarks_:
1719 * - Use `isset()` instead of `in_array()` on array elements to boost
1720 * performance about 30%
1721 * - Sort elements by name for better readability!
1722 *
1723 * @link https://www.w3.org/TR/html HTML Specification
1724 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1725 */
1726 protected $self_closing_tags = array(
1727 'area' => 1,
1728 'base' => 1,
1729 'br' => 1,
1730 'col' => 1,
1731 'embed' => 1,
1732 'hr' => 1,
1733 'img' => 1,
1734 'input' => 1,
1735 'link' => 1,
1736 'meta' => 1,
1737 'param' => 1,
1738 'source' => 1,
1739 'track' => 1,
1740 'wbr' => 1
1741 );
1742
1743 /**
1744 * Defines a list of tags which - if closed - close all optional closing
1745 * elements within if they haven't been closed yet. (So, an element where
1746 * neither opening nor closing tag is omissible consistently closes every
1747 * optional closing element within)
1748 *
1749 * _Remarks_:
1750 * - Use `isset()` instead of `in_array()` on array elements to boost
1751 * performance about 30%
1752 * - Sort elements by name for better readability!
1753 */
1754 protected $block_tags = array(
1755 'body' => 1,
1756 'div' => 1,
1757 'form' => 1,
1758 'root' => 1,
1759 'span' => 1,
1760 'table' => 1
1761 );
1762
1763 /**
1764 * Defines elements whose end tag is omissible.
1765 *
1766 * * key = Name of an element whose end tag is omissible.
1767 * * value = Names of elements whose end tag is omissible, that are closed
1768 * by the current element.
1769 *
1770 * _Remarks_:
1771 * - Use `isset()` instead of `in_array()` on array elements to boost
1772 * performance about 30%
1773 * - Sort elements by name for better readability!
1774 *
1775 * **Example**
1776 *
1777 * An `li` element’s end tag may be omitted if the `li` element is immediately
1778 * followed by another `li` element. To do that, add following element to the
1779 * array:
1780 *
1781 * ```php
1782 * 'li' => array('li'),
1783 * ```
1784 *
1785 * With this, the following two examples are considered equal. Note that the
1786 * second example is missing the closing tags on `li` elements.
1787 *
1788 * ```html
1789 * <ul><li>First Item</li><li>Second Item</li></ul>
1790 * ```
1791 *
1792 * <ul><li>First Item</li><li>Second Item</li></ul>
1793 *
1794 * ```html
1795 * <ul><li>First Item<li>Second Item</ul>
1796 * ```
1797 *
1798 * <ul><li>First Item<li>Second Item</ul>
1799 *
1800 * @var array A two-dimensional array where the key is the name of an
1801 * element whose end tag is omissible and the value is an array of elements
1802 * whose end tag is omissible, that are closed by the current element.
1803 *
1804 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1805 *
1806 * @todo The implementation of optional closing tags doesn't work in all cases
1807 * because it only consideres elements who close other optional closing
1808 * tags, not taking into account that some (non-blocking) tags should close
1809 * these optional closing tags. For example, the end tag for "p" is omissible
1810 * and can be closed by an "address" element, whose end tag is NOT omissible.
1811 * Currently a "p" element without closing tag stops at the next "p" element
1812 * or blocking tag, even if it contains other elements.
1813 *
1814 * @todo Known sourceforge issue #2977341
1815 * B tags that are not closed cause us to return everything to the end of
1816 * the document.
1817 */
1818 protected $optional_closing_tags = array(
1819 // Not optional, see
1820 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1821 'b' => array('b' => 1),
1822 'dd' => array('dd' => 1, 'dt' => 1),
1823 // Not optional, see
1824 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1825 'dl' => array('dd' => 1, 'dt' => 1),
1826 'dt' => array('dd' => 1, 'dt' => 1),
1827 'li' => array('li' => 1),
1828 'optgroup' => array('optgroup' => 1, 'option' => 1),
1829 'option' => array('optgroup' => 1, 'option' => 1),
1830 'p' => array('p' => 1),
1831 'rp' => array('rp' => 1, 'rt' => 1),
1832 'rt' => array('rp' => 1, 'rt' => 1),
1833 'td' => array('td' => 1, 'th' => 1),
1834 'th' => array('td' => 1, 'th' => 1),
1835 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1836 );
1837
1838 function __construct(
1839 $str = null,
1840 $lowercase = true,
1841 $forceTagsClosed = true,
1842 $target_charset = DEFAULT_TARGET_CHARSET,
1843 $stripRN = true,
1844 $defaultBRText = DEFAULT_BR_TEXT,
1845 $defaultSpanText = DEFAULT_SPAN_TEXT,
1846 $options = 0)
1847 {
1848 if ($str) {
1849 if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1850 $this->load_file($str);
1851 } else {
1852 $this->load(
1853 $str,
1854 $lowercase,
1855 $stripRN,
1856 $defaultBRText,
1857 $defaultSpanText,
1858 $options
1859 );
1860 }
1861 }
1862 // Forcing tags to be closed implies that we don't trust the html, but
1863 // it can lead to parsing errors if we SHOULD trust the html.
1864 if (!$forceTagsClosed) {
1865 $this->optional_closing_array = array();
1866 }
1867
1868 $this->_target_charset = $target_charset;
1869 }
1870
1871 function __destruct()
1872 {
1873 $this->clear();
1874 }
1875
1876 // load html from string
1877 function load(
1878 $str,
1879 $lowercase = true,
1880 $stripRN = true,
1881 $defaultBRText = DEFAULT_BR_TEXT,
1882 $defaultSpanText = DEFAULT_SPAN_TEXT,
1883 $options = 0)
1884 {
1885 global $debug_object;
1886
1887 // prepare
1888 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1889
1890 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1891 // Script tags removal now preceeds style tag removal.
1892 // strip out <script> tags
1893 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1894 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1895
1896 // strip out the \r \n's if we are told to.
1897 if ($stripRN) {
1898 $this->doc = str_replace("\r", ' ', $this->doc);
1899 $this->doc = str_replace("\n", ' ', $this->doc);
1900
1901 // set the length of content since we have changed it.
1902 $this->size = strlen($this->doc);
1903 }
1904
1905 // strip out cdata
1906 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1907 // strip out comments
1908 $this->remove_noise("'<!--(.*?)-->'is");
1909 // strip out <style> tags
1910 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1911 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1912 // strip out preformatted tags
1913 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1914 // strip out server side scripts
1915 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1916
1917 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1918 $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1919 }
1920
1921 // parsing
1922 $this->parse();
1923 // end
1924 $this->root->_[HDOM_INFO_END] = $this->cursor;
1925 $this->parse_charset();
1926
1927 // make load function chainable
1928 return $this;
1929 }
1930
1931 // load html from file
1932 function load_file()
1933 {
1934 $args = func_get_args();
1935
1936 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1937 $this->load($doc, true);
1938 } else {
1939 return false;
1940 }
1941 }
1942
1943 /**
1944 * Set the callback function
1945 *
1946 * @param callable $function_name Callback function to run for each element
1947 * in the DOM.
1948 * @return void
1949 */
1950 function set_callback($function_name)
1951 {
1952 $this->callback = $function_name;
1953 }
1954
1955 /**
1956 * Remove callback function
1957 *
1958 * @return void
1959 */
1960 function remove_callback()
1961 {
1962 $this->callback = null;
1963 }
1964
1965 // save dom as string
1966 function save($filepath = '')
1967 {
1968 $ret = $this->root->innertext();
1969 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1970 return $ret;
1971 }
1972
1973 // find dom node by css selector
1974 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1975 function find($selector, $idx = null, $lowercase = false)
1976 {
1977 return $this->root->find($selector, $idx, $lowercase);
1978 }
1979
1980 // clean up memory due to php5 circular references memory leak...
1981 function clear()
1982 {
1983 foreach ($this->nodes as $n) {
1984 $n->clear(); $n = null;
1985 }
1986
1987 // This add next line is documented in the sourceforge repository.
1988 // 2977248 as a fix for ongoing memory leaks that occur even with the
1989 // use of clear.
1990 if (isset($this->children)) {
1991 foreach ($this->children as $n) {
1992 $n->clear(); $n = null;
1993 }
1994 }
1995
1996 if (isset($this->parent)) {
1997 $this->parent->clear();
1998 unset($this->parent);
1999 }
2000
2001 if (isset($this->root)) {
2002 $this->root->clear();
2003 unset($this->root);
2004 }
2005
2006 unset($this->doc);
2007 unset($this->noise);
2008 }
2009
2010 function dump($show_attr = true)
2011 {
2012 $this->root->dump($show_attr);
2013 }
2014
2015 // prepare HTML data and init everything
2016 protected function prepare(
2017 $str, $lowercase = true,
2018 $defaultBRText = DEFAULT_BR_TEXT,
2019 $defaultSpanText = DEFAULT_SPAN_TEXT)
2020 {
2021 $this->clear();
2022
2023 $this->doc = trim($str);
2024 $this->size = strlen($this->doc);
2025 $this->original_size = $this->size; // original size of the html
2026 $this->pos = 0;
2027 $this->cursor = 1;
2028 $this->noise = array();
2029 $this->nodes = array();
2030 $this->lowercase = $lowercase;
2031 $this->default_br_text = $defaultBRText;
2032 $this->default_span_text = $defaultSpanText;
2033 $this->root = new simple_html_dom_node($this);
2034 $this->root->tag = 'root';
2035 $this->root->_[HDOM_INFO_BEGIN] = -1;
2036 $this->root->nodetype = HDOM_TYPE_ROOT;
2037 $this->parent = $this->root;
2038 if ($this->size > 0) { $this->char = $this->doc[0]; }
2039 }
2040
2041 /**
2042 * Parse HTML content
2043 *
2044 * @return bool True on success
2045 */
2046 protected function parse()
2047 {
2048 while (true) {
2049 // Read next tag if there is no text between current position and the
2050 // next opening tag.
2051 if (($s = $this->copy_until_char('<')) === '') {
2052 if($this->read_tag()) {
2053 continue;
2054 } else {
2055 return true;
2056 }
2057 }
2058
2059 // Add a text node for text between tags
2060 $node = new simple_html_dom_node($this);
2061 ++$this->cursor;
2062 $node->_[HDOM_INFO_TEXT] = $s;
2063 $this->link_nodes($node, false);
2064 }
2065 }
2066
2067 // PAPERG - dkchou - added this to try to identify the character set of the
2068 // page we have just parsed so we know better how to spit it out later.
2069 // NOTE: IF you provide a routine called
2070 // get_last_retrieve_url_contents_content_type which returns the
2071 // CURLINFO_CONTENT_TYPE from the last curl_exec
2072 // (or the content_type header from the last transfer), we will parse THAT,
2073 // and if a charset is specified, we will use it over any other mechanism.
2074 protected function parse_charset()
2075 {
2076 global $debug_object;
2077
2078 $charset = null;
2079
2080 if (function_exists('get_last_retrieve_url_contents_content_type')) {
2081 $contentTypeHeader = get_last_retrieve_url_contents_content_type();
2082 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2083 if ($success) {
2084 $charset = $matches[1];
2085 if (is_object($debug_object)) {
2086 $debug_object->debug_log(2,
2087 'header content-type found charset of: '
2088 . $charset
2089 );
2090 }
2091 }
2092 }
2093
2094 if (empty($charset)) {
2095 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2096
2097 if (!empty($el)) {
2098 $fullvalue = $el->content;
2099 if (is_object($debug_object)) {
2100 $debug_object->debug_log(2,
2101 'meta content-type tag found'
2102 . $fullvalue
2103 );
2104 }
2105
2106 if (!empty($fullvalue)) {
2107 $success = preg_match(
2108 '/charset=(.+)/i',
2109 $fullvalue,
2110 $matches
2111 );
2112
2113 if ($success) {
2114 $charset = $matches[1];
2115 } else {
2116 // If there is a meta tag, and they don't specify the
2117 // character set, research says that it's typically
2118 // ISO-8859-1
2119 if (is_object($debug_object)) {
2120 $debug_object->debug_log(2,
2121 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2122 );
2123 }
2124
2125 $charset = 'ISO-8859-1';
2126 }
2127 }
2128 }
2129 }
2130
2131 // If we couldn't find a charset above, then lets try to detect one
2132 // based on the text we got...
2133 if (empty($charset)) {
2134 // Use this in case mb_detect_charset isn't installed/loaded on
2135 // this machine.
2136 $charset = false;
2137 if (function_exists('mb_detect_encoding')) {
2138 // Have php try to detect the encoding from the text given to us.
2139 $charset = mb_detect_encoding(
2140 $this->doc . 'ascii',
2141 $encoding_list = array( 'UTF-8', 'CP1252' )
2142 );
2143
2144 if (is_object($debug_object)) {
2145 $debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2146 }
2147 }
2148
2149 // and if this doesn't work... then we need to just wrongheadedly
2150 // assume it's UTF-8 so that we can move on - cause this will
2151 // usually give us most of what we need...
2152 if ($charset === false) {
2153 if (is_object($debug_object)) {
2154 $debug_object->debug_log(
2155 2,
2156 'since mb_detect failed - using default of utf-8'
2157 );
2158 }
2159
2160 $charset = 'UTF-8';
2161 }
2162 }
2163
2164 // Since CP1252 is a superset, if we get one of it's subsets, we want
2165 // it instead.
2166 if ((strtolower($charset) == strtolower('ISO-8859-1'))
2167 || (strtolower($charset) == strtolower('Latin1'))
2168 || (strtolower($charset) == strtolower('Latin-1'))) {
2169
2170 if (is_object($debug_object)) {
2171 $debug_object->debug_log(
2172 2,
2173 'replacing ' . $charset . ' with CP1252 as its a superset'
2174 );
2175 }
2176
2177 $charset = 'CP1252';
2178 }
2179
2180 if (is_object($debug_object)) {
2181 $debug_object->debug_log(1, 'EXIT - ' . $charset);
2182 }
2183
2184 return $this->_charset = $charset;
2185 }
2186
2187 /**
2188 * Parse tag from current document position.
2189 *
2190 * @return bool True if a tag was found, false otherwise
2191 */
2192 protected function read_tag()
2193 {
2194 // Set end position if no further tags found
2195 if ($this->char !== '<') {
2196 $this->root->_[HDOM_INFO_END] = $this->cursor;
2197 return false;
2198 }
2199
2200 $begin_tag_pos = $this->pos;
2201 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2202
2203 // end tag
2204 if ($this->char === '/') {
2205 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2206
2207 // Skip whitespace in end tags (i.e. in "</ html>")
2208 $this->skip($this->token_blank);
2209 $tag = $this->copy_until_char('>');
2210
2211 // Skip attributes in end tags
2212 if (($pos = strpos($tag, ' ')) !== false) {
2213 $tag = substr($tag, 0, $pos);
2214 }
2215
2216 $parent_lower = strtolower($this->parent->tag);
2217 $tag_lower = strtolower($tag);
2218
2219 // The end tag is supposed to close the parent tag. Handle situations
2220 // when it doesn't
2221 if ($parent_lower !== $tag_lower) {
2222 // Parent tag does not have to be closed necessarily (optional closing tag)
2223 // Current tag is a block tag, so it may close an ancestor
2224 if (isset($this->optional_closing_tags[$parent_lower])
2225 && isset($this->block_tags[$tag_lower])) {
2226
2227 $this->parent->_[HDOM_INFO_END] = 0;
2228 $org_parent = $this->parent;
2229
2230 // Traverse ancestors to find a matching opening tag
2231 // Stop at root node
2232 while (($this->parent->parent)
2233 && strtolower($this->parent->tag) !== $tag_lower
2234 ){
2235 $this->parent = $this->parent->parent;
2236 }
2237
2238 // If we don't have a match add current tag as text node
2239 if (strtolower($this->parent->tag) !== $tag_lower) {
2240 $this->parent = $org_parent; // restore origonal parent
2241
2242 if ($this->parent->parent) {
2243 $this->parent = $this->parent->parent;
2244 }
2245
2246 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2247 return $this->as_text_node($tag);
2248 }
2249 } elseif (($this->parent->parent)
2250 && isset($this->block_tags[$tag_lower])
2251 ) {
2252 // Grandparent exists and current tag is a block tag, so our
2253 // parent doesn't have an end tag
2254 $this->parent->_[HDOM_INFO_END] = 0; // No end tag
2255 $org_parent = $this->parent;
2256
2257 // Traverse ancestors to find a matching opening tag
2258 // Stop at root node
2259 while (($this->parent->parent)
2260 && strtolower($this->parent->tag) !== $tag_lower
2261 ) {
2262 $this->parent = $this->parent->parent;
2263 }
2264
2265 // If we don't have a match add current tag as text node
2266 if (strtolower($this->parent->tag) !== $tag_lower) {
2267 $this->parent = $org_parent; // restore origonal parent
2268 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2269 return $this->as_text_node($tag);
2270 }
2271 } elseif (($this->parent->parent)
2272 && strtolower($this->parent->parent->tag) === $tag_lower
2273 ) { // Grandparent exists and current tag closes it
2274 $this->parent->_[HDOM_INFO_END] = 0;
2275 $this->parent = $this->parent->parent;
2276 } else { // Random tag, add as text node
2277 return $this->as_text_node($tag);
2278 }
2279 }
2280
2281 // Set end position of parent tag to current cursor position
2282 $this->parent->_[HDOM_INFO_END] = $this->cursor;
2283
2284 if ($this->parent->parent) {
2285 $this->parent = $this->parent->parent;
2286 }
2287
2288 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2289 return true;
2290 }
2291
2292 // start tag
2293 $node = new simple_html_dom_node($this);
2294 $node->_[HDOM_INFO_BEGIN] = $this->cursor;
2295 ++$this->cursor;
2296 $tag = $this->copy_until($this->token_slash); // Get tag name
2297 $node->tag_start = $begin_tag_pos;
2298
2299 // doctype, cdata & comments...
2300 // <!DOCTYPE html>
2301 // <![CDATA[ ... ]]>
2302 // <!-- Comment -->
2303 if (isset($tag[0]) && $tag[0] === '!') {
2304 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2305
2306 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2307 $node->nodetype = HDOM_TYPE_COMMENT;
2308 $node->tag = 'comment';
2309 } else { // Could be doctype or CDATA but we don't care
2310 $node->nodetype = HDOM_TYPE_UNKNOWN;
2311 $node->tag = 'unknown';
2312 }
2313
2314 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2315
2316 $this->link_nodes($node, true);
2317 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2318 return true;
2319 }
2320
2321 // The start tag cannot contain another start tag, if so add as text
2322 // i.e. "<<html>"
2323 if ($pos = strpos($tag, '<') !== false) {
2324 $tag = '<' . substr($tag, 0, -1);
2325 $node->_[HDOM_INFO_TEXT] = $tag;
2326 $this->link_nodes($node, false);
2327 $this->char = $this->doc[--$this->pos]; // prev
2328 return true;
2329 }
2330
2331 // Handle invalid tag names (i.e. "<html#doc>")
2332 if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2333 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2334
2335 // Next char is the beginning of a new tag, don't touch it.
2336 if ($this->char === '<') {
2337 $this->link_nodes($node, false);
2338 return true;
2339 }
2340
2341 // Next char closes current tag, add and be done with it.
2342 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2343 $this->link_nodes($node, false);
2344 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2345 return true;
2346 }
2347
2348 // begin tag, add new node
2349 $node->nodetype = HDOM_TYPE_ELEMENT;
2350 $tag_lower = strtolower($tag);
2351 $node->tag = ($this->lowercase) ? $tag_lower : $tag;
2352
2353 // handle optional closing tags
2354 if (isset($this->optional_closing_tags[$tag_lower])) {
2355 // Traverse ancestors to close all optional closing tags
2356 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2357 $this->parent->_[HDOM_INFO_END] = 0;
2358 $this->parent = $this->parent->parent;
2359 }
2360 $node->parent = $this->parent;
2361 }
2362
2363 $guard = 0; // prevent infinity loop
2364
2365 // [0] Space between tag and first attribute
2366 $space = array($this->copy_skip($this->token_blank), '', '');
2367
2368 // attributes
2369 do {
2370 // Everything until the first equal sign should be the attribute name
2371 $name = $this->copy_until($this->token_equal);
2372
2373 if ($name === '' && $this->char !== null && $space[0] === '') {
2374 break;
2375 }
2376
2377 if ($guard === $this->pos) { // Escape infinite loop
2378 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2379 continue;
2380 }
2381
2382 $guard = $this->pos;
2383
2384 // handle endless '<'
2385 // Out of bounds before the tag ended
2386 if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2387 $node->nodetype = HDOM_TYPE_TEXT;
2388 $node->_[HDOM_INFO_END] = 0;
2389 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2390 $node->tag = 'text';
2391 $this->link_nodes($node, false);
2392 return true;
2393 }
2394
2395 // handle mismatch '<'
2396 // Attributes cannot start after opening tag
2397 if ($this->doc[$this->pos - 1] == '<') {
2398 $node->nodetype = HDOM_TYPE_TEXT;
2399 $node->tag = 'text';
2400 $node->attr = array();
2401 $node->_[HDOM_INFO_END] = 0;
2402 $node->_[HDOM_INFO_TEXT] = substr(
2403 $this->doc,
2404 $begin_tag_pos,
2405 $this->pos - $begin_tag_pos - 1
2406 );
2407 $this->pos -= 2;
2408 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2409 $this->link_nodes($node, false);
2410 return true;
2411 }
2412
2413 if ($name !== '/' && $name !== '') { // this is a attribute name
2414 // [1] Whitespace after attribute name
2415 $space[1] = $this->copy_skip($this->token_blank);
2416
2417 $name = $this->restore_noise($name); // might be a noisy name
2418
2419 if ($this->lowercase) { $name = strtolower($name); }
2420
2421 if ($this->char === '=') { // attribute with value
2422 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2423 $this->parse_attr($node, $name, $space); // get attribute value
2424 } else {
2425 //no value attr: nowrap, checked selected...
2426 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2427 $node->attr[$name] = true;
2428 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2429 }
2430
2431 $node->_[HDOM_INFO_SPACE][] = $space;
2432
2433 // prepare for next attribute
2434 $space = array(
2435 $this->copy_skip($this->token_blank),
2436 '',
2437 ''
2438 );
2439 } else { // no more attributes
2440 break;
2441 }
2442 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2443
2444 $this->link_nodes($node, true);
2445 $node->_[HDOM_INFO_ENDSPACE] = $space[0];
2446
2447 // handle empty tags (i.e. "<div/>")
2448 if ($this->copy_until_char('>') === '/') {
2449 $node->_[HDOM_INFO_ENDSPACE] .= '/';
2450 $node->_[HDOM_INFO_END] = 0;
2451 } else {
2452 // reset parent
2453 if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2454 $this->parent = $node;
2455 }
2456 }
2457
2458 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2459
2460 // If it's a BR tag, we need to set it's text to the default text.
2461 // This way when we see it in plaintext, we can generate formatting that the user wants.
2462 // since a br tag never has sub nodes, this works well.
2463 if ($node->tag === 'br') {
2464 $node->_[HDOM_INFO_INNER] = $this->default_br_text;
2465 }
2466
2467 return true;
2468 }
2469
2470 /**
2471 * Parse attribute from current document position
2472 *
2473 * @param object $node Node for the attributes
2474 * @param string $name Name of the current attribute
2475 * @param array $space Array for spacing information
2476 * @return void
2477 */
2478 protected function parse_attr($node, $name, &$space)
2479 {
2480 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2481 // If the attribute is already defined inside a tag, only pay attention
2482 // to the first one as opposed to the last one.
2483 // https://stackoverflow.com/a/26341866
2484 if (isset($node->attr[$name])) {
2485 return;
2486 }
2487
2488 // [2] Whitespace between "=" and the value
2489 $space[2] = $this->copy_skip($this->token_blank);
2490
2491 switch ($this->char) {
2492 case '"': // value is anything between double quotes
2493 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2494 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2495 $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2496 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2497 break;
2498 case '\'': // value is anything between single quotes
2499 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2500 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2501 $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2502 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2503 break;
2504 default: // value is anything until the first space or end tag
2505 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2506 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2507 }
2508 // PaperG: Attributes should not have \r or \n in them, that counts as
2509 // html whitespace.
2510 $node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2511 $node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2512 // PaperG: If this is a "class" selector, lets get rid of the preceeding
2513 // and trailing space since some people leave it in the multi class case.
2514 if ($name === 'class') {
2515 $node->attr[$name] = trim($node->attr[$name]);
2516 }
2517 }
2518
2519 /**
2520 * Link node to parent node
2521 *
2522 * @param object $node Node to link to parent
2523 * @param bool $is_child True if the node is a child of parent
2524 * @return void
2525 */
2526 // link node's parent
2527 protected function link_nodes(&$node, $is_child)
2528 {
2529 $node->parent = $this->parent;
2530 $this->parent->nodes[] = $node;
2531 if ($is_child) {
2532 $this->parent->children[] = $node;
2533 }
2534 }
2535
2536 /**
2537 * Add tag as text node to current node
2538 *
2539 * @param string $tag Tag name
2540 * @return bool True on success
2541 */
2542 protected function as_text_node($tag)
2543 {
2544 $node = new simple_html_dom_node($this);
2545 ++$this->cursor;
2546 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2547 $this->link_nodes($node, false);
2548 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2549 return true;
2550 }
2551
2552 /**
2553 * Seek from the current document position to the first occurrence of a
2554 * character not defined by the provided string. Update the current document
2555 * position to the new position.
2556 *
2557 * @param string $chars A string containing every allowed character.
2558 * @return void
2559 */
2560 protected function skip($chars)
2561 {
2562 $this->pos += strspn($this->doc, $chars, $this->pos);
2563 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2564 }
2565
2566 /**
2567 * Copy substring from the current document position to the first occurrence
2568 * of a character not defined by the provided string.
2569 *
2570 * @param string $chars A string containing every allowed character.
2571 * @return string Substring from the current document position to the first
2572 * occurrence of a character not defined by the provided string.
2573 */
2574 protected function copy_skip($chars)
2575 {
2576 $pos = $this->pos;
2577 $len = strspn($this->doc, $chars, $pos);
2578 $this->pos += $len;
2579 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2580 if ($len === 0) { return ''; }
2581 return substr($this->doc, $pos, $len);
2582 }
2583
2584 /**
2585 * Copy substring from the current document position to the first occurrence
2586 * of any of the provided characters.
2587 *
2588 * @param string $chars A string containing every character to stop at.
2589 * @return string Substring from the current document position to the first
2590 * occurrence of any of the provided characters.
2591 */
2592 protected function copy_until($chars)
2593 {
2594 $pos = $this->pos;
2595 $len = strcspn($this->doc, $chars, $pos);
2596 $this->pos += $len;
2597 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2598 return substr($this->doc, $pos, $len);
2599 }
2600
2601 /**
2602 * Copy substring from the current document position to the first occurrence
2603 * of the provided string.
2604 *
2605 * @param string $char The string to stop at.
2606 * @return string Substring from the current document position to the first
2607 * occurrence of the provided string.
2608 */
2609 protected function copy_until_char($char)
2610 {
2611 if ($this->char === null) { return ''; }
2612
2613 if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2614 $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2615 $this->char = null;
2616 $this->pos = $this->size;
2617 return $ret;
2618 }
2619
2620 if ($pos === $this->pos) { return ''; }
2621
2622 $pos_old = $this->pos;
2623 $this->char = $this->doc[$pos];
2624 $this->pos = $pos;
2625 return substr($this->doc, $pos_old, $pos - $pos_old);
2626 }
2627
2628 /**
2629 * Remove noise from HTML content
2630 *
2631 * Noise is stored to {@see simple_html_dom::$noise}
2632 *
2633 * @param string $pattern The regex pattern used for finding noise
2634 * @param bool $remove_tag True to remove the entire match. Default is false
2635 * to only remove the captured data.
2636 */
2637 protected function remove_noise($pattern, $remove_tag = false)
2638 {
2639 global $debug_object;
2640 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2641
2642 $count = preg_match_all(
2643 $pattern,
2644 $this->doc,
2645 $matches,
2646 PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2647 );
2648
2649 for ($i = $count - 1; $i > -1; --$i) {
2650 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2651
2652 if (is_object($debug_object)) {
2653 $debug_object->debug_log(2, 'key is: ' . $key);
2654 }
2655
2656 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2657 $this->noise[$key] = $matches[$i][$idx][0];
2658 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2659 }
2660
2661 // reset the length of content
2662 $this->size = strlen($this->doc);
2663
2664 if ($this->size > 0) {
2665 $this->char = $this->doc[0];
2666 }
2667 }
2668
2669 /**
2670 * Restore noise to HTML content
2671 *
2672 * Noise is restored from {@see simple_html_dom::$noise}
2673 *
2674 * @param string $text A subset of HTML containing noise
2675 * @return string The same content with noise restored
2676 */
2677 function restore_noise($text)
2678 {
2679 global $debug_object;
2680 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2681
2682 while (($pos = strpos($text, '___noise___')) !== false) {
2683 // Sometimes there is a broken piece of markup, and we don't GET the
2684 // pos+11 etc... token which indicates a problem outside of us...
2685
2686 // todo: "___noise___1000" (or any number with four or more digits)
2687 // in the DOM causes an infinite loop which could be utilized by
2688 // malicious software
2689 if (strlen($text) > $pos + 15) {
2690 $key = '___noise___'
2691 . $text[$pos + 11]
2692 . $text[$pos + 12]
2693 . $text[$pos + 13]
2694 . $text[$pos + 14]
2695 . $text[$pos + 15];
2696
2697 if (is_object($debug_object)) {
2698 $debug_object->debug_log(2, 'located key of: ' . $key);
2699 }
2700
2701 if (isset($this->noise[$key])) {
2702 $text = substr($text, 0, $pos)
2703 . $this->noise[$key]
2704 . substr($text, $pos + 16);
2705 } else {
2706 // do this to prevent an infinite loop.
2707 $text = substr($text, 0, $pos)
2708 . 'UNDEFINED NOISE FOR KEY: '
2709 . $key
2710 . substr($text, $pos + 16);
2711 }
2712 } else {
2713 // There is no valid key being given back to us... We must get
2714 // rid of the ___noise___ or we will have a problem.
2715 $text = substr($text, 0, $pos)
2716 . 'NO NUMERIC NOISE KEY'
2717 . substr($text, $pos + 11);
2718 }
2719 }
2720 return $text;
2721 }
2722
2723 // Sometimes we NEED one of the noise elements.
2724 function search_noise($text)
2725 {
2726 global $debug_object;
2727 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2728
2729 foreach($this->noise as $noiseElement) {
2730 if (strpos($noiseElement, $text) !== false) {
2731 return $noiseElement;
2732 }
2733 }
2734 }
2735
2736 function __toString()
2737 {
2738 return $this->root->innertext();
2739 }
2740
2741 function __get($name)
2742 {
2743 switch ($name) {
2744 case 'outertext':
2745 return $this->root->innertext();
2746 case 'innertext':
2747 return $this->root->innertext();
2748 case 'plaintext':
2749 return $this->root->text();
2750 case 'charset':
2751 return $this->_charset;
2752 case 'target_charset':
2753 return $this->_target_charset;
2754 }
2755 }
2756
2757 // camel naming conventions
2758 function childNodes($idx = -1)
2759 {
2760 return $this->root->childNodes($idx);
2761 }
2762
2763 function firstChild()
2764 {
2765 return $this->root->first_child();
2766 }
2767
2768 function lastChild()
2769 {
2770 return $this->root->last_child();
2771 }
2772
2773 function createElement($name, $value = null)
2774 {
2775 return @str_get_html("<$name>$value</$name>")->first_child();
2776 }
2777
2778 function createTextNode($value)
2779 {
2780 return @end(str_get_html($value)->nodes);
2781 }
2782
2783 function getElementById($id)
2784 {
2785 return $this->find("#$id", 0);
2786 }
2787
2788 function getElementsById($id, $idx = null)
2789 {
2790 return $this->find("#$id", $idx);
2791 }
2792
2793 function getElementByTagName($name)
2794 {
2795 return $this->find($name, 0);
2796 }
2797
2798 function getElementsByTagName($name, $idx = -1)
2799 {
2800 return $this->find($name, $idx);
2801 }
2802
2803 function loadFile()
2804 {
2805 $args = func_get_args();
2806 $this->load_file($args);
2807 }
2808}