QtBBWakb

· 6 years ago · Nov 05, 2019, 11:56 PM
1<?php
2/**
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6 * Contributions by:
7 *	 Yousuke Kumakura (Attribute filters)
8 *	 Vadim Voituk (Negative indexes supports of "find" method)
9 *	 Antcs (Constructor with automatically load contents either text or file/url)
10 *
11 * all affected sections have comments starting with "PaperG"
12 *
13 * Paperg - Added case insensitive testing of the value of the selector.
14 *
15 * Paperg - Added tag_start for the starting index of tags - NOTE: This works
16 * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
17 * out, and after the remove_noice calls so it will not reflect the REAL
18 * position of the tag in the source, it will almost always be smaller by some
19 * amount. We use this to determine how far into the file the tag in question
20 * is. This "percentage" will never be accurate as the $dom->size is the "real"
21 * number of bytes the dom was created from. But for most purposes, it's a
22 * really good estimation.
23 *
24 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
25 * closed is great for malformed html, but it CAN lead to parsing errors.
26 *
27 * Allow the user to tell us how much they trust the html.
28 *
29 * Paperg add the text and plaintext to the selectors for the find syntax.
30 * plaintext implies text in the innertext of a node.  text implies that the
31 * tag is a text node. This allows for us to find tags based on the text they
32 * contain.
33 *
34 * Create find_ancestor_tag to see if a tag is - at any level - inside of
35 * another specific tag.
36 *
37 * Paperg: added parse_charset so that we know about the character set of
38 * the source document. NOTE: If the user's system has a routine called
39 * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
40 * returning the content-type header from the last transfer or curl_exec, and
41 * we will parse that and use it in preference to any other method of charset
42 * detection.
43 *
44 * Found infinite loop in the case of broken html in restore_noise. Rewrote to
45 * protect from that.
46 *
47 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
48 *
49 * Licensed under The MIT License
50 * Redistributions of files must retain the above copyright notice.
51 *
52 * @author S.C. Chen <me578022@gmail.com>
53 * @author John Schlick
54 * @author Rus Carroll
55 * @version Rev. 1.8.1 (247)
56 * @package PlaceLocalInclude
57 * @subpackage simple_html_dom
58 */
59
60/**
61 * All of the Defines for the classes below.
62 * @author S.C. Chen <me578022@gmail.com>
63 */
64define('HDOM_TYPE_ELEMENT', 1);
65define('HDOM_TYPE_COMMENT', 2);
66define('HDOM_TYPE_TEXT', 3);
67define('HDOM_TYPE_ENDTAG', 4);
68define('HDOM_TYPE_ROOT', 5);
69define('HDOM_TYPE_UNKNOWN', 6);
70define('HDOM_QUOTE_DOUBLE', 0);
71define('HDOM_QUOTE_SINGLE', 1);
72define('HDOM_QUOTE_NO', 3);
73define('HDOM_INFO_BEGIN', 0);
74define('HDOM_INFO_END', 1);
75define('HDOM_INFO_QUOTE', 2);
76define('HDOM_INFO_SPACE', 3);
77define('HDOM_INFO_TEXT', 4);
78define('HDOM_INFO_INNER', 5);
79define('HDOM_INFO_OUTER', 6);
80define('HDOM_INFO_ENDSPACE', 7);
81
82/** The default target charset */
83defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
84
85/** The default <br> text used instead of <br> tags when returning text */
86defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
87
88/** The default <span> text used instead of <span> tags when returning text */
89defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
90
91/** The maximum file size the parser should load */
92defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
93
94/** Contents between curly braces "{" and "}" are interpreted as text */
95define('HDOM_SMARTY_AS_TEXT', 1);
96
97// helper functions
98// -----------------------------------------------------------------------------
99// get html dom from file
100// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
101function file_get_html(
102	$url,
103	$use_include_path = false,
104	$context = null,
105	$offset = 0,
106	$maxLen = -1,
107	$lowercase = true,
108	$forceTagsClosed = true,
109	$target_charset = DEFAULT_TARGET_CHARSET,
110	$stripRN = true,
111	$defaultBRText = DEFAULT_BR_TEXT,
112	$defaultSpanText = DEFAULT_SPAN_TEXT)
113{
114	// Ensure maximum length is greater than zero
115	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
116
117	// We DO force the tags to be terminated.
118	$dom = new simple_html_dom(
119		null,
120		$lowercase,
121		$forceTagsClosed,
122		$target_charset,
123		$stripRN,
124		$defaultBRText,
125		$defaultSpanText);
126
127	/**
128	 * For sourceforge users: uncomment the next line and comment the
129	 * retrieve_url_contents line 2 lines down if it is not already done.
130	 */
131	$contents = file_get_contents(
132		$url,
133		$use_include_path,
134		$context,
135		$offset,
136		$maxLen);
137
138	// Paperg - use our own mechanism for getting the contents as we want to
139	// control the timeout.
140	// $contents = retrieve_url_contents($url);
141	if (empty($contents) || strlen($contents) > $maxLen) { return false; }
142
143	// The second parameter can force the selectors to all be lowercase.
144	$dom->load($contents, $lowercase, $stripRN);
145	return $dom;
146}
147
148// get html dom from string
149function str_get_html(
150	$str,
151	$lowercase = true,
152	$forceTagsClosed = true,
153	$target_charset = DEFAULT_TARGET_CHARSET,
154	$stripRN = true,
155	$defaultBRText = DEFAULT_BR_TEXT,
156	$defaultSpanText = DEFAULT_SPAN_TEXT)
157{
158	$dom = new simple_html_dom(
159		null,
160		$lowercase,
161		$forceTagsClosed,
162		$target_charset,
163		$stripRN,
164		$defaultBRText,
165		$defaultSpanText);
166
167	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
168		$dom->clear();
169		return false;
170	}
171
172	$dom->load($str, $lowercase, $stripRN);
173	return $dom;
174}
175
176// dump html dom tree
177function dump_html_tree($node, $show_attr = true, $deep = 0)
178{
179	$node->dump($node);
180}
181
182/**
183 * simple html dom node
184 * PaperG - added ability for "find" routine to lowercase the value of the
185 * selector.
186 *
187 * PaperG - added $tag_start to track the start position of the tag in the total
188 * byte index
189 *
190 * @package PlaceLocalInclude
191 */
192class simple_html_dom_node
193{
194	/**
195	 * Node type
196	 *
197	 * Default is {@see HDOM_TYPE_TEXT}
198	 *
199	 * @var int
200	 */
201	public $nodetype = HDOM_TYPE_TEXT;
202
203	/**
204	 * Tag name
205	 *
206	 * Default is 'text'
207	 *
208	 * @var string
209	 */
210	public $tag = 'text';
211
212	/**
213	 * List of attributes
214	 *
215	 * @var array
216	 */
217	public $attr = array();
218
219	/**
220	 * List of child node objects
221	 *
222	 * @var array
223	 */
224	public $children = array();
225	public $nodes = array();
226
227	/**
228	 * The parent node object
229	 *
230	 * @var object|null
231	 */
232	public $parent = null;
233
234	// The "info" array - see HDOM_INFO_... for what each element contains.
235	public $_ = array();
236
237	/**
238	 * Start position of the tag in the document
239	 *
240	 * @var int
241	 */
242	public $tag_start = 0;
243
244	/**
245	 * The DOM object
246	 *
247	 * @var object|null
248	 */
249	private $dom = null;
250
251	/**
252	 * Construct new node object
253	 *
254	 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
255	 */
256	function __construct($dom)
257	{
258		$this->dom = $dom;
259		$dom->nodes[] = $this;
260	}
261
262	function __destruct()
263	{
264		$this->clear();
265	}
266
267	function __toString()
268	{
269		return $this->outertext();
270	}
271
272	// clean up memory due to php5 circular references memory leak...
273	function clear()
274	{
275		$this->dom = null;
276		$this->nodes = null;
277		$this->parent = null;
278		$this->children = null;
279	}
280
281	// dump node's tree
282	function dump($show_attr = true, $deep = 0)
283	{
284		$lead = str_repeat('	', $deep);
285
286		echo $lead . $this->tag;
287
288		if ($show_attr && count($this->attr) > 0) {
289			echo '(';
290			foreach ($this->attr as $k => $v) {
291				echo "[$k]=>\"" . $this->$k . '", ';
292			}
293			echo ')';
294		}
295
296		echo "\n";
297
298		if ($this->nodes) {
299			foreach ($this->nodes as $c) {
300				$c->dump($show_attr, $deep + 1);
301			}
302		}
303	}
304
305
306	// Debugging function to dump a single dom node with a bunch of information about it.
307	function dump_node($echo = true)
308	{
309		$string = $this->tag;
310
311		if (count($this->attr) > 0) {
312			$string .= '(';
313			foreach ($this->attr as $k => $v) {
314				$string .= "[$k]=>\"" . $this->$k . '", ';
315			}
316			$string .= ')';
317		}
318
319		if (count($this->_) > 0) {
320			$string .= ' $_ (';
321			foreach ($this->_ as $k => $v) {
322				if (is_array($v)) {
323					$string .= "[$k]=>(";
324					foreach ($v as $k2 => $v2) {
325						$string .= "[$k2]=>\"" . $v2 . '", ';
326					}
327					$string .= ')';
328				} else {
329					$string .= "[$k]=>\"" . $v . '", ';
330				}
331			}
332			$string .= ')';
333		}
334
335		if (isset($this->text)) {
336			$string .= ' text: (' . $this->text . ')';
337		}
338
339		$string .= " HDOM_INNER_INFO: '";
340
341		if (isset($node->_[HDOM_INFO_INNER])) {
342			$string .= $node->_[HDOM_INFO_INNER] . "'";
343		} else {
344			$string .= ' NULL ';
345		}
346
347		$string .= ' children: ' . count($this->children);
348		$string .= ' nodes: ' . count($this->nodes);
349		$string .= ' tag_start: ' . $this->tag_start;
350		$string .= "\n";
351
352		if ($echo) {
353			echo $string;
354			return;
355		} else {
356			return $string;
357		}
358	}
359
360	/**
361	 * Return or set parent node
362	 *
363	 * @param object|null $parent (optional) The parent node, `null` to return
364	 * the current parent node.
365	 * @return object|null The parent node
366	 */
367	function parent($parent = null)
368	{
369		// I am SURE that this doesn't work properly.
370		// It fails to unset the current node from it's current parents nodes or
371		// children list first.
372		if ($parent !== null) {
373			$this->parent = $parent;
374			$this->parent->nodes[] = $this;
375			$this->parent->children[] = $this;
376		}
377
378		return $this->parent;
379	}
380
381	/**
382	 * @return bool True if the node has at least one child node
383	 */
384	function has_child()
385	{
386		return !empty($this->children);
387	}
388
389	/**
390	 * Get child node at specified index
391	 *
392	 * @param int $idx The index of the child node to return, `-1` to return all
393	 * child nodes.
394	 * @return object|array|null The child node at the specified index, all child
395	 * nodes or null if the index is invalid.
396	 */
397	function children($idx = -1)
398	{
399		if ($idx === -1) {
400			return $this->children;
401		}
402
403		if (isset($this->children[$idx])) {
404			return $this->children[$idx];
405		}
406
407		return null;
408	}
409
410	/**
411	 * Get first child node
412	 *
413	 * @return object|null The first child node or null if the current node has
414	 * no child nodes.
415	 *
416	 * @todo Use `empty()` instead of `count()` to improve performance on large
417	 * arrays.
418	 */
419	function first_child()
420	{
421		if (count($this->children) > 0) {
422			return $this->children[0];
423		}
424		return null;
425	}
426
427	/**
428	 * Get last child node
429	 *
430	 * @return object|null The last child node or null if the current node has
431	 * no child nodes.
432	 *
433	 * @todo Use `end()` to slightly improve performance on large arrays.
434	 */
435	function last_child()
436	{
437		if (($count = count($this->children)) > 0) {
438			return $this->children[$count - 1];
439		}
440		return null;
441	}
442
443	/**
444	 * Get next sibling node
445	 *
446	 * @return object|null The sibling node or null if the current node has no
447	 * sibling nodes.
448	 */
449	function next_sibling()
450	{
451		if ($this->parent === null) {
452			return null;
453		}
454
455		$idx = 0;
456		$count = count($this->parent->children);
457
458		while ($idx < $count && $this !== $this->parent->children[$idx]) {
459			++$idx;
460		}
461
462		if (++$idx >= $count) {
463			return null;
464		}
465
466		return $this->parent->children[$idx];
467	}
468
469	/**
470	 * Get previous sibling node
471	 *
472	 * @return object|null The sibling node or null if the current node has no
473	 * sibling nodes.
474	 */
475	function prev_sibling()
476	{
477		if ($this->parent === null) { return null; }
478
479		$idx = 0;
480		$count = count($this->parent->children);
481
482		while ($idx < $count && $this !== $this->parent->children[$idx]) {
483			++$idx;
484		}
485
486		if (--$idx < 0) { return null; }
487
488		return $this->parent->children[$idx];
489	}
490
491	/**
492	 * Traverse ancestors to the first matching tag.
493	 *
494	 * @param string $tag Tag to find
495	 * @return object|null First matching node in the DOM tree or null if no
496	 * match was found.
497	 *
498	 * @todo Null is returned implicitly by calling ->parent on the root node.
499	 * This behaviour could change at any time, rendering this function invalid.
500	 */
501	function find_ancestor_tag($tag)
502	{
503		global $debug_object;
504		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
505
506		// Start by including ourselves in the comparison.
507		$returnDom = $this;
508
509		while (!is_null($returnDom)) {
510			if (is_object($debug_object)) {
511				$debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
512			}
513
514			if ($returnDom->tag == $tag) {
515				break;
516			}
517
518			$returnDom = $returnDom->parent;
519		}
520
521		return $returnDom;
522	}
523
524	/**
525	 * Get node's inner text (everything inside the opening and closing tags)
526	 *
527	 * @return string
528	 */
529	function innertext()
530	{
531		if (isset($this->_[HDOM_INFO_INNER])) {
532			return $this->_[HDOM_INFO_INNER];
533		}
534
535		if (isset($this->_[HDOM_INFO_TEXT])) {
536			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
537		}
538
539		$ret = '';
540
541		foreach ($this->nodes as $n) {
542			$ret .= $n->outertext();
543		}
544
545		return $ret;
546	}
547
548	/**
549	 * Get node's outer text (everything including the opening and closing tags)
550	 *
551	 * @return string
552	 */
553	function outertext()
554	{
555		global $debug_object;
556
557		if (is_object($debug_object)) {
558			$text = '';
559
560			if ($this->tag === 'text') {
561				if (!empty($this->text)) {
562					$text = ' with text: ' . $this->text;
563				}
564			}
565
566			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
567		}
568
569		if ($this->tag === 'root') return $this->innertext();
570
571		// trigger callback
572		if ($this->dom && $this->dom->callback !== null) {
573			call_user_func_array($this->dom->callback, array($this));
574		}
575
576		if (isset($this->_[HDOM_INFO_OUTER])) {
577			return $this->_[HDOM_INFO_OUTER];
578		}
579
580		if (isset($this->_[HDOM_INFO_TEXT])) {
581			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
582		}
583
584		// render begin tag
585		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
586			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
587		} else {
588			$ret = '';
589		}
590
591		// render inner text
592		if (isset($this->_[HDOM_INFO_INNER])) {
593			// If it's a br tag...  don't return the HDOM_INNER_INFO that we
594			// may or may not have added.
595			if ($this->tag !== 'br') {
596				$ret .= $this->_[HDOM_INFO_INNER];
597			}
598		} else {
599			if ($this->nodes) {
600				foreach ($this->nodes as $n) {
601					$ret .= $this->convert_text($n->outertext());
602				}
603			}
604		}
605
606		// render end tag
607		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
608			$ret .= '</' . $this->tag . '>';
609		}
610
611		return $ret;
612	}
613
614	/**
615	 * Get node's plain text (everything excluding all tags)
616	 *
617	 * @return string
618	 */
619	function text()
620	{
621		if (isset($this->_[HDOM_INFO_INNER])) {
622			return $this->_[HDOM_INFO_INNER];
623		}
624
625		switch ($this->nodetype) {
626			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
627			case HDOM_TYPE_COMMENT: return '';
628			case HDOM_TYPE_UNKNOWN: return '';
629		}
630
631		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
632		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
633
634		$ret = '';
635
636		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
637		// for some span tags, and some p tags) $this->nodes is set to NULL.
638		// NOTE: This indicates that there is a problem where it's set to NULL
639		// without a clear happening.
640		// WHY is this happening?
641		if (!is_null($this->nodes)) {
642			foreach ($this->nodes as $n) {
643				// Start paragraph after a blank line
644				if ($n->tag === 'p') {
645					$ret .= "\n\n";
646				}
647
648				$ret .= $this->convert_text($n->text());
649
650				// If this node is a span... add a space at the end of it so
651				// multiple spans don't run into each other.  This is plaintext
652				// after all.
653				if ($n->tag === 'span') {
654					$ret .= $this->dom->default_span_text;
655				}
656			}
657		}
658		return trim($ret);
659	}
660
661	/**
662	 * Get node's xml text (inner text as a CDATA section)
663	 *
664	 * @return string
665	 */
666	function xmltext()
667	{
668		$ret = $this->innertext();
669		$ret = str_ireplace('<![CDATA[', '', $ret);
670		$ret = str_replace(']]>', '', $ret);
671		return $ret;
672	}
673
674	// build node's text with tag
675	function makeup()
676	{
677		// text, comment, unknown
678		if (isset($this->_[HDOM_INFO_TEXT])) {
679			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
680		}
681
682		$ret = '<' . $this->tag;
683		$i = -1;
684
685		foreach ($this->attr as $key => $val) {
686			++$i;
687
688			// skip removed attribute
689			if ($val === null || $val === false) { continue; }
690
691			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
692
693			//no value attr: nowrap, checked selected...
694			if ($val === true) {
695				$ret .= $key;
696			} else {
697				switch ($this->_[HDOM_INFO_QUOTE][$i])
698				{
699					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
700					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
701					default: $quote = '';
702				}
703
704				$ret .= $key
705				. $this->_[HDOM_INFO_SPACE][$i][1]
706				. '='
707				. $this->_[HDOM_INFO_SPACE][$i][2]
708				. $quote
709				. $val
710				. $quote;
711			}
712		}
713
714		$ret = $this->dom->restore_noise($ret);
715		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
716	}
717
718	/**
719	 * Find elements by CSS selector
720	 *
721	 * @param string $selector The CSS selector
722	 * @param int|null $idx Index of element to return form the list of matching
723	 * elements (default: `null` = disabled).
724	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
725	 * enabled (default: `false`)
726	 * @return array|object|null A list of elements matching the specified CSS
727	 * selector or a single element if $idx is specified or null if no element
728	 * was found.
729	 */
730	function find($selector, $idx = null, $lowercase = false)
731	{
732		$selectors = $this->parse_selector($selector);
733		if (($count = count($selectors)) === 0) { return array(); }
734		$found_keys = array();
735
736		// find each selector
737		for ($c = 0; $c < $count; ++$c) {
738			// The change on the below line was documented on the sourceforge
739			// code tracker id 2788009
740			// used to be: if (($levle=count($selectors[0]))===0) return array();
741			if (($levle = count($selectors[$c])) === 0) { return array(); }
742			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
743
744			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
745			$cmd = ' '; // Combinator
746
747			// handle descendant selectors, no recursive!
748			for ($l = 0; $l < $levle; ++$l) {
749				$ret = array();
750
751				foreach ($head as $k => $v) {
752					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
753					//PaperG - Pass this optional parameter on to the seek function.
754					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
755				}
756
757				$head = $ret;
758				$cmd = $selectors[$c][$l][4]; // Next Combinator
759			}
760
761			foreach ($head as $k => $v) {
762				if (!isset($found_keys[$k])) {
763					$found_keys[$k] = 1;
764				}
765			}
766		}
767
768		// sort keys
769		ksort($found_keys);
770
771		$found = array();
772		foreach ($found_keys as $k => $v) {
773			$found[] = $this->dom->nodes[$k];
774		}
775
776		// return nth-element or array
777		if (is_null($idx)) { return $found; }
778		elseif ($idx < 0) { $idx = count($found) + $idx; }
779		return (isset($found[$idx])) ? $found[$idx] : null;
780	}
781
782	/**
783	 * Seek DOM elements by selector
784	 *
785	 * **Note**
786	 * The selector element must be compatible to a selector from
787	 * {@see simple_html_dom_node::parse_selector()}
788	 *
789	 * @param array $selector A selector element
790	 * @param array $ret An array of matches
791	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
792	 * enabled (default: `false`)
793	 * @return void
794	 */
795	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
796	{
797		global $debug_object;
798		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
799
800		list($tag, $id, $class, $attributes, $cmb) = $selector;
801		$nodes = array();
802
803		if ($parent_cmd === ' ') { // Descendant Combinator
804			// Find parent closing tag if the current element doesn't have a closing
805			// tag (i.e. void element)
806			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
807			if ($end == 0) {
808				$parent = $this->parent;
809				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
810					$end -= 1;
811					$parent = $parent->parent;
812				}
813				$end += $parent->_[HDOM_INFO_END];
814			}
815
816			// Get list of target nodes
817			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
818			$nodes_count = $end - $nodes_start;
819			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
820		} elseif ($parent_cmd === '>') { // Child Combinator
821			$nodes = $this->children;
822		} elseif ($parent_cmd === '+'
823			&& $this->parent
824			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
825				$index = array_search($this, $this->parent->children, true) + 1;
826				$nodes[] = $this->parent->children[$index];
827		} elseif ($parent_cmd === '~'
828			&& $this->parent
829			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
830				$index = array_search($this, $this->parent->children, true);
831				$nodes = array_slice($this->parent->children, $index);
832		}
833
834		// Go throgh each element starting at this element until the end tag
835		// Note: If this element is a void tag, any previous void element is
836		// skipped.
837		foreach($nodes as $node) {
838			$pass = true;
839
840			// Skip root nodes
841			if(!$node->parent) {
842				$pass = false;
843			}
844
845			// Skip if node isn't a child node (i.e. text nodes)
846			if($pass && !in_array($node, $node->parent->children, true)) {
847				$pass = false;
848			}
849
850			// Skip if tag doesn't match
851			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
852				$pass = false;
853			}
854
855			// Skip if ID doesn't exist
856			if ($pass && $id !== '' && !isset($node->attr['id'])) {
857				$pass = false;
858			}
859
860			// Check if ID matches
861			if ($pass && $id !== '' && isset($node->attr['id'])) {
862				// Note: Only consider the first ID (as browsers do)
863				$node_id = explode(' ', trim($node->attr['id']))[0];
864
865				if($id !== $node_id) { $pass = false; }
866			}
867
868			// Check if all class(es) exist
869			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
870				if (isset($node->attr['class'])) {
871					$node_classes = explode(' ', $node->attr['class']);
872
873					if ($lowercase) {
874						$node_classes = array_map('strtolower', $node_classes);
875					}
876
877					foreach($class as $c) {
878						if(!in_array($c, $node_classes)) {
879							$pass = false;
880							break;
881						}
882					}
883				} else {
884					$pass = false;
885				}
886			}
887
888			// Check attributes
889			if ($pass
890				&& $attributes !== ''
891				&& is_array($attributes)
892				&& !empty($attributes)) {
893					foreach($attributes as $a) {
894						list (
895							$att_name,
896							$att_expr,
897							$att_val,
898							$att_inv,
899							$att_case_sensitivity
900						) = $a;
901
902						// Handle indexing attributes (i.e. "[2]")
903						/**
904						 * Note: This is not supported by the CSS Standard but adds
905						 * the ability to select items compatible to XPath (i.e.
906						 * the 3rd element within it's parent).
907						 *
908						 * Note: This doesn't conflict with the CSS Standard which
909						 * doesn't work on numeric attributes anyway.
910						 */
911						if (is_numeric($att_name)
912							&& $att_expr === ''
913							&& $att_val === '') {
914								$count = 0;
915
916								// Find index of current element in parent
917								foreach ($node->parent->children as $c) {
918									if ($c->tag === $node->tag) ++$count;
919									if ($c === $node) break;
920								}
921
922								// If this is the correct node, continue with next
923								// attribute
924								if ($count === (int)$att_name) continue;
925						}
926
927						// Check attribute availability
928						if ($att_inv) { // Attribute should NOT be set
929							if (isset($node->attr[$att_name])) {
930								$pass = false;
931								break;
932							}
933						} else { // Attribute should be set
934							// todo: "plaintext" is not a valid CSS selector!
935							if ($att_name !== 'plaintext'
936								&& !isset($node->attr[$att_name])) {
937									$pass = false;
938									break;
939							}
940						}
941
942						// Continue with next attribute if expression isn't defined
943						if ($att_expr === '') continue;
944
945						// If they have told us that this is a "plaintext"
946						// search then we want the plaintext of the node - right?
947						// todo "plaintext" is not a valid CSS selector!
948						if ($att_name === 'plaintext') {
949							$nodeKeyValue = $node->text();
950						} else {
951							$nodeKeyValue = $node->attr[$att_name];
952						}
953
954						if (is_object($debug_object)) {
955							$debug_object->debug_log(2,
956								'testing node: '
957								. $node->tag
958								. ' for attribute: '
959								. $att_name
960								. $att_expr
961								. $att_val
962								. ' where nodes value is: '
963								. $nodeKeyValue
964							);
965						}
966
967						// If lowercase is set, do a case insensitive test of
968						// the value of the selector.
969						if ($lowercase) {
970							$check = $this->match(
971								$att_expr,
972								strtolower($att_val),
973								strtolower($nodeKeyValue),
974								$att_case_sensitivity
975							);
976						} else {
977							$check = $this->match(
978								$att_expr,
979								$att_val,
980								$nodeKeyValue,
981								$att_case_sensitivity
982							);
983						}
984
985						if (is_object($debug_object)) {
986							$debug_object->debug_log(2,
987								'after match: '
988								. ($check ? 'true' : 'false')
989							);
990						}
991
992						if (!$check) {
993							$pass = false;
994							break;
995						}
996					}
997			}
998
999			// Found a match. Add to list and clear node
1000			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1001			unset($node);
1002		}
1003		// It's passed by reference so this is actually what this function returns.
1004		if (is_object($debug_object)) {
1005			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1006		}
1007	}
1008
1009	/**
1010	 * Match value and pattern for a given CSS expression
1011	 *
1012	 * **Supported Expressions**
1013	 *
1014	 * | Expression | Description
1015	 * | ---------- | -----------
1016	 * | `=`        | $value and $pattern must be equal
1017	 * | `!=`       | $value and $pattern must not be equal
1018	 * | `^=`       | $value must start with $pattern
1019	 * | `$=`       | $value must end with $pattern
1020	 * | `*=`       | $value must contain $pattern
1021	 *
1022	 * @param string $exp The expression.
1023	 * @param string $pattern The pattern
1024	 * @param string $value The value
1025	 * @value bool True if $value matches $pattern
1026	 */
1027	protected function match($exp, $pattern, $value, $case_sensitivity)
1028	{
1029		global $debug_object;
1030		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1031
1032		if ($case_sensitivity === 'i') {
1033			$pattern = strtolower($pattern);
1034			$value = strtolower($value);
1035		}
1036
1037		switch ($exp) {
1038			case '=':
1039				return ($value === $pattern);
1040			case '!=':
1041				return ($value !== $pattern);
1042			case '^=':
1043				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1044			case '$=':
1045				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1046			case '*=':
1047				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1048			case '|=':
1049				/**
1050				 * [att|=val]
1051				 *
1052				 * Represents an element with the att attribute, its value
1053				 * either being exactly "val" or beginning with "val"
1054				 * immediately followed by "-" (U+002D).
1055				 */
1056				return strpos($value, $pattern) === 0;
1057			case '~=':
1058				/**
1059				 * [att~=val]
1060				 *
1061				 * Represents an element with the att attribute whose value is a
1062				 * whitespace-separated list of words, one of which is exactly
1063				 * "val". If "val" contains whitespace, it will never represent
1064				 * anything (since the words are separated by spaces). Also if
1065				 * "val" is the empty string, it will never represent anything.
1066				 */
1067				return in_array($pattern, explode(' ', trim($value)), true);
1068		}
1069		return false;
1070	}
1071
1072	/**
1073	 * Parse CSS selector
1074	 *
1075	 * @param string $selector_string CSS selector string
1076	 * @return array List of CSS selectors. The format depends on the type of
1077	 * selector:
1078	 *
1079	 * ```php
1080	 *
1081	 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1082	 *   array( // list of combinator selectors, i.e. 'img > p > div'
1083	 *     array( // selector element
1084	 *       [0], // (string) The element tag
1085	 *       [1], // (string) The element id
1086	 *       [2], // (array<string>) The element classes
1087	 *       [3], // (array<array<string>>) The list of attributes, each
1088	 *            // with four elements: name, expression, value, inverted
1089	 *       [4]  // (string) The selector combinator (' ' | '>' | '+' | '~')
1090	 *     )
1091	 *   )
1092	 * )
1093	 * ```
1094	 *
1095	 * @link https://www.w3.org/TR/selectors/#compound Compound selector
1096	 */
1097	protected function parse_selector($selector_string)
1098	{
1099		global $debug_object;
1100		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1101
1102		/**
1103		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1104		 *
1105		 * Paperg: Add the colon to the attribute, so that it properly finds
1106		 * <tag attr:ibute="something" > like google does.
1107		 *
1108		 * Note: if you try to look at this attribute, you MUST use getAttribute
1109		 * since $dom->x:y will fail the php syntax check.
1110		 *
1111		 * Notice the \[ starting the attribute? and the @? following? This
1112		 * implies that an attribute can begin with an @ sign that is not
1113		 * captured. This implies that an html attribute specifier may start
1114		 * with an @ sign that is NOT captured by the expression. Farther study
1115		 * is required to determine of this should be documented or removed.
1116		 *
1117		 * Matches selectors in this order:
1118		 *
1119		 * [0] - full match
1120		 *
1121		 * [1] - tag name
1122		 *     ([\w:\*-]*)
1123		 *     Matches the tag name consisting of zero or more words, colons,
1124		 *     asterisks and hyphens.
1125		 *
1126		 * [2] - id name
1127		 *     (?:\#([\w-]+))
1128		 *     Optionally matches a id name, consisting of an "#" followed by
1129		 *     the id name (one or more words and hyphens).
1130		 *
1131		 * [3] - class names (including dots)
1132		 *     (?:\.([\w\.-]+))?
1133		 *     Optionally matches a list of classs, consisting of an "."
1134		 *     followed by the class name (one or more words and hyphens)
1135		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
1136		 *
1137		 * [4] - attributes
1138		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1139		 *     Optionally matches the attributes list
1140		 *
1141		 * [5] - separator
1142		 *     ([\/, >+~]+)
1143		 *     Matches the selector list separator
1144		 */
1145		// phpcs:ignore Generic.Files.LineLength
1146		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1147
1148		preg_match_all(
1149			$pattern,
1150			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1151			$matches,
1152			PREG_SET_ORDER
1153		);
1154
1155		if (is_object($debug_object)) {
1156			$debug_object->debug_log(2, 'Matches Array: ', $matches);
1157		}
1158
1159		$selectors = array();
1160		$result = array();
1161
1162		foreach ($matches as $m) {
1163			$m[0] = trim($m[0]);
1164
1165			// Skip NoOps
1166			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1167
1168			// Convert to lowercase
1169			if ($this->dom->lowercase) {
1170				$m[1] = strtolower($m[1]);
1171			}
1172
1173			// Extract classes
1174			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1175
1176			/* Extract attributes (pattern based on the pattern above!)
1177
1178			 * [0] - full match
1179			 * [1] - attribute name
1180			 * [2] - attribute expression
1181			 * [3] - attribute value
1182			 * [4] - case sensitivity
1183			 *
1184			 * Note: Attributes can be negated with a "!" prefix to their name
1185			 */
1186			if($m[4] !== '') {
1187				preg_match_all(
1188					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1189					trim($m[4]),
1190					$attributes,
1191					PREG_SET_ORDER
1192				);
1193
1194				// Replace element by array
1195				$m[4] = array();
1196
1197				foreach($attributes as $att) {
1198					// Skip empty matches
1199					if(trim($att[0]) === '') { continue; }
1200
1201					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
1202					$m[4][] = array(
1203						$inverted ? substr($att[1], 1) : $att[1], // Name
1204						(isset($att[2])) ? $att[2] : '', // Expression
1205						(isset($att[3])) ? $att[3] : '', // Value
1206						$inverted, // Inverted Flag
1207						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1208					);
1209				}
1210			}
1211
1212			// Sanitize Separator
1213			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1214				$m[5] = ' ';
1215			} else { // Other Separator
1216				$m[5] = trim($m[5]);
1217			}
1218
1219			// Clear Separator if it's a Selector List
1220			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1221
1222			// Remove full match before adding to results
1223			array_shift($m);
1224			$result[] = $m;
1225
1226			if ($is_list) { // Selector List
1227				$selectors[] = $result;
1228				$result = array();
1229			}
1230		}
1231
1232		if (count($result) > 0) { $selectors[] = $result; }
1233		return $selectors;
1234	}
1235
1236	function __get($name)
1237	{
1238		if (isset($this->attr[$name])) {
1239			return $this->convert_text($this->attr[$name]);
1240		}
1241		switch ($name) {
1242			case 'outertext': return $this->outertext();
1243			case 'innertext': return $this->innertext();
1244			case 'plaintext': return $this->text();
1245			case 'xmltext': return $this->xmltext();
1246			default: return array_key_exists($name, $this->attr);
1247		}
1248	}
1249
1250	function __set($name, $value)
1251	{
1252		global $debug_object;
1253		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1254
1255		switch ($name) {
1256			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1257			case 'innertext':
1258				if (isset($this->_[HDOM_INFO_TEXT])) {
1259					return $this->_[HDOM_INFO_TEXT] = $value;
1260				}
1261				return $this->_[HDOM_INFO_INNER] = $value;
1262		}
1263
1264		if (!isset($this->attr[$name])) {
1265			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1266			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1267		}
1268
1269		$this->attr[$name] = $value;
1270	}
1271
1272	function __isset($name)
1273	{
1274		switch ($name) {
1275			case 'outertext': return true;
1276			case 'innertext': return true;
1277			case 'plaintext': return true;
1278		}
1279		//no value attr: nowrap, checked selected...
1280		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1281	}
1282
1283	function __unset($name)
1284	{
1285		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1286	}
1287
1288	// PaperG - Function to convert the text from one character set to another
1289	// if the two sets are not the same.
1290	function convert_text($text)
1291	{
1292		global $debug_object;
1293		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1294
1295		$converted_text = $text;
1296
1297		$sourceCharset = '';
1298		$targetCharset = '';
1299
1300		if ($this->dom) {
1301			$sourceCharset = strtoupper($this->dom->_charset);
1302			$targetCharset = strtoupper($this->dom->_target_charset);
1303		}
1304
1305		if (is_object($debug_object)) {
1306			$debug_object->debug_log(3,
1307				'source charset: '
1308				. $sourceCharset
1309				. ' target charaset: '
1310				. $targetCharset
1311			);
1312		}
1313
1314		if (!empty($sourceCharset)
1315			&& !empty($targetCharset)
1316			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1317			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1318			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1319				&& ($this->is_utf8($text))) {
1320				$converted_text = $text;
1321			} else {
1322				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1323			}
1324		}
1325
1326		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1327		if ($targetCharset === 'UTF-8') {
1328			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1329				$converted_text = substr($converted_text, 3);
1330			}
1331
1332			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1333				$converted_text = substr($converted_text, 0, -3);
1334			}
1335		}
1336
1337		return $converted_text;
1338	}
1339
1340	/**
1341	* Returns true if $string is valid UTF-8 and false otherwise.
1342	*
1343	* @param mixed $str String to be tested
1344	* @return boolean
1345	*/
1346	static function is_utf8($str)
1347	{
1348		$c = 0; $b = 0;
1349		$bits = 0;
1350		$len = strlen($str);
1351		for($i = 0; $i < $len; $i++) {
1352			$c = ord($str[$i]);
1353			if($c > 128) {
1354				if(($c >= 254)) { return false; }
1355				elseif($c >= 252) { $bits = 6; }
1356				elseif($c >= 248) { $bits = 5; }
1357				elseif($c >= 240) { $bits = 4; }
1358				elseif($c >= 224) { $bits = 3; }
1359				elseif($c >= 192) { $bits = 2; }
1360				else { return false; }
1361				if(($i + $bits) > $len) { return false; }
1362				while($bits > 1) {
1363					$i++;
1364					$b = ord($str[$i]);
1365					if($b < 128 || $b > 191) { return false; }
1366					$bits--;
1367				}
1368			}
1369		}
1370		return true;
1371	}
1372
1373	/**
1374	 * Function to try a few tricks to determine the displayed size of an img on
1375	 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1376	 * other tag types.
1377	 *
1378	 * @author John Schlick
1379	 * @version April 19 2012
1380	 * @return array an array containing the 'height' and 'width' of the image
1381	 * on the page or -1 if we can't figure it out.
1382	 */
1383	function get_display_size()
1384	{
1385		global $debug_object;
1386
1387		$width = -1;
1388		$height = -1;
1389
1390		if ($this->tag !== 'img') {
1391			return false;
1392		}
1393
1394		// See if there is aheight or width attribute in the tag itself.
1395		if (isset($this->attr['width'])) {
1396			$width = $this->attr['width'];
1397		}
1398
1399		if (isset($this->attr['height'])) {
1400			$height = $this->attr['height'];
1401		}
1402
1403		// Now look for an inline style.
1404		if (isset($this->attr['style'])) {
1405			// Thanks to user gnarf from stackoverflow for this regular expression.
1406			$attributes = array();
1407
1408			preg_match_all(
1409				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1410				$this->attr['style'],
1411				$matches,
1412				PREG_SET_ORDER
1413			);
1414
1415			foreach ($matches as $match) {
1416				$attributes[$match[1]] = $match[2];
1417			}
1418
1419			// If there is a width in the style attributes:
1420			if (isset($attributes['width']) && $width == -1) {
1421				// check that the last two characters are px (pixels)
1422				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1423					$proposed_width = substr($attributes['width'], 0, -2);
1424					// Now make sure that it's an integer and not something stupid.
1425					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1426						$width = $proposed_width;
1427					}
1428				}
1429			}
1430
1431			// If there is a width in the style attributes:
1432			if (isset($attributes['height']) && $height == -1) {
1433				// check that the last two characters are px (pixels)
1434				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1435					$proposed_height = substr($attributes['height'], 0, -2);
1436					// Now make sure that it's an integer and not something stupid.
1437					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1438						$height = $proposed_height;
1439					}
1440				}
1441			}
1442
1443		}
1444
1445		// Future enhancement:
1446		// Look in the tag to see if there is a class or id specified that has
1447		// a height or width attribute to it.
1448
1449		// Far future enhancement
1450		// Look at all the parent tags of this image to see if they specify a
1451		// class or id that has an img selector that specifies a height or width
1452		// Note that in this case, the class or id will have the img subselector
1453		// for it to apply to the image.
1454
1455		// ridiculously far future development
1456		// If the class or id is specified in a SEPARATE css file thats not on
1457		// the page, go get it and do what we were just doing for the ones on
1458		// the page.
1459
1460		$result = array(
1461			'height' => $height,
1462			'width' => $width
1463		);
1464
1465		return $result;
1466	}
1467
1468	// camel naming conventions
1469	function getAllAttributes()
1470	{
1471		return $this->attr;
1472	}
1473
1474	function getAttribute($name)
1475	{
1476		return $this->__get($name);
1477	}
1478
1479	function setAttribute($name, $value)
1480	{
1481		$this->__set($name, $value);
1482	}
1483
1484	function hasAttribute($name)
1485	{
1486		return $this->__isset($name);
1487	}
1488
1489	function removeAttribute($name)
1490	{
1491		$this->__set($name, null);
1492	}
1493
1494	function getElementById($id)
1495	{
1496		return $this->find("#$id", 0);
1497	}
1498
1499	function getElementsById($id, $idx = null)
1500	{
1501		return $this->find("#$id", $idx);
1502	}
1503
1504	function getElementByTagName($name)
1505	{
1506		return $this->find($name, 0);
1507	}
1508
1509	function getElementsByTagName($name, $idx = null)
1510	{
1511		return $this->find($name, $idx);
1512	}
1513
1514	function parentNode()
1515	{
1516		return $this->parent();
1517	}
1518
1519	function childNodes($idx = -1)
1520	{
1521		return $this->children($idx);
1522	}
1523
1524	function firstChild()
1525	{
1526		return $this->first_child();
1527	}
1528
1529	function lastChild()
1530	{
1531		return $this->last_child();
1532	}
1533
1534	function nextSibling()
1535	{
1536		return $this->next_sibling();
1537	}
1538
1539	function previousSibling()
1540	{
1541		return $this->prev_sibling();
1542	}
1543
1544	function hasChildNodes()
1545	{
1546		return $this->has_child();
1547	}
1548
1549	function nodeName()
1550	{
1551		return $this->tag;
1552	}
1553
1554	function appendChild($node)
1555	{
1556		$node->parent($this);
1557		return $node;
1558	}
1559
1560}
1561
1562/**
1563 * simple html dom parser
1564 *
1565 * Paperg - in the find routine: allow us to specify that we want case
1566 * insensitive testing of the value of the selector.
1567 *
1568 * Paperg - change $size from protected to public so we can easily access it
1569 *
1570 * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1571 * trust the html or not.  Default is to NOT trust it.
1572 *
1573 * @package PlaceLocalInclude
1574 */
1575class simple_html_dom
1576{
1577	/**
1578	 * The root node of the document
1579	 *
1580	 * @var object
1581	 */
1582	public $root = null;
1583
1584	/**
1585	 * List of nodes in the current DOM
1586	 *
1587	 * @var array
1588	 */
1589	public $nodes = array();
1590
1591	/**
1592	 * Callback function to run for each element in the DOM.
1593	 *
1594	 * @var callable|null
1595	 */
1596	public $callback = null;
1597
1598	/**
1599	 * Indicates how tags and attributes are matched
1600	 *
1601	 * @var bool When set to **true** tags and attributes will be converted to
1602	 * lowercase before matching.
1603	 */
1604	public $lowercase = false;
1605
1606	/**
1607	 * Original document size
1608	 *
1609	 * Holds the original document size.
1610	 *
1611	 * @var int
1612	 */
1613	public $original_size;
1614
1615	/**
1616	 * Current document size
1617	 *
1618	 * Holds the current document size. The document size is determined by the
1619	 * string length of ({@see simple_html_dom::$doc}).
1620	 *
1621	 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1622	 *
1623	 * @var int
1624	 * */
1625	public $size;
1626
1627	/**
1628	 * Current position in the document
1629	 *
1630	 * @var int
1631	 */
1632	protected $pos;
1633
1634	/**
1635	 * The document
1636	 *
1637	 * @var string
1638	 */
1639	protected $doc;
1640
1641	/**
1642	 * Current character
1643	 *
1644	 * Holds the current character at position {@see simple_html_dom::$pos} in
1645	 * the document {@see simple_html_dom::$doc}
1646	 *
1647	 * _Note_: Using this variable is more efficient than calling
1648	 * `substr($doc, $pos, 1)`
1649	 *
1650	 * @var string
1651	 */
1652	protected $char;
1653
1654	protected $cursor;
1655
1656	/**
1657	 * Parent node of the next node detected by the parser
1658	 *
1659	 * @var object
1660	 */
1661	protected $parent;
1662	protected $noise = array();
1663
1664	/**
1665	 * Tokens considered blank in HTML
1666	 *
1667	 * @var string
1668	 */
1669	protected $token_blank = " \t\r\n";
1670
1671	/**
1672	 * Tokens to identify the equal sign for attributes, stopping either at the
1673	 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1674	 * "<html>")
1675	 *
1676	 * @var string
1677	 */
1678	protected $token_equal = ' =/>';
1679
1680	/**
1681	 * Tokens to identify the end of a tag name. A tag name either ends on the
1682	 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1683	 *
1684	 * @var string
1685	 */
1686	protected $token_slash = " />\r\n\t";
1687
1688	/**
1689	 * Tokens to identify the end of an attribute
1690	 *
1691	 * @var string
1692	 */
1693	protected $token_attr = ' >';
1694
1695	// Note that this is referenced by a child node, and so it needs to be
1696	// public for that node to see this information.
1697	public $_charset = '';
1698	public $_target_charset = '';
1699
1700	/**
1701	 * Innertext for <br> elements
1702	 *
1703	 * @var string
1704	 */
1705	protected $default_br_text = '';
1706
1707	/**
1708	 * Suffix for <span> elements
1709	 *
1710	 * @var string
1711	 */
1712	public $default_span_text = '';
1713
1714	/**
1715	 * Defines a list of self-closing tags (Void elements) according to the HTML
1716	 * Specification
1717	 *
1718	 * _Remarks_:
1719	 * - Use `isset()` instead of `in_array()` on array elements to boost
1720	 * performance about 30%
1721	 * - Sort elements by name for better readability!
1722	 *
1723	 * @link https://www.w3.org/TR/html HTML Specification
1724	 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1725	 */
1726	protected $self_closing_tags = array(
1727		'area' => 1,
1728		'base' => 1,
1729		'br' => 1,
1730		'col' => 1,
1731		'embed' => 1,
1732		'hr' => 1,
1733		'img' => 1,
1734		'input' => 1,
1735		'link' => 1,
1736		'meta' => 1,
1737		'param' => 1,
1738		'source' => 1,
1739		'track' => 1,
1740		'wbr' => 1
1741	);
1742
1743	/**
1744	 * Defines a list of tags which - if closed - close all optional closing
1745	 * elements within if they haven't been closed yet. (So, an element where
1746	 * neither opening nor closing tag is omissible consistently closes every
1747	 * optional closing element within)
1748	 *
1749	 * _Remarks_:
1750	 * - Use `isset()` instead of `in_array()` on array elements to boost
1751	 * performance about 30%
1752	 * - Sort elements by name for better readability!
1753	 */
1754	protected $block_tags = array(
1755		'body' => 1,
1756		'div' => 1,
1757		'form' => 1,
1758		'root' => 1,
1759		'span' => 1,
1760		'table' => 1
1761	);
1762
1763	/**
1764	 * Defines elements whose end tag is omissible.
1765	 *
1766	 * * key = Name of an element whose end tag is omissible.
1767	 * * value = Names of elements whose end tag is omissible, that are closed
1768	 * by the current element.
1769	 *
1770	 * _Remarks_:
1771	 * - Use `isset()` instead of `in_array()` on array elements to boost
1772	 * performance about 30%
1773	 * - Sort elements by name for better readability!
1774	 *
1775	 * **Example**
1776	 *
1777	 * An `li` element’s end tag may be omitted if the `li` element is immediately
1778	 * followed by another `li` element. To do that, add following element to the
1779	 * array:
1780	 *
1781	 * ```php
1782	 * 'li' => array('li'),
1783	 * ```
1784	 *
1785	 * With this, the following two examples are considered equal. Note that the
1786	 * second example is missing the closing tags on `li` elements.
1787	 *
1788	 * ```html
1789	 * <ul><li>First Item</li><li>Second Item</li></ul>
1790	 * ```
1791	 *
1792	 * <ul><li>First Item</li><li>Second Item</li></ul>
1793	 *
1794	 * ```html
1795	 * <ul><li>First Item<li>Second Item</ul>
1796	 * ```
1797	 *
1798	 * <ul><li>First Item<li>Second Item</ul>
1799	 *
1800	 * @var array A two-dimensional array where the key is the name of an
1801	 * element whose end tag is omissible and the value is an array of elements
1802	 * whose end tag is omissible, that are closed by the current element.
1803	 *
1804	 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1805	 *
1806	 * @todo The implementation of optional closing tags doesn't work in all cases
1807	 * because it only consideres elements who close other optional closing
1808	 * tags, not taking into account that some (non-blocking) tags should close
1809	 * these optional closing tags. For example, the end tag for "p" is omissible
1810	 * and can be closed by an "address" element, whose end tag is NOT omissible.
1811	 * Currently a "p" element without closing tag stops at the next "p" element
1812	 * or blocking tag, even if it contains other elements.
1813	 *
1814	 * @todo Known sourceforge issue #2977341
1815	 * B tags that are not closed cause us to return everything to the end of
1816	 * the document.
1817	 */
1818	protected $optional_closing_tags = array(
1819		// Not optional, see
1820		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1821		'b' => array('b' => 1),
1822		'dd' => array('dd' => 1, 'dt' => 1),
1823		// Not optional, see
1824		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1825		'dl' => array('dd' => 1, 'dt' => 1),
1826		'dt' => array('dd' => 1, 'dt' => 1),
1827		'li' => array('li' => 1),
1828		'optgroup' => array('optgroup' => 1, 'option' => 1),
1829		'option' => array('optgroup' => 1, 'option' => 1),
1830		'p' => array('p' => 1),
1831		'rp' => array('rp' => 1, 'rt' => 1),
1832		'rt' => array('rp' => 1, 'rt' => 1),
1833		'td' => array('td' => 1, 'th' => 1),
1834		'th' => array('td' => 1, 'th' => 1),
1835		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1836	);
1837
1838	function __construct(
1839		$str = null,
1840		$lowercase = true,
1841		$forceTagsClosed = true,
1842		$target_charset = DEFAULT_TARGET_CHARSET,
1843		$stripRN = true,
1844		$defaultBRText = DEFAULT_BR_TEXT,
1845		$defaultSpanText = DEFAULT_SPAN_TEXT,
1846		$options = 0)
1847	{
1848		if ($str) {
1849			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1850				$this->load_file($str);
1851			} else {
1852				$this->load(
1853					$str,
1854					$lowercase,
1855					$stripRN,
1856					$defaultBRText,
1857					$defaultSpanText,
1858					$options
1859				);
1860			}
1861		}
1862		// Forcing tags to be closed implies that we don't trust the html, but
1863		// it can lead to parsing errors if we SHOULD trust the html.
1864		if (!$forceTagsClosed) {
1865			$this->optional_closing_array = array();
1866		}
1867
1868		$this->_target_charset = $target_charset;
1869	}
1870
1871	function __destruct()
1872	{
1873		$this->clear();
1874	}
1875
1876	// load html from string
1877	function load(
1878		$str,
1879		$lowercase = true,
1880		$stripRN = true,
1881		$defaultBRText = DEFAULT_BR_TEXT,
1882		$defaultSpanText = DEFAULT_SPAN_TEXT,
1883		$options = 0)
1884	{
1885		global $debug_object;
1886
1887		// prepare
1888		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1889
1890		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1891		// Script tags removal now preceeds style tag removal.
1892		// strip out <script> tags
1893		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1894		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1895
1896		// strip out the \r \n's if we are told to.
1897		if ($stripRN) {
1898			$this->doc = str_replace("\r", ' ', $this->doc);
1899			$this->doc = str_replace("\n", ' ', $this->doc);
1900
1901			// set the length of content since we have changed it.
1902			$this->size = strlen($this->doc);
1903		}
1904
1905		// strip out cdata
1906		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1907		// strip out comments
1908		$this->remove_noise("'<!--(.*?)-->'is");
1909		// strip out <style> tags
1910		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1911		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1912		// strip out preformatted tags
1913		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1914		// strip out server side scripts
1915		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1916
1917		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1918			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1919		}
1920
1921		// parsing
1922		$this->parse();
1923		// end
1924		$this->root->_[HDOM_INFO_END] = $this->cursor;
1925		$this->parse_charset();
1926
1927		// make load function chainable
1928		return $this;
1929	}
1930
1931	// load html from file
1932	function load_file()
1933	{
1934		$args = func_get_args();
1935
1936		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1937			$this->load($doc, true);
1938		} else {
1939			return false;
1940		}
1941	}
1942
1943	/**
1944	 * Set the callback function
1945	 *
1946	 * @param callable $function_name Callback function to run for each element
1947	 * in the DOM.
1948	 * @return void
1949	 */
1950	function set_callback($function_name)
1951	{
1952		$this->callback = $function_name;
1953	}
1954
1955	/**
1956	 * Remove callback function
1957	 *
1958	 * @return void
1959	 */
1960	function remove_callback()
1961	{
1962		$this->callback = null;
1963	}
1964
1965	// save dom as string
1966	function save($filepath = '')
1967	{
1968		$ret = $this->root->innertext();
1969		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1970		return $ret;
1971	}
1972
1973	// find dom node by css selector
1974	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1975	function find($selector, $idx = null, $lowercase = false)
1976	{
1977		return $this->root->find($selector, $idx, $lowercase);
1978	}
1979
1980	// clean up memory due to php5 circular references memory leak...
1981	function clear()
1982	{
1983		foreach ($this->nodes as $n) {
1984			$n->clear(); $n = null;
1985		}
1986
1987		// This add next line is documented in the sourceforge repository.
1988		// 2977248 as a fix for ongoing memory leaks that occur even with the
1989		// use of clear.
1990		if (isset($this->children)) {
1991			foreach ($this->children as $n) {
1992				$n->clear(); $n = null;
1993			}
1994		}
1995
1996		if (isset($this->parent)) {
1997			$this->parent->clear();
1998			unset($this->parent);
1999		}
2000
2001		if (isset($this->root)) {
2002			$this->root->clear();
2003			unset($this->root);
2004		}
2005
2006		unset($this->doc);
2007		unset($this->noise);
2008	}
2009
2010	function dump($show_attr = true)
2011	{
2012		$this->root->dump($show_attr);
2013	}
2014
2015	// prepare HTML data and init everything
2016	protected function prepare(
2017		$str, $lowercase = true,
2018		$defaultBRText = DEFAULT_BR_TEXT,
2019		$defaultSpanText = DEFAULT_SPAN_TEXT)
2020	{
2021		$this->clear();
2022
2023		$this->doc = trim($str);
2024		$this->size = strlen($this->doc);
2025		$this->original_size = $this->size; // original size of the html
2026		$this->pos = 0;
2027		$this->cursor = 1;
2028		$this->noise = array();
2029		$this->nodes = array();
2030		$this->lowercase = $lowercase;
2031		$this->default_br_text = $defaultBRText;
2032		$this->default_span_text = $defaultSpanText;
2033		$this->root = new simple_html_dom_node($this);
2034		$this->root->tag = 'root';
2035		$this->root->_[HDOM_INFO_BEGIN] = -1;
2036		$this->root->nodetype = HDOM_TYPE_ROOT;
2037		$this->parent = $this->root;
2038		if ($this->size > 0) { $this->char = $this->doc[0]; }
2039	}
2040
2041	/**
2042	 * Parse HTML content
2043	 *
2044	 * @return bool True on success
2045	 */
2046	protected function parse()
2047	{
2048		while (true) {
2049			// Read next tag if there is no text between current position and the
2050			// next opening tag.
2051			if (($s = $this->copy_until_char('<')) === '') {
2052				if($this->read_tag()) {
2053					continue;
2054				} else {
2055					return true;
2056				}
2057			}
2058
2059			// Add a text node for text between tags
2060			$node = new simple_html_dom_node($this);
2061			++$this->cursor;
2062			$node->_[HDOM_INFO_TEXT] = $s;
2063			$this->link_nodes($node, false);
2064		}
2065	}
2066
2067	// PAPERG - dkchou - added this to try to identify the character set of the
2068	// page we have just parsed so we know better how to spit it out later.
2069	// NOTE:  IF you provide a routine called
2070	// get_last_retrieve_url_contents_content_type which returns the
2071	// CURLINFO_CONTENT_TYPE from the last curl_exec
2072	// (or the content_type header from the last transfer), we will parse THAT,
2073	// and if a charset is specified, we will use it over any other mechanism.
2074	protected function parse_charset()
2075	{
2076		global $debug_object;
2077
2078		$charset = null;
2079
2080		if (function_exists('get_last_retrieve_url_contents_content_type')) {
2081			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
2082			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2083			if ($success) {
2084				$charset = $matches[1];
2085				if (is_object($debug_object)) {
2086					$debug_object->debug_log(2,
2087						'header content-type found charset of: '
2088						. $charset
2089					);
2090				}
2091			}
2092		}
2093
2094		if (empty($charset)) {
2095			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2096
2097			if (!empty($el)) {
2098				$fullvalue = $el->content;
2099				if (is_object($debug_object)) {
2100					$debug_object->debug_log(2,
2101						'meta content-type tag found'
2102						. $fullvalue
2103					);
2104				}
2105
2106				if (!empty($fullvalue)) {
2107					$success = preg_match(
2108						'/charset=(.+)/i',
2109						$fullvalue,
2110						$matches
2111					);
2112
2113					if ($success) {
2114						$charset = $matches[1];
2115					} else {
2116						// If there is a meta tag, and they don't specify the
2117						// character set, research says that it's typically
2118						// ISO-8859-1
2119						if (is_object($debug_object)) {
2120							$debug_object->debug_log(2,
2121								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2122							);
2123						}
2124
2125						$charset = 'ISO-8859-1';
2126					}
2127				}
2128			}
2129		}
2130
2131		// If we couldn't find a charset above, then lets try to detect one
2132		// based on the text we got...
2133		if (empty($charset)) {
2134			// Use this in case mb_detect_charset isn't installed/loaded on
2135			// this machine.
2136			$charset = false;
2137			if (function_exists('mb_detect_encoding')) {
2138				// Have php try to detect the encoding from the text given to us.
2139				$charset = mb_detect_encoding(
2140					$this->doc . 'ascii',
2141					$encoding_list = array( 'UTF-8', 'CP1252' )
2142				);
2143
2144				if (is_object($debug_object)) {
2145					$debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2146				}
2147			}
2148
2149			// and if this doesn't work...  then we need to just wrongheadedly
2150			// assume it's UTF-8 so that we can move on - cause this will
2151			// usually give us most of what we need...
2152			if ($charset === false) {
2153				if (is_object($debug_object)) {
2154					$debug_object->debug_log(
2155						2,
2156						'since mb_detect failed - using default of utf-8'
2157					);
2158				}
2159
2160				$charset = 'UTF-8';
2161			}
2162		}
2163
2164		// Since CP1252 is a superset, if we get one of it's subsets, we want
2165		// it instead.
2166		if ((strtolower($charset) == strtolower('ISO-8859-1'))
2167			|| (strtolower($charset) == strtolower('Latin1'))
2168			|| (strtolower($charset) == strtolower('Latin-1'))) {
2169
2170			if (is_object($debug_object)) {
2171				$debug_object->debug_log(
2172					2,
2173					'replacing ' . $charset . ' with CP1252 as its a superset'
2174				);
2175			}
2176
2177			$charset = 'CP1252';
2178		}
2179
2180		if (is_object($debug_object)) {
2181			$debug_object->debug_log(1, 'EXIT - ' . $charset);
2182		}
2183
2184		return $this->_charset = $charset;
2185	}
2186
2187	/**
2188	 * Parse tag from current document position.
2189	 *
2190	 * @return bool True if a tag was found, false otherwise
2191	 */
2192	protected function read_tag()
2193	{
2194		// Set end position if no further tags found
2195		if ($this->char !== '<') {
2196			$this->root->_[HDOM_INFO_END] = $this->cursor;
2197			return false;
2198		}
2199
2200		$begin_tag_pos = $this->pos;
2201		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2202
2203		// end tag
2204		if ($this->char === '/') {
2205			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2206
2207			// Skip whitespace in end tags (i.e. in "</   html>")
2208			$this->skip($this->token_blank);
2209			$tag = $this->copy_until_char('>');
2210
2211			// Skip attributes in end tags
2212			if (($pos = strpos($tag, ' ')) !== false) {
2213				$tag = substr($tag, 0, $pos);
2214			}
2215
2216			$parent_lower = strtolower($this->parent->tag);
2217			$tag_lower = strtolower($tag);
2218
2219			// The end tag is supposed to close the parent tag. Handle situations
2220			// when it doesn't
2221			if ($parent_lower !== $tag_lower) {
2222				// Parent tag does not have to be closed necessarily (optional closing tag)
2223				// Current tag is a block tag, so it may close an ancestor
2224				if (isset($this->optional_closing_tags[$parent_lower])
2225					&& isset($this->block_tags[$tag_lower])) {
2226
2227					$this->parent->_[HDOM_INFO_END] = 0;
2228					$org_parent = $this->parent;
2229
2230					// Traverse ancestors to find a matching opening tag
2231					// Stop at root node
2232					while (($this->parent->parent)
2233						&& strtolower($this->parent->tag) !== $tag_lower
2234					){
2235						$this->parent = $this->parent->parent;
2236					}
2237
2238					// If we don't have a match add current tag as text node
2239					if (strtolower($this->parent->tag) !== $tag_lower) {
2240						$this->parent = $org_parent; // restore origonal parent
2241
2242						if ($this->parent->parent) {
2243							$this->parent = $this->parent->parent;
2244						}
2245
2246						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2247						return $this->as_text_node($tag);
2248					}
2249				} elseif (($this->parent->parent)
2250					&& isset($this->block_tags[$tag_lower])
2251				) {
2252					// Grandparent exists and current tag is a block tag, so our
2253					// parent doesn't have an end tag
2254					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
2255					$org_parent = $this->parent;
2256
2257					// Traverse ancestors to find a matching opening tag
2258					// Stop at root node
2259					while (($this->parent->parent)
2260						&& strtolower($this->parent->tag) !== $tag_lower
2261					) {
2262						$this->parent = $this->parent->parent;
2263					}
2264
2265					// If we don't have a match add current tag as text node
2266					if (strtolower($this->parent->tag) !== $tag_lower) {
2267						$this->parent = $org_parent; // restore origonal parent
2268						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2269						return $this->as_text_node($tag);
2270					}
2271				} elseif (($this->parent->parent)
2272					&& strtolower($this->parent->parent->tag) === $tag_lower
2273				) { // Grandparent exists and current tag closes it
2274					$this->parent->_[HDOM_INFO_END] = 0;
2275					$this->parent = $this->parent->parent;
2276				} else { // Random tag, add as text node
2277					return $this->as_text_node($tag);
2278				}
2279			}
2280
2281			// Set end position of parent tag to current cursor position
2282			$this->parent->_[HDOM_INFO_END] = $this->cursor;
2283
2284			if ($this->parent->parent) {
2285				$this->parent = $this->parent->parent;
2286			}
2287
2288			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2289			return true;
2290		}
2291
2292		// start tag
2293		$node = new simple_html_dom_node($this);
2294		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
2295		++$this->cursor;
2296		$tag = $this->copy_until($this->token_slash); // Get tag name
2297		$node->tag_start = $begin_tag_pos;
2298
2299		// doctype, cdata & comments...
2300		// <!DOCTYPE html>
2301		// <![CDATA[ ... ]]>
2302		// <!-- Comment -->
2303		if (isset($tag[0]) && $tag[0] === '!') {
2304			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2305
2306			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2307				$node->nodetype = HDOM_TYPE_COMMENT;
2308				$node->tag = 'comment';
2309			} else { // Could be doctype or CDATA but we don't care
2310				$node->nodetype = HDOM_TYPE_UNKNOWN;
2311				$node->tag = 'unknown';
2312			}
2313
2314			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2315
2316			$this->link_nodes($node, true);
2317			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2318			return true;
2319		}
2320
2321		// The start tag cannot contain another start tag, if so add as text
2322		// i.e. "<<html>"
2323		if ($pos = strpos($tag, '<') !== false) {
2324			$tag = '<' . substr($tag, 0, -1);
2325			$node->_[HDOM_INFO_TEXT] = $tag;
2326			$this->link_nodes($node, false);
2327			$this->char = $this->doc[--$this->pos]; // prev
2328			return true;
2329		}
2330
2331		// Handle invalid tag names (i.e. "<html#doc>")
2332		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2333			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2334
2335			// Next char is the beginning of a new tag, don't touch it.
2336			if ($this->char === '<') {
2337				$this->link_nodes($node, false);
2338				return true;
2339			}
2340
2341			// Next char closes current tag, add and be done with it.
2342			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2343			$this->link_nodes($node, false);
2344			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2345			return true;
2346		}
2347
2348		// begin tag, add new node
2349		$node->nodetype = HDOM_TYPE_ELEMENT;
2350		$tag_lower = strtolower($tag);
2351		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
2352
2353		// handle optional closing tags
2354		if (isset($this->optional_closing_tags[$tag_lower])) {
2355			// Traverse ancestors to close all optional closing tags
2356			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2357				$this->parent->_[HDOM_INFO_END] = 0;
2358				$this->parent = $this->parent->parent;
2359			}
2360			$node->parent = $this->parent;
2361		}
2362
2363		$guard = 0; // prevent infinity loop
2364
2365		// [0] Space between tag and first attribute
2366		$space = array($this->copy_skip($this->token_blank), '', '');
2367
2368		// attributes
2369		do {
2370			// Everything until the first equal sign should be the attribute name
2371			$name = $this->copy_until($this->token_equal);
2372
2373			if ($name === '' && $this->char !== null && $space[0] === '') {
2374				break;
2375			}
2376
2377			if ($guard === $this->pos) { // Escape infinite loop
2378				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2379				continue;
2380			}
2381
2382			$guard = $this->pos;
2383
2384			// handle endless '<'
2385			// Out of bounds before the tag ended
2386			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2387				$node->nodetype = HDOM_TYPE_TEXT;
2388				$node->_[HDOM_INFO_END] = 0;
2389				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2390				$node->tag = 'text';
2391				$this->link_nodes($node, false);
2392				return true;
2393			}
2394
2395			// handle mismatch '<'
2396			// Attributes cannot start after opening tag
2397			if ($this->doc[$this->pos - 1] == '<') {
2398				$node->nodetype = HDOM_TYPE_TEXT;
2399				$node->tag = 'text';
2400				$node->attr = array();
2401				$node->_[HDOM_INFO_END] = 0;
2402				$node->_[HDOM_INFO_TEXT] = substr(
2403					$this->doc,
2404					$begin_tag_pos,
2405					$this->pos - $begin_tag_pos - 1
2406				);
2407				$this->pos -= 2;
2408				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2409				$this->link_nodes($node, false);
2410				return true;
2411			}
2412
2413			if ($name !== '/' && $name !== '') { // this is a attribute name
2414				// [1] Whitespace after attribute name
2415				$space[1] = $this->copy_skip($this->token_blank);
2416
2417				$name = $this->restore_noise($name); // might be a noisy name
2418
2419				if ($this->lowercase) { $name = strtolower($name); }
2420
2421				if ($this->char === '=') { // attribute with value
2422					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2423					$this->parse_attr($node, $name, $space); // get attribute value
2424				} else {
2425					//no value attr: nowrap, checked selected...
2426					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2427					$node->attr[$name] = true;
2428					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2429				}
2430
2431				$node->_[HDOM_INFO_SPACE][] = $space;
2432
2433				// prepare for next attribute
2434				$space = array(
2435					$this->copy_skip($this->token_blank),
2436					'',
2437					''
2438				);
2439			} else { // no more attributes
2440				break;
2441			}
2442		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2443
2444		$this->link_nodes($node, true);
2445		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2446
2447		// handle empty tags (i.e. "<div/>")
2448		if ($this->copy_until_char('>') === '/') {
2449			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2450			$node->_[HDOM_INFO_END] = 0;
2451		} else {
2452			// reset parent
2453			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2454				$this->parent = $node;
2455			}
2456		}
2457
2458		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2459
2460		// If it's a BR tag, we need to set it's text to the default text.
2461		// This way when we see it in plaintext, we can generate formatting that the user wants.
2462		// since a br tag never has sub nodes, this works well.
2463		if ($node->tag === 'br') {
2464			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2465		}
2466
2467		return true;
2468	}
2469
2470	/**
2471	 * Parse attribute from current document position
2472	 *
2473	 * @param object $node Node for the attributes
2474	 * @param string $name Name of the current attribute
2475	 * @param array $space Array for spacing information
2476	 * @return void
2477	 */
2478	protected function parse_attr($node, $name, &$space)
2479	{
2480		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2481		// If the attribute is already defined inside a tag, only pay attention
2482		// to the first one as opposed to the last one.
2483		// https://stackoverflow.com/a/26341866
2484		if (isset($node->attr[$name])) {
2485			return;
2486		}
2487
2488		// [2] Whitespace between "=" and the value
2489		$space[2] = $this->copy_skip($this->token_blank);
2490
2491		switch ($this->char) {
2492			case '"': // value is anything between double quotes
2493				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2494				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2495				$node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2496				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2497				break;
2498			case '\'': // value is anything between single quotes
2499				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2500				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2501				$node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2502				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2503				break;
2504			default: // value is anything until the first space or end tag
2505				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2506				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2507		}
2508		// PaperG: Attributes should not have \r or \n in them, that counts as
2509		// html whitespace.
2510		$node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2511		$node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2512		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2513		// and trailing space since some people leave it in the multi class case.
2514		if ($name === 'class') {
2515			$node->attr[$name] = trim($node->attr[$name]);
2516		}
2517	}
2518
2519	/**
2520	 * Link node to parent node
2521	 *
2522	 * @param object $node Node to link to parent
2523	 * @param bool $is_child True if the node is a child of parent
2524	 * @return void
2525	 */
2526	// link node's parent
2527	protected function link_nodes(&$node, $is_child)
2528	{
2529		$node->parent = $this->parent;
2530		$this->parent->nodes[] = $node;
2531		if ($is_child) {
2532			$this->parent->children[] = $node;
2533		}
2534	}
2535
2536	/**
2537	 * Add tag as text node to current node
2538	 *
2539	 * @param string $tag Tag name
2540	 * @return bool True on success
2541	 */
2542	protected function as_text_node($tag)
2543	{
2544		$node = new simple_html_dom_node($this);
2545		++$this->cursor;
2546		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2547		$this->link_nodes($node, false);
2548		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2549		return true;
2550	}
2551
2552	/**
2553	 * Seek from the current document position to the first occurrence of a
2554	 * character not defined by the provided string. Update the current document
2555	 * position to the new position.
2556	 *
2557	 * @param string $chars A string containing every allowed character.
2558	 * @return void
2559	 */
2560	protected function skip($chars)
2561	{
2562		$this->pos += strspn($this->doc, $chars, $this->pos);
2563		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2564	}
2565
2566	/**
2567	 * Copy substring from the current document position to the first occurrence
2568	 * of a character not defined by the provided string.
2569	 *
2570	 * @param string $chars A string containing every allowed character.
2571	 * @return string Substring from the current document position to the first
2572	 * occurrence of a character not defined by the provided string.
2573	 */
2574	protected function copy_skip($chars)
2575	{
2576		$pos = $this->pos;
2577		$len = strspn($this->doc, $chars, $pos);
2578		$this->pos += $len;
2579		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2580		if ($len === 0) { return ''; }
2581		return substr($this->doc, $pos, $len);
2582	}
2583
2584	/**
2585	 * Copy substring from the current document position to the first occurrence
2586	 * of any of the provided characters.
2587	 *
2588	 * @param string $chars A string containing every character to stop at.
2589	 * @return string Substring from the current document position to the first
2590	 * occurrence of any of the provided characters.
2591	 */
2592	protected function copy_until($chars)
2593	{
2594		$pos = $this->pos;
2595		$len = strcspn($this->doc, $chars, $pos);
2596		$this->pos += $len;
2597		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2598		return substr($this->doc, $pos, $len);
2599	}
2600
2601	/**
2602	 * Copy substring from the current document position to the first occurrence
2603	 * of the provided string.
2604	 *
2605	 * @param string $char The string to stop at.
2606	 * @return string Substring from the current document position to the first
2607	 * occurrence of the provided string.
2608	 */
2609	protected function copy_until_char($char)
2610	{
2611		if ($this->char === null) { return ''; }
2612
2613		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2614			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2615			$this->char = null;
2616			$this->pos = $this->size;
2617			return $ret;
2618		}
2619
2620		if ($pos === $this->pos) { return ''; }
2621
2622		$pos_old = $this->pos;
2623		$this->char = $this->doc[$pos];
2624		$this->pos = $pos;
2625		return substr($this->doc, $pos_old, $pos - $pos_old);
2626	}
2627
2628	/**
2629	 * Remove noise from HTML content
2630	 *
2631	 * Noise is stored to {@see simple_html_dom::$noise}
2632	 *
2633	 * @param string $pattern The regex pattern used for finding noise
2634	 * @param bool $remove_tag True to remove the entire match. Default is false
2635	 * to only remove the captured data.
2636	 */
2637	protected function remove_noise($pattern, $remove_tag = false)
2638	{
2639		global $debug_object;
2640		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2641
2642		$count = preg_match_all(
2643			$pattern,
2644			$this->doc,
2645			$matches,
2646			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2647		);
2648
2649		for ($i = $count - 1; $i > -1; --$i) {
2650			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2651
2652			if (is_object($debug_object)) {
2653				$debug_object->debug_log(2, 'key is: ' . $key);
2654			}
2655
2656			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2657			$this->noise[$key] = $matches[$i][$idx][0];
2658			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2659		}
2660
2661		// reset the length of content
2662		$this->size = strlen($this->doc);
2663
2664		if ($this->size > 0) {
2665			$this->char = $this->doc[0];
2666		}
2667	}
2668
2669	/**
2670	 * Restore noise to HTML content
2671	 *
2672	 * Noise is restored from {@see simple_html_dom::$noise}
2673	 *
2674	 * @param string $text A subset of HTML containing noise
2675	 * @return string The same content with noise restored
2676	 */
2677	function restore_noise($text)
2678	{
2679		global $debug_object;
2680		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2681
2682		while (($pos = strpos($text, '___noise___')) !== false) {
2683			// Sometimes there is a broken piece of markup, and we don't GET the
2684			// pos+11 etc... token which indicates a problem outside of us...
2685
2686			// todo: "___noise___1000" (or any number with four or more digits)
2687			// in the DOM causes an infinite loop which could be utilized by
2688			// malicious software
2689			if (strlen($text) > $pos + 15) {
2690				$key = '___noise___'
2691				. $text[$pos + 11]
2692				. $text[$pos + 12]
2693				. $text[$pos + 13]
2694				. $text[$pos + 14]
2695				. $text[$pos + 15];
2696
2697				if (is_object($debug_object)) {
2698					$debug_object->debug_log(2, 'located key of: ' . $key);
2699				}
2700
2701				if (isset($this->noise[$key])) {
2702					$text = substr($text, 0, $pos)
2703					. $this->noise[$key]
2704					. substr($text, $pos + 16);
2705				} else {
2706					// do this to prevent an infinite loop.
2707					$text = substr($text, 0, $pos)
2708					. 'UNDEFINED NOISE FOR KEY: '
2709					. $key
2710					. substr($text, $pos + 16);
2711				}
2712			} else {
2713				// There is no valid key being given back to us... We must get
2714				// rid of the ___noise___ or we will have a problem.
2715				$text = substr($text, 0, $pos)
2716				. 'NO NUMERIC NOISE KEY'
2717				. substr($text, $pos + 11);
2718			}
2719		}
2720		return $text;
2721	}
2722
2723	// Sometimes we NEED one of the noise elements.
2724	function search_noise($text)
2725	{
2726		global $debug_object;
2727		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2728
2729		foreach($this->noise as $noiseElement) {
2730			if (strpos($noiseElement, $text) !== false) {
2731				return $noiseElement;
2732			}
2733		}
2734	}
2735
2736	function __toString()
2737	{
2738		return $this->root->innertext();
2739	}
2740
2741	function __get($name)
2742	{
2743		switch ($name) {
2744			case 'outertext':
2745				return $this->root->innertext();
2746			case 'innertext':
2747				return $this->root->innertext();
2748			case 'plaintext':
2749				return $this->root->text();
2750			case 'charset':
2751				return $this->_charset;
2752			case 'target_charset':
2753				return $this->_target_charset;
2754		}
2755	}
2756
2757	// camel naming conventions
2758	function childNodes($idx = -1)
2759	{
2760		return $this->root->childNodes($idx);
2761	}
2762
2763	function firstChild()
2764	{
2765		return $this->root->first_child();
2766	}
2767
2768	function lastChild()
2769	{
2770		return $this->root->last_child();
2771	}
2772
2773	function createElement($name, $value = null)
2774	{
2775		return @str_get_html("<$name>$value</$name>")->first_child();
2776	}
2777
2778	function createTextNode($value)
2779	{
2780		return @end(str_get_html($value)->nodes);
2781	}
2782
2783	function getElementById($id)
2784	{
2785		return $this->find("#$id", 0);
2786	}
2787
2788	function getElementsById($id, $idx = null)
2789	{
2790		return $this->find("#$id", $idx);
2791	}
2792
2793	function getElementByTagName($name)
2794	{
2795		return $this->find($name, 0);
2796	}
2797
2798	function getElementsByTagName($name, $idx = -1)
2799	{
2800		return $this->find($name, $idx);
2801	}
2802
2803	function loadFile()
2804	{
2805		$args = func_get_args();
2806		$this->load_file($args);
2807	}
2808}