YFQ08C4Z

· 6 years ago · Mar 20, 2019, 08:24 AM
1
2<?php
3
4/* START NOT EDIT BEREICH 1 */
5/**
6 * All of the Defines for the classes below.
7 * @author S.C. Chen <me578022@gmail.com>
8 */
9error_reporting(-1);
10define('HDOM_TYPE_ELEMENT', 1);
11define('HDOM_TYPE_COMMENT', 2);
12define('HDOM_TYPE_TEXT', 3);
13define('HDOM_TYPE_ENDTAG', 4);
14define('HDOM_TYPE_ROOT', 5);
15define('HDOM_TYPE_UNKNOWN', 6);
16define('HDOM_QUOTE_DOUBLE', 0);
17define('HDOM_QUOTE_SINGLE', 1);
18define('HDOM_QUOTE_NO', 3);
19define('HDOM_INFO_BEGIN', 0);
20define('HDOM_INFO_END', 1);
21define('HDOM_INFO_QUOTE', 2);
22define('HDOM_INFO_SPACE', 3);
23define('HDOM_INFO_TEXT', 4);
24define('HDOM_INFO_INNER', 5);
25define('HDOM_INFO_OUTER', 6);
26define('HDOM_INFO_ENDSPACE', 7);
27/** The default target charset */
28defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
29/** The default <br> text used instead of <br> tags when returning text */
30defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
31/** The default <span> text used instead of <span> tags when returning text */
32defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
33/** The maximum file size the parser should load */
34defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
35/** Contents between curly braces "{" and "}" are interpreted as text */
36define('HDOM_SMARTY_AS_TEXT', 1);
37
38/* END NOT EDIT BEREICH 1 */
39
40
41// Dummy-URL
42$destinationUrl = "https://www.dell.com/support/home/de/de/dedhs1/product-support/servicetag/SERVICETAG/warranty";
43// Liste der Service Tags
44$serviceTags = array("jcss0n2");
45// Key
46$keyToSearchFor = "Versanddatum";
47
48
49// Laufe Schleife Durch
50foreach ($serviceTags as $tag){
51	// Setze einzelnen Service-Tag ein
52	$tempUrl = str_replace("SERVICETAG", $tag, $destinationUrl);
53	// Ziehe Content um zu parsen
54	$test = file_get_html($tempUrl);
55	// Google Captcha umgehen
56	echo $test;
57	// Neue Schleife um Tabellendaten auszulesen
58	foreach($test->find('th') as $th) {
59		// Suche nach $keyToSearchFor in HTML-Code
60		if (strpos($th, $keyToSearchFor) !== false) {
61			// Gebe aus, wenn gefunden
62			echo "bla";
63			echo $th;
64		}
65	}
66}
67/* DO NOT TOUCH BELOW 
68
69INCLUDE LIBRARY
70
71*/
72
73/**
74 * Website: http://sourceforge.net/projects/simplehtmldom/
75 * Additional projects: http://sourceforge.net/projects/debugobject/
76 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
77 * Contributions by:
78 *	 Yousuke Kumakura (Attribute filters)
79 *	 Vadim Voituk (Negative indexes supports of "find" method)
80 *	 Antcs (Constructor with automatically load contents either text or file/url)
81 *
82 * all affected sections have comments starting with "PaperG"
83 *
84 * Paperg - Added case insensitive testing of the value of the selector.
85 *
86 * Paperg - Added tag_start for the starting index of tags - NOTE: This works
87 * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
88 * out, and after the remove_noice calls so it will not reflect the REAL
89 * position of the tag in the source, it will almost always be smaller by some
90 * amount. We use this to determine how far into the file the tag in question
91 * is. This "percentage" will never be accurate as the $dom->size is the "real"
92 * number of bytes the dom was created from. But for most purposes, it's a
93 * really good estimation.
94 *
95 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
96 * closed is great for malformed html, but it CAN lead to parsing errors.
97 *
98 * Allow the user to tell us how much they trust the html.
99 *
100 * Paperg add the text and plaintext to the selectors for the find syntax.
101 * plaintext implies text in the innertext of a node.  text implies that the
102 * tag is a text node. This allows for us to find tags based on the text they
103 * contain.
104 *
105 * Create find_ancestor_tag to see if a tag is - at any level - inside of
106 * another specific tag.
107 *
108 * Paperg: added parse_charset so that we know about the character set of
109 * the source document. NOTE: If the user's system has a routine called
110 * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
111 * returning the content-type header from the last transfer or curl_exec, and
112 * we will parse that and use it in preference to any other method of charset
113 * detection.
114 *
115 * Found infinite loop in the case of broken html in restore_noise. Rewrote to
116 * protect from that.
117 *
118 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
119 *
120 * Licensed under The MIT License
121 * Redistributions of files must retain the above copyright notice.
122 *
123 * @author S.C. Chen <me578022@gmail.com>
124 * @author John Schlick
125 * @author Rus Carroll
126 * @version Rev. 1.8.1 (247)
127 * @package PlaceLocalInclude
128 * @subpackage simple_html_dom
129 */
130
131/**
132 * All of the Defines for the classes below.
133 * @author S.C. Chen <me578022@gmail.com>
134 */
135define('HDOM_TYPE_ELEMENT', 1);
136define('HDOM_TYPE_COMMENT', 2);
137define('HDOM_TYPE_TEXT', 3);
138define('HDOM_TYPE_ENDTAG', 4);
139define('HDOM_TYPE_ROOT', 5);
140define('HDOM_TYPE_UNKNOWN', 6);
141define('HDOM_QUOTE_DOUBLE', 0);
142define('HDOM_QUOTE_SINGLE', 1);
143define('HDOM_QUOTE_NO', 3);
144define('HDOM_INFO_BEGIN', 0);
145define('HDOM_INFO_END', 1);
146define('HDOM_INFO_QUOTE', 2);
147define('HDOM_INFO_SPACE', 3);
148define('HDOM_INFO_TEXT', 4);
149define('HDOM_INFO_INNER', 5);
150define('HDOM_INFO_OUTER', 6);
151define('HDOM_INFO_ENDSPACE', 7);
152
153/** The default target charset */
154defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
155
156/** The default <br> text used instead of <br> tags when returning text */
157defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
158
159/** The default <span> text used instead of <span> tags when returning text */
160defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
161
162/** The maximum file size the parser should load */
163defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
164
165/** Contents between curly braces "{" and "}" are interpreted as text */
166define('HDOM_SMARTY_AS_TEXT', 1);
167
168// helper functions
169// -----------------------------------------------------------------------------
170// get html dom from file
171// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
172function file_get_html(
173	$url,
174	$use_include_path = false,
175	$context = null,
176	$offset = 0,
177	$maxLen = -1,
178	$lowercase = true,
179	$forceTagsClosed = true,
180	$target_charset = DEFAULT_TARGET_CHARSET,
181	$stripRN = true,
182	$defaultBRText = DEFAULT_BR_TEXT,
183	$defaultSpanText = DEFAULT_SPAN_TEXT)
184{
185	// Ensure maximum length is greater than zero
186	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
187
188	// We DO force the tags to be terminated.
189	$dom = new simple_html_dom(
190		null,
191		$lowercase,
192		$forceTagsClosed,
193		$target_charset,
194		$stripRN,
195		$defaultBRText,
196		$defaultSpanText);
197
198	/**
199	 * For sourceforge users: uncomment the next line and comment the
200	 * retrieve_url_contents line 2 lines down if it is not already done.
201	 */
202	$contents = file_get_contents(
203		$url,
204		$use_include_path,
205		$context,
206		$offset,
207		$maxLen);
208
209	// Paperg - use our own mechanism for getting the contents as we want to
210	// control the timeout.
211	// $contents = retrieve_url_contents($url);
212	if (empty($contents) || strlen($contents) > $maxLen) { return false; }
213
214	// The second parameter can force the selectors to all be lowercase.
215	$dom->load($contents, $lowercase, $stripRN);
216	return $dom;
217}
218
219// get html dom from string
220function str_get_html(
221	$str,
222	$lowercase = true,
223	$forceTagsClosed = true,
224	$target_charset = DEFAULT_TARGET_CHARSET,
225	$stripRN = true,
226	$defaultBRText = DEFAULT_BR_TEXT,
227	$defaultSpanText = DEFAULT_SPAN_TEXT)
228{
229	$dom = new simple_html_dom(
230		null,
231		$lowercase,
232		$forceTagsClosed,
233		$target_charset,
234		$stripRN,
235		$defaultBRText,
236		$defaultSpanText);
237
238	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
239		$dom->clear();
240		return false;
241	}
242
243	$dom->load($str, $lowercase, $stripRN);
244	return $dom;
245}
246
247// dump html dom tree
248function dump_html_tree($node, $show_attr = true, $deep = 0)
249{
250	$node->dump($node);
251}
252
253/**
254 * simple html dom node
255 * PaperG - added ability for "find" routine to lowercase the value of the
256 * selector.
257 *
258 * PaperG - added $tag_start to track the start position of the tag in the total
259 * byte index
260 *
261 * @package PlaceLocalInclude
262 */
263class simple_html_dom_node
264{
265	/**
266	 * Node type
267	 *
268	 * Default is {@see HDOM_TYPE_TEXT}
269	 *
270	 * @var int
271	 */
272	public $nodetype = HDOM_TYPE_TEXT;
273
274	/**
275	 * Tag name
276	 *
277	 * Default is 'text'
278	 *
279	 * @var string
280	 */
281	public $tag = 'text';
282
283	/**
284	 * List of attributes
285	 *
286	 * @var array
287	 */
288	public $attr = array();
289
290	/**
291	 * List of child node objects
292	 *
293	 * @var array
294	 */
295	public $children = array();
296	public $nodes = array();
297
298	/**
299	 * The parent node object
300	 *
301	 * @var object|null
302	 */
303	public $parent = null;
304
305	// The "info" array - see HDOM_INFO_... for what each element contains.
306	public $_ = array();
307
308	/**
309	 * Start position of the tag in the document
310	 *
311	 * @var int
312	 */
313	public $tag_start = 0;
314
315	/**
316	 * The DOM object
317	 *
318	 * @var object|null
319	 */
320	private $dom = null;
321
322	/**
323	 * Construct new node object
324	 *
325	 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
326	 */
327	function __construct($dom)
328	{
329		$this->dom = $dom;
330		$dom->nodes[] = $this;
331	}
332
333	function __destruct()
334	{
335		$this->clear();
336	}
337
338	function __toString()
339	{
340		return $this->outertext();
341	}
342
343	// clean up memory due to php5 circular references memory leak...
344	function clear()
345	{
346		$this->dom = null;
347		$this->nodes = null;
348		$this->parent = null;
349		$this->children = null;
350	}
351
352	// dump node's tree
353	function dump($show_attr = true, $deep = 0)
354	{
355		$lead = str_repeat('	', $deep);
356
357		echo $lead . $this->tag;
358
359		if ($show_attr && count($this->attr) > 0) {
360			echo '(';
361			foreach ($this->attr as $k => $v) {
362				echo "[$k]=>\"" . $this->$k . '", ';
363			}
364			echo ')';
365		}
366
367		echo "\n";
368
369		if ($this->nodes) {
370			foreach ($this->nodes as $c) {
371				$c->dump($show_attr, $deep + 1);
372			}
373		}
374	}
375
376
377	// Debugging function to dump a single dom node with a bunch of information about it.
378	function dump_node($echo = true)
379	{
380		$string = $this->tag;
381
382		if (count($this->attr) > 0) {
383			$string .= '(';
384			foreach ($this->attr as $k => $v) {
385				$string .= "[$k]=>\"" . $this->$k . '", ';
386			}
387			$string .= ')';
388		}
389
390		if (count($this->_) > 0) {
391			$string .= ' $_ (';
392			foreach ($this->_ as $k => $v) {
393				if (is_array($v)) {
394					$string .= "[$k]=>(";
395					foreach ($v as $k2 => $v2) {
396						$string .= "[$k2]=>\"" . $v2 . '", ';
397					}
398					$string .= ')';
399				} else {
400					$string .= "[$k]=>\"" . $v . '", ';
401				}
402			}
403			$string .= ')';
404		}
405
406		if (isset($this->text)) {
407			$string .= ' text: (' . $this->text . ')';
408		}
409
410		$string .= " HDOM_INNER_INFO: '";
411
412		if (isset($node->_[HDOM_INFO_INNER])) {
413			$string .= $node->_[HDOM_INFO_INNER] . "'";
414		} else {
415			$string .= ' NULL ';
416		}
417
418		$string .= ' children: ' . count($this->children);
419		$string .= ' nodes: ' . count($this->nodes);
420		$string .= ' tag_start: ' . $this->tag_start;
421		$string .= "\n";
422
423		if ($echo) {
424			echo $string;
425			return;
426		} else {
427			return $string;
428		}
429	}
430
431	/**
432	 * Return or set parent node
433	 *
434	 * @param object|null $parent (optional) The parent node, `null` to return
435	 * the current parent node.
436	 * @return object|null The parent node
437	 */
438	function parent($parent = null)
439	{
440		// I am SURE that this doesn't work properly.
441		// It fails to unset the current node from it's current parents nodes or
442		// children list first.
443		if ($parent !== null) {
444			$this->parent = $parent;
445			$this->parent->nodes[] = $this;
446			$this->parent->children[] = $this;
447		}
448
449		return $this->parent;
450	}
451
452	/**
453	 * @return bool True if the node has at least one child node
454	 */
455	function has_child()
456	{
457		return !empty($this->children);
458	}
459
460	/**
461	 * Get child node at specified index
462	 *
463	 * @param int $idx The index of the child node to return, `-1` to return all
464	 * child nodes.
465	 * @return object|array|null The child node at the specified index, all child
466	 * nodes or null if the index is invalid.
467	 */
468	function children($idx = -1)
469	{
470		if ($idx === -1) {
471			return $this->children;
472		}
473
474		if (isset($this->children[$idx])) {
475			return $this->children[$idx];
476		}
477
478		return null;
479	}
480
481	/**
482	 * Get first child node
483	 *
484	 * @return object|null The first child node or null if the current node has
485	 * no child nodes.
486	 *
487	 * @todo Use `empty()` instead of `count()` to improve performance on large
488	 * arrays.
489	 */
490	function first_child()
491	{
492		if (count($this->children) > 0) {
493			return $this->children[0];
494		}
495		return null;
496	}
497
498	/**
499	 * Get last child node
500	 *
501	 * @return object|null The last child node or null if the current node has
502	 * no child nodes.
503	 *
504	 * @todo Use `end()` to slightly improve performance on large arrays.
505	 */
506	function last_child()
507	{
508		if (($count = count($this->children)) > 0) {
509			return $this->children[$count - 1];
510		}
511		return null;
512	}
513
514	/**
515	 * Get next sibling node
516	 *
517	 * @return object|null The sibling node or null if the current node has no
518	 * sibling nodes.
519	 */
520	function next_sibling()
521	{
522		if ($this->parent === null) {
523			return null;
524		}
525
526		$idx = 0;
527		$count = count($this->parent->children);
528
529		while ($idx < $count && $this !== $this->parent->children[$idx]) {
530			++$idx;
531		}
532
533		if (++$idx >= $count) {
534			return null;
535		}
536
537		return $this->parent->children[$idx];
538	}
539
540	/**
541	 * Get previous sibling node
542	 *
543	 * @return object|null The sibling node or null if the current node has no
544	 * sibling nodes.
545	 */
546	function prev_sibling()
547	{
548		if ($this->parent === null) { return null; }
549
550		$idx = 0;
551		$count = count($this->parent->children);
552
553		while ($idx < $count && $this !== $this->parent->children[$idx]) {
554			++$idx;
555		}
556
557		if (--$idx < 0) { return null; }
558
559		return $this->parent->children[$idx];
560	}
561
562	/**
563	 * Traverse ancestors to the first matching tag.
564	 *
565	 * @param string $tag Tag to find
566	 * @return object|null First matching node in the DOM tree or null if no
567	 * match was found.
568	 *
569	 * @todo Null is returned implicitly by calling ->parent on the root node.
570	 * This behaviour could change at any time, rendering this function invalid.
571	 */
572	function find_ancestor_tag($tag)
573	{
574		global $debug_object;
575		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
576
577		// Start by including ourselves in the comparison.
578		$returnDom = $this;
579
580		while (!is_null($returnDom)) {
581			if (is_object($debug_object)) {
582				$debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
583			}
584
585			if ($returnDom->tag == $tag) {
586				break;
587			}
588
589			$returnDom = $returnDom->parent;
590		}
591
592		return $returnDom;
593	}
594
595	/**
596	 * Get node's inner text (everything inside the opening and closing tags)
597	 *
598	 * @return string
599	 */
600	function innertext()
601	{
602		if (isset($this->_[HDOM_INFO_INNER])) {
603			return $this->_[HDOM_INFO_INNER];
604		}
605
606		if (isset($this->_[HDOM_INFO_TEXT])) {
607			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
608		}
609
610		$ret = '';
611
612		foreach ($this->nodes as $n) {
613			$ret .= $n->outertext();
614		}
615
616		return $ret;
617	}
618
619	/**
620	 * Get node's outer text (everything including the opening and closing tags)
621	 *
622	 * @return string
623	 */
624	function outertext()
625	{
626		global $debug_object;
627
628		if (is_object($debug_object)) {
629			$text = '';
630
631			if ($this->tag === 'text') {
632				if (!empty($this->text)) {
633					$text = ' with text: ' . $this->text;
634				}
635			}
636
637			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
638		}
639
640		if ($this->tag === 'root') return $this->innertext();
641
642		// trigger callback
643		if ($this->dom && $this->dom->callback !== null) {
644			call_user_func_array($this->dom->callback, array($this));
645		}
646
647		if (isset($this->_[HDOM_INFO_OUTER])) {
648			return $this->_[HDOM_INFO_OUTER];
649		}
650
651		if (isset($this->_[HDOM_INFO_TEXT])) {
652			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
653		}
654
655		// render begin tag
656		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
657			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
658		} else {
659			$ret = '';
660		}
661
662		// render inner text
663		if (isset($this->_[HDOM_INFO_INNER])) {
664			// If it's a br tag...  don't return the HDOM_INNER_INFO that we
665			// may or may not have added.
666			if ($this->tag !== 'br') {
667				$ret .= $this->_[HDOM_INFO_INNER];
668			}
669		} else {
670			if ($this->nodes) {
671				foreach ($this->nodes as $n) {
672					$ret .= $this->convert_text($n->outertext());
673				}
674			}
675		}
676
677		// render end tag
678		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
679			$ret .= '</' . $this->tag . '>';
680		}
681
682		return $ret;
683	}
684
685	/**
686	 * Get node's plain text (everything excluding all tags)
687	 *
688	 * @return string
689	 */
690	function text()
691	{
692		if (isset($this->_[HDOM_INFO_INNER])) {
693			return $this->_[HDOM_INFO_INNER];
694		}
695
696		switch ($this->nodetype) {
697			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
698			case HDOM_TYPE_COMMENT: return '';
699			case HDOM_TYPE_UNKNOWN: return '';
700		}
701
702		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
703		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
704
705		$ret = '';
706
707		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
708		// for some span tags, and some p tags) $this->nodes is set to NULL.
709		// NOTE: This indicates that there is a problem where it's set to NULL
710		// without a clear happening.
711		// WHY is this happening?
712		if (!is_null($this->nodes)) {
713			foreach ($this->nodes as $n) {
714				// Start paragraph after a blank line
715				if ($n->tag === 'p') {
716					$ret .= "\n\n";
717				}
718
719				$ret .= $this->convert_text($n->text());
720
721				// If this node is a span... add a space at the end of it so
722				// multiple spans don't run into each other.  This is plaintext
723				// after all.
724				if ($n->tag === 'span') {
725					$ret .= $this->dom->default_span_text;
726				}
727			}
728		}
729		return trim($ret);
730	}
731
732	/**
733	 * Get node's xml text (inner text as a CDATA section)
734	 *
735	 * @return string
736	 */
737	function xmltext()
738	{
739		$ret = $this->innertext();
740		$ret = str_ireplace('<![CDATA[', '', $ret);
741		$ret = str_replace(']]>', '', $ret);
742		return $ret;
743	}
744
745	// build node's text with tag
746	function makeup()
747	{
748		// text, comment, unknown
749		if (isset($this->_[HDOM_INFO_TEXT])) {
750			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
751		}
752
753		$ret = '<' . $this->tag;
754		$i = -1;
755
756		foreach ($this->attr as $key => $val) {
757			++$i;
758
759			// skip removed attribute
760			if ($val === null || $val === false) { continue; }
761
762			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
763
764			//no value attr: nowrap, checked selected...
765			if ($val === true) {
766				$ret .= $key;
767			} else {
768				switch ($this->_[HDOM_INFO_QUOTE][$i])
769				{
770					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
771					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
772					default: $quote = '';
773				}
774
775				$ret .= $key
776				. $this->_[HDOM_INFO_SPACE][$i][1]
777				. '='
778				. $this->_[HDOM_INFO_SPACE][$i][2]
779				. $quote
780				. $val
781				. $quote;
782			}
783		}
784
785		$ret = $this->dom->restore_noise($ret);
786		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
787	}
788
789	/**
790	 * Find elements by CSS selector
791	 *
792	 * @param string $selector The CSS selector
793	 * @param int|null $idx Index of element to return form the list of matching
794	 * elements (default: `null` = disabled).
795	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
796	 * enabled (default: `false`)
797	 * @return array|object|null A list of elements matching the specified CSS
798	 * selector or a single element if $idx is specified or null if no element
799	 * was found.
800	 */
801	function find($selector, $idx = null, $lowercase = false)
802	{
803		$selectors = $this->parse_selector($selector);
804		if (($count = count($selectors)) === 0) { return array(); }
805		$found_keys = array();
806
807		// find each selector
808		for ($c = 0; $c < $count; ++$c) {
809			// The change on the below line was documented on the sourceforge
810			// code tracker id 2788009
811			// used to be: if (($levle=count($selectors[0]))===0) return array();
812			if (($levle = count($selectors[$c])) === 0) { return array(); }
813			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
814
815			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
816			$cmd = ' '; // Combinator
817
818			// handle descendant selectors, no recursive!
819			for ($l = 0; $l < $levle; ++$l) {
820				$ret = array();
821
822				foreach ($head as $k => $v) {
823					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
824					//PaperG - Pass this optional parameter on to the seek function.
825					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
826				}
827
828				$head = $ret;
829				$cmd = $selectors[$c][$l][4]; // Next Combinator
830			}
831
832			foreach ($head as $k => $v) {
833				if (!isset($found_keys[$k])) {
834					$found_keys[$k] = 1;
835				}
836			}
837		}
838
839		// sort keys
840		ksort($found_keys);
841
842		$found = array();
843		foreach ($found_keys as $k => $v) {
844			$found[] = $this->dom->nodes[$k];
845		}
846
847		// return nth-element or array
848		if (is_null($idx)) { return $found; }
849		elseif ($idx < 0) { $idx = count($found) + $idx; }
850		return (isset($found[$idx])) ? $found[$idx] : null;
851	}
852
853	/**
854	 * Seek DOM elements by selector
855	 *
856	 * **Note**
857	 * The selector element must be compatible to a selector from
858	 * {@see simple_html_dom_node::parse_selector()}
859	 *
860	 * @param array $selector A selector element
861	 * @param array $ret An array of matches
862	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
863	 * enabled (default: `false`)
864	 * @return void
865	 */
866	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
867	{
868		global $debug_object;
869		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
870
871		list($tag, $id, $class, $attributes, $cmb) = $selector;
872		$nodes = array();
873
874		if ($parent_cmd === ' ') { // Descendant Combinator
875			// Find parent closing tag if the current element doesn't have a closing
876			// tag (i.e. void element)
877			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
878			if ($end == 0) {
879				$parent = $this->parent;
880				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
881					$end -= 1;
882					$parent = $parent->parent;
883				}
884				$end += $parent->_[HDOM_INFO_END];
885			}
886
887			// Get list of target nodes
888			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
889			$nodes_count = $end - $nodes_start;
890			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
891		} elseif ($parent_cmd === '>') { // Child Combinator
892			$nodes = $this->children;
893		} elseif ($parent_cmd === '+'
894			&& $this->parent
895			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
896				$index = array_search($this, $this->parent->children, true) + 1;
897				$nodes[] = $this->parent->children[$index];
898		} elseif ($parent_cmd === '~'
899			&& $this->parent
900			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
901				$index = array_search($this, $this->parent->children, true);
902				$nodes = array_slice($this->parent->children, $index);
903		}
904
905		// Go throgh each element starting at this element until the end tag
906		// Note: If this element is a void tag, any previous void element is
907		// skipped.
908		foreach($nodes as $node) {
909			$pass = true;
910
911			// Skip root nodes
912			if(!$node->parent) {
913				$pass = false;
914			}
915
916			// Skip if node isn't a child node (i.e. text nodes)
917			if($pass && !in_array($node, $node->parent->children, true)) {
918				$pass = false;
919			}
920
921			// Skip if tag doesn't match
922			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
923				$pass = false;
924			}
925
926			// Skip if ID doesn't exist
927			if ($pass && $id !== '' && !isset($node->attr['id'])) {
928				$pass = false;
929			}
930
931			// Check if ID matches
932			if ($pass && $id !== '' && isset($node->attr['id'])) {
933				// Note: Only consider the first ID (as browsers do)
934				$node_id = explode(' ', trim($node->attr['id']))[0];
935
936				if($id !== $node_id) { $pass = false; }
937			}
938
939			// Check if all class(es) exist
940			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
941				if (isset($node->attr['class'])) {
942					$node_classes = explode(' ', $node->attr['class']);
943
944					if ($lowercase) {
945						$node_classes = array_map('strtolower', $node_classes);
946					}
947
948					foreach($class as $c) {
949						if(!in_array($c, $node_classes)) {
950							$pass = false;
951							break;
952						}
953					}
954				} else {
955					$pass = false;
956				}
957			}
958
959			// Check attributes
960			if ($pass
961				&& $attributes !== ''
962				&& is_array($attributes)
963				&& !empty($attributes)) {
964					foreach($attributes as $a) {
965						list (
966							$att_name,
967							$att_expr,
968							$att_val,
969							$att_inv,
970							$att_case_sensitivity
971						) = $a;
972
973						// Handle indexing attributes (i.e. "[2]")
974						/**
975						 * Note: This is not supported by the CSS Standard but adds
976						 * the ability to select items compatible to XPath (i.e.
977						 * the 3rd element within it's parent).
978						 *
979						 * Note: This doesn't conflict with the CSS Standard which
980						 * doesn't work on numeric attributes anyway.
981						 */
982						if (is_numeric($att_name)
983							&& $att_expr === ''
984							&& $att_val === '') {
985								$count = 0;
986
987								// Find index of current element in parent
988								foreach ($node->parent->children as $c) {
989									if ($c->tag === $node->tag) ++$count;
990									if ($c === $node) break;
991								}
992
993								// If this is the correct node, continue with next
994								// attribute
995								if ($count === (int)$att_name) continue;
996						}
997
998						// Check attribute availability
999						if ($att_inv) { // Attribute should NOT be set
1000							if (isset($node->attr[$att_name])) {
1001								$pass = false;
1002								break;
1003							}
1004						} else { // Attribute should be set
1005							// todo: "plaintext" is not a valid CSS selector!
1006							if ($att_name !== 'plaintext'
1007								&& !isset($node->attr[$att_name])) {
1008									$pass = false;
1009									break;
1010							}
1011						}
1012
1013						// Continue with next attribute if expression isn't defined
1014						if ($att_expr === '') continue;
1015
1016						// If they have told us that this is a "plaintext"
1017						// search then we want the plaintext of the node - right?
1018						// todo "plaintext" is not a valid CSS selector!
1019						if ($att_name === 'plaintext') {
1020							$nodeKeyValue = $node->text();
1021						} else {
1022							$nodeKeyValue = $node->attr[$att_name];
1023						}
1024
1025						if (is_object($debug_object)) {
1026							$debug_object->debug_log(2,
1027								'testing node: '
1028								. $node->tag
1029								. ' for attribute: '
1030								. $att_name
1031								. $att_expr
1032								. $att_val
1033								. ' where nodes value is: '
1034								. $nodeKeyValue
1035							);
1036						}
1037
1038						// If lowercase is set, do a case insensitive test of
1039						// the value of the selector.
1040						if ($lowercase) {
1041							$check = $this->match(
1042								$att_expr,
1043								strtolower($att_val),
1044								strtolower($nodeKeyValue),
1045								$att_case_sensitivity
1046							);
1047						} else {
1048							$check = $this->match(
1049								$att_expr,
1050								$att_val,
1051								$nodeKeyValue,
1052								$att_case_sensitivity
1053							);
1054						}
1055
1056						if (is_object($debug_object)) {
1057							$debug_object->debug_log(2,
1058								'after match: '
1059								. ($check ? 'true' : 'false')
1060							);
1061						}
1062
1063						if (!$check) {
1064							$pass = false;
1065							break;
1066						}
1067					}
1068			}
1069
1070			// Found a match. Add to list and clear node
1071			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1072			unset($node);
1073		}
1074		// It's passed by reference so this is actually what this function returns.
1075		if (is_object($debug_object)) {
1076			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1077		}
1078	}
1079
1080	/**
1081	 * Match value and pattern for a given CSS expression
1082	 *
1083	 * **Supported Expressions**
1084	 *
1085	 * | Expression | Description
1086	 * | ---------- | -----------
1087	 * | `=`        | $value and $pattern must be equal
1088	 * | `!=`       | $value and $pattern must not be equal
1089	 * | `^=`       | $value must start with $pattern
1090	 * | `$=`       | $value must end with $pattern
1091	 * | `*=`       | $value must contain $pattern
1092	 *
1093	 * @param string $exp The expression.
1094	 * @param string $pattern The pattern
1095	 * @param string $value The value
1096	 * @value bool True if $value matches $pattern
1097	 */
1098	protected function match($exp, $pattern, $value, $case_sensitivity)
1099	{
1100		global $debug_object;
1101		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1102
1103		if ($case_sensitivity === 'i') {
1104			$pattern = strtolower($pattern);
1105			$value = strtolower($value);
1106		}
1107
1108		switch ($exp) {
1109			case '=':
1110				return ($value === $pattern);
1111			case '!=':
1112				return ($value !== $pattern);
1113			case '^=':
1114				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1115			case '$=':
1116				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1117			case '*=':
1118				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1119			case '|=':
1120				/**
1121				 * [att|=val]
1122				 *
1123				 * Represents an element with the att attribute, its value
1124				 * either being exactly "val" or beginning with "val"
1125				 * immediately followed by "-" (U+002D).
1126				 */
1127				return strpos($value, $pattern) === 0;
1128			case '~=':
1129				/**
1130				 * [att~=val]
1131				 *
1132				 * Represents an element with the att attribute whose value is a
1133				 * whitespace-separated list of words, one of which is exactly
1134				 * "val". If "val" contains whitespace, it will never represent
1135				 * anything (since the words are separated by spaces). Also if
1136				 * "val" is the empty string, it will never represent anything.
1137				 */
1138				return in_array($pattern, explode(' ', trim($value)), true);
1139		}
1140		return false;
1141	}
1142
1143	/**
1144	 * Parse CSS selector
1145	 *
1146	 * @param string $selector_string CSS selector string
1147	 * @return array List of CSS selectors. The format depends on the type of
1148	 * selector:
1149	 *
1150	 * ```php
1151	 *
1152	 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1153	 *   array( // list of combinator selectors, i.e. 'img > p > div'
1154	 *     array( // selector element
1155	 *       [0], // (string) The element tag
1156	 *       [1], // (string) The element id
1157	 *       [2], // (array<string>) The element classes
1158	 *       [3], // (array<array<string>>) The list of attributes, each
1159	 *            // with four elements: name, expression, value, inverted
1160	 *       [4]  // (string) The selector combinator (' ' | '>' | '+' | '~')
1161	 *     )
1162	 *   )
1163	 * )
1164	 * ```
1165	 *
1166	 * @link https://www.w3.org/TR/selectors/#compound Compound selector
1167	 */
1168	protected function parse_selector($selector_string)
1169	{
1170		global $debug_object;
1171		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1172
1173		/**
1174		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1175		 *
1176		 * Paperg: Add the colon to the attribute, so that it properly finds
1177		 * <tag attr:ibute="something" > like google does.
1178		 *
1179		 * Note: if you try to look at this attribute, you MUST use getAttribute
1180		 * since $dom->x:y will fail the php syntax check.
1181		 *
1182		 * Notice the \[ starting the attribute? and the @? following? This
1183		 * implies that an attribute can begin with an @ sign that is not
1184		 * captured. This implies that an html attribute specifier may start
1185		 * with an @ sign that is NOT captured by the expression. Farther study
1186		 * is required to determine of this should be documented or removed.
1187		 *
1188		 * Matches selectors in this order:
1189		 *
1190		 * [0] - full match
1191		 *
1192		 * [1] - tag name
1193		 *     ([\w:\*-]*)
1194		 *     Matches the tag name consisting of zero or more words, colons,
1195		 *     asterisks and hyphens.
1196		 *
1197		 * [2] - id name
1198		 *     (?:\#([\w-]+))
1199		 *     Optionally matches a id name, consisting of an "#" followed by
1200		 *     the id name (one or more words and hyphens).
1201		 *
1202		 * [3] - class names (including dots)
1203		 *     (?:\.([\w\.-]+))?
1204		 *     Optionally matches a list of classs, consisting of an "."
1205		 *     followed by the class name (one or more words and hyphens)
1206		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
1207		 *
1208		 * [4] - attributes
1209		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1210		 *     Optionally matches the attributes list
1211		 *
1212		 * [5] - separator
1213		 *     ([\/, >+~]+)
1214		 *     Matches the selector list separator
1215		 */
1216		// phpcs:ignore Generic.Files.LineLength
1217		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1218
1219		preg_match_all(
1220			$pattern,
1221			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1222			$matches,
1223			PREG_SET_ORDER
1224		);
1225
1226		if (is_object($debug_object)) {
1227			$debug_object->debug_log(2, 'Matches Array: ', $matches);
1228		}
1229
1230		$selectors = array();
1231		$result = array();
1232
1233		foreach ($matches as $m) {
1234			$m[0] = trim($m[0]);
1235
1236			// Skip NoOps
1237			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1238
1239			// Convert to lowercase
1240			if ($this->dom->lowercase) {
1241				$m[1] = strtolower($m[1]);
1242			}
1243
1244			// Extract classes
1245			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1246
1247			/* Extract attributes (pattern based on the pattern above!)
1248
1249			 * [0] - full match
1250			 * [1] - attribute name
1251			 * [2] - attribute expression
1252			 * [3] - attribute value
1253			 * [4] - case sensitivity
1254			 *
1255			 * Note: Attributes can be negated with a "!" prefix to their name
1256			 */
1257			if($m[4] !== '') {
1258				preg_match_all(
1259					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1260					trim($m[4]),
1261					$attributes,
1262					PREG_SET_ORDER
1263				);
1264
1265				// Replace element by array
1266				$m[4] = array();
1267
1268				foreach($attributes as $att) {
1269					// Skip empty matches
1270					if(trim($att[0]) === '') { continue; }
1271
1272					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
1273					$m[4][] = array(
1274						$inverted ? substr($att[1], 1) : $att[1], // Name
1275						(isset($att[2])) ? $att[2] : '', // Expression
1276						(isset($att[3])) ? $att[3] : '', // Value
1277						$inverted, // Inverted Flag
1278						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1279					);
1280				}
1281			}
1282
1283			// Sanitize Separator
1284			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1285				$m[5] = ' ';
1286			} else { // Other Separator
1287				$m[5] = trim($m[5]);
1288			}
1289
1290			// Clear Separator if it's a Selector List
1291			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1292
1293			// Remove full match before adding to results
1294			array_shift($m);
1295			$result[] = $m;
1296
1297			if ($is_list) { // Selector List
1298				$selectors[] = $result;
1299				$result = array();
1300			}
1301		}
1302
1303		if (count($result) > 0) { $selectors[] = $result; }
1304		return $selectors;
1305	}
1306
1307	function __get($name)
1308	{
1309		if (isset($this->attr[$name])) {
1310			return $this->convert_text($this->attr[$name]);
1311		}
1312		switch ($name) {
1313			case 'outertext': return $this->outertext();
1314			case 'innertext': return $this->innertext();
1315			case 'plaintext': return $this->text();
1316			case 'xmltext': return $this->xmltext();
1317			default: return array_key_exists($name, $this->attr);
1318		}
1319	}
1320
1321	function __set($name, $value)
1322	{
1323		global $debug_object;
1324		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1325
1326		switch ($name) {
1327			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1328			case 'innertext':
1329				if (isset($this->_[HDOM_INFO_TEXT])) {
1330					return $this->_[HDOM_INFO_TEXT] = $value;
1331				}
1332				return $this->_[HDOM_INFO_INNER] = $value;
1333		}
1334
1335		if (!isset($this->attr[$name])) {
1336			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1337			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1338		}
1339
1340		$this->attr[$name] = $value;
1341	}
1342
1343	function __isset($name)
1344	{
1345		switch ($name) {
1346			case 'outertext': return true;
1347			case 'innertext': return true;
1348			case 'plaintext': return true;
1349		}
1350		//no value attr: nowrap, checked selected...
1351		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1352	}
1353
1354	function __unset($name)
1355	{
1356		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1357	}
1358
1359	// PaperG - Function to convert the text from one character set to another
1360	// if the two sets are not the same.
1361	function convert_text($text)
1362	{
1363		global $debug_object;
1364		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1365
1366		$converted_text = $text;
1367
1368		$sourceCharset = '';
1369		$targetCharset = '';
1370
1371		if ($this->dom) {
1372			$sourceCharset = strtoupper($this->dom->_charset);
1373			$targetCharset = strtoupper($this->dom->_target_charset);
1374		}
1375
1376		if (is_object($debug_object)) {
1377			$debug_object->debug_log(3,
1378				'source charset: '
1379				. $sourceCharset
1380				. ' target charaset: '
1381				. $targetCharset
1382			);
1383		}
1384
1385		if (!empty($sourceCharset)
1386			&& !empty($targetCharset)
1387			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1388			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1389			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1390				&& ($this->is_utf8($text))) {
1391				$converted_text = $text;
1392			} else {
1393				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1394			}
1395		}
1396
1397		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1398		if ($targetCharset === 'UTF-8') {
1399			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1400				$converted_text = substr($converted_text, 3);
1401			}
1402
1403			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1404				$converted_text = substr($converted_text, 0, -3);
1405			}
1406		}
1407
1408		return $converted_text;
1409	}
1410
1411	/**
1412	* Returns true if $string is valid UTF-8 and false otherwise.
1413	*
1414	* @param mixed $str String to be tested
1415	* @return boolean
1416	*/
1417	static function is_utf8($str)
1418	{
1419		$c = 0; $b = 0;
1420		$bits = 0;
1421		$len = strlen($str);
1422		for($i = 0; $i < $len; $i++) {
1423			$c = ord($str[$i]);
1424			if($c > 128) {
1425				if(($c >= 254)) { return false; }
1426				elseif($c >= 252) { $bits = 6; }
1427				elseif($c >= 248) { $bits = 5; }
1428				elseif($c >= 240) { $bits = 4; }
1429				elseif($c >= 224) { $bits = 3; }
1430				elseif($c >= 192) { $bits = 2; }
1431				else { return false; }
1432				if(($i + $bits) > $len) { return false; }
1433				while($bits > 1) {
1434					$i++;
1435					$b = ord($str[$i]);
1436					if($b < 128 || $b > 191) { return false; }
1437					$bits--;
1438				}
1439			}
1440		}
1441		return true;
1442	}
1443
1444	/**
1445	 * Function to try a few tricks to determine the displayed size of an img on
1446	 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1447	 * other tag types.
1448	 *
1449	 * @author John Schlick
1450	 * @version April 19 2012
1451	 * @return array an array containing the 'height' and 'width' of the image
1452	 * on the page or -1 if we can't figure it out.
1453	 */
1454	function get_display_size()
1455	{
1456		global $debug_object;
1457
1458		$width = -1;
1459		$height = -1;
1460
1461		if ($this->tag !== 'img') {
1462			return false;
1463		}
1464
1465		// See if there is aheight or width attribute in the tag itself.
1466		if (isset($this->attr['width'])) {
1467			$width = $this->attr['width'];
1468		}
1469
1470		if (isset($this->attr['height'])) {
1471			$height = $this->attr['height'];
1472		}
1473
1474		// Now look for an inline style.
1475		if (isset($this->attr['style'])) {
1476			// Thanks to user gnarf from stackoverflow for this regular expression.
1477			$attributes = array();
1478
1479			preg_match_all(
1480				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1481				$this->attr['style'],
1482				$matches,
1483				PREG_SET_ORDER
1484			);
1485
1486			foreach ($matches as $match) {
1487				$attributes[$match[1]] = $match[2];
1488			}
1489
1490			// If there is a width in the style attributes:
1491			if (isset($attributes['width']) && $width == -1) {
1492				// check that the last two characters are px (pixels)
1493				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1494					$proposed_width = substr($attributes['width'], 0, -2);
1495					// Now make sure that it's an integer and not something stupid.
1496					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1497						$width = $proposed_width;
1498					}
1499				}
1500			}
1501
1502			// If there is a width in the style attributes:
1503			if (isset($attributes['height']) && $height == -1) {
1504				// check that the last two characters are px (pixels)
1505				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1506					$proposed_height = substr($attributes['height'], 0, -2);
1507					// Now make sure that it's an integer and not something stupid.
1508					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1509						$height = $proposed_height;
1510					}
1511				}
1512			}
1513
1514		}
1515
1516		// Future enhancement:
1517		// Look in the tag to see if there is a class or id specified that has
1518		// a height or width attribute to it.
1519
1520		// Far future enhancement
1521		// Look at all the parent tags of this image to see if they specify a
1522		// class or id that has an img selector that specifies a height or width
1523		// Note that in this case, the class or id will have the img subselector
1524		// for it to apply to the image.
1525
1526		// ridiculously far future development
1527		// If the class or id is specified in a SEPARATE css file thats not on
1528		// the page, go get it and do what we were just doing for the ones on
1529		// the page.
1530
1531		$result = array(
1532			'height' => $height,
1533			'width' => $width
1534		);
1535
1536		return $result;
1537	}
1538
1539	// camel naming conventions
1540	function getAllAttributes()
1541	{
1542		return $this->attr;
1543	}
1544
1545	function getAttribute($name)
1546	{
1547		return $this->__get($name);
1548	}
1549
1550	function setAttribute($name, $value)
1551	{
1552		$this->__set($name, $value);
1553	}
1554
1555	function hasAttribute($name)
1556	{
1557		return $this->__isset($name);
1558	}
1559
1560	function removeAttribute($name)
1561	{
1562		$this->__set($name, null);
1563	}
1564
1565	function getElementById($id)
1566	{
1567		return $this->find("#$id", 0);
1568	}
1569
1570	function getElementsById($id, $idx = null)
1571	{
1572		return $this->find("#$id", $idx);
1573	}
1574
1575	function getElementByTagName($name)
1576	{
1577		return $this->find($name, 0);
1578	}
1579
1580	function getElementsByTagName($name, $idx = null)
1581	{
1582		return $this->find($name, $idx);
1583	}
1584
1585	function parentNode()
1586	{
1587		return $this->parent();
1588	}
1589
1590	function childNodes($idx = -1)
1591	{
1592		return $this->children($idx);
1593	}
1594
1595	function firstChild()
1596	{
1597		return $this->first_child();
1598	}
1599
1600	function lastChild()
1601	{
1602		return $this->last_child();
1603	}
1604
1605	function nextSibling()
1606	{
1607		return $this->next_sibling();
1608	}
1609
1610	function previousSibling()
1611	{
1612		return $this->prev_sibling();
1613	}
1614
1615	function hasChildNodes()
1616	{
1617		return $this->has_child();
1618	}
1619
1620	function nodeName()
1621	{
1622		return $this->tag;
1623	}
1624
1625	function appendChild($node)
1626	{
1627		$node->parent($this);
1628		return $node;
1629	}
1630
1631}
1632
1633/**
1634 * simple html dom parser
1635 *
1636 * Paperg - in the find routine: allow us to specify that we want case
1637 * insensitive testing of the value of the selector.
1638 *
1639 * Paperg - change $size from protected to public so we can easily access it
1640 *
1641 * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1642 * trust the html or not.  Default is to NOT trust it.
1643 *
1644 * @package PlaceLocalInclude
1645 */
1646class simple_html_dom
1647{
1648	/**
1649	 * The root node of the document
1650	 *
1651	 * @var object
1652	 */
1653	public $root = null;
1654
1655	/**
1656	 * List of nodes in the current DOM
1657	 *
1658	 * @var array
1659	 */
1660	public $nodes = array();
1661
1662	/**
1663	 * Callback function to run for each element in the DOM.
1664	 *
1665	 * @var callable|null
1666	 */
1667	public $callback = null;
1668
1669	/**
1670	 * Indicates how tags and attributes are matched
1671	 *
1672	 * @var bool When set to **true** tags and attributes will be converted to
1673	 * lowercase before matching.
1674	 */
1675	public $lowercase = false;
1676
1677	/**
1678	 * Original document size
1679	 *
1680	 * Holds the original document size.
1681	 *
1682	 * @var int
1683	 */
1684	public $original_size;
1685
1686	/**
1687	 * Current document size
1688	 *
1689	 * Holds the current document size. The document size is determined by the
1690	 * string length of ({@see simple_html_dom::$doc}).
1691	 *
1692	 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1693	 *
1694	 * @var int
1695	 * */
1696	public $size;
1697
1698	/**
1699	 * Current position in the document
1700	 *
1701	 * @var int
1702	 */
1703	protected $pos;
1704
1705	/**
1706	 * The document
1707	 *
1708	 * @var string
1709	 */
1710	protected $doc;
1711
1712	/**
1713	 * Current character
1714	 *
1715	 * Holds the current character at position {@see simple_html_dom::$pos} in
1716	 * the document {@see simple_html_dom::$doc}
1717	 *
1718	 * _Note_: Using this variable is more efficient than calling
1719	 * `substr($doc, $pos, 1)`
1720	 *
1721	 * @var string
1722	 */
1723	protected $char;
1724
1725	protected $cursor;
1726
1727	/**
1728	 * Parent node of the next node detected by the parser
1729	 *
1730	 * @var object
1731	 */
1732	protected $parent;
1733	protected $noise = array();
1734
1735	/**
1736	 * Tokens considered blank in HTML
1737	 *
1738	 * @var string
1739	 */
1740	protected $token_blank = " \t\r\n";
1741
1742	/**
1743	 * Tokens to identify the equal sign for attributes, stopping either at the
1744	 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1745	 * "<html>")
1746	 *
1747	 * @var string
1748	 */
1749	protected $token_equal = ' =/>';
1750
1751	/**
1752	 * Tokens to identify the end of a tag name. A tag name either ends on the
1753	 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1754	 *
1755	 * @var string
1756	 */
1757	protected $token_slash = " />\r\n\t";
1758
1759	/**
1760	 * Tokens to identify the end of an attribute
1761	 *
1762	 * @var string
1763	 */
1764	protected $token_attr = ' >';
1765
1766	// Note that this is referenced by a child node, and so it needs to be
1767	// public for that node to see this information.
1768	public $_charset = '';
1769	public $_target_charset = '';
1770
1771	/**
1772	 * Innertext for <br> elements
1773	 *
1774	 * @var string
1775	 */
1776	protected $default_br_text = '';
1777
1778	/**
1779	 * Suffix for <span> elements
1780	 *
1781	 * @var string
1782	 */
1783	public $default_span_text = '';
1784
1785	/**
1786	 * Defines a list of self-closing tags (Void elements) according to the HTML
1787	 * Specification
1788	 *
1789	 * _Remarks_:
1790	 * - Use `isset()` instead of `in_array()` on array elements to boost
1791	 * performance about 30%
1792	 * - Sort elements by name for better readability!
1793	 *
1794	 * @link https://www.w3.org/TR/html HTML Specification
1795	 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1796	 */
1797	protected $self_closing_tags = array(
1798		'area' => 1,
1799		'base' => 1,
1800		'br' => 1,
1801		'col' => 1,
1802		'embed' => 1,
1803		'hr' => 1,
1804		'img' => 1,
1805		'input' => 1,
1806		'link' => 1,
1807		'meta' => 1,
1808		'param' => 1,
1809		'source' => 1,
1810		'track' => 1,
1811		'wbr' => 1
1812	);
1813
1814	/**
1815	 * Defines a list of tags which - if closed - close all optional closing
1816	 * elements within if they haven't been closed yet. (So, an element where
1817	 * neither opening nor closing tag is omissible consistently closes every
1818	 * optional closing element within)
1819	 *
1820	 * _Remarks_:
1821	 * - Use `isset()` instead of `in_array()` on array elements to boost
1822	 * performance about 30%
1823	 * - Sort elements by name for better readability!
1824	 */
1825	protected $block_tags = array(
1826		'body' => 1,
1827		'div' => 1,
1828		'form' => 1,
1829		'root' => 1,
1830		'span' => 1,
1831		'table' => 1
1832	);
1833
1834	/**
1835	 * Defines elements whose end tag is omissible.
1836	 *
1837	 * * key = Name of an element whose end tag is omissible.
1838	 * * value = Names of elements whose end tag is omissible, that are closed
1839	 * by the current element.
1840	 *
1841	 * _Remarks_:
1842	 * - Use `isset()` instead of `in_array()` on array elements to boost
1843	 * performance about 30%
1844	 * - Sort elements by name for better readability!
1845	 *
1846	 * **Example**
1847	 *
1848	 * An `li` elementâ€™s end tag may be omitted if the `li` element is immediately
1849	 * followed by another `li` element. To do that, add following element to the
1850	 * array:
1851	 *
1852	 * ```php
1853	 * 'li' => array('li'),
1854	 * ```
1855	 *
1856	 * With this, the following two examples are considered equal. Note that the
1857	 * second example is missing the closing tags on `li` elements.
1858	 *
1859	 * ```html
1860	 * <ul><li>First Item</li><li>Second Item</li></ul>
1861	 * ```
1862	 *
1863	 * <ul><li>First Item</li><li>Second Item</li></ul>
1864	 *
1865	 * ```html
1866	 * <ul><li>First Item<li>Second Item</ul>
1867	 * ```
1868	 *
1869	 * <ul><li>First Item<li>Second Item</ul>
1870	 *
1871	 * @var array A two-dimensional array where the key is the name of an
1872	 * element whose end tag is omissible and the value is an array of elements
1873	 * whose end tag is omissible, that are closed by the current element.
1874	 *
1875	 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1876	 *
1877	 * @todo The implementation of optional closing tags doesn't work in all cases
1878	 * because it only consideres elements who close other optional closing
1879	 * tags, not taking into account that some (non-blocking) tags should close
1880	 * these optional closing tags. For example, the end tag for "p" is omissible
1881	 * and can be closed by an "address" element, whose end tag is NOT omissible.
1882	 * Currently a "p" element without closing tag stops at the next "p" element
1883	 * or blocking tag, even if it contains other elements.
1884	 *
1885	 * @todo Known sourceforge issue #2977341
1886	 * B tags that are not closed cause us to return everything to the end of
1887	 * the document.
1888	 */
1889	protected $optional_closing_tags = array(
1890		// Not optional, see
1891		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1892		'b' => array('b' => 1),
1893		'dd' => array('dd' => 1, 'dt' => 1),
1894		// Not optional, see
1895		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1896		'dl' => array('dd' => 1, 'dt' => 1),
1897		'dt' => array('dd' => 1, 'dt' => 1),
1898		'li' => array('li' => 1),
1899		'optgroup' => array('optgroup' => 1, 'option' => 1),
1900		'option' => array('optgroup' => 1, 'option' => 1),
1901		'p' => array('p' => 1),
1902		'rp' => array('rp' => 1, 'rt' => 1),
1903		'rt' => array('rp' => 1, 'rt' => 1),
1904		'td' => array('td' => 1, 'th' => 1),
1905		'th' => array('td' => 1, 'th' => 1),
1906		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1907	);
1908
1909	function __construct(
1910		$str = null,
1911		$lowercase = true,
1912		$forceTagsClosed = true,
1913		$target_charset = DEFAULT_TARGET_CHARSET,
1914		$stripRN = true,
1915		$defaultBRText = DEFAULT_BR_TEXT,
1916		$defaultSpanText = DEFAULT_SPAN_TEXT,
1917		$options = 0)
1918	{
1919		if ($str) {
1920			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1921				$this->load_file($str);
1922			} else {
1923				$this->load(
1924					$str,
1925					$lowercase,
1926					$stripRN,
1927					$defaultBRText,
1928					$defaultSpanText,
1929					$options
1930				);
1931			}
1932		}
1933		// Forcing tags to be closed implies that we don't trust the html, but
1934		// it can lead to parsing errors if we SHOULD trust the html.
1935		if (!$forceTagsClosed) {
1936			$this->optional_closing_array = array();
1937		}
1938
1939		$this->_target_charset = $target_charset;
1940	}
1941
1942	function __destruct()
1943	{
1944		$this->clear();
1945	}
1946
1947	// load html from string
1948	function load(
1949		$str,
1950		$lowercase = true,
1951		$stripRN = true,
1952		$defaultBRText = DEFAULT_BR_TEXT,
1953		$defaultSpanText = DEFAULT_SPAN_TEXT,
1954		$options = 0)
1955	{
1956		global $debug_object;
1957
1958		// prepare
1959		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1960
1961		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1962		// Script tags removal now preceeds style tag removal.
1963		// strip out <script> tags
1964		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1965		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1966
1967		// strip out the \r \n's if we are told to.
1968		if ($stripRN) {
1969			$this->doc = str_replace("\r", ' ', $this->doc);
1970			$this->doc = str_replace("\n", ' ', $this->doc);
1971
1972			// set the length of content since we have changed it.
1973			$this->size = strlen($this->doc);
1974		}
1975
1976		// strip out cdata
1977		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1978		// strip out comments
1979		$this->remove_noise("'<!--(.*?)-->'is");
1980		// strip out <style> tags
1981		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1982		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1983		// strip out preformatted tags
1984		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1985		// strip out server side scripts
1986		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1987
1988		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1989			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1990		}
1991
1992		// parsing
1993		$this->parse();
1994		// end
1995		$this->root->_[HDOM_INFO_END] = $this->cursor;
1996		$this->parse_charset();
1997
1998		// make load function chainable
1999		return $this;
2000	}
2001
2002	// load html from file
2003	function load_file()
2004	{
2005		$args = func_get_args();
2006
2007		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
2008			$this->load($doc, true);
2009		} else {
2010			return false;
2011		}
2012	}
2013
2014	/**
2015	 * Set the callback function
2016	 *
2017	 * @param callable $function_name Callback function to run for each element
2018	 * in the DOM.
2019	 * @return void
2020	 */
2021	function set_callback($function_name)
2022	{
2023		$this->callback = $function_name;
2024	}
2025
2026	/**
2027	 * Remove callback function
2028	 *
2029	 * @return void
2030	 */
2031	function remove_callback()
2032	{
2033		$this->callback = null;
2034	}
2035
2036	// save dom as string
2037	function save($filepath = '')
2038	{
2039		$ret = $this->root->innertext();
2040		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
2041		return $ret;
2042	}
2043
2044	// find dom node by css selector
2045	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
2046	function find($selector, $idx = null, $lowercase = false)
2047	{
2048		return $this->root->find($selector, $idx, $lowercase);
2049	}
2050
2051	// clean up memory due to php5 circular references memory leak...
2052	function clear()
2053	{
2054		foreach ($this->nodes as $n) {
2055			$n->clear(); $n = null;
2056		}
2057
2058		// This add next line is documented in the sourceforge repository.
2059		// 2977248 as a fix for ongoing memory leaks that occur even with the
2060		// use of clear.
2061		if (isset($this->children)) {
2062			foreach ($this->children as $n) {
2063				$n->clear(); $n = null;
2064			}
2065		}
2066
2067		if (isset($this->parent)) {
2068			$this->parent->clear();
2069			unset($this->parent);
2070		}
2071
2072		if (isset($this->root)) {
2073			$this->root->clear();
2074			unset($this->root);
2075		}
2076
2077		unset($this->doc);
2078		unset($this->noise);
2079	}
2080
2081	function dump($show_attr = true)
2082	{
2083		$this->root->dump($show_attr);
2084	}
2085
2086	// prepare HTML data and init everything
2087	protected function prepare(
2088		$str, $lowercase = true,
2089		$defaultBRText = DEFAULT_BR_TEXT,
2090		$defaultSpanText = DEFAULT_SPAN_TEXT)
2091	{
2092		$this->clear();
2093
2094		$this->doc = trim($str);
2095		$this->size = strlen($this->doc);
2096		$this->original_size = $this->size; // original size of the html
2097		$this->pos = 0;
2098		$this->cursor = 1;
2099		$this->noise = array();
2100		$this->nodes = array();
2101		$this->lowercase = $lowercase;
2102		$this->default_br_text = $defaultBRText;
2103		$this->default_span_text = $defaultSpanText;
2104		$this->root = new simple_html_dom_node($this);
2105		$this->root->tag = 'root';
2106		$this->root->_[HDOM_INFO_BEGIN] = -1;
2107		$this->root->nodetype = HDOM_TYPE_ROOT;
2108		$this->parent = $this->root;
2109		if ($this->size > 0) { $this->char = $this->doc[0]; }
2110	}
2111
2112	/**
2113	 * Parse HTML content
2114	 *
2115	 * @return bool True on success
2116	 */
2117	protected function parse()
2118	{
2119		while (true) {
2120			// Read next tag if there is no text between current position and the
2121			// next opening tag.
2122			if (($s = $this->copy_until_char('<')) === '') {
2123				if($this->read_tag()) {
2124					continue;
2125				} else {
2126					return true;
2127				}
2128			}
2129
2130			// Add a text node for text between tags
2131			$node = new simple_html_dom_node($this);
2132			++$this->cursor;
2133			$node->_[HDOM_INFO_TEXT] = $s;
2134			$this->link_nodes($node, false);
2135		}
2136	}
2137
2138	// PAPERG - dkchou - added this to try to identify the character set of the
2139	// page we have just parsed so we know better how to spit it out later.
2140	// NOTE:  IF you provide a routine called
2141	// get_last_retrieve_url_contents_content_type which returns the
2142	// CURLINFO_CONTENT_TYPE from the last curl_exec
2143	// (or the content_type header from the last transfer), we will parse THAT,
2144	// and if a charset is specified, we will use it over any other mechanism.
2145	protected function parse_charset()
2146	{
2147		global $debug_object;
2148
2149		$charset = null;
2150
2151		if (function_exists('get_last_retrieve_url_contents_content_type')) {
2152			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
2153			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2154			if ($success) {
2155				$charset = $matches[1];
2156				if (is_object($debug_object)) {
2157					$debug_object->debug_log(2,
2158						'header content-type found charset of: '
2159						. $charset
2160					);
2161				}
2162			}
2163		}
2164
2165		if (empty($charset)) {
2166			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2167
2168			if (!empty($el)) {
2169				$fullvalue = $el->content;
2170				if (is_object($debug_object)) {
2171					$debug_object->debug_log(2,
2172						'meta content-type tag found'
2173						. $fullvalue
2174					);
2175				}
2176
2177				if (!empty($fullvalue)) {
2178					$success = preg_match(
2179						'/charset=(.+)/i',
2180						$fullvalue,
2181						$matches
2182					);
2183
2184					if ($success) {
2185						$charset = $matches[1];
2186					} else {
2187						// If there is a meta tag, and they don't specify the
2188						// character set, research says that it's typically
2189						// ISO-8859-1
2190						if (is_object($debug_object)) {
2191							$debug_object->debug_log(2,
2192								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2193							);
2194						}
2195
2196						$charset = 'ISO-8859-1';
2197					}
2198				}
2199			}
2200		}
2201
2202		// If we couldn't find a charset above, then lets try to detect one
2203		// based on the text we got...
2204		if (empty($charset)) {
2205			// Use this in case mb_detect_charset isn't installed/loaded on
2206			// this machine.
2207			$charset = false;
2208			if (function_exists('mb_detect_encoding')) {
2209				// Have php try to detect the encoding from the text given to us.
2210				$charset = mb_detect_encoding(
2211					$this->doc . 'ascii',
2212					$encoding_list = array( 'UTF-8', 'CP1252' )
2213				);
2214
2215				if (is_object($debug_object)) {
2216					$debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2217				}
2218			}
2219
2220			// and if this doesn't work...  then we need to just wrongheadedly
2221			// assume it's UTF-8 so that we can move on - cause this will
2222			// usually give us most of what we need...
2223			if ($charset === false) {
2224				if (is_object($debug_object)) {
2225					$debug_object->debug_log(
2226						2,
2227						'since mb_detect failed - using default of utf-8'
2228					);
2229				}
2230
2231				$charset = 'UTF-8';
2232			}
2233		}
2234
2235		// Since CP1252 is a superset, if we get one of it's subsets, we want
2236		// it instead.
2237		if ((strtolower($charset) == strtolower('ISO-8859-1'))
2238			|| (strtolower($charset) == strtolower('Latin1'))
2239			|| (strtolower($charset) == strtolower('Latin-1'))) {
2240
2241			if (is_object($debug_object)) {
2242				$debug_object->debug_log(
2243					2,
2244					'replacing ' . $charset . ' with CP1252 as its a superset'
2245				);
2246			}
2247
2248			$charset = 'CP1252';
2249		}
2250
2251		if (is_object($debug_object)) {
2252			$debug_object->debug_log(1, 'EXIT - ' . $charset);
2253		}
2254
2255		return $this->_charset = $charset;
2256	}
2257
2258	/**
2259	 * Parse tag from current document position.
2260	 *
2261	 * @return bool True if a tag was found, false otherwise
2262	 */
2263	protected function read_tag()
2264	{
2265		// Set end position if no further tags found
2266		if ($this->char !== '<') {
2267			$this->root->_[HDOM_INFO_END] = $this->cursor;
2268			return false;
2269		}
2270
2271		$begin_tag_pos = $this->pos;
2272		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2273
2274		// end tag
2275		if ($this->char === '/') {
2276			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2277
2278			// Skip whitespace in end tags (i.e. in "</   html>")
2279			$this->skip($this->token_blank);
2280			$tag = $this->copy_until_char('>');
2281
2282			// Skip attributes in end tags
2283			if (($pos = strpos($tag, ' ')) !== false) {
2284				$tag = substr($tag, 0, $pos);
2285			}
2286
2287			$parent_lower = strtolower($this->parent->tag);
2288			$tag_lower = strtolower($tag);
2289
2290			// The end tag is supposed to close the parent tag. Handle situations
2291			// when it doesn't
2292			if ($parent_lower !== $tag_lower) {
2293				// Parent tag does not have to be closed necessarily (optional closing tag)
2294				// Current tag is a block tag, so it may close an ancestor
2295				if (isset($this->optional_closing_tags[$parent_lower])
2296					&& isset($this->block_tags[$tag_lower])) {
2297
2298					$this->parent->_[HDOM_INFO_END] = 0;
2299					$org_parent = $this->parent;
2300
2301					// Traverse ancestors to find a matching opening tag
2302					// Stop at root node
2303					while (($this->parent->parent)
2304						&& strtolower($this->parent->tag) !== $tag_lower
2305					){
2306						$this->parent = $this->parent->parent;
2307					}
2308
2309					// If we don't have a match add current tag as text node
2310					if (strtolower($this->parent->tag) !== $tag_lower) {
2311						$this->parent = $org_parent; // restore origonal parent
2312
2313						if ($this->parent->parent) {
2314							$this->parent = $this->parent->parent;
2315						}
2316
2317						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2318						return $this->as_text_node($tag);
2319					}
2320				} elseif (($this->parent->parent)
2321					&& isset($this->block_tags[$tag_lower])
2322				) {
2323					// Grandparent exists and current tag is a block tag, so our
2324					// parent doesn't have an end tag
2325					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
2326					$org_parent = $this->parent;
2327
2328					// Traverse ancestors to find a matching opening tag
2329					// Stop at root node
2330					while (($this->parent->parent)
2331						&& strtolower($this->parent->tag) !== $tag_lower
2332					) {
2333						$this->parent = $this->parent->parent;
2334					}
2335
2336					// If we don't have a match add current tag as text node
2337					if (strtolower($this->parent->tag) !== $tag_lower) {
2338						$this->parent = $org_parent; // restore origonal parent
2339						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2340						return $this->as_text_node($tag);
2341					}
2342				} elseif (($this->parent->parent)
2343					&& strtolower($this->parent->parent->tag) === $tag_lower
2344				) { // Grandparent exists and current tag closes it
2345					$this->parent->_[HDOM_INFO_END] = 0;
2346					$this->parent = $this->parent->parent;
2347				} else { // Random tag, add as text node
2348					return $this->as_text_node($tag);
2349				}
2350			}
2351
2352			// Set end position of parent tag to current cursor position
2353			$this->parent->_[HDOM_INFO_END] = $this->cursor;
2354
2355			if ($this->parent->parent) {
2356				$this->parent = $this->parent->parent;
2357			}
2358
2359			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2360			return true;
2361		}
2362
2363		// start tag
2364		$node = new simple_html_dom_node($this);
2365		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
2366		++$this->cursor;
2367		$tag = $this->copy_until($this->token_slash); // Get tag name
2368		$node->tag_start = $begin_tag_pos;
2369
2370		// doctype, cdata & comments...
2371		// <!DOCTYPE html>
2372		// <![CDATA[ ... ]]>
2373		// <!-- Comment -->
2374		if (isset($tag[0]) && $tag[0] === '!') {
2375			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2376
2377			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2378				$node->nodetype = HDOM_TYPE_COMMENT;
2379				$node->tag = 'comment';
2380			} else { // Could be doctype or CDATA but we don't care
2381				$node->nodetype = HDOM_TYPE_UNKNOWN;
2382				$node->tag = 'unknown';
2383			}
2384
2385			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2386
2387			$this->link_nodes($node, true);
2388			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2389			return true;
2390		}
2391
2392		// The start tag cannot contain another start tag, if so add as text
2393		// i.e. "<<html>"
2394		if ($pos = strpos($tag, '<') !== false) {
2395			$tag = '<' . substr($tag, 0, -1);
2396			$node->_[HDOM_INFO_TEXT] = $tag;
2397			$this->link_nodes($node, false);
2398			$this->char = $this->doc[--$this->pos]; // prev
2399			return true;
2400		}
2401
2402		// Handle invalid tag names (i.e. "<html#doc>")
2403		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2404			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2405
2406			// Next char is the beginning of a new tag, don't touch it.
2407			if ($this->char === '<') {
2408				$this->link_nodes($node, false);
2409				return true;
2410			}
2411
2412			// Next char closes current tag, add and be done with it.
2413			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2414			$this->link_nodes($node, false);
2415			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2416			return true;
2417		}
2418
2419		// begin tag, add new node
2420		$node->nodetype = HDOM_TYPE_ELEMENT;
2421		$tag_lower = strtolower($tag);
2422		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
2423
2424		// handle optional closing tags
2425		if (isset($this->optional_closing_tags[$tag_lower])) {
2426			// Traverse ancestors to close all optional closing tags
2427			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2428				$this->parent->_[HDOM_INFO_END] = 0;
2429				$this->parent = $this->parent->parent;
2430			}
2431			$node->parent = $this->parent;
2432		}
2433
2434		$guard = 0; // prevent infinity loop
2435
2436		// [0] Space between tag and first attribute
2437		$space = array($this->copy_skip($this->token_blank), '', '');
2438
2439		// attributes
2440		do {
2441			// Everything until the first equal sign should be the attribute name
2442			$name = $this->copy_until($this->token_equal);
2443
2444			if ($name === '' && $this->char !== null && $space[0] === '') {
2445				break;
2446			}
2447
2448			if ($guard === $this->pos) { // Escape infinite loop
2449				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2450				continue;
2451			}
2452
2453			$guard = $this->pos;
2454
2455			// handle endless '<'
2456			// Out of bounds before the tag ended
2457			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2458				$node->nodetype = HDOM_TYPE_TEXT;
2459				$node->_[HDOM_INFO_END] = 0;
2460				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2461				$node->tag = 'text';
2462				$this->link_nodes($node, false);
2463				return true;
2464			}
2465
2466			// handle mismatch '<'
2467			// Attributes cannot start after opening tag
2468			if ($this->doc[$this->pos - 1] == '<') {
2469				$node->nodetype = HDOM_TYPE_TEXT;
2470				$node->tag = 'text';
2471				$node->attr = array();
2472				$node->_[HDOM_INFO_END] = 0;
2473				$node->_[HDOM_INFO_TEXT] = substr(
2474					$this->doc,
2475					$begin_tag_pos,
2476					$this->pos - $begin_tag_pos - 1
2477				);
2478				$this->pos -= 2;
2479				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2480				$this->link_nodes($node, false);
2481				return true;
2482			}
2483
2484			if ($name !== '/' && $name !== '') { // this is a attribute name
2485				// [1] Whitespace after attribute name
2486				$space[1] = $this->copy_skip($this->token_blank);
2487
2488				$name = $this->restore_noise($name); // might be a noisy name
2489
2490				if ($this->lowercase) { $name = strtolower($name); }
2491
2492				if ($this->char === '=') { // attribute with value
2493					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2494					$this->parse_attr($node, $name, $space); // get attribute value
2495				} else {
2496					//no value attr: nowrap, checked selected...
2497					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2498					$node->attr[$name] = true;
2499					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2500				}
2501
2502				$node->_[HDOM_INFO_SPACE][] = $space;
2503
2504				// prepare for next attribute
2505				$space = array(
2506					$this->copy_skip($this->token_blank),
2507					'',
2508					''
2509				);
2510			} else { // no more attributes
2511				break;
2512			}
2513		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2514
2515		$this->link_nodes($node, true);
2516		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2517
2518		// handle empty tags (i.e. "<div/>")
2519		if ($this->copy_until_char('>') === '/') {
2520			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2521			$node->_[HDOM_INFO_END] = 0;
2522		} else {
2523			// reset parent
2524			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2525				$this->parent = $node;
2526			}
2527		}
2528
2529		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2530
2531		// If it's a BR tag, we need to set it's text to the default text.
2532		// This way when we see it in plaintext, we can generate formatting that the user wants.
2533		// since a br tag never has sub nodes, this works well.
2534		if ($node->tag === 'br') {
2535			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2536		}
2537
2538		return true;
2539	}
2540
2541	/**
2542	 * Parse attribute from current document position
2543	 *
2544	 * @param object $node Node for the attributes
2545	 * @param string $name Name of the current attribute
2546	 * @param array $space Array for spacing information
2547	 * @return void
2548	 */
2549	protected function parse_attr($node, $name, &$space)
2550	{
2551		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2552		// If the attribute is already defined inside a tag, only pay attention
2553		// to the first one as opposed to the last one.
2554		// https://stackoverflow.com/a/26341866
2555		if (isset($node->attr[$name])) {
2556			return;
2557		}
2558
2559		// [2] Whitespace between "=" and the value
2560		$space[2] = $this->copy_skip($this->token_blank);
2561
2562		switch ($this->char) {
2563			case '"': // value is anything between double quotes
2564				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2565				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2566				$node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2567				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2568				break;
2569			case '\'': // value is anything between single quotes
2570				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2571				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2572				$node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2573				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2574				break;
2575			default: // value is anything until the first space or end tag
2576				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2577				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2578		}
2579		// PaperG: Attributes should not have \r or \n in them, that counts as
2580		// html whitespace.
2581		$node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2582		$node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2583		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2584		// and trailing space since some people leave it in the multi class case.
2585		if ($name === 'class') {
2586			$node->attr[$name] = trim($node->attr[$name]);
2587		}
2588	}
2589
2590	/**
2591	 * Link node to parent node
2592	 *
2593	 * @param object $node Node to link to parent
2594	 * @param bool $is_child True if the node is a child of parent
2595	 * @return void
2596	 */
2597	// link node's parent
2598	protected function link_nodes(&$node, $is_child)
2599	{
2600		$node->parent = $this->parent;
2601		$this->parent->nodes[] = $node;
2602		if ($is_child) {
2603			$this->parent->children[] = $node;
2604		}
2605	}
2606
2607	/**
2608	 * Add tag as text node to current node
2609	 *
2610	 * @param string $tag Tag name
2611	 * @return bool True on success
2612	 */
2613	protected function as_text_node($tag)
2614	{
2615		$node = new simple_html_dom_node($this);
2616		++$this->cursor;
2617		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2618		$this->link_nodes($node, false);
2619		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2620		return true;
2621	}
2622
2623	/**
2624	 * Seek from the current document position to the first occurrence of a
2625	 * character not defined by the provided string. Update the current document
2626	 * position to the new position.
2627	 *
2628	 * @param string $chars A string containing every allowed character.
2629	 * @return void
2630	 */
2631	protected function skip($chars)
2632	{
2633		$this->pos += strspn($this->doc, $chars, $this->pos);
2634		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2635	}
2636
2637	/**
2638	 * Copy substring from the current document position to the first occurrence
2639	 * of a character not defined by the provided string.
2640	 *
2641	 * @param string $chars A string containing every allowed character.
2642	 * @return string Substring from the current document position to the first
2643	 * occurrence of a character not defined by the provided string.
2644	 */
2645	protected function copy_skip($chars)
2646	{
2647		$pos = $this->pos;
2648		$len = strspn($this->doc, $chars, $pos);
2649		$this->pos += $len;
2650		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2651		if ($len === 0) { return ''; }
2652		return substr($this->doc, $pos, $len);
2653	}
2654
2655	/**
2656	 * Copy substring from the current document position to the first occurrence
2657	 * of any of the provided characters.
2658	 *
2659	 * @param string $chars A string containing every character to stop at.
2660	 * @return string Substring from the current document position to the first
2661	 * occurrence of any of the provided characters.
2662	 */
2663	protected function copy_until($chars)
2664	{
2665		$pos = $this->pos;
2666		$len = strcspn($this->doc, $chars, $pos);
2667		$this->pos += $len;
2668		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2669		return substr($this->doc, $pos, $len);
2670	}
2671
2672	/**
2673	 * Copy substring from the current document position to the first occurrence
2674	 * of the provided string.
2675	 *
2676	 * @param string $char The string to stop at.
2677	 * @return string Substring from the current document position to the first
2678	 * occurrence of the provided string.
2679	 */
2680	protected function copy_until_char($char)
2681	{
2682		if ($this->char === null) { return ''; }
2683
2684		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2685			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2686			$this->char = null;
2687			$this->pos = $this->size;
2688			return $ret;
2689		}
2690
2691		if ($pos === $this->pos) { return ''; }
2692
2693		$pos_old = $this->pos;
2694		$this->char = $this->doc[$pos];
2695		$this->pos = $pos;
2696		return substr($this->doc, $pos_old, $pos - $pos_old);
2697	}
2698
2699	/**
2700	 * Remove noise from HTML content
2701	 *
2702	 * Noise is stored to {@see simple_html_dom::$noise}
2703	 *
2704	 * @param string $pattern The regex pattern used for finding noise
2705	 * @param bool $remove_tag True to remove the entire match. Default is false
2706	 * to only remove the captured data.
2707	 */
2708	protected function remove_noise($pattern, $remove_tag = false)
2709	{
2710		global $debug_object;
2711		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2712
2713		$count = preg_match_all(
2714			$pattern,
2715			$this->doc,
2716			$matches,
2717			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2718		);
2719
2720		for ($i = $count - 1; $i > -1; --$i) {
2721			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2722
2723			if (is_object($debug_object)) {
2724				$debug_object->debug_log(2, 'key is: ' . $key);
2725			}
2726
2727			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2728			$this->noise[$key] = $matches[$i][$idx][0];
2729			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2730		}
2731
2732		// reset the length of content
2733		$this->size = strlen($this->doc);
2734
2735		if ($this->size > 0) {
2736			$this->char = $this->doc[0];
2737		}
2738	}
2739
2740	/**
2741	 * Restore noise to HTML content
2742	 *
2743	 * Noise is restored from {@see simple_html_dom::$noise}
2744	 *
2745	 * @param string $text A subset of HTML containing noise
2746	 * @return string The same content with noise restored
2747	 */
2748	function restore_noise($text)
2749	{
2750		global $debug_object;
2751		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2752
2753		while (($pos = strpos($text, '___noise___')) !== false) {
2754			// Sometimes there is a broken piece of markup, and we don't GET the
2755			// pos+11 etc... token which indicates a problem outside of us...
2756
2757			// todo: "___noise___1000" (or any number with four or more digits)
2758			// in the DOM causes an infinite loop which could be utilized by
2759			// malicious software
2760			if (strlen($text) > $pos + 15) {
2761				$key = '___noise___'
2762				. $text[$pos + 11]
2763				. $text[$pos + 12]
2764				. $text[$pos + 13]
2765				. $text[$pos + 14]
2766				. $text[$pos + 15];
2767
2768				if (is_object($debug_object)) {
2769					$debug_object->debug_log(2, 'located key of: ' . $key);
2770				}
2771
2772				if (isset($this->noise[$key])) {
2773					$text = substr($text, 0, $pos)
2774					. $this->noise[$key]
2775					. substr($text, $pos + 16);
2776				} else {
2777					// do this to prevent an infinite loop.
2778					$text = substr($text, 0, $pos)
2779					. 'UNDEFINED NOISE FOR KEY: '
2780					. $key
2781					. substr($text, $pos + 16);
2782				}
2783			} else {
2784				// There is no valid key being given back to us... We must get
2785				// rid of the ___noise___ or we will have a problem.
2786				$text = substr($text, 0, $pos)
2787				. 'NO NUMERIC NOISE KEY'
2788				. substr($text, $pos + 11);
2789			}
2790		}
2791		return $text;
2792	}
2793
2794	// Sometimes we NEED one of the noise elements.
2795	function search_noise($text)
2796	{
2797		global $debug_object;
2798		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2799
2800		foreach($this->noise as $noiseElement) {
2801			if (strpos($noiseElement, $text) !== false) {
2802				return $noiseElement;
2803			}
2804		}
2805	}
2806
2807	function __toString()
2808	{
2809		return $this->root->innertext();
2810	}
2811
2812	function __get($name)
2813	{
2814		switch ($name) {
2815			case 'outertext':
2816				return $this->root->innertext();
2817			case 'innertext':
2818				return $this->root->innertext();
2819			case 'plaintext':
2820				return $this->root->text();
2821			case 'charset':
2822				return $this->_charset;
2823			case 'target_charset':
2824				return $this->_target_charset;
2825		}
2826	}
2827
2828	// camel naming conventions
2829	function childNodes($idx = -1)
2830	{
2831		return $this->root->childNodes($idx);
2832	}
2833
2834	function firstChild()
2835	{
2836		return $this->root->first_child();
2837	}
2838
2839	function lastChild()
2840	{
2841		return $this->root->last_child();
2842	}
2843
2844	function createElement($name, $value = null)
2845	{
2846		return @str_get_html("<$name>$value</$name>")->first_child();
2847	}
2848
2849	function createTextNode($value)
2850	{
2851		return @end(str_get_html($value)->nodes);
2852	}
2853
2854	function getElementById($id)
2855	{
2856		return $this->find("#$id", 0);
2857	}
2858
2859	function getElementsById($id, $idx = null)
2860	{
2861		return $this->find("#$id", $idx);
2862	}
2863
2864	function getElementByTagName($name)
2865	{
2866		return $this->find($name, 0);
2867	}
2868
2869	function getElementsByTagName($name, $idx = -1)
2870	{
2871		return $this->find($name, $idx);
2872	}
2873
2874	function loadFile()
2875	{
2876		$args = func_get_args();
2877		$this->load_file($args);
2878	}
2879}
2880
2881
2882?>