· 4 years ago · Aug 19, 2021, 07:30 PM
1//
2// nokogumbo.c defines the following:
3//
4// class Nokogumbo
5// def parse(utf8_string) # returns Nokogiri::HTML5::Document
6// end
7//
8// Processing starts by calling gumbo_parse_with_options. The resulting
9// document tree is then walked:
10//
11// * if Nokogiri and libxml2 headers are available at compile time,
12// (if NGLIB) then a parallel libxml2 tree is constructed, and the
13// final document is then wrapped using Nokogiri_wrap_xml_document.
14// This approach reduces memory and CPU requirements as Ruby objects
15// are only built when necessary.
16//
17// * if the necessary headers are not available at compile time, Nokogiri
18// methods are called instead, producing the equivalent functionality.
19//
20
21#include <assert.h>
22#include <ruby.h>
23#include <ruby/version.h>
24
25#include "gumbo.h"
26
27// class constants
28static VALUE Document;
29
30// Interned symbols
31static ID internal_subset;
32static ID parent;
33
34/* Backwards compatibility to Ruby 2.1.0 */
35#if RUBY_API_VERSION_CODE < 20200
36#define ONIG_ESCAPE_UCHAR_COLLISION 1
37#include <ruby/encoding.h>
38
39static VALUE rb_utf8_str_new(const char *str, long length) {
40 return rb_enc_str_new(str, length, rb_utf8_encoding());
41}
42
43static VALUE rb_utf8_str_new_cstr(const char *str) {
44 return rb_enc_str_new_cstr(str, rb_utf8_encoding());
45}
46
47static VALUE rb_utf8_str_new_static(const char *str, long length) {
48 return rb_enc_str_new(str, length, rb_utf8_encoding());
49}
50#endif
51
52#if NGLIB
53#include <nokogiri.h>
54#include <libxml/tree.h>
55#include <libxml/HTMLtree.h>
56
57#define NIL NULL
58#else
59#define NIL Qnil
60
61// These are defined by nokogiri.h
62static VALUE cNokogiriXmlSyntaxError;
63static VALUE cNokogiriXmlElement;
64static VALUE cNokogiriXmlText;
65static VALUE cNokogiriXmlCData;
66static VALUE cNokogiriXmlComment;
67
68// Interned symbols.
69static ID new;
70static ID node_name_;
71
72// Map libxml2 types to Ruby VALUE.
73typedef VALUE xmlNodePtr;
74typedef VALUE xmlDocPtr;
75typedef VALUE xmlNsPtr;
76typedef VALUE xmlDtdPtr;
77typedef char xmlChar;
78#define BAD_CAST
79
80// Redefine libxml2 API as Ruby function calls.
81static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) {
82 assert(ns == NIL && content == NULL);
83 return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc);
84}
85
86static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) {
87 VALUE str = rb_utf8_str_new_cstr(content);
88 return rb_funcall(cNokogiriXmlText, new, 2, str, doc);
89}
90
91static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) {
92 VALUE str = rb_utf8_str_new(content, len);
93 // CDATA.new takes arguments in the opposite order from Text.new.
94 return rb_funcall(cNokogiriXmlCData, new, 2, doc, str);
95}
96
97static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) {
98 VALUE str = rb_utf8_str_new_cstr(content);
99 return rb_funcall(cNokogiriXmlComment, new, 2, doc, str);
100}
101
102static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) {
103 ID add_child;
104 CONST_ID(add_child, "add_child");
105 return rb_funcall(parent, add_child, 1, cur);
106}
107
108static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) {
109 ID namespace_;
110 CONST_ID(namespace_, "namespace=");
111 rb_funcall(node, namespace_, 1, ns);
112}
113
114static void xmlFreeDoc(xmlDocPtr doc) { }
115
116static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) {
117 return doc;
118}
119
120static VALUE find_dummy_key(VALUE collection) {
121 VALUE r_dummy = Qnil;
122 char dummy[5] = "a";
123 size_t len = 1;
124 ID key_;
125 CONST_ID(key_, "key?");
126 while (len < sizeof dummy) {
127 r_dummy = rb_utf8_str_new(dummy, len);
128 if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
129 return r_dummy;
130 for (size_t i = 0; ; ++i) {
131 if (dummy[i] == 0) {
132 dummy[i] = 'a';
133 ++len;
134 break;
135 }
136 if (dummy[i] == 'z')
137 dummy[i] = 'a';
138 else {
139 ++dummy[i];
140 break;
141 }
142 }
143 }
144 // This collection has 475254 elements?? Give up.
145 rb_raise(rb_eArgError, "Failed to find a dummy key.");
146}
147
148// This should return an xmlAttrPtr, but we don't need it and it's easier to
149// not get the result.
150static void xmlNewNsProp (
151 xmlNodePtr node,
152 xmlNsPtr ns,
153 const xmlChar *name,
154 const xmlChar *value
155) {
156 ID set_attribute;
157 CONST_ID(set_attribute, "set_attribute");
158
159 VALUE rvalue = rb_utf8_str_new_cstr(value);
160
161 if (RTEST(ns)) {
162 // This is an easy case, we have a namespace so it's enough to do
163 // node["#{ns.prefix}:#{name}"] = value
164 ID prefix;
165 CONST_ID(prefix, "prefix");
166 VALUE ns_prefix = rb_funcall(ns, prefix, 0);
167 VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name);
168 rb_funcall(node, set_attribute, 2, qname, rvalue);
169 return;
170 }
171
172 size_t len = strlen(name);
173 VALUE rname = rb_utf8_str_new(name, len);
174 if (memchr(name, ':', len) == NULL) {
175 // This is the easiest case. There's no colon so we can do
176 // node[name] = value.
177 rb_funcall(node, set_attribute, 2, rname, rvalue);
178 return;
179 }
180
181 // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
182 // which behaves roughly as
183 // if name is a QName prefix:local
184 // if node->doc has a namespace ns corresponding to prefix
185 // return xmlSetNsProp(node, ns, local, value)
186 // return xmlSetNsProp(node, NULL, name, value)
187 //
188 // If the prefix is "xml", then the namespace lookup will create it.
189 //
190 // By contrast, xmlNewNsProp does not do this parsing and creates an attribute
191 // with the name and value exactly as given. This is the behavior that we
192 // want.
193 //
194 // Thus, for attribute names like "xml:lang", #set_attribute will create an
195 // attribute with namespace "xml" and name "lang". This is incorrect for
196 // html elements (but correct for foreign elements).
197 //
198 // Work around this by inserting a dummy attribute and then changing the
199 // name, if needed.
200
201 // Find a dummy attribute string that doesn't already exist.
202 VALUE dummy = find_dummy_key(node);
203 // Add the dummy attribute.
204 rb_funcall(node, set_attribute, 2, dummy, rvalue);
205
206 // Remove the old attribute, if it exists.
207 ID remove_attribute;
208 CONST_ID(remove_attribute, "remove_attribute");
209 rb_funcall(node, remove_attribute, 1, rname);
210
211 // Rename the dummy
212 ID attribute;
213 CONST_ID(attribute, "attribute");
214 VALUE attr = rb_funcall(node, attribute, 1, dummy);
215 rb_funcall(attr, node_name_, 1, rname);
216}
217#endif
218
219// URI = system id
220// external id = public id
221static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
222{
223#if NGLIB
224 // These two libxml2 functions take the public and system ids in
225 // opposite orders.
226 htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
227 assert(doc);
228 if (dtd_name)
229 xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
230 return doc;
231#else
232 // remove internal subset from newly created documents
233 VALUE doc;
234 // If system and public are both NULL, Document#new is going to set default
235 // values for them so we're going to have to remove the internal subset
236 // which seems to leak memory in Nokogiri, so leak as little as possible.
237 if (system == NULL && public == NULL) {
238 ID remove;
239 CONST_ID(remove, "remove");
240 doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0));
241 rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0);
242 if (dtd_name) {
243 // We need to create an internal subset now.
244 ID create_internal_subset;
245 CONST_ID(create_internal_subset, "create_internal_subset");
246 rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil);
247 }
248 } else {
249 assert(dtd_name);
250 // Rather than removing and creating the internal subset as we did above,
251 // just create and then rename one.
252 VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil;
253 VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil;
254 doc = rb_funcall(Document, new, 2, r_system, r_public);
255 rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name));
256 }
257 return doc;
258#endif
259}
260
261static xmlNodePtr get_parent(xmlNodePtr node) {
262#if NGLIB
263 return node->parent;
264#else
265 if (!rb_respond_to(node, parent))
266 return Qnil;
267 return rb_funcall(node, parent, 0);
268#endif
269}
270
271static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
272 assert(RTEST(input));
273 Check_Type(input, T_STRING);
274 GumboOutput *output = gumbo_parse_with_options (
275 options,
276 RSTRING_PTR(input),
277 RSTRING_LEN(input)
278 );
279
280 const char *status_string = gumbo_status_to_string(output->status);
281 switch (output->status) {
282 case GUMBO_STATUS_OK:
283 break;
284 case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
285 case GUMBO_STATUS_TREE_TOO_DEEP:
286 gumbo_destroy_output(output);
287 rb_raise(rb_eArgError, "%s", status_string);
288 case GUMBO_STATUS_OUT_OF_MEMORY:
289 gumbo_destroy_output(output);
290 rb_raise(rb_eNoMemError, "%s", status_string);
291 }
292 return output;
293}
294
295static xmlNsPtr lookup_or_add_ns (
296 xmlDocPtr doc,
297 xmlNodePtr root,
298 const char *href,
299 const char *prefix
300) {
301#if NGLIB
302 xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
303 if (ns)
304 return ns;
305 return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
306#else
307 ID add_namespace_definition;
308 CONST_ID(add_namespace_definition, "add_namespace_definition");
309 VALUE rprefix = rb_utf8_str_new_cstr(prefix);
310 VALUE rhref = rb_utf8_str_new_cstr(href);
311 return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref);
312#endif
313}
314
315static void set_line(xmlNodePtr node, size_t line) {
316#if NGLIB
317 // libxml2 uses 65535 to mean look elsewhere for the line number on some
318 // nodes.
319 if (line < 65535)
320 node->line = (unsigned short)line;
321#else
322 // XXX: If Nokogiri gets a `#line=` method, we'll use that.
323#endif
324}
325
326// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
327// at gumbo_node.
328static void build_tree (
329 xmlDocPtr doc,
330 xmlNodePtr xml_output_node,
331 const GumboNode *gumbo_node
332) {
333 xmlNodePtr xml_root = NIL;
334 xmlNodePtr xml_node = xml_output_node;
335 size_t child_index = 0;
336
337 while (true) {
338 assert(gumbo_node != NULL);
339 const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT?
340 &gumbo_node->v.document.children : &gumbo_node->v.element.children;
341 if (child_index >= children->length) {
342 // Move up the tree and to the next child.
343 if (xml_node == xml_output_node) {
344 // We've built as much of the tree as we can.
345 return;
346 }
347 child_index = gumbo_node->index_within_parent + 1;
348 gumbo_node = gumbo_node->parent;
349 xml_node = get_parent(xml_node);
350 // Children of fragments don't share the same root, so reset it and
351 // it'll be set below. In the non-fragment case, this will only happen
352 // after the html element has been finished at which point there are no
353 // further elements.
354 if (xml_node == xml_output_node)
355 xml_root = NIL;
356 continue;
357 }
358 const GumboNode *gumbo_child = children->data[child_index++];
359 xmlNodePtr xml_child;
360
361 switch (gumbo_child->type) {
362 case GUMBO_NODE_DOCUMENT:
363 abort(); // Bug in Gumbo.
364
365 case GUMBO_NODE_TEXT:
366 case GUMBO_NODE_WHITESPACE:
367 xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
368 set_line(xml_child, gumbo_child->v.text.start_pos.line);
369 xmlAddChild(xml_node, xml_child);
370 break;
371
372 case GUMBO_NODE_CDATA:
373 xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
374 (int) strlen(gumbo_child->v.text.text));
375 set_line(xml_child, gumbo_child->v.text.start_pos.line);
376 xmlAddChild(xml_node, xml_child);
377 break;
378
379 case GUMBO_NODE_COMMENT:
380 xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
381 set_line(xml_child, gumbo_child->v.text.start_pos.line);
382 xmlAddChild(xml_node, xml_child);
383 break;
384
385 case GUMBO_NODE_TEMPLATE:
386 // XXX: Should create a template element and a new DocumentFragment
387 case GUMBO_NODE_ELEMENT:
388 {
389 xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
390 set_line(xml_child, gumbo_child->v.element.start_pos.line);
391 if (xml_root == NIL)
392 xml_root = xml_child;
393 xmlNsPtr ns = NIL;
394 switch (gumbo_child->v.element.tag_namespace) {
395 case GUMBO_NAMESPACE_HTML:
396 break;
397 case GUMBO_NAMESPACE_SVG:
398 ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
399 break;
400 case GUMBO_NAMESPACE_MATHML:
401 ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
402 break;
403 }
404 if (ns != NIL)
405 xmlSetNs(xml_child, ns);
406 xmlAddChild(xml_node, xml_child);
407
408 // Add the attributes.
409 const GumboVector* attrs = &gumbo_child->v.element.attributes;
410 for (size_t i=0; i < attrs->length; i++) {
411 const GumboAttribute *attr = attrs->data[i];
412
413 switch (attr->attr_namespace) {
414 case GUMBO_ATTR_NAMESPACE_XLINK:
415 ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
416 break;
417
418 case GUMBO_ATTR_NAMESPACE_XML:
419 ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
420 break;
421
422 case GUMBO_ATTR_NAMESPACE_XMLNS:
423 ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
424 break;
425
426 default:
427 ns = NIL;
428 }
429 xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
430 }
431
432 // Add children for this element.
433 child_index = 0;
434 gumbo_node = gumbo_child;
435 xml_node = xml_child;
436 }
437 }
438 }
439}
440
441static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) {
442 const char *input_str = RSTRING_PTR(input);
443 size_t input_len = RSTRING_LEN(input);
444
445 // Add parse errors to rdoc.
446 if (output->errors.length) {
447 const GumboVector *errors = &output->errors;
448 VALUE rerrors = rb_ary_new2(errors->length);
449
450 for (size_t i=0; i < errors->length; i++) {
451 GumboError *err = errors->data[i];
452 GumboSourcePosition position = gumbo_error_position(err);
453 char *msg;
454 size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
455 VALUE err_str = rb_utf8_str_new(msg, size);
456 free(msg);
457 VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
458 const char *error_code = gumbo_error_code(err);
459 VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
460 rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
461 rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
462 rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
463 rb_iv_set(syntax_error, "@file", url);
464 rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
465 rb_iv_set(syntax_error, "@str1", str1);
466 rb_iv_set(syntax_error, "@str2", Qnil);
467 rb_iv_set(syntax_error, "@str3", Qnil);
468 rb_iv_set(syntax_error, "@int1", INT2NUM(0));
469 rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
470 rb_ary_push(rerrors, syntax_error);
471 }
472 rb_iv_set(rdoc, "@errors", rerrors);
473 }
474}
475
476typedef struct {
477 GumboOutput *output;
478 VALUE input;
479 VALUE url_or_frag;
480 xmlDocPtr doc;
481} ParseArgs;
482
483static void parse_args_mark(void *parse_args) {
484 ParseArgs *args = parse_args;
485 rb_gc_mark_maybe(args->input);
486 rb_gc_mark_maybe(args->url_or_frag);
487}
488
489// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
490// wrapper.
491static VALUE wrap_parse_args(ParseArgs *args) {
492 return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
493}
494
495// Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
496static ParseArgs *unwrap_parse_args(VALUE obj) {
497 ParseArgs *args;
498 Data_Get_Struct(obj, ParseArgs, args);
499 return args;
500}
501
502static VALUE parse_cleanup(VALUE parse_args) {
503 ParseArgs *args = unwrap_parse_args(parse_args);
504 gumbo_destroy_output(args->output);
505 // Make sure garbage collection doesn't mark the objects as being live based
506 // on references from the ParseArgs. This may be unnecessary.
507 args->input = Qnil;
508 args->url_or_frag = Qnil;
509 if (args->doc != NIL)
510 xmlFreeDoc(args->doc);
511 return Qnil;
512}
513
514static VALUE parse_continue(VALUE parse_args);
515
516// Parse a string using gumbo_parse into a Nokogiri document
517static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
518 GumboOptions options = kGumboDefaultOptions;
519 options.max_attributes = NUM2INT(max_attributes);
520 options.max_errors = NUM2INT(max_errors);
521 options.max_tree_depth = NUM2INT(max_depth);
522
523 GumboOutput *output = perform_parse(&options, input);
524 ParseArgs args = {
525 .output = output,
526 .input = input,
527 .url_or_frag = url,
528 .doc = NIL,
529 };
530 VALUE parse_args = wrap_parse_args(&args);
531
532 return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
533}
534
535static VALUE parse_continue(VALUE parse_args) {
536 ParseArgs *args = unwrap_parse_args(parse_args);
537 GumboOutput *output = args->output;
538 xmlDocPtr doc;
539 if (output->document->v.document.has_doctype) {
540 const char *name = output->document->v.document.name;
541 const char *public = output->document->v.document.public_identifier;
542 const char *system = output->document->v.document.system_identifier;
543 public = public[0] ? public : NULL;
544 system = system[0] ? system : NULL;
545 doc = new_html_doc(name, system, public);
546 } else {
547 doc = new_html_doc(NULL, NULL, NULL);
548 }
549 args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
550 build_tree(doc, (xmlNodePtr)doc, output->document);
551 VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
552 args->doc = NIL; // The Ruby runtime now owns doc so don't delete it.
553 add_errors(output, rdoc, args->input, args->url_or_frag);
554 return rdoc;
555}
556
557static int lookup_namespace(VALUE node, bool require_known_ns) {
558 ID namespace, href;
559 CONST_ID(namespace, "namespace");
560 CONST_ID(href, "href");
561 VALUE ns = rb_funcall(node, namespace, 0);
562
563 if (NIL_P(ns))
564 return GUMBO_NAMESPACE_HTML;
565 ns = rb_funcall(ns, href, 0);
566 assert(RTEST(ns));
567 Check_Type(ns, T_STRING);
568
569 const char *href_ptr = RSTRING_PTR(ns);
570 size_t href_len = RSTRING_LEN(ns);
571#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
572 if (NAMESPACE_P("http://www.w3.org/1999/xhtml"))
573 return GUMBO_NAMESPACE_HTML;
574 if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML"))
575 return GUMBO_NAMESPACE_MATHML;
576 if (NAMESPACE_P("http://www.w3.org/2000/svg"))
577 return GUMBO_NAMESPACE_SVG;
578#undef NAMESPACE_P
579 if (require_known_ns)
580 rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
581 return -1;
582}
583
584static xmlNodePtr extract_xml_node(VALUE node) {
585#if NGLIB
586 xmlNodePtr xml_node;
587 Data_Get_Struct(node, xmlNode, xml_node);
588 return xml_node;
589#else
590 return node;
591#endif
592}
593
594static VALUE fragment_continue(VALUE parse_args);
595
596static VALUE fragment (
597 VALUE self,
598 VALUE doc_fragment,
599 VALUE tags,
600 VALUE ctx,
601 VALUE max_attributes,
602 VALUE max_errors,
603 VALUE max_depth
604) {
605 ID name = rb_intern_const("name");
606 const char *ctx_tag;
607 GumboNamespaceEnum ctx_ns;
608 GumboQuirksModeEnum quirks_mode;
609 bool form = false;
610 const char *encoding = NULL;
611
612 if (NIL_P(ctx)) {
613 ctx_tag = "body";
614 ctx_ns = GUMBO_NAMESPACE_HTML;
615 } else if (TYPE(ctx) == T_STRING) {
616 ctx_tag = StringValueCStr(ctx);
617 ctx_ns = GUMBO_NAMESPACE_HTML;
618 size_t len = RSTRING_LEN(ctx);
619 const char *colon = memchr(ctx_tag, ':', len);
620 if (colon) {
621 switch (colon - ctx_tag) {
622 case 3:
623 if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
624 goto error;
625 ctx_ns = GUMBO_NAMESPACE_SVG;
626 break;
627 case 4:
628 if (st_strncasecmp(ctx_tag, "html", 4) == 0)
629 ctx_ns = GUMBO_NAMESPACE_HTML;
630 else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
631 ctx_ns = GUMBO_NAMESPACE_MATHML;
632 else
633 goto error;
634 break;
635 default:
636 error:
637 rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
638 }
639 ctx_tag = colon+1;
640 } else {
641 // For convenience, put 'svg' and 'math' in their namespaces.
642 if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
643 ctx_ns = GUMBO_NAMESPACE_SVG;
644 else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
645 ctx_ns = GUMBO_NAMESPACE_MATHML;
646 }
647
648 // Check if it's a form.
649 form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
650 } else {
651 ID element_ = rb_intern_const("element?");
652
653 // Context fragment name.
654 VALUE tag_name = rb_funcall(ctx, name, 0);
655 assert(RTEST(tag_name));
656 Check_Type(tag_name, T_STRING);
657 ctx_tag = StringValueCStr(tag_name);
658
659 // Context fragment namespace.
660 ctx_ns = lookup_namespace(ctx, true);
661
662 // Check for a form ancestor, including self.
663 for (VALUE node = ctx;
664 !NIL_P(node);
665 node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
666 if (!RTEST(rb_funcall(node, element_, 0)))
667 continue;
668 VALUE element_name = rb_funcall(node, name, 0);
669 if (RSTRING_LEN(element_name) == 4
670 && !st_strcasecmp(RSTRING_PTR(element_name), "form")
671 && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
672 form = true;
673 break;
674 }
675 }
676
677 // Encoding.
678 if (RSTRING_LEN(tag_name) == 14
679 && !st_strcasecmp(ctx_tag, "annotation-xml")) {
680 VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
681 rb_utf8_str_new_static("encoding", 8));
682 if (RTEST(enc)) {
683 Check_Type(enc, T_STRING);
684 encoding = StringValueCStr(enc);
685 }
686 }
687 }
688
689 // Quirks mode.
690 VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
691 VALUE dtd = rb_funcall(doc, internal_subset, 0);
692 if (NIL_P(dtd)) {
693 quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
694 } else {
695 VALUE dtd_name = rb_funcall(dtd, name, 0);
696 VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
697 VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
698 quirks_mode = gumbo_compute_quirks_mode (
699 NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
700 NIL_P(pubid)? NULL:StringValueCStr(pubid),
701 NIL_P(sysid)? NULL:StringValueCStr(sysid)
702 );
703 }
704
705 // Perform a fragment parse.
706 int depth = NUM2INT(max_depth);
707 GumboOptions options = kGumboDefaultOptions;
708 options.max_attributes = NUM2INT(max_attributes);
709 options.max_errors = NUM2INT(max_errors);
710 // Add one to account for the HTML element.
711 options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
712 options.fragment_context = ctx_tag;
713 options.fragment_namespace = ctx_ns;
714 options.fragment_encoding = encoding;
715 options.quirks_mode = quirks_mode;
716 options.fragment_context_has_form_ancestor = form;
717
718 GumboOutput *output = perform_parse(&options, tags);
719 ParseArgs args = {
720 .output = output,
721 .input = tags,
722 .url_or_frag = doc_fragment,
723 .doc = (xmlDocPtr)extract_xml_node(doc),
724 };
725 VALUE parse_args = wrap_parse_args(&args);
726 rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
727 return Qnil;
728}
729
730static VALUE fragment_continue(VALUE parse_args) {
731 ParseArgs *args = unwrap_parse_args(parse_args);
732 GumboOutput *output = args->output;
733 VALUE doc_fragment = args->url_or_frag;
734 xmlDocPtr xml_doc = args->doc;
735
736 args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it.
737 xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
738 build_tree(xml_doc, xml_frag, output->root);
739 add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
740 return Qnil;
741}
742
743// Initialize the Nokogumbo class and fetch constants we will use later.
744void Init_nokogumbo() {
745 rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
746 rb_require("nokogiri");
747
748 VALUE line_supported = Qtrue;
749
750#if !NGLIB
751 // Class constants.
752 VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
753 VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
754 cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
755 rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
756 cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
757 rb_gc_register_mark_object(cNokogiriXmlElement);
758 cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
759 rb_gc_register_mark_object(cNokogiriXmlText);
760 cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
761 rb_gc_register_mark_object(cNokogiriXmlCData);
762 cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
763 rb_gc_register_mark_object(cNokogiriXmlComment);
764
765 // Interned symbols.
766 new = rb_intern_const("new");
767 node_name_ = rb_intern_const("node_name=");
768
769 // #line is not supported (returns 0)
770 line_supported = Qfalse;
771#endif
772
773 // Class constants.
774 VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
775 Document = rb_const_get(HTML5, rb_intern_const("Document"));
776 rb_gc_register_mark_object(Document);
777
778 // Interned symbols.
779 internal_subset = rb_intern_const("internal_subset");
780 parent = rb_intern_const("parent");
781
782 // Define Nokogumbo module with parse and fragment methods.
783 VALUE Gumbo = rb_define_module("Nokogumbo");
784 rb_define_singleton_method(Gumbo, "parse", parse, 5);
785 rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
786
787 // Add private constant for testing.
788 rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
789 rb_funcall(Gumbo, rb_intern_const("private_constant"), 1,
790 rb_utf8_str_new_cstr("LINE_SUPPORTED"));
791}
792
793// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
794