· 6 years ago · Jun 14, 2019, 05:50 AM
1-- luaexpat-utils
2-- lxp.doc implements some useful things on LOM documents, such as returned by lxp.lom.parse.
3-- In particular, it can convert LOM back into XML text, with optional pretty-printing control.
4-- It's based on stanza.lua from Prosody (http://hg.prosody.im/trunk/file/4621c92d2368/util/stanza.lua)
5--
6-- Can be used as a lightweight one-stop-shop for simple XML processing; a simple XML parser is included
7-- but the default is to use lxp.lom if it can be found.
8--
9-- Prosody IM
10-- Copyright (C) 2008-2010 Matthew Wild
11-- Copyright (C) 2008-2010 Waqas Hussain
12--
13-- classic Lua XML parser by Roberto Ierusalimschy.
14-- modified to output LOM format.
15-- http://lua-users.org/wiki/LuaXml
16--
17-- This project is MIT/X11 licensed. Please see the
18-- COPYING file in the source package for more information.
19--
20
21local t_insert = table.insert;
22local t_concat = table.concat;
23local t_remove = table.remove;
24local s_format = string.format;
25local s_match = string.match;
26local tostring = tostring;
27local setmetatable = setmetatable;
28local getmetatable = getmetatable;
29local pairs = pairs;
30local ipairs = ipairs;
31local type = type;
32local next = next;
33local print = print;
34local unpack = unpack or table.unpack;
35local s_gsub = string.gsub;
36local s_char = string.char;
37local s_find = string.find;
38local os = os;
39local pcall,require,io = pcall,require,io
40local split
41--module (...)
42local _M = {}
43local Doc = { __type = "doc" };
44Doc.__index = Doc;
45
46--- create a new document node.
47-- @param tag the tag name
48-- @param attr optional attributes (table of name-value pairs)
49function _M.new(tag, attr)
50 local doc = { tag = tag, attr = attr or {}, last_add = {}};
51 return setmetatable(doc, Doc);
52end
53
54-- injects an existing document already parsed
55-- @param tdoc table containing the parsed document
56function _M.build(tdoc)
57 _M.walk(tdoc,false,function(_,d)
58 setmetatable(d,Doc)
59 end)
60 return tdoc
61end
62
63--- parse an XML document. By default, this uses lxp.lom.parse, but
64-- falls back to basic_parse, or if use_basic is true
65-- @param text_or_file file or string representation
66-- @param is_file whether text_or_file is a file name or not
67-- @param use_basic do a basic parse
68-- @return a parsed LOM document with the document metatatables set
69-- @return nil, error the error can either be a file error or a parse error
70function _M.parse(text_or_file, is_file, use_basic)
71 local parser,status,lom
72 if use_basic then parser = _M.basic_parse
73 else
74 status,lom = pcall(require,'lxp.lom')
75 if not status then parser = _M.basic_parse else parser = lom.parse end
76 end
77 if is_file then
78 local f,err = io.open(text_or_file)
79 if not f then return nil,err end
80 text_or_file = f:read '*a'
81 f:close()
82 end
83 local doc,err = parser(text_or_file)
84 if not doc then return nil,err end
85 if lom then
86 _M.walk(doc,false,function(_,d)
87 setmetatable(d,Doc)
88 end)
89 end
90 return doc
91end
92
93---- convenient function to add a document node, This updates the last inserted position.
94-- @param tag a tag name
95-- @param attrs optional set of attributes (name-string pairs)
96function Doc:addtag(tag, attrs)
97 local s = _M.new(tag, attrs);
98 (self.last_add[#self.last_add] or self):add_direct_child(s);
99 t_insert(self.last_add, s);
100 return self;
101end
102
103--- convenient function to add a text node. This updates the last inserted position.
104-- @param text a string
105function Doc:text(text)
106 (self.last_add[#self.last_add] or self):add_direct_child(text);
107 return self;
108end
109
110---- go up one level in a document
111function Doc:up()
112 t_remove(self.last_add);
113 return self;
114end
115
116function Doc:reset()
117 local last_add = self.last_add;
118 for i = 1,#last_add do
119 last_add[i] = nil;
120 end
121 return self;
122end
123
124--- append a child to a document directly.
125-- @param child a child node (either text or a document)
126function Doc:add_direct_child(child)
127 t_insert(self, child);
128end
129
130--- append a child to a document at the last element added
131-- @param child a child node (either text or a document)
132function Doc:add_child(child)
133 (self.last_add[#self.last_add] or self):add_direct_child(child);
134 return self;
135end
136
137
138--- function to create an element with a given tag name and a set of children.
139-- @param tag a tag name
140-- @param items either text or a table where the hash part is the attributes and the list part is the children.
141function _M.elem(tag,items)
142 local s = _M.new(tag)
143 if type(items) == 'string' then items = {items} end
144 if _M.is_tag(items) then
145 t_insert(s,items)
146 elseif type(items) == 'table' then
147 for k,v in pairs(items) do
148 if type(k) == 'string' then
149 s.attr[k] = v
150 t_insert(s.attr,k)
151 else
152 s[k] = v
153 end
154 end
155 end
156 return s
157end
158
159--- given a list of names, return a number of element constructors.
160-- @param list a list of names, or a comma-separated string.
161-- @usage local parent,children = doc.tags 'parent,children' <br>
162-- doc = parent {child 'one', child 'two'}
163function _M.tags(list)
164 local ctors = {}
165 local elem = _M.elem
166 if type(list) == 'string' then list = split(list,',') end
167 for _,tag in ipairs(list) do
168 local ctor = function(items) return _M.elem(tag,items) end
169 t_insert(ctors,ctor)
170 end
171 return unpack(ctors)
172end
173
174local templ_cache = {}
175
176local function is_data(data)
177 return #data == 0 or type(data[1]) ~= 'table'
178end
179
180local function prepare_data(data)
181 -- a hack for ensuring that $1 maps to first element of data, etc.
182 -- Either this or could change the gsub call just below.
183 for i,v in ipairs(data) do
184 data[tostring(i)] = v
185 end
186end
187
188--- create a substituted copy of a document,
189-- @param templ may be a document or a string representation which will be parsed and cached
190-- @param data a table of name-value pairs or a list of such tables
191-- @return an XML document
192function Doc.subst(templ, data)
193 if type(data) ~= 'table' or not next(data) then return nil, "data must be a non-empty table" end
194 if is_data(data) then
195 prepare_data(data)
196 end
197 if type(templ) == 'string' then
198 if templ_cache[templ] then
199 templ = templ_cache[templ]
200 else
201 local str,err = templ
202 templ,err = _M.parse(str)
203 if not templ then return nil,err end
204 templ_cache[str] = templ
205 end
206 end
207 local function _subst(item)
208 return _M.clone(templ,function(s)
209 return s:gsub('%$(%w+)',item)
210 end)
211 end
212 if is_data(data) then return _subst(data) end
213 local list = {}
214 for _,item in ipairs(data) do
215 prepare_data(item)
216 t_insert(list,_subst(item))
217 end
218 if data.tag then
219 list = _M.elem(data.tag,list)
220 end
221 return list
222end
223
224
225--- get the first child with a given tag name.
226-- @param tag the tag name
227function Doc:child_with_name(tag)
228 for _, child in ipairs(self) do
229 if child.tag == tag then return child; end
230 end
231end
232
233local _children_with_name
234function _children_with_name(self,tag,list,recurse)
235 for _, child in ipairs(self) do if type(child) == 'table' then
236 if child.tag == tag then t_insert(list,child) end
237 if recurse then _children_with_name(child,tag,list,recurse) end
238 end end
239end
240
241--- get all elements in a document that have a given tag.
242-- @param tag a tag name
243-- @param dont_recurse optionally only return the immediate children with this tag name
244-- @return a list of elements
245function Doc:get_elements_with_name(tag,dont_recurse)
246 local res = {}
247 _children_with_name(self,tag,res,not dont_recurse)
248 return res
249end
250
251-- iterate over all children of a document node, including text nodes.
252function Doc:children()
253 local i = 0;
254 return function (a)
255 i = i + 1
256 return a[i];
257 end, self, i;
258end
259
260-- return the first child element of a node, if it exists.
261function Doc:first_childtag()
262 if #self == 0 then return end
263 for _,t in ipairs(self) do
264 if type(t) == 'table' then return t end
265 end
266end
267
268function Doc:matching_tags(tag, xmlns)
269 xmlns = xmlns or self.attr.xmlns;
270 local tags = self;
271 local start_i, max_i = 1, #tags;
272 return function ()
273 for i=start_i,max_i do
274 v = tags[i];
275 if (not tag or v.tag == tag)
276 and (not xmlns or xmlns == v.attr.xmlns) then
277 start_i = i+1;
278 return v;
279 end
280 end
281 end, tags, i;
282end
283
284--- iterate over all child elements of a document node.
285function Doc:childtags()
286 local i = 0;
287 return function (a)
288 local v
289 repeat
290 i = i + 1
291 v = self[i]
292 if v and type(v) == 'table' then return v; end
293 until not v
294 end, self[1], i;
295end
296
297--- visit child element of a node and call a function, possibility modifying the document.
298-- @param callback a function passed the node (text or element). If it returns nil, that node will be removed.
299-- If it returns a value, that will replace the current node.
300function Doc:maptags(callback)
301 local is_tag = _M.is_tag
302 local i = 1;
303 while i <= #self do
304 if is_tag(self[i]) then
305 local ret = callback(self[i]);
306 if ret == nil then
307 t_remove(self, i);
308 else
309 self[i] = ret;
310 i = i + 1;
311 end
312 end
313 end
314 return self;
315end
316
317local xml_escape
318do
319 local escape_table = { ["'"] = "'", ["\""] = """, ["<"] = "<", [">"] = ">", ["&"] = "&" };
320 function xml_escape(str) return (s_gsub(str, "['&<>\"]", escape_table)); end
321 _M.xml_escape = xml_escape;
322end
323
324-- pretty printing
325-- if indent, then put each new tag on its own line
326-- if attr_indent, put each new attribute on its own line
327local function _dostring(t, buf, self, xml_escape, parentns, idn, indent, attr_indent)
328 local nsid = 0;
329 local tag = t.tag
330 local lf,alf = ""," "
331 if indent then lf = '\n'..idn end
332 if attr_indent then alf = '\n'..idn..attr_indent end
333 t_insert(buf, lf.."<"..tag);
334 for k, v in pairs(t.attr) do
335 if type(k) ~= 'number' then -- LOM attr table has list-like part
336 if s_find(k, "\1", 1, true) then
337 local ns, attrk = s_match(k, "^([^\1]*)\1?(.*)$");
338 nsid = nsid + 1;
339 t_insert(buf, " xmlns:ns"..nsid.."='"..xml_escape(ns).."' ".."ns"..nsid..":"..attrk.."='"..xml_escape(v).."'");
340 elseif not(k == "xmlns" and v == parentns) then
341 t_insert(buf, alf..k.."='"..xml_escape(v).."'");
342 end
343 end
344 end
345 local len,has_children = #t;
346 if len == 0 then
347 local out = "/>"
348 if attr_indent then out = '\n'..idn..out end
349 t_insert(buf, out);
350 else
351 t_insert(buf, ">");
352 for n=1,len do
353 local child = t[n];
354 if child.tag then
355 self(child, buf, self, xml_escape, t.attr.xmlns,idn and idn..indent, indent, attr_indent );
356 has_children = true
357 else -- text element
358 t_insert(buf, xml_escape(child));
359 end
360 end
361 t_insert(buf, (has_children and lf or '').."</"..tag..">");
362 end
363end
364
365---- pretty-print an XML document
366--- @param idn an initial indent (indents are all strings)
367--- @param indent an indent for each level
368--- @param attr_indent if given, indent each attribute pair and put on a separate line
369--- @return a string representation
370function _M.tostring(t,idn,indent, attr_indent)
371 local buf = {};
372 _dostring(t, buf, _dostring, xml_escape, nil,idn,indent, attr_indent);
373 return t_concat(buf);
374end
375
376Doc.__tostring = _M.tostring
377
378--- get the full text value of an element
379function Doc:get_text()
380 local res = {}
381 for i,el in ipairs(self) do
382 if type(el) == 'string' then t_insert(res,el) end
383 end
384 return t_concat(res);
385end
386
387--- make a copy of a document
388-- @param doc the original document
389-- @param strsubst an optional function for handling string copying which could do substitution, etc.
390function _M.clone(doc, strsubst)
391 local lookup_table = {};
392 local function _copy(object)
393 if type(object) ~= "table" then
394 if strsubst and type(object) == 'string' then return strsubst(object)
395 else return object;
396 end
397 elseif lookup_table[object] then
398 return lookup_table[object];
399 end
400 local new_table = {};
401 lookup_table[object] = new_table;
402 for index, value in pairs(object) do
403 new_table[_copy(index)] = _copy(value); -- is cloning keys much use, hm?
404 end
405 return setmetatable(new_table, getmetatable(object));
406 end
407
408 return _copy(doc)
409end
410
411--- compare two documents.
412-- @param t1 any value
413-- @param t2 any value
414function _M.compare(t1,t2)
415 local ty1 = type(t1)
416 local ty2 = type(t2)
417 if ty1 ~= ty2 then return false, 'type mismatch' end
418 if ty1 == 'string' then
419 return t1 == t2 and true or 'text '..t1..' ~= text '..t2
420 end
421 if ty1 ~= 'table' or ty2 ~= 'table' then return false, 'not a document' end
422 if t1.tag ~= t2.tag then return false, 'tag '..t1.tag..' ~= tag '..t2.tag end
423 if #t1 ~= #t2 then return false, 'size '..#t1..' ~= size '..#t2..' for tag '..t1.tag end
424 -- compare attributes
425 for k,v in pairs(t1.attr) do
426 if t2.attr[k] ~= v then return false, 'mismatch attrib' end
427 end
428 for k,v in pairs(t2.attr) do
429 if t1.attr[k] ~= v then return false, 'mismatch attrib' end
430 end
431 -- compare children
432 for i = 1,#t1 do
433 local yes,err = _M.compare(t1[i],t2[i])
434 if not yes then return err end
435 end
436 return true
437end
438
439--- is this value a document element?
440-- @param d any value
441function _M.is_tag(d)
442 return type(d) == 'table' and type(d.tag) == 'string'
443end
444
445--- call the desired function recursively over the document.
446-- @param depth_first visit child notes first, then the current node
447-- @param operation a function which will receive the current tag name and current node.
448function _M.walk (doc, depth_first, operation)
449 if not depth_first then operation(doc.tag,doc) end
450 for _,d in ipairs(doc) do
451 if _M.is_tag(d) then
452 _M.walk(d,depth_first,operation)
453 end
454 end
455 if depth_first then operation(doc.tag,doc) end
456end
457
458local escapes = { quot = "\"", apos = "'", lt = "<", gt = ">", amp = "&" }
459local function unescape(str) return (str:gsub( "&(%a+);", escapes)); end
460
461local function parseargs(s)
462 local arg = {}
463 s:gsub("([%w:]+)%s*=%s*([\"'])(.-)%2", function (w, _, a)
464 arg[w] = unescape(a)
465 end)
466 return arg
467end
468
469--- Parse a simple XML document using a pure Lua parser based on Robero Ierusalimschy's original version.
470-- @param s the XML document to be parsed.
471-- @param all_text if true, preserves all whitespace. Otherwise only text containing non-whitespace is included.
472function _M.basic_parse(s,all_text)
473 local t_insert,t_remove = table.insert,table.remove
474 local s_find,s_sub = string.find,string.sub
475 local stack = {}
476 local top = {}
477 t_insert(stack, top)
478 local ni,c,label,xarg, empty
479 local i, j = 1, 1
480 -- we're not interested in <?xml version="1.0"?>
481 local _,istart = s_find(s,'^%s*<%?[^%?]+%?>%s*')
482 if istart then i = istart+1 end
483 while true do
484 ni,j,c,label,xarg, empty = s_find(s, "<(%/?)([%w:%-_]+)(.-)(%/?)>", i)
485 if not ni then break end
486 local text = s_sub(s, i, ni-1)
487 if all_text or not s_find(text, "^%s*$") then
488 t_insert(top, unescape(text))
489 end
490 if empty == "/" then -- empty element tag
491 t_insert(top, setmetatable({tag=label, attr=parseargs(xarg), empty=1},Doc))
492 elseif c == "" then -- start tag
493 top = setmetatable({tag=label, attr=parseargs(xarg)},Doc)
494 t_insert(stack, top) -- new level
495 else -- end tag
496 local toclose = t_remove(stack) -- remove top
497 top = stack[#stack]
498 if #stack < 1 then
499 error("nothing to close with "..label)
500 end
501 if toclose.tag ~= label then
502 error("trying to close "..toclose.tag.." with "..label)
503 end
504 t_insert(top, toclose)
505 end
506 i = j+1
507 end
508 local text = s_sub(s, i)
509 if all_text or not s_find(text, "^%s*$") then
510 t_insert(stack[#stack], unescape(text))
511 end
512 if #stack > 1 then
513 error("unclosed "..stack[#stack].tag)
514 end
515 local res = stack[1]
516 return type(res[1])=='string' and res[2] or res[1]
517end
518
519function empty(attr)
520 return not attr or not next(attr)
521end
522
523function is_text(s) return type(s) == 'string' end
524function is_element(d) return type(d) == 'table' and d.tag ~= nil end
525
526-- returns the key,value pair from a table if it has exactly one entry
527function has_one_element(t)
528 local key,value = next(t)
529 if next(t,key) ~= nil then return false end
530 return key,value
531end
532
533function tostringn(d)
534 return tostring(d):sub(1,60)
535end
536
537local function append_capture(res,tbl)
538 if not empty(tbl) then -- no point in capturing empty tables...
539 local key
540 if tbl._ then -- if $_ was set then it is meant as the top-level key for the captured table
541 key = tbl._
542 tbl._ = nil
543 if empty(tbl) then return end
544 end
545 -- a table with only one pair {[0]=value} shall be reduced to that value
546 local numkey,val = has_one_element(tbl)
547 if numkey == 0 then tbl = val end
548 if key then
549 res[key] = tbl
550 else -- otherwise, we append the captured table
551 t_insert(res,tbl)
552 end
553 end
554end
555
556local function capture_attrib(res,pat,value)
557 pat = pat:sub(2)
558 if pat:find '^%d+$' then -- $1 etc means use this as an array location
559 pat = tonumber(pat)
560 end
561 res[pat] = value
562 return true
563end
564
565local match
566function match(d,pat,res,keep_going)
567 local ret = true
568 if d == nil then d = '' end
569 -- attribute string matching is straight equality, except if the pattern is a $ capture,
570 -- which always succeeds.
571 if type(d) == 'string' then
572 if type(pat) ~= 'string' then return false end
573 if pat:find '^%$' then
574 return capture_attrib(res,pat,d)
575 else
576 return d == pat
577 end
578 else
579 -- this is an element node. For a match to succeed, the attributes must
580 -- match as well.
581 if d.tag == pat.tag then
582 if not empty(pat.attr) then
583 if empty(d.attr) then ret = false
584 else
585 for prop,pval in pairs(pat.attr) do
586 local dval = d.attr[prop]
587 if not match(dval,pval,res) then ret = false; break end
588 end
589 end
590 end
591 -- the pattern may have child nodes. We match partially, so that {P1,P2} shall match {X,P1,X,X,P2,..}
592 if ret and #pat > 0 then
593 local i,j = 1,1
594 local function next_elem()
595 j = j + 1 -- next child element of data
596 if is_text(d[j]) then j = j + 1 end
597 return j <= #d
598 end
599 repeat
600 local p = pat[i]
601 -- repeated {{<...>}} patterns shall match one or more elements
602 -- so e.g. {P+} will match {X,X,P,P,X,P,X,X,X}
603 if is_element(p) and p.repeated then
604 local found
605 repeat
606 local tbl = {}
607 ret = match(d[j],p,tbl,false)
608 if ret then
609 found = false --true
610 append_capture(res,tbl)
611 end
612 until not next_elem() or (found and not ret)
613 i = i + 1
614 else
615 ret = match(d[j],p,res,false)
616 if ret then i = i + 1 end
617 end
618 until not next_elem() or i > #pat -- run out of elements or patterns to match
619 -- if every element in our pattern matched ok, then it's been a successful match
620 if i > #pat then return true end
621 end
622 if ret then return true end
623 else
624 ret = false
625 end
626 -- keep going anyway - look at the children!
627 if keep_going then
628 for child in d:childtags() do
629 ret = match(child,pat,res,keep_going)
630 if ret then break end
631 end
632 end
633 end
634 return ret
635end
636
637function Doc:match(pat)
638 if is_text(pat) then
639 pat = _M.parse(pat,false,true)
640 end
641 _M.walk(pat,false,function(_,d)
642 if is_text(d[1]) and is_element(d[2]) and is_text(d[3]) and
643 d[1]:find '%s*{{' and d[3]:find '}}%s*' then
644 t_remove(d,1)
645 t_remove(d,2)
646 d[1].repeated = true
647 end
648 end)
649
650 local res = {}
651 local ret = match(self,pat,res,true)
652 return res,ret
653end
654
655
656--- split a string into a list of strings separated by a delimiter.
657function split(s,re)
658 local res = {}
659 re = '[^'..re..']+'
660 for k in s:gmatch(re) do t_insert(res,k) end
661 return res
662end
663
664local function export(name,mod)
665 local rawget,rawset = _G.rawget,_G.rawset
666 if not rawget(_G,'__PRIVATE_REQUIRE') then
667 local path = split(name,'%.')
668 local T = _G
669 for i = 1,#path-1 do
670 local p = rawget(T,path[i])
671 if not p then
672 p = {}
673 rawset(T,path[i],p)
674 end
675 T = p
676 end
677 rawset(T,path[#path],mod)
678 end
679 return mod
680end
681
682return export(...,_M)