dom-parser.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. var conventions = require("./conventions");
  2. var dom = require('./dom')
  3. var entities = require('./entities');
  4. var sax = require('./sax');
  5. var DOMImplementation = dom.DOMImplementation;
  6. var NAMESPACE = conventions.NAMESPACE;
  7. var ParseError = sax.ParseError;
  8. var XMLReader = sax.XMLReader;
  9. /**
  10. * Normalizes line ending according to https://www.w3.org/TR/xml11/#sec-line-ends:
  11. *
  12. * > XML parsed entities are often stored in computer files which,
  13. * > for editing convenience, are organized into lines.
  14. * > These lines are typically separated by some combination
  15. * > of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
  16. * >
  17. * > To simplify the tasks of applications, the XML processor must behave
  18. * > as if it normalized all line breaks in external parsed entities (including the document entity)
  19. * > on input, before parsing, by translating all of the following to a single #xA character:
  20. * >
  21. * > 1. the two-character sequence #xD #xA
  22. * > 2. the two-character sequence #xD #x85
  23. * > 3. the single character #x85
  24. * > 4. the single character #x2028
  25. * > 5. any #xD character that is not immediately followed by #xA or #x85.
  26. *
  27. * @param {string} input
  28. * @returns {string}
  29. */
  30. function normalizeLineEndings(input) {
  31. return input
  32. .replace(/\r[\n\u0085]/g, '\n')
  33. .replace(/[\r\u0085\u2028]/g, '\n')
  34. }
  35. /**
  36. * @typedef Locator
  37. * @property {number} [columnNumber]
  38. * @property {number} [lineNumber]
  39. */
  40. /**
  41. * @typedef DOMParserOptions
  42. * @property {DOMHandler} [domBuilder]
  43. * @property {Function} [errorHandler]
  44. * @property {(string) => string} [normalizeLineEndings] used to replace line endings before parsing
  45. * defaults to `normalizeLineEndings`
  46. * @property {Locator} [locator]
  47. * @property {Record<string, string>} [xmlns]
  48. *
  49. * @see normalizeLineEndings
  50. */
  51. /**
  52. * The DOMParser interface provides the ability to parse XML or HTML source code
  53. * from a string into a DOM `Document`.
  54. *
  55. * _xmldom is different from the spec in that it allows an `options` parameter,
  56. * to override the default behavior._
  57. *
  58. * @param {DOMParserOptions} [options]
  59. * @constructor
  60. *
  61. * @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser
  62. * @see https://html.spec.whatwg.org/multipage/dynamic-markup-insertion.html#dom-parsing-and-serialization
  63. */
  64. function DOMParser(options){
  65. this.options = options ||{locator:{}};
  66. }
  67. DOMParser.prototype.parseFromString = function(source,mimeType){
  68. var options = this.options;
  69. var sax = new XMLReader();
  70. var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
  71. var errorHandler = options.errorHandler;
  72. var locator = options.locator;
  73. var defaultNSMap = options.xmlns||{};
  74. var isHTML = /\/x?html?$/.test(mimeType);//mimeType.toLowerCase().indexOf('html') > -1;
  75. var entityMap = isHTML ? entities.HTML_ENTITIES : entities.XML_ENTITIES;
  76. if(locator){
  77. domBuilder.setDocumentLocator(locator)
  78. }
  79. sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
  80. sax.domBuilder = options.domBuilder || domBuilder;
  81. if(isHTML){
  82. defaultNSMap[''] = NAMESPACE.HTML;
  83. }
  84. defaultNSMap.xml = defaultNSMap.xml || NAMESPACE.XML;
  85. var normalize = options.normalizeLineEndings || normalizeLineEndings;
  86. if (source && typeof source === 'string') {
  87. sax.parse(
  88. normalize(source),
  89. defaultNSMap,
  90. entityMap
  91. )
  92. } else {
  93. sax.errorHandler.error('invalid doc source')
  94. }
  95. return domBuilder.doc;
  96. }
  97. function buildErrorHandler(errorImpl,domBuilder,locator){
  98. if(!errorImpl){
  99. if(domBuilder instanceof DOMHandler){
  100. return domBuilder;
  101. }
  102. errorImpl = domBuilder ;
  103. }
  104. var errorHandler = {}
  105. var isCallback = errorImpl instanceof Function;
  106. locator = locator||{}
  107. function build(key){
  108. var fn = errorImpl[key];
  109. if(!fn && isCallback){
  110. fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
  111. }
  112. errorHandler[key] = fn && function(msg){
  113. fn('[xmldom '+key+']\t'+msg+_locator(locator));
  114. }||function(){};
  115. }
  116. build('warning');
  117. build('error');
  118. build('fatalError');
  119. return errorHandler;
  120. }
  121. //console.log('#\n\n\n\n\n\n\n####')
  122. /**
  123. * +ContentHandler+ErrorHandler
  124. * +LexicalHandler+EntityResolver2
  125. * -DeclHandler-DTDHandler
  126. *
  127. * DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
  128. * DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
  129. * @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
  130. */
  131. function DOMHandler() {
  132. this.cdata = false;
  133. }
  134. function position(locator,node){
  135. node.lineNumber = locator.lineNumber;
  136. node.columnNumber = locator.columnNumber;
  137. }
  138. /**
  139. * @see org.xml.sax.ContentHandler#startDocument
  140. * @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
  141. */
  142. DOMHandler.prototype = {
  143. startDocument : function() {
  144. this.doc = new DOMImplementation().createDocument(null, null, null);
  145. if (this.locator) {
  146. this.doc.documentURI = this.locator.systemId;
  147. }
  148. },
  149. startElement:function(namespaceURI, localName, qName, attrs) {
  150. var doc = this.doc;
  151. var el = doc.createElementNS(namespaceURI, qName||localName);
  152. var len = attrs.length;
  153. appendElement(this, el);
  154. this.currentElement = el;
  155. this.locator && position(this.locator,el)
  156. for (var i = 0 ; i < len; i++) {
  157. var namespaceURI = attrs.getURI(i);
  158. var value = attrs.getValue(i);
  159. var qName = attrs.getQName(i);
  160. var attr = doc.createAttributeNS(namespaceURI, qName);
  161. this.locator &&position(attrs.getLocator(i),attr);
  162. attr.value = attr.nodeValue = value;
  163. el.setAttributeNode(attr)
  164. }
  165. },
  166. endElement:function(namespaceURI, localName, qName) {
  167. var current = this.currentElement
  168. var tagName = current.tagName;
  169. this.currentElement = current.parentNode;
  170. },
  171. startPrefixMapping:function(prefix, uri) {
  172. },
  173. endPrefixMapping:function(prefix) {
  174. },
  175. processingInstruction:function(target, data) {
  176. var ins = this.doc.createProcessingInstruction(target, data);
  177. this.locator && position(this.locator,ins)
  178. appendElement(this, ins);
  179. },
  180. ignorableWhitespace:function(ch, start, length) {
  181. },
  182. characters:function(chars, start, length) {
  183. chars = _toString.apply(this,arguments)
  184. //console.log(chars)
  185. if(chars){
  186. if (this.cdata) {
  187. var charNode = this.doc.createCDATASection(chars);
  188. } else {
  189. var charNode = this.doc.createTextNode(chars);
  190. }
  191. if(this.currentElement){
  192. this.currentElement.appendChild(charNode);
  193. }else if(/^\s*$/.test(chars)){
  194. this.doc.appendChild(charNode);
  195. //process xml
  196. }
  197. this.locator && position(this.locator,charNode)
  198. }
  199. },
  200. skippedEntity:function(name) {
  201. },
  202. endDocument:function() {
  203. this.doc.normalize();
  204. },
  205. setDocumentLocator:function (locator) {
  206. if(this.locator = locator){// && !('lineNumber' in locator)){
  207. locator.lineNumber = 0;
  208. }
  209. },
  210. //LexicalHandler
  211. comment:function(chars, start, length) {
  212. chars = _toString.apply(this,arguments)
  213. var comm = this.doc.createComment(chars);
  214. this.locator && position(this.locator,comm)
  215. appendElement(this, comm);
  216. },
  217. startCDATA:function() {
  218. //used in characters() methods
  219. this.cdata = true;
  220. },
  221. endCDATA:function() {
  222. this.cdata = false;
  223. },
  224. startDTD:function(name, publicId, systemId) {
  225. var impl = this.doc.implementation;
  226. if (impl && impl.createDocumentType) {
  227. var dt = impl.createDocumentType(name, publicId, systemId);
  228. this.locator && position(this.locator,dt)
  229. appendElement(this, dt);
  230. this.doc.doctype = dt;
  231. }
  232. },
  233. /**
  234. * @see org.xml.sax.ErrorHandler
  235. * @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
  236. */
  237. warning:function(error) {
  238. console.warn('[xmldom warning]\t'+error,_locator(this.locator));
  239. },
  240. error:function(error) {
  241. console.error('[xmldom error]\t'+error,_locator(this.locator));
  242. },
  243. fatalError:function(error) {
  244. throw new ParseError(error, this.locator);
  245. }
  246. }
  247. function _locator(l){
  248. if(l){
  249. return '\n@'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
  250. }
  251. }
  252. function _toString(chars,start,length){
  253. if(typeof chars == 'string'){
  254. return chars.substr(start,length)
  255. }else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
  256. if(chars.length >= start+length || start){
  257. return new java.lang.String(chars,start,length)+'';
  258. }
  259. return chars;
  260. }
  261. }
  262. /*
  263. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
  264. * used method of org.xml.sax.ext.LexicalHandler:
  265. * #comment(chars, start, length)
  266. * #startCDATA()
  267. * #endCDATA()
  268. * #startDTD(name, publicId, systemId)
  269. *
  270. *
  271. * IGNORED method of org.xml.sax.ext.LexicalHandler:
  272. * #endDTD()
  273. * #startEntity(name)
  274. * #endEntity(name)
  275. *
  276. *
  277. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
  278. * IGNORED method of org.xml.sax.ext.DeclHandler
  279. * #attributeDecl(eName, aName, type, mode, value)
  280. * #elementDecl(name, model)
  281. * #externalEntityDecl(name, publicId, systemId)
  282. * #internalEntityDecl(name, value)
  283. * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
  284. * IGNORED method of org.xml.sax.EntityResolver2
  285. * #resolveEntity(String name,String publicId,String baseURI,String systemId)
  286. * #resolveEntity(publicId, systemId)
  287. * #getExternalSubset(name, baseURI)
  288. * @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
  289. * IGNORED method of org.xml.sax.DTDHandler
  290. * #notationDecl(name, publicId, systemId) {};
  291. * #unparsedEntityDecl(name, publicId, systemId, notationName) {};
  292. */
  293. "endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
  294. DOMHandler.prototype[key] = function(){return null}
  295. })
  296. /* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
  297. function appendElement (hander,node) {
  298. if (!hander.currentElement) {
  299. hander.doc.appendChild(node);
  300. } else {
  301. hander.currentElement.appendChild(node);
  302. }
  303. }//appendChild and setAttributeNS are preformance key
  304. exports.__DOMHandler = DOMHandler;
  305. exports.normalizeLineEndings = normalizeLineEndings;
  306. exports.DOMParser = DOMParser;