parser.js 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /**
  2. * Simple HTML Parser
  3. *
  4. * @author Zongmin Lei<leizongmin@gmail.com>
  5. */
  6. var _ = require("./util");
  7. /**
  8. * get tag name
  9. *
  10. * @param {String} html e.g. '<a hef="#">'
  11. * @return {String}
  12. */
  13. function getTagName(html) {
  14. var i = _.spaceIndex(html);
  15. var tagName;
  16. if (i === -1) {
  17. tagName = html.slice(1, -1);
  18. } else {
  19. tagName = html.slice(1, i + 1);
  20. }
  21. tagName = _.trim(tagName).toLowerCase();
  22. if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
  23. if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
  24. return tagName;
  25. }
  26. /**
  27. * is close tag?
  28. *
  29. * @param {String} html 如:'<a hef="#">'
  30. * @return {Boolean}
  31. */
  32. function isClosing(html) {
  33. return html.slice(0, 2) === "</";
  34. }
  35. /**
  36. * parse input html and returns processed html
  37. *
  38. * @param {String} html
  39. * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
  40. * @param {Function} escapeHtml
  41. * @return {String}
  42. */
  43. function parseTag(html, onTag, escapeHtml) {
  44. "use strict";
  45. var rethtml = "";
  46. var lastPos = 0;
  47. var tagStart = false;
  48. var quoteStart = false;
  49. var currentPos = 0;
  50. var len = html.length;
  51. var currentTagName = "";
  52. var currentHtml = "";
  53. chariterator: for (currentPos = 0; currentPos < len; currentPos++) {
  54. var c = html.charAt(currentPos);
  55. if (tagStart === false) {
  56. if (c === "<") {
  57. tagStart = currentPos;
  58. continue;
  59. }
  60. } else {
  61. if (quoteStart === false) {
  62. if (c === "<") {
  63. rethtml += escapeHtml(html.slice(lastPos, currentPos));
  64. tagStart = currentPos;
  65. lastPos = currentPos;
  66. continue;
  67. }
  68. if (c === ">" || currentPos === len - 1) {
  69. rethtml += escapeHtml(html.slice(lastPos, tagStart));
  70. currentHtml = html.slice(tagStart, currentPos + 1);
  71. currentTagName = getTagName(currentHtml);
  72. rethtml += onTag(
  73. tagStart,
  74. rethtml.length,
  75. currentTagName,
  76. currentHtml,
  77. isClosing(currentHtml)
  78. );
  79. lastPos = currentPos + 1;
  80. tagStart = false;
  81. continue;
  82. }
  83. if (c === '"' || c === "'") {
  84. var i = 1;
  85. var ic = html.charAt(currentPos - i);
  86. while (ic.trim() === "" || ic === "=") {
  87. if (ic === "=") {
  88. quoteStart = c;
  89. continue chariterator;
  90. }
  91. ic = html.charAt(currentPos - ++i);
  92. }
  93. }
  94. } else {
  95. if (c === quoteStart) {
  96. quoteStart = false;
  97. continue;
  98. }
  99. }
  100. }
  101. }
  102. if (lastPos < len) {
  103. rethtml += escapeHtml(html.substr(lastPos));
  104. }
  105. return rethtml;
  106. }
  107. var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9\\_:.-]/gim;
  108. /**
  109. * parse input attributes and returns processed attributes
  110. *
  111. * @param {String} html e.g. `href="#" target="_blank"`
  112. * @param {Function} onAttr e.g. `function (name, value)`
  113. * @return {String}
  114. */
  115. function parseAttr(html, onAttr) {
  116. "use strict";
  117. var lastPos = 0;
  118. var lastMarkPos = 0;
  119. var retAttrs = [];
  120. var tmpName = false;
  121. var len = html.length;
  122. function addAttr(name, value) {
  123. name = _.trim(name);
  124. name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
  125. if (name.length < 1) return;
  126. var ret = onAttr(name, value || "");
  127. if (ret) retAttrs.push(ret);
  128. }
  129. // 逐个分析字符
  130. for (var i = 0; i < len; i++) {
  131. var c = html.charAt(i);
  132. var v, j;
  133. if (tmpName === false && c === "=") {
  134. tmpName = html.slice(lastPos, i);
  135. lastPos = i + 1;
  136. lastMarkPos = html.charAt(lastPos) === '"' || html.charAt(lastPos) === "'" ? lastPos : findNextQuotationMark(html, i + 1);
  137. continue;
  138. }
  139. if (tmpName !== false) {
  140. if (
  141. i === lastMarkPos
  142. ) {
  143. j = html.indexOf(c, i + 1);
  144. if (j === -1) {
  145. break;
  146. } else {
  147. v = _.trim(html.slice(lastMarkPos + 1, j));
  148. addAttr(tmpName, v);
  149. tmpName = false;
  150. i = j;
  151. lastPos = i + 1;
  152. continue;
  153. }
  154. }
  155. }
  156. if (/\s|\n|\t/.test(c)) {
  157. html = html.replace(/\s|\n|\t/g, " ");
  158. if (tmpName === false) {
  159. j = findNextEqual(html, i);
  160. if (j === -1) {
  161. v = _.trim(html.slice(lastPos, i));
  162. addAttr(v);
  163. tmpName = false;
  164. lastPos = i + 1;
  165. continue;
  166. } else {
  167. i = j - 1;
  168. continue;
  169. }
  170. } else {
  171. j = findBeforeEqual(html, i - 1);
  172. if (j === -1) {
  173. v = _.trim(html.slice(lastPos, i));
  174. v = stripQuoteWrap(v);
  175. addAttr(tmpName, v);
  176. tmpName = false;
  177. lastPos = i + 1;
  178. continue;
  179. } else {
  180. continue;
  181. }
  182. }
  183. }
  184. }
  185. if (lastPos < html.length) {
  186. if (tmpName === false) {
  187. addAttr(html.slice(lastPos));
  188. } else {
  189. addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
  190. }
  191. }
  192. return _.trim(retAttrs.join(" "));
  193. }
  194. function findNextEqual(str, i) {
  195. for (; i < str.length; i++) {
  196. var c = str[i];
  197. if (c === " ") continue;
  198. if (c === "=") return i;
  199. return -1;
  200. }
  201. }
  202. function findNextQuotationMark(str, i) {
  203. for (; i < str.length; i++) {
  204. var c = str[i];
  205. if (c === " ") continue;
  206. if (c === "'" || c === '"') return i;
  207. return -1;
  208. }
  209. }
  210. function findBeforeEqual(str, i) {
  211. for (; i > 0; i--) {
  212. var c = str[i];
  213. if (c === " ") continue;
  214. if (c === "=") return i;
  215. return -1;
  216. }
  217. }
  218. function isQuoteWrapString(text) {
  219. if (
  220. (text[0] === '"' && text[text.length - 1] === '"') ||
  221. (text[0] === "'" && text[text.length - 1] === "'")
  222. ) {
  223. return true;
  224. } else {
  225. return false;
  226. }
  227. }
  228. function stripQuoteWrap(text) {
  229. if (isQuoteWrapString(text)) {
  230. return text.substr(1, text.length - 2);
  231. } else {
  232. return text;
  233. }
  234. }
  235. exports.parseTag = parseTag;
  236. exports.parseAttr = parseAttr;