123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- /**
- * Simple HTML Parser
- *
- * @author Zongmin Lei<leizongmin@gmail.com>
- */
- var _ = require("./util");
- /**
- * get tag name
- *
- * @param {String} html e.g. '<a hef="#">'
- * @return {String}
- */
- function getTagName(html) {
- var i = _.spaceIndex(html);
- var tagName;
- if (i === -1) {
- tagName = html.slice(1, -1);
- } else {
- tagName = html.slice(1, i + 1);
- }
- tagName = _.trim(tagName).toLowerCase();
- if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
- if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
- return tagName;
- }
- /**
- * is close tag?
- *
- * @param {String} html 如:'<a hef="#">'
- * @return {Boolean}
- */
- function isClosing(html) {
- return html.slice(0, 2) === "</";
- }
- /**
- * parse input html and returns processed html
- *
- * @param {String} html
- * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
- * @param {Function} escapeHtml
- * @return {String}
- */
- function parseTag(html, onTag, escapeHtml) {
- "use strict";
- var rethtml = "";
- var lastPos = 0;
- var tagStart = false;
- var quoteStart = false;
- var currentPos = 0;
- var len = html.length;
- var currentTagName = "";
- var currentHtml = "";
- chariterator: for (currentPos = 0; currentPos < len; currentPos++) {
- var c = html.charAt(currentPos);
- if (tagStart === false) {
- if (c === "<") {
- tagStart = currentPos;
- continue;
- }
- } else {
- if (quoteStart === false) {
- if (c === "<") {
- rethtml += escapeHtml(html.slice(lastPos, currentPos));
- tagStart = currentPos;
- lastPos = currentPos;
- continue;
- }
- if (c === ">" || currentPos === len - 1) {
- rethtml += escapeHtml(html.slice(lastPos, tagStart));
- currentHtml = html.slice(tagStart, currentPos + 1);
- currentTagName = getTagName(currentHtml);
- rethtml += onTag(
- tagStart,
- rethtml.length,
- currentTagName,
- currentHtml,
- isClosing(currentHtml)
- );
- lastPos = currentPos + 1;
- tagStart = false;
- continue;
- }
- if (c === '"' || c === "'") {
- var i = 1;
- var ic = html.charAt(currentPos - i);
- while (ic.trim() === "" || ic === "=") {
- if (ic === "=") {
- quoteStart = c;
- continue chariterator;
- }
- ic = html.charAt(currentPos - ++i);
- }
- }
- } else {
- if (c === quoteStart) {
- quoteStart = false;
- continue;
- }
- }
- }
- }
- if (lastPos < len) {
- rethtml += escapeHtml(html.substr(lastPos));
- }
- return rethtml;
- }
- var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9\\_:.-]/gim;
- /**
- * parse input attributes and returns processed attributes
- *
- * @param {String} html e.g. `href="#" target="_blank"`
- * @param {Function} onAttr e.g. `function (name, value)`
- * @return {String}
- */
- function parseAttr(html, onAttr) {
- "use strict";
- var lastPos = 0;
- var lastMarkPos = 0;
- var retAttrs = [];
- var tmpName = false;
- var len = html.length;
- function addAttr(name, value) {
- name = _.trim(name);
- name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
- if (name.length < 1) return;
- var ret = onAttr(name, value || "");
- if (ret) retAttrs.push(ret);
- }
- // 逐个分析字符
- for (var i = 0; i < len; i++) {
- var c = html.charAt(i);
- var v, j;
- if (tmpName === false && c === "=") {
- tmpName = html.slice(lastPos, i);
- lastPos = i + 1;
- lastMarkPos = html.charAt(lastPos) === '"' || html.charAt(lastPos) === "'" ? lastPos : findNextQuotationMark(html, i + 1);
- continue;
- }
- if (tmpName !== false) {
- if (
- i === lastMarkPos
- ) {
- j = html.indexOf(c, i + 1);
- if (j === -1) {
- break;
- } else {
- v = _.trim(html.slice(lastMarkPos + 1, j));
- addAttr(tmpName, v);
- tmpName = false;
- i = j;
- lastPos = i + 1;
- continue;
- }
- }
- }
- if (/\s|\n|\t/.test(c)) {
- html = html.replace(/\s|\n|\t/g, " ");
- if (tmpName === false) {
- j = findNextEqual(html, i);
- if (j === -1) {
- v = _.trim(html.slice(lastPos, i));
- addAttr(v);
- tmpName = false;
- lastPos = i + 1;
- continue;
- } else {
- i = j - 1;
- continue;
- }
- } else {
- j = findBeforeEqual(html, i - 1);
- if (j === -1) {
- v = _.trim(html.slice(lastPos, i));
- v = stripQuoteWrap(v);
- addAttr(tmpName, v);
- tmpName = false;
- lastPos = i + 1;
- continue;
- } else {
- continue;
- }
- }
- }
- }
- if (lastPos < html.length) {
- if (tmpName === false) {
- addAttr(html.slice(lastPos));
- } else {
- addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
- }
- }
- return _.trim(retAttrs.join(" "));
- }
- function findNextEqual(str, i) {
- for (; i < str.length; i++) {
- var c = str[i];
- if (c === " ") continue;
- if (c === "=") return i;
- return -1;
- }
- }
- function findNextQuotationMark(str, i) {
- for (; i < str.length; i++) {
- var c = str[i];
- if (c === " ") continue;
- if (c === "'" || c === '"') return i;
- return -1;
- }
- }
- function findBeforeEqual(str, i) {
- for (; i > 0; i--) {
- var c = str[i];
- if (c === " ") continue;
- if (c === "=") return i;
- return -1;
- }
- }
- function isQuoteWrapString(text) {
- if (
- (text[0] === '"' && text[text.length - 1] === '"') ||
- (text[0] === "'" && text[text.length - 1] === "'")
- ) {
- return true;
- } else {
- return false;
- }
- }
- function stripQuoteWrap(text) {
- if (isQuoteWrapString(text)) {
- return text.substr(1, text.length - 2);
- } else {
- return text;
- }
- }
- exports.parseTag = parseTag;
- exports.parseAttr = parseAttr;
|