"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
var utils_1 = require("../utils");
var comment_node_1 = require("./comment-node");
var element_node_1 = require("./element-node");
var entity_node_1 = require("./entity-node");
var text_node_1 = require("./text-node");
/**
* @private
* @property {RegExp} htmlRegex
*
* The regular expression used to pull out HTML tags from a string. Handles namespaced HTML tags and
* attribute names, as specified by http://www.w3.org/TR/html-markup/syntax.html.
*
* Capturing groups:
*
* 1. The "!DOCTYPE" tag name, if a tag is a <!DOCTYPE> tag.
* 2. If it is an end tag, this group will have the '/'.
* 3. If it is a comment tag, this group will hold the comment text (i.e.
* the text inside the `<!--` and `-->`.
* 4. The tag name for a tag without attributes (other than the <!DOCTYPE> tag)
* 5. The tag name for a tag with attributes (other than the <!DOCTYPE> tag)
*/
var htmlRegex = (function () {
var commentTagRegex = /!--([\s\S]+?)--/, tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/, attrNameRegex = /[^\s"'>\/=\x00-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
attrValueRegex = /(?:"[^"]*?"|'[^']*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
optionalAttrValueRegex = '(?:\\s*?=\\s*?' + attrValueRegex.source + ')?'; // optional '=[value]'
var getNameEqualsValueRegex = function (group) {
return '(?=(' + attrNameRegex.source + '))\\' + group + optionalAttrValueRegex;
};
return new RegExp([
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:',
'<(!DOCTYPE)',
// Zero or more attributes following the tag name
'(?:',
'\\s+',
// Either:
// A. attr="value", or
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
// *** Capturing Group 2 - Pseudo-atomic group for attrNameRegex
'(?:', getNameEqualsValueRegex(2), '|', attrValueRegex.source + ')',
')*',
'>',
')',
'|',
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
'(?:',
'<(/)?',
// *** Capturing Group 3: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
'(?:',
commentTagRegex.source,
'|',
// Handle tag without attributes.
// Doing this separately from a tag that has attributes
// to fix a regex time complexity issue seen with the
// example in https://github.com/gregjacobs/Autolinker.js/issues/172
'(?:',
// *** Capturing Group 5 - The tag name for a tag without attributes
'(' + tagNameRegex.source + ')',
'\\s*/?',
')',
'|',
// Handle tag with attributes
// Doing this separately from a tag with no attributes
// to fix a regex time complexity issue seen with the
// example in https://github.com/gregjacobs/Autolinker.js/issues/172
'(?:',
// *** Capturing Group 6 - The tag name for a tag with attributes
'(' + tagNameRegex.source + ')',
'\\s+',
// Zero or more attributes following the tag name
'(?:',
'(?:\\s+|\\b)',
// *** Capturing Group 7 - Pseudo-atomic group for attrNameRegex
getNameEqualsValueRegex(7),
')*',
'\\s*/?',
')',
')',
'>',
')'
].join(""), 'gi');
})();
/**
* @private
* @property {RegExp} htmlCharacterEntitiesRegex
*
* The regular expression that matches common HTML character entities.
*
* Ignoring & as it could be part of a query string -- handling it separately.
*/
var htmlCharacterEntitiesRegex = /( | |<|<|>|>|"|"|')/gi;
/**
* @class Autolinker.htmlParser.HtmlParser
* @extends Object
*
* An HTML parser implementation which simply walks an HTML string and returns an array of
* {@link Autolinker.htmlParser.HtmlNode HtmlNodes} that represent the basic HTML structure of the input string.
*
* Autolinker uses this to only link URLs/emails/mentions within text nodes, effectively ignoring / "walking
* around" HTML tags.
*/
var HtmlParser = (function () {
function HtmlParser() {
}
/**
* Parses an HTML string and returns a simple array of {@link Autolinker.htmlParser.HtmlNode HtmlNodes}
* to represent the HTML structure of the input string.
*
* @param {String} html The HTML to parse.
* @return {Autolinker.htmlParser.HtmlNode[]}
*/
HtmlParser.prototype.parse = function (html) {
var currentResult, lastIndex = 0, textAndEntityNodes, nodes = []; // will be the result of the method
while ((currentResult = htmlRegex.exec(html)) !== null) {
var tagText = currentResult[0], commentText = currentResult[4], // if we've matched a comment
tagName = currentResult[1] || currentResult[5] || currentResult[6], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a" or "img")
isClosingTag = !!currentResult[3], offset = currentResult.index, inBetweenTagsText = html.substring(lastIndex, offset);
// Push TextNodes and EntityNodes for any text found between tags
if (inBetweenTagsText) {
textAndEntityNodes = this.parseTextAndEntityNodes(lastIndex, inBetweenTagsText);
nodes.push.apply(nodes, textAndEntityNodes);
}
// Push the CommentNode or ElementNode
if (commentText) {
nodes.push(this.createCommentNode(offset, tagText, commentText));
}
else {
nodes.push(this.createElementNode(offset, tagText, tagName, isClosingTag));
}
lastIndex = offset + tagText.length;
}
// Process any remaining text after the last HTML element. Will process all of the text if there were no HTML elements.
if (lastIndex < html.length) {
var text = html.substring(lastIndex);
// Push TextNodes and EntityNodes for any text found between tags
if (text) {
textAndEntityNodes = this.parseTextAndEntityNodes(lastIndex, text);
// Note: the following 3 lines were previously:
// nodes.push.apply( nodes, textAndEntityNodes );
// but this was causing a "Maximum Call Stack Size Exceeded"
// error on inputs with a large number of html entities.
textAndEntityNodes.forEach(function (node) { return nodes.push(node); });
}
}
return nodes;
};
/**
* Parses text and HTML entity nodes from a given string. The input string
* should not have any HTML tags (elements) within it.
*
* @private
* @param {Number} offset The offset of the text node match within the
* original HTML string.
* @param {String} text The string of text to parse. This is from an HTML
* text node.
* @return {Autolinker.htmlParser.HtmlNode[]} An array of HtmlNodes to
* represent the {@link Autolinker.htmlParser.TextNode TextNodes} and
* {@link Autolinker.htmlParser.EntityNode EntityNodes} found.
*/
HtmlParser.prototype.parseTextAndEntityNodes = function (offset, text) {
var nodes = [], textAndEntityTokens = utils_1.splitAndCapture(text, htmlCharacterEntitiesRegex); // split at HTML entities, but include the HTML entities in the results array
// Every even numbered token is a TextNode, and every odd numbered token is an EntityNode
// For example: an input `text` of "Test "this" today" would turn into the
// `textAndEntityTokens`: [ 'Test ', '"', 'this', '"', ' today' ]
for (var i = 0, len = textAndEntityTokens.length; i < len; i += 2) {
var textToken = textAndEntityTokens[i], entityToken = textAndEntityTokens[i + 1];
if (textToken) {
nodes.push(this.createTextNode(offset, textToken));
offset += textToken.length;
}
if (entityToken) {
nodes.push(this.createEntityNode(offset, entityToken));
offset += entityToken.length;
}
}
return nodes;
};
* Factory method to create an {@link Autolinker.htmlParser.CommentNode CommentNode}.
*
* @private
* @param {Number} offset The offset of the match within the original HTML
* string.
* @param {String} tagText The full text of the tag (comment) that was
* matched, including its <!-- and -->.
* @param {String} commentText The full text of the comment that was matched.
*/
HtmlParser.prototype.createCommentNode = function (offset, tagText, commentText) {
return new comment_node_1.CommentNode({
offset: offset,
text: tagText,
comment: commentText.trim()
});
};
/**
* Factory method to create an {@link Autolinker.htmlParser.ElementNode ElementNode}.
*
* @private
* @param {Number} offset The offset of the match within the original HTML
* string.
* @param {String} tagText The full text of the tag (element) that was
* matched, including its attributes.
* @param {String} tagName The name of the tag. Ex: An <img> tag would
* be passed to this method as "img".
* @param {Boolean} isClosingTag `true` if it's a closing tag, false
* otherwise.
* @return {Autolinker.htmlParser.ElementNode}
*/
HtmlParser.prototype.createElementNode = function (offset, tagText, tagName, isClosingTag) {
return new element_node_1.ElementNode({
offset: offset,
text: tagText,
tagName: tagName.toLowerCase(),
closing: isClosingTag
});
};
/**
* Factory method to create a {@link Autolinker.htmlParser.EntityNode EntityNode}.
*
* @private
* @param {Number} offset The offset of the match within the original HTML
* string.
* @param {String} text The text that was matched for the HTML entity (such
* as '&nbsp;').
* @return {Autolinker.htmlParser.EntityNode}
*/
HtmlParser.prototype.createEntityNode = function (offset, text) {
return new entity_node_1.EntityNode({ offset: offset, text: text });
};
/**
* Factory method to create a {@link Autolinker.htmlParser.TextNode TextNode}.
*
* @private
* @param {Number} offset The offset of the match within the original HTML
* string.
* @param {String} text The text that was matched.
* @return {Autolinker.htmlParser.TextNode}
*/
HtmlParser.prototype.createTextNode = function (offset, text) {
return new text_node_1.TextNode({ offset: offset, text: text });
};
return HtmlParser;
}());
exports.HtmlParser = HtmlParser;
//# sourceMappingURL=html-parser.js.map