Refactor the HTML element parser

The purpose is to allow attributes to be specified as macro invocations. For example `<div myattr=<<mymacro param1 param3>>>`. The parser needed sprucing up in order to copy with the nesting of angle brackets. The refactoring has been done with an eye on using the same technique in the filter expression parser (which is pretty messy at the moment -- it throws exceptions for syntax errors, which is bad). Later I'm hoping to extend the technique to create a more flexible table parser.
2026-04-29 20:46:52 +00:00 · 2013-06-15 15:12:05 +01:00
parent aaed4faf7e
commit 18f8b7266e
2 changed files with 558 additions and 60 deletions
--- a/core/modules/parsers/wikiparser/rules/html.js
+++ b/core/modules/parsers/wikiparser/rules/html.js
@@ -28,72 +28,366 @@ exports.types = {inline: true, block: true};

 exports.init = function(parser) {
 	this.parser = parser;
-	// Regexp to match
-	if(this.is.block) {
-		this.matchRegExp = /<([A-Za-z\$]+)(\s*[^>]*?)(\/)?>(\r?\n)/mg;
-	} else {
-		this.matchRegExp = /<([A-Za-z\$]+)(\s*[^>]*?)(?:(\/>)|(?:>(\r?\n)?))/mg;
-	}
+};
+
+exports.findNextMatch = function(startPos) {
+	// Find the next tag
+	this.nextTag = this.findNextTag(this.parser.source,startPos,{
+		requireLineBreak: this.is.block
+	});
+	return this.nextTag ? this.nextTag.start : undefined;
 };

 /*
 Parse the most recent match
 */
 exports.parse = function() {
-	// Get all the details of the match in case this parser is called recursively
-	var tagName = this.match[1],
-		attributeString = this.match[2],
-		isSelfClosing = !!this.match[3],
-		hasLineBreak = !!this.match[4];
-	// Move past the tag name and parameters
-	this.parser.pos = this.matchRegExp.lastIndex;
-	var reAttr = /\s*([A-Za-z\-_]+)(?:\s*=\s*(?:("[^"]*")|('[^']*')|(\{\{[^\}]*\}\})|([^"'\s]+)))?/mg;
-	// Process the attributes
-	var attrMatch = reAttr.exec(attributeString),
-		attributes = {};
-	while(attrMatch) {
-		var name = attrMatch[1],
-			value;
-		if(attrMatch[2]) { // Double quoted
-			value = {type: "string", value: attrMatch[2].substring(1,attrMatch[2].length-1)};
-		} else if(attrMatch[3]) { // Single quoted
-			value = {type: "string", value: attrMatch[3].substring(1,attrMatch[3].length-1)};
-		} else if(attrMatch[4]) { // Double curly brace quoted
-			value = {type: "indirect", textReference: attrMatch[4].substr(2,attrMatch[4].length-4)};
-		} else if(attrMatch[5]) { // Unquoted
-			value = {type: "string", value: attrMatch[5]};
-		} else { // Valueless
-			value = {type: "string", value: "true"}; // TODO: We should have a way of indicating we want an attribute without a value
-		}
-		attributes[name] = value;
-		attrMatch = reAttr.exec(attributeString);
-	}
-	// Process the end tag
-	if(!isSelfClosing && $tw.config.htmlVoidElements.indexOf(tagName) === -1) {
-		var reEndString = "</" + $tw.utils.escapeRegExp(tagName) + ">",
-			reEnd = new RegExp("(" + reEndString + ")","mg"),
-			content;
+	// Retrieve the most recent match so that recursive calls don't overwrite it
+	var tag = this.nextTag;
+	this.nextTag = null;
+	// Advance the parser position to past the tag
+	this.parser.pos = tag.end;
+	// Check for a following linebreak
+	var hasLineBreak = !tag.isSelfClosing && !!this.parseTokenRegExp(this.parser.source,this.parser.pos,/(\r?\n)/g);
+	// Set whether we're in block mode
+	tag.isBlock = this.is.block || hasLineBreak;
+	// Parse the body if we need to
+	if(!tag.isSelfClosing && $tw.config.htmlVoidElements.indexOf(tag.tag) === -1) {
+			var reEndString = "</" + $tw.utils.escapeRegExp(tag.tag) + ">",
+				reEnd = new RegExp("(" + reEndString + ")","mg");
 		if(hasLineBreak) {
-			content = this.parser.parseBlocks(reEndString);
+			tag.children = this.parser.parseBlocks(reEndString);
 		} else {
-			content = this.parser.parseInlineRun(reEnd);
+			tag.children = this.parser.parseInlineRun(reEnd);
 		}
 		reEnd.lastIndex = this.parser.pos;
 		var endMatch = reEnd.exec(this.parser.source);
 		if(endMatch && endMatch.index === this.parser.pos) {
 			this.parser.pos = endMatch.index + endMatch[0].length;
 		}
-	} else {
-		content = [];
 	}
-	var element = {
-		type: "element",
-		tag: tagName,
-		isBlock: this.is.block || hasLineBreak,
-		attributes: attributes,
-		children: content
+	// Return the tag
+	return [tag];
+};
+
+/*
+Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,}
+*/
+exports.parseWhiteSpace = function(source,pos) {
+	var node = {
+		type: "whitespace",
+		start: pos
 	};
-	return [element];
+	var re = /(\s)+/g;
+	re.lastIndex = pos;
+	var match = re.exec(source);
+	if(match && match.index === pos) {
+		node.end = pos + match[0].length;
+		return node;
+	}
+	return null;
+};
+
+/*
+Convenience wrapper for parseWhiteSpace
+*/
+exports.skipWhiteSpace = function(source,pos) {
+	var whitespace = this.parseWhiteSpace(source,pos);
+	if(whitespace) {
+		return whitespace.end;
+	}
+	return pos;
+};
+
+/*
+Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,}
+*/
+exports.parseTokenString = function(source,pos,token) {
+	var match = source.indexOf(token,pos) === pos;
+	if(match) {
+		return {
+			type: "token",
+			value: token,
+			start: pos,
+			end: pos + token.length
+		};
+	}
+	return null;
+};
+
+/*
+Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,}
+*/
+exports.parseTokenRegExp = function(source,pos,reToken) {
+	var node = {
+		type: "regexp",
+		start: pos
+	};
+	reToken.lastIndex = pos;
+	node.match = reToken.exec(source);
+	if(node.match && node.match.index === pos) {
+		node.end = pos + node.match[0].length;
+		return node;
+	} else {
+		return null;
+	}
+};
+
+/*
+Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,}
+*/
+exports.parseStringLiteral = function(source,pos) {
+	var node = {
+		type: "string",
+		start: pos
+	};
+	var reString = /(?:"([^"]*)")|(?:'([^']*)')/g;
+	reString.lastIndex = pos;
+	var match = reString.exec(source);
+	if(match && match.index === pos) {
+		node.value = match[1] === undefined ? match[2] : match[1];
+		node.end = pos + match[0].length;
+		return node;
+	} else {
+		return null;
+	}
+};
+
+/*
+Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:}
+*/
+exports.parseMacroParameter = function(source,pos) {
+	var node = {
+		type: "macro-parameter",
+		start: pos
+	}
+	// Define our regexp
+	var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g;
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for the parameter
+	var token = this.parseTokenRegExp(source,pos,reMacroParameter);
+	if(!token) {
+		return null;
+	}
+	pos = token.end;
+	// Get the parameter details
+	node.value = token.match[2] !== undefined ? token.match[2] : (
+					token.match[3] !== undefined ? token.match[3] : (
+						token.match[4] !== undefined ? token.match[4] : (
+							token.match[5] !== undefined ? token.match[5] : (
+								""
+							)
+						)
+					)
+				);
+	if(token.match[1]) {
+		node.name = token.match[1];
+	}
+	// Update the end position
+	node.end = pos;
+	return node;
+};
+
+/*
+Look for a macro invocation. Returns null if not found, or {type: "macro-invocation", name:, parameters:, start:, end:}
+*/
+exports.parseMacroInvocation = function(source,pos) {
+	var node = {
+		type: "macro-invocation",
+		start: pos,
+		parameters: []
+	}
+	// Define our regexps
+	var reMacroName = /([^\s>"'=]+)/g;
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for a double less than sign
+	var token = this.parseTokenString(source,pos,"<<");
+	if(!token) {
+		return null;
+	}
+	pos = token.end;
+	// Get the macro name
+	var name = this.parseTokenRegExp(source,pos,reMacroName);
+	if(!name) {
+		return null;
+	}
+	node.name = name.match[1];
+	pos = name.end;
+	// Process parameters
+	var parameter = this.parseMacroParameter(source,pos);
+	while(parameter) {
+		node.parameters.push(parameter);
+		pos = parameter.end;
+		// Get the next parameter
+		parameter = this.parseMacroParameter(source,pos);
+	}
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for a double greater than sign
+	token = this.parseTokenString(source,pos,">>");
+	if(!token) {
+		return null;
+	}
+	pos = token.end;
+	// Update the end position
+	node.end = pos;
+	return node;
+};
+
+/*
+Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,}
+*/
+exports.parseAttribute = function(source,pos) {
+	var node = {
+		start: pos
+	};
+	// Define our regexps
+	var reAttributeName = /([^\/\s>"'=]+)/g,
+		reUnquotedAttribute = /([^\/\s<>"'=]+)/g,
+		reIndirectValue = /\{\{([^\}]+)\}\}/g;
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Get the attribute name
+	var name = this.parseTokenRegExp(source,pos,reAttributeName);
+	if(!name) {
+		return null;
+	}
+	node.name = name.match[1];
+	pos = name.end;
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for an equals sign
+	var token = this.parseTokenString(source,pos,"=");
+	if(token) {
+		pos = token.end;
+		// Skip whitespace
+		pos = this.skipWhiteSpace(source,pos);
+		// Look for a string literal
+		var stringLiteral = this.parseStringLiteral(source,pos);
+		if(stringLiteral) {
+			pos = stringLiteral.end;
+			node.type = "string";
+			node.value = stringLiteral.value;
+		} else {
+			// Look for an indirect value
+			var indirectValue = this.parseTokenRegExp(source,pos,reIndirectValue);
+			if(indirectValue) {
+				pos = indirectValue.end;
+				node.type = "indirect";
+				node.textReference = indirectValue.match[1];
+			} else {
+				// Look for a unquoted value
+				var unquotedValue = this.parseTokenRegExp(source,pos,reUnquotedAttribute);
+				if(unquotedValue) {
+					pos = unquotedValue.end;
+					node.type = "string";
+					node.value = unquotedValue.match[1];
+				} else {
+					// Look for a macro invocation value
+					var macroInvocation = this.parseMacroInvocation(source,pos);
+					if(macroInvocation) {
+						pos = macroInvocation.end;
+						node.type = "macro";
+						node.value = macroInvocation;
+					} else {
+						node.type = "string";
+						node.value = "true";
+					}
+				}
+			}
+		}
+	} else {
+		node.type = "string";
+		node.value = "true";
+	}
+	// Update the end position
+	node.end = pos;
+	return node;
+};
+
+/*
+Look for an HTML tag. Returns null if not found, otherwise returns {type: "tag", name:, attributes: [], isSelfClosing:, start:, end:,}
+*/
+exports.parseTag = function(source,pos,options) {
+	options = options || {};
+	var token,
+		node = {
+			type: "element",
+			start: pos,
+			attributes: {}
+		};
+	// Define our regexps
+	var reTagName = /([a-zA-Z\-\$]+)/g;
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for a less than sign
+	token = this.parseTokenString(source,pos,"<");
+	if(!token) {
+		return null;
+	}
+	pos = token.end;
+	// Get the tag name
+	token = this.parseTokenRegExp(source,pos,reTagName);
+	if(!token) {
+		return null;
+	}
+	node.tag = token.match[1];
+	pos = token.end;
+	// Process attributes
+	var attribute = this.parseAttribute(source,pos);
+	while(attribute) {
+		node.attributes[attribute.name] = attribute;
+		pos = attribute.end;
+		// Get the next attribute
+		attribute = this.parseAttribute(source,pos);
+	}
+	// Skip whitespace
+	pos = this.skipWhiteSpace(source,pos);
+	// Look for a closing slash
+	token = this.parseTokenString(source,pos,"/");
+	if(token) {
+		pos = token.end;
+		node.isSelfClosing = true;
+	}
+	// Look for a greater than sign
+	token = this.parseTokenString(source,pos,">");
+	if(!token) {
+		return null;
+	}
+	pos = token.end;
+	// Check for a required line break
+	if(options.requireLineBreak) {
+		token = this.parseTokenRegExp(source,pos,/(\r?\n)/g);
+		if(!token) {
+			return null;
+		}
+	}
+	// Update the end position
+	node.end = pos;
+	return node;
+};
+
+exports.findNextTag = function(source,pos,options) {
+	// A regexp for finding candidate HTML tags
+	var reLookahead = /<([a-zA-Z\-\$]+)/g;
+	// Find the next candidate
+	reLookahead.lastIndex = pos;
+	var match = reLookahead.exec(source);
+	while(match) {
+		// Try to parse the candidate as a tag
+		var tag = this.parseTag(source,match.index,options);
+		// Return success
+		if(tag) {
+			return tag;
+		}
+		// Look for the next match
+		reLookahead.lastIndex = match.index + 1;
+		match = reLookahead.exec(source);
+	}
+	// Failed
+	return null;
 };

 })();