491 lines
17 KiB
JavaScript
491 lines
17 KiB
JavaScript
/**
|
|
* xmlpure.js
|
|
*
|
|
* Building upon and improving the CodeMirror 2 XML parser
|
|
* @author: Dror BG (deebug.dev@gmail.com)
|
|
* @date: August, 2011
|
|
*/
|
|
|
|
CodeMirror.defineMode("xmlpure", function(config, parserConfig) {
|
|
// constants
|
|
var STYLE_ERROR = "error";
|
|
var STYLE_INSTRUCTION = "comment";
|
|
var STYLE_COMMENT = "comment";
|
|
var STYLE_ELEMENT_NAME = "tag";
|
|
var STYLE_ATTRIBUTE = "attribute";
|
|
var STYLE_WORD = "string";
|
|
var STYLE_TEXT = "atom";
|
|
var STYLE_ENTITIES = "string";
|
|
|
|
var TAG_INSTRUCTION = "!instruction";
|
|
var TAG_CDATA = "!cdata";
|
|
var TAG_COMMENT = "!comment";
|
|
var TAG_TEXT = "!text";
|
|
|
|
var doNotIndent = {
|
|
"!cdata": true,
|
|
"!comment": true,
|
|
"!text": true,
|
|
"!instruction": true
|
|
};
|
|
|
|
// options
|
|
var indentUnit = config.indentUnit;
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// helper functions
|
|
|
|
// chain a parser to another parser
|
|
function chain(stream, state, parser) {
|
|
state.tokenize = parser;
|
|
return parser(stream, state);
|
|
}
|
|
|
|
// parse a block (comment, CDATA or text)
|
|
function inBlock(style, terminator, nextTokenize) {
|
|
return function(stream, state) {
|
|
while (!stream.eol()) {
|
|
if (stream.match(terminator)) {
|
|
popContext(state);
|
|
state.tokenize = nextTokenize;
|
|
break;
|
|
}
|
|
stream.next();
|
|
}
|
|
return style;
|
|
};
|
|
}
|
|
|
|
// go down a level in the document
|
|
// (hint: look at who calls this function to know what the contexts are)
|
|
function pushContext(state, tagName) {
|
|
var noIndent = doNotIndent.hasOwnProperty(tagName) || (state.context && state.context.doIndent);
|
|
var newContext = {
|
|
tagName: tagName,
|
|
prev: state.context,
|
|
indent: state.context ? state.context.indent + indentUnit : 0,
|
|
lineNumber: state.lineNumber,
|
|
indented: state.indented,
|
|
noIndent: noIndent
|
|
};
|
|
state.context = newContext;
|
|
}
|
|
|
|
// go up a level in the document
|
|
function popContext(state) {
|
|
if (state.context) {
|
|
var oldContext = state.context;
|
|
state.context = oldContext.prev;
|
|
return oldContext;
|
|
}
|
|
|
|
// we shouldn't be here - it means we didn't have a context to pop
|
|
return null;
|
|
}
|
|
|
|
// return true if the current token is seperated from the tokens before it
|
|
// which means either this is the start of the line, or there is at least
|
|
// one space or tab character behind the token
|
|
// otherwise returns false
|
|
function isTokenSeparated(stream) {
|
|
return stream.sol() ||
|
|
stream.string.charAt(stream.start - 1) == " " ||
|
|
stream.string.charAt(stream.start - 1) == "\t";
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: document
|
|
//
|
|
// an XML document can contain:
|
|
// - a single declaration (if defined, it must be the very first line)
|
|
// - exactly one root element
|
|
// @todo try to actually limit the number of root elements to 1
|
|
// - zero or more comments
|
|
function parseDocument(stream, state) {
|
|
if(stream.eat("<")) {
|
|
if(stream.eat("?")) {
|
|
// processing instruction
|
|
pushContext(state, TAG_INSTRUCTION);
|
|
state.tokenize = parseProcessingInstructionStartTag;
|
|
return STYLE_INSTRUCTION;
|
|
} else if(stream.match("!--")) {
|
|
// new context: comment
|
|
pushContext(state, TAG_COMMENT);
|
|
return chain(stream, state, inBlock(STYLE_COMMENT, "-->", parseDocument));
|
|
} else if(stream.eatSpace() || stream.eol() ) {
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
} else {
|
|
// element
|
|
state.tokenize = parseElementTagName;
|
|
return STYLE_ELEMENT_NAME;
|
|
}
|
|
}
|
|
|
|
// error on line
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: XML element start-tag or end-tag
|
|
//
|
|
// - element start-tag can contain attributes
|
|
// - element start-tag may self-close (or start an element block if it doesn't)
|
|
// - element end-tag can contain only the tag name
|
|
function parseElementTagName(stream, state) {
|
|
// get the name of the tag
|
|
var startPos = stream.pos;
|
|
if(stream.match(/^[a-zA-Z_:][-a-zA-Z0-9_:.]*/)) {
|
|
// element start-tag
|
|
var tagName = stream.string.substring(startPos, stream.pos);
|
|
pushContext(state, tagName);
|
|
state.tokenize = parseElement;
|
|
return STYLE_ELEMENT_NAME;
|
|
} else if(stream.match(/^\/[a-zA-Z_:][-a-zA-Z0-9_:.]*( )*>/)) {
|
|
// element end-tag
|
|
var endTagName = stream.string.substring(startPos + 1, stream.pos - 1).trim();
|
|
var oldContext = popContext(state);
|
|
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
|
|
if(oldContext == null || endTagName != oldContext.tagName) {
|
|
// the start and end tag names should match - error
|
|
return STYLE_ERROR;
|
|
}
|
|
return STYLE_ELEMENT_NAME;
|
|
} else {
|
|
// no tag name - error
|
|
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
|
|
stream.eatWhile(/[^>]/);
|
|
stream.eat(">");
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
stream.skipToEnd();
|
|
return null;
|
|
}
|
|
|
|
function parseElement(stream, state) {
|
|
if(stream.match(/^\/>/)) {
|
|
// self-closing tag
|
|
popContext(state);
|
|
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
|
|
return STYLE_ELEMENT_NAME;
|
|
} else if(stream.eat(/^>/)) {
|
|
state.tokenize = parseElementBlock;
|
|
return STYLE_ELEMENT_NAME;
|
|
} else if(isTokenSeparated(stream) && stream.match(/^[a-zA-Z_:][-a-zA-Z0-9_:.]*( )*=/)) {
|
|
// attribute
|
|
state.tokenize = parseAttribute;
|
|
return STYLE_ATTRIBUTE;
|
|
}
|
|
|
|
// no other options - this is an error
|
|
state.tokenize = state.context == null ? parseDocument : parseDocument;
|
|
stream.eatWhile(/[^>]/);
|
|
stream.eat(">");
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: attribute
|
|
//
|
|
// attribute values may contain everything, except:
|
|
// - the ending quote (with ' or ") - this marks the end of the value
|
|
// - the character "<" - should never appear
|
|
// - ampersand ("&") - unless it starts a reference: a string that ends with a semi-colon (";")
|
|
// ---> note: this parser is lax in what may be put into a reference string,
|
|
// ---> consult http://www.w3.org/TR/REC-xml/#NT-Reference if you want to make it tighter
|
|
function parseAttribute(stream, state) {
|
|
var quote = stream.next();
|
|
if(quote != "\"" && quote != "'") {
|
|
// attribute must be quoted
|
|
stream.skipToEnd();
|
|
state.tokenize = parseElement;
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
state.tokParams.quote = quote;
|
|
state.tokenize = parseAttributeValue;
|
|
return STYLE_WORD;
|
|
}
|
|
|
|
// @todo: find out whether this attribute value spans multiple lines,
|
|
// and if so, push a context for it in order not to indent it
|
|
// (or something of the sort..)
|
|
function parseAttributeValue(stream, state) {
|
|
var ch = "";
|
|
while(!stream.eol()) {
|
|
ch = stream.next();
|
|
if(ch == state.tokParams.quote) {
|
|
// end quote found
|
|
state.tokenize = parseElement;
|
|
return STYLE_WORD;
|
|
} else if(ch == "<") {
|
|
// can't have less-than signs in an attribute value, ever
|
|
stream.skipToEnd()
|
|
state.tokenize = parseElement;
|
|
return STYLE_ERROR;
|
|
} else if(ch == "&") {
|
|
// reference - look for a semi-colon, or return error if none found
|
|
ch = stream.next();
|
|
|
|
// make sure that semi-colon isn't right after the ampersand
|
|
if(ch == ';') {
|
|
stream.skipToEnd()
|
|
state.tokenize = parseElement;
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
// make sure no less-than characters slipped in
|
|
while(!stream.eol() && ch != ";") {
|
|
if(ch == "<") {
|
|
// can't have less-than signs in an attribute value, ever
|
|
stream.skipToEnd()
|
|
state.tokenize = parseElement;
|
|
return STYLE_ERROR;
|
|
}
|
|
ch = stream.next();
|
|
}
|
|
if(stream.eol() && ch != ";") {
|
|
// no ampersand found - error
|
|
stream.skipToEnd();
|
|
state.tokenize = parseElement;
|
|
return STYLE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
// attribute value continues to next line
|
|
return STYLE_WORD;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: element block
|
|
//
|
|
// a block can contain:
|
|
// - elements
|
|
// - text
|
|
// - CDATA sections
|
|
// - comments
|
|
function parseElementBlock(stream, state) {
|
|
if(stream.eat("<")) {
|
|
if(stream.match("?")) {
|
|
pushContext(state, TAG_INSTRUCTION);
|
|
state.tokenize = parseProcessingInstructionStartTag;
|
|
return STYLE_INSTRUCTION;
|
|
} else if(stream.match("!--")) {
|
|
// new context: comment
|
|
pushContext(state, TAG_COMMENT);
|
|
return chain(stream, state, inBlock(STYLE_COMMENT, "-->",
|
|
state.context == null ? parseDocument : parseElementBlock));
|
|
} else if(stream.match("![CDATA[")) {
|
|
// new context: CDATA section
|
|
pushContext(state, TAG_CDATA);
|
|
return chain(stream, state, inBlock(STYLE_TEXT, "]]>",
|
|
state.context == null ? parseDocument : parseElementBlock));
|
|
} else if(stream.eatSpace() || stream.eol() ) {
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
} else {
|
|
// element
|
|
state.tokenize = parseElementTagName;
|
|
return STYLE_ELEMENT_NAME;
|
|
}
|
|
} else if(stream.eat("&")) {
|
|
stream.eatWhile(/[^;]/);
|
|
stream.eat(";");
|
|
return STYLE_ENTITIES;
|
|
} else {
|
|
// new context: text
|
|
pushContext(state, TAG_TEXT);
|
|
state.tokenize = parseText;
|
|
return null;
|
|
}
|
|
|
|
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
|
|
stream.skipToEnd();
|
|
return null;
|
|
}
|
|
|
|
function parseText(stream, state) {
|
|
stream.eatWhile(/[^<]/);
|
|
if(!stream.eol()) {
|
|
// we cannot possibly be in the document context,
|
|
// just inside an element block
|
|
popContext(state);
|
|
state.tokenize = parseElementBlock;
|
|
}
|
|
return STYLE_TEXT;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: XML processing instructions
|
|
//
|
|
// XML processing instructions (PIs) allow documents to contain instructions for applications.
|
|
// PI format: <?name data?>
|
|
// - 'name' can be anything other than 'xml' (case-insensitive)
|
|
// - 'data' can be anything which doesn't contain '?>'
|
|
// XML declaration is a special PI (see XML declaration context below)
|
|
function parseProcessingInstructionStartTag(stream, state) {
|
|
if(stream.match("xml", true, true)) {
|
|
// xml declaration
|
|
if(state.lineNumber > 1 || stream.pos > 5) {
|
|
state.tokenize = parseDocument;
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
} else {
|
|
state.tokenize = parseDeclarationVersion;
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
}
|
|
|
|
// regular processing instruction
|
|
if(isTokenSeparated(stream) || stream.match("?>")) {
|
|
// we have a space after the start-tag, or nothing but the end-tag
|
|
// either way - error!
|
|
state.tokenize = parseDocument;
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
state.tokenize = parseProcessingInstructionBody;
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
|
|
function parseProcessingInstructionBody(stream, state) {
|
|
stream.eatWhile(/[^?]/);
|
|
if(stream.eat("?")) {
|
|
if(stream.eat(">")) {
|
|
popContext(state);
|
|
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
|
|
}
|
|
}
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// context: XML declaration
|
|
//
|
|
// XML declaration is of the following format:
|
|
// <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
|
|
// - must start at the first character of the first line
|
|
// - may span multiple lines
|
|
// - must include 'version'
|
|
// - may include 'encoding' and 'standalone' (in that order after 'version')
|
|
// - attribute names must be lowercase
|
|
// - cannot contain anything else on the line
|
|
function parseDeclarationVersion(stream, state) {
|
|
state.tokenize = parseDeclarationEncoding;
|
|
|
|
if(isTokenSeparated(stream) && stream.match(/^version( )*=( )*"([a-zA-Z0-9_.:]|\-)+"/)) {
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
function parseDeclarationEncoding(stream, state) {
|
|
state.tokenize = parseDeclarationStandalone;
|
|
|
|
if(isTokenSeparated(stream) && stream.match(/^encoding( )*=( )*"[A-Za-z]([A-Za-z0-9._]|\-)*"/)) {
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function parseDeclarationStandalone(stream, state) {
|
|
state.tokenize = parseDeclarationEndTag;
|
|
|
|
if(isTokenSeparated(stream) && stream.match(/^standalone( )*=( )*"(yes|no)"/)) {
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function parseDeclarationEndTag(stream, state) {
|
|
state.tokenize = parseDocument;
|
|
|
|
if(stream.match("?>") && stream.eol()) {
|
|
popContext(state);
|
|
return STYLE_INSTRUCTION;
|
|
}
|
|
stream.skipToEnd();
|
|
return STYLE_ERROR;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// returned object
|
|
return {
|
|
electricChars: "/[",
|
|
|
|
startState: function() {
|
|
return {
|
|
tokenize: parseDocument,
|
|
tokParams: {},
|
|
lineNumber: 0,
|
|
lineError: false,
|
|
context: null,
|
|
indented: 0
|
|
};
|
|
},
|
|
|
|
token: function(stream, state) {
|
|
if(stream.sol()) {
|
|
// initialize a new line
|
|
state.lineNumber++;
|
|
state.lineError = false;
|
|
state.indented = stream.indentation();
|
|
}
|
|
|
|
// eat all (the spaces) you can
|
|
if(stream.eatSpace()) return null;
|
|
|
|
// run the current tokenize function, according to the state
|
|
var style = state.tokenize(stream, state);
|
|
|
|
// is there an error somewhere in the line?
|
|
state.lineError = (state.lineError || style == "error");
|
|
|
|
return style;
|
|
},
|
|
|
|
blankLine: function(state) {
|
|
// blank lines are lines too!
|
|
state.lineNumber++;
|
|
state.lineError = false;
|
|
},
|
|
|
|
indent: function(state, textAfter) {
|
|
if(state.context) {
|
|
if(state.context.noIndent == true) {
|
|
// do not indent - no return value at all
|
|
return;
|
|
}
|
|
if(textAfter.match(/^<\/.*/)) {
|
|
// end-tag - indent back to last context
|
|
return state.context.indent;
|
|
}
|
|
if(textAfter.match(/^<!\[CDATA\[/)) {
|
|
// a stand-alone CDATA start-tag - indent back to column 0
|
|
return 0;
|
|
}
|
|
// indent to last context + regular indent unit
|
|
return state.context.indent + indentUnit;
|
|
}
|
|
return 0;
|
|
},
|
|
|
|
compareStates: function(a, b) {
|
|
if (a.indented != b.indented) return false;
|
|
for (var ca = a.context, cb = b.context; ; ca = ca.prev, cb = cb.prev) {
|
|
if (!ca || !cb) return ca == cb;
|
|
if (ca.tagName != cb.tagName) return false;
|
|
}
|
|
}
|
|
};
|
|
});
|
|
|
|
CodeMirror.defineMIME("application/xml", "purexml");
|
|
CodeMirror.defineMIME("text/xml", "purexml");
|