Mercurial > caja-test
diff js/ext/html-sanitizer.js @ 0:633c9cb05555
Origination.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 07 Jun 2009 19:29:10 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/js/ext/html-sanitizer.js Sun Jun 07 19:29:10 2009 -0700 @@ -0,0 +1,524 @@ +// Copyright (C) 2006 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @fileoverview + * An HTML sanitizer that can satisfy a variety of security policies. + * + * <p> + * The HTML sanitizer is built around a SAX parser and HTML element and + * attributes schemas. + * + * @author mikesamuel@gmail.com + * @requires html4 + * @provides html, html_sanitize + */ + +/** + * @namespace + */ +var html = (function () { + var lcase; + // The below may not be true on browsers in the Turkish locale. + if ('script' === 'SCRIPT'.toLowerCase()) { + lcase = function (s) { return s.toLowerCase(); }; + } else { + /** + * {@updoc + * $ lcase('SCRIPT') + * # 'script' + * $ lcase('script') + * # 'script' + * } + */ + lcase = function (s) { + return s.replace( + /[A-Z]/g, + function (ch) { + return String.fromCharCode(ch.charCodeAt(0) | 32); + }); + }; + } + + var ENTITIES = { + lt : '<', + gt : '>', + amp : '&', + nbsp : '\240', + quot : '"', + apos : '\'' + }; + + var decimalEscapeRe = /^#(\d+)$/; + var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/; + /** + * Decodes an HTML entity. + * + * {@updoc + * $ lookupEntity('lt') + * # '<' + * $ lookupEntity('GT') + * # '>' + * $ lookupEntity('amp') + * # '&' + * $ lookupEntity('nbsp') + * # '\xA0' + * $ lookupEntity('apos') + * # "'" + * $ lookupEntity('quot') + * # '"' + * $ lookupEntity('#xa') + * # '\n' + * $ lookupEntity('#10') + * # '\n' + * $ lookupEntity('#x0a') + * # '\n' + * $ lookupEntity('#010') + * # '\n' + * $ lookupEntity('#x00A') + * # '\n' + * $ lookupEntity('Pi') // Known failure + * # '\u03A0' + * $ lookupEntity('pi') // Known failure + * # '\u03C0' + * } + * + * @param name the content between the '&' and the ';'. + * @return a single unicode code-point as a string. + */ + function lookupEntity(name) { + name = lcase(name); // TODO: π is different from Π + if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; } + var m = name.match(decimalEscapeRe); + if (m) { + return String.fromCharCode(parseInt(m[1], 10)); + } else if (!!(m = name.match(hexEscapeRe))) { + return String.fromCharCode(parseInt(m[1], 16)); + } + return ''; + } + + function decodeOneEntity(_, name) { + return lookupEntity(name); + } + + var nulRe = /\0/g; + function stripNULs(s) { + return s.replace(nulRe, ''); + } + + var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; + /** + * The plain text of a chunk of HTML CDATA which possibly containing. + * + * {@updoc + * $ unescapeEntities('') + * # '' + * $ unescapeEntities('hello World!') + * # 'hello World!' + * $ unescapeEntities('1 < 2 && 4 > 3 ') + * # '1 < 2 && 4 > 3\n' + * $ unescapeEntities('<< <- unfinished entity>') + * # '<< <- unfinished entity>' + * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS + * # '/foo?bar=baz©=true' + * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure + * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0' + * } + * + * @param s a chunk of HTML CDATA. It must not start or end inside an HTML + * entity. + */ + function unescapeEntities(s) { + return s.replace(entityRe, decodeOneEntity); + } + + var ampRe = /&/g; + var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; + var ltRe = /</g; + var gtRe = />/g; + var quotRe = /\"/g; + var eqRe = /\=/g; // Backslash required on JScript.net + + /** + * Escapes HTML special characters in attribute values as HTML entities. + * + * {@updoc + * $ escapeAttrib('') + * # '' + * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence. + * # '"<<&==&>>"' + * $ escapeAttrib('Hello <World>!') + * # 'Hello <World>!' + * } + */ + function escapeAttrib(s) { + // Escaping '=' defangs many UTF-7 and SGML short-tag attacks. + return s.replace(ampRe, '&').replace(ltRe, '<').replace(gtRe, '>') + .replace(quotRe, '"').replace(eqRe, '='); + } + + /** + * Escape entities in RCDATA that can be escaped without changing the meaning. + * {@updoc + * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8') + * # '1 < 2 && 3 > 4 && 5 < 7&8' + * } + */ + function normalizeRCData(rcdata) { + return rcdata + .replace(looseAmpRe, '&$1') + .replace(ltRe, '<') + .replace(gtRe, '>'); + } + + + // TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at + // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html + + /** token definitions. */ + var INSIDE_TAG_TOKEN = new RegExp( + // Don't capture space. + '^\\s*(?:' + // Capture an attribute name in group 1, and value in group 3. + // We capture the fact that there was an attribute in group 2, since + // interpreters are inconsistent in whether a group that matches nothing + // is null, undefined, or the empty string. + + ('(?:' + + '([a-z][a-z-]*)' // attribute name + + ('(' // optionally followed + + '\\s*=\\s*' + + ('(' + // A double quoted string. + + '\"[^\"]*\"' + // A single quoted string. + + '|\'[^\']*\'' + // The positive lookahead is used to make sure that in + // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo". + + '|(?=[a-z][a-z-]*\\s*=)' + // An unquoted value that is not an attribute name. + // We know it is not an attribute name because the previous + // zero-width match would've eliminated that possibility. + + '|[^>\"\'\\s]*' + + ')' + ) + + ')' + ) + '?' + + ')' + ) + // End of tag captured in group 3. + + '|(/?>)' + // Don't capture cruft + + '|.[^\\w\\s>]*)', + 'i'); + + var OUTSIDE_TAG_TOKEN = new RegExp( + '^(?:' + // Entity captured in group 1. + + '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' + // Comment, doctypes, and processing instructions not captured. + + '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>' + // '/' captured in group 2 for close tags, and name captured in group 3. + + '|<(/)?([a-z][a-z0-9]*)' + // Text captured in group 4. + + '|([^<&>]+)' + // Cruft captured in group 5. + + '|([<&>]))', + 'i'); + + /** + * Given a SAX-like event handler, produce a function that feeds those + * events and a parameter to the event handler. + * + * The event handler has the form:{@code + * { + * // Name is an upper-case HTML tag name. Attribs is an array of + * // alternating upper-case attribute names, and attribute values. The + * // attribs array is reused by the parser. Param is the value passed to + * // the saxParser. + * startTag: function (name, attribs, param) { ... }, + * endTag: function (name, param) { ... }, + * pcdata: function (text, param) { ... }, + * rcdata: function (text, param) { ... }, + * cdata: function (text, param) { ... }, + * startDoc: function (param) { ... }, + * endDoc: function (param) { ... } + * }} + * + * @param {Object} handler a record containing event handlers. + * @return {Function} that takes a chunk of html and a parameter. + * The parameter is passed on to the handler methods. + */ + function makeSaxParser(handler) { + return function parse(htmlText, param) { + htmlText = String(htmlText); + var htmlLower = null; + + var inTag = false; // True iff we're currently processing a tag. + var attribs = []; // Accumulates attribute names and values. + var tagName = void 0; // The name of the tag currently being processed. + var eflags = void 0; // The element flags for the current tag. + var openTag = void 0; // True if the current tag is an open tag. + + if (handler.startDoc) { handler.startDoc(param); } + + while (htmlText) { + var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN); + htmlText = htmlText.substring(m[0].length); + + if (inTag) { + if (m[1]) { // attribute + // setAttribute with uppercase names doesn't work on IE6. + var attribName = lcase(m[1]); + var decodedValue; + if (m[2]) { + var encodedValue = m[3]; + switch (encodedValue.charCodeAt(0)) { // Strip quotes + case 34: case 39: + encodedValue = encodedValue.substring( + 1, encodedValue.length - 1); + break; + } + decodedValue = unescapeEntities(stripNULs(encodedValue)); + } else { + // Use name as value for valueless attribs, so + // <input type=checkbox checked> + // gets attributes ['type', 'checkbox', 'checked', 'checked'] + decodedValue = attribName; + } + attribs.push(attribName, decodedValue); + } else if (m[4]) { + if (eflags !== void 0) { // False if not in whitelist. + if (openTag) { + if (handler.startTag) { + handler.startTag(tagName, attribs, param); + } + } else { + if (handler.endTag) { + handler.endTag(tagName, param); + } + } + } + + if (openTag + && (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) { + if (htmlLower === null) { + htmlLower = lcase(htmlText); + } else { + htmlLower = htmlLower.substring( + htmlLower.length - htmlText.length); + } + var dataEnd = htmlLower.indexOf('</' + tagName); + if (dataEnd < 0) { dataEnd = htmlText.length; } + if (eflags & html4.eflags.CDATA) { + if (handler.cdata) { + handler.cdata(htmlText.substring(0, dataEnd), param); + } + } else if (handler.rcdata) { + handler.rcdata( + normalizeRCData(htmlText.substring(0, dataEnd)), param); + } + htmlText = htmlText.substring(dataEnd); + } + + tagName = eflags = openTag = void 0; + attribs.length = 0; + inTag = false; + } + } else { + if (m[1]) { // Entity + if (handler.pcdata) { handler.pcdata(m[0], param); } + } else if (m[3]) { // Tag + openTag = !m[2]; + inTag = true; + tagName = lcase(m[3]); + eflags = html4.ELEMENTS.hasOwnProperty(tagName) + ? html4.ELEMENTS[tagName] : void 0; + } else if (m[4]) { // Text + if (handler.pcdata) { handler.pcdata(m[4], param); } + } else if (m[5]) { // Cruft + if (handler.pcdata) { + switch (m[5]) { + case '<': handler.pcdata('<', param); break; + case '>': handler.pcdata('>', param); break; + default: handler.pcdata('&', param); break; + } + } + } + } + } + + if (handler.endDoc) { handler.endDoc(param); } + }; + } + + return { + normalizeRCData: normalizeRCData, + escapeAttrib: escapeAttrib, + unescapeEntities: unescapeEntities, + makeSaxParser: makeSaxParser + }; +})(); + +/** + * Returns a function that strips unsafe tags and attributes from html. + * @param {Function} sanitizeAttributes + * maps from (tagName, attribs[]) to null or a sanitized attribute array. + * The attribs array can be arbitrarily modified, but the same array + * instance is reused, so should not be held. + * @return {Function} from html to sanitized html + */ +html.makeHtmlSanitizer = function (sanitizeAttributes) { + var stack = []; + var ignoring = false; + return html.makeSaxParser({ + startDoc: function (_) { + stack = []; + ignoring = false; + }, + startTag: function (tagName, attribs, out) { + if (ignoring) { return; } + if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } + var eflags = html4.ELEMENTS[tagName]; + if (eflags & html4.eflags.FOLDABLE) { + return; + } else if (eflags & html4.eflags.UNSAFE) { + ignoring = !(eflags & html4.eflags.EMPTY); + return; + } + attribs = sanitizeAttributes(tagName, attribs); + // TODO(mikesamuel): relying on sanitizeAttributes not to + // insert unsafe attribute names. + if (attribs) { + if (!(eflags & html4.eflags.EMPTY)) { + stack.push(tagName); + } + + out.push('<', tagName); + for (var i = 0, n = attribs.length; i < n; i += 2) { + var attribName = attribs[i], + value = attribs[i + 1]; + if (value !== null && value !== void 0) { + out.push(' ', attribName, '="', html.escapeAttrib(value), '"'); + } + } + out.push('>'); + } + }, + endTag: function (tagName, out) { + if (ignoring) { + ignoring = false; + return; + } + if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } + var eflags = html4.ELEMENTS[tagName]; + if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY + | html4.eflags.FOLDABLE))) { + var index; + if (eflags & html4.eflags.OPTIONAL_ENDTAG) { + for (index = stack.length; --index >= 0;) { + var stackEl = stack[index]; + if (stackEl === tagName) { break; } + if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { + // Don't pop non optional end tags looking for a match. + return; + } + } + } else { + for (index = stack.length; --index >= 0;) { + if (stack[index] === tagName) { break; } + } + } + if (index < 0) { return; } // Not opened. + for (var i = stack.length; --i > index;) { + var stackEl = stack[i]; + if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { + out.push('</', stackEl, '>'); + } + } + stack.length = index; + out.push('</', tagName, '>'); + } + }, + pcdata: function (text, out) { + if (!ignoring) { out.push(text); } + }, + rcdata: function (text, out) { + if (!ignoring) { out.push(text); } + }, + cdata: function (text, out) { + if (!ignoring) { out.push(text); } + }, + endDoc: function (out) { + for (var i = stack.length; --i >= 0;) { + out.push('</', stack[i], '>'); + } + stack.length = 0; + } + }); +}; + + +/** + * Strips unsafe tags and attributes from html. + * @param {string} htmlText to sanitize + * @param {Function} opt_urlPolicy -- a transform to apply to url attribute + * values. + * @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to + * apply to names, ids, and classes. + * @return {string} html + */ +function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) { + var out = []; + html.makeHtmlSanitizer( + function sanitizeAttribs(tagName, attribs) { + for (var i = 0; i < attribs.length; i += 2) { + var attribName = attribs[i]; + var value = attribs[i + 1]; + var atype = null, attribKey; + if ((attribKey = tagName + ':' + attribName, + html4.ATTRIBS.hasOwnProperty(attribKey)) + || (attribKey = '*:' + attribName, + html4.ATTRIBS.hasOwnProperty(attribKey))) { + atype = html4.ATTRIBS[attribKey]; + } + if (atype !== null) { + switch (atype) { + case html4.atype.SCRIPT: + case html4.atype.STYLE: + value = null; + break; + case html4.atype.IDREF: + case html4.atype.IDREFS: + case html4.atype.GLOBAL_NAME: + case html4.atype.LOCAL_NAME: + case html4.atype.CLASSES: + value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value; + break; + case html4.atype.URI: + value = opt_urlPolicy && opt_urlPolicy(value); + break; + } + } else { + value = null; + } + attribs[i + 1] = value; + } + return attribs; + })(htmlText, out); + return out.join(''); +}