Mercurial > caja-test
view js/ext/html-sanitizer.js @ 4:cf673c093b61 default tip
paver runserver now shows errors if cajoling failed.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 07 Jun 2009 20:44:44 -0700 |
parents | 633c9cb05555 |
children |
line wrap: on
line source
// Copyright (C) 2006 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * An HTML sanitizer that can satisfy a variety of security policies. * * <p> * The HTML sanitizer is built around a SAX parser and HTML element and * attributes schemas. * * @author mikesamuel@gmail.com * @requires html4 * @provides html, html_sanitize */ /** * @namespace */ var html = (function () { var lcase; // The below may not be true on browsers in the Turkish locale. if ('script' === 'SCRIPT'.toLowerCase()) { lcase = function (s) { return s.toLowerCase(); }; } else { /** * {@updoc * $ lcase('SCRIPT') * # 'script' * $ lcase('script') * # 'script' * } */ lcase = function (s) { return s.replace( /[A-Z]/g, function (ch) { return String.fromCharCode(ch.charCodeAt(0) | 32); }); }; } var ENTITIES = { lt : '<', gt : '>', amp : '&', nbsp : '\240', quot : '"', apos : '\'' }; var decimalEscapeRe = /^#(\d+)$/; var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/; /** * Decodes an HTML entity. * * {@updoc * $ lookupEntity('lt') * # '<' * $ lookupEntity('GT') * # '>' * $ lookupEntity('amp') * # '&' * $ lookupEntity('nbsp') * # '\xA0' * $ lookupEntity('apos') * # "'" * $ lookupEntity('quot') * # '"' * $ lookupEntity('#xa') * # '\n' * $ lookupEntity('#10') * # '\n' * $ lookupEntity('#x0a') * # '\n' * $ lookupEntity('#010') * # '\n' * $ lookupEntity('#x00A') * # '\n' * $ lookupEntity('Pi') // Known failure * # '\u03A0' * $ lookupEntity('pi') // Known failure * # '\u03C0' * } * * @param name the content between the '&' and the ';'. * @return a single unicode code-point as a string. */ function lookupEntity(name) { name = lcase(name); // TODO: π is different from Π if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; } var m = name.match(decimalEscapeRe); if (m) { return String.fromCharCode(parseInt(m[1], 10)); } else if (!!(m = name.match(hexEscapeRe))) { return String.fromCharCode(parseInt(m[1], 16)); } return ''; } function decodeOneEntity(_, name) { return lookupEntity(name); } var nulRe = /\0/g; function stripNULs(s) { return s.replace(nulRe, ''); } var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; /** * The plain text of a chunk of HTML CDATA which possibly containing. * * {@updoc * $ unescapeEntities('') * # '' * $ unescapeEntities('hello World!') * # 'hello World!' * $ unescapeEntities('1 < 2 && 4 > 3 ') * # '1 < 2 && 4 > 3\n' * $ unescapeEntities('<< <- unfinished entity>') * # '<< <- unfinished entity>' * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS * # '/foo?bar=baz©=true' * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0' * } * * @param s a chunk of HTML CDATA. It must not start or end inside an HTML * entity. */ function unescapeEntities(s) { return s.replace(entityRe, decodeOneEntity); } var ampRe = /&/g; var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; var ltRe = /</g; var gtRe = />/g; var quotRe = /\"/g; var eqRe = /\=/g; // Backslash required on JScript.net /** * Escapes HTML special characters in attribute values as HTML entities. * * {@updoc * $ escapeAttrib('') * # '' * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence. * # '"<<&==&>>"' * $ escapeAttrib('Hello <World>!') * # 'Hello <World>!' * } */ function escapeAttrib(s) { // Escaping '=' defangs many UTF-7 and SGML short-tag attacks. return s.replace(ampRe, '&').replace(ltRe, '<').replace(gtRe, '>') .replace(quotRe, '"').replace(eqRe, '='); } /** * Escape entities in RCDATA that can be escaped without changing the meaning. * {@updoc * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8') * # '1 < 2 && 3 > 4 && 5 < 7&8' * } */ function normalizeRCData(rcdata) { return rcdata .replace(looseAmpRe, '&$1') .replace(ltRe, '<') .replace(gtRe, '>'); } // TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html /** token definitions. */ var INSIDE_TAG_TOKEN = new RegExp( // Don't capture space. '^\\s*(?:' // Capture an attribute name in group 1, and value in group 3. // We capture the fact that there was an attribute in group 2, since // interpreters are inconsistent in whether a group that matches nothing // is null, undefined, or the empty string. + ('(?:' + '([a-z][a-z-]*)' // attribute name + ('(' // optionally followed + '\\s*=\\s*' + ('(' // A double quoted string. + '\"[^\"]*\"' // A single quoted string. + '|\'[^\']*\'' // The positive lookahead is used to make sure that in // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo". + '|(?=[a-z][a-z-]*\\s*=)' // An unquoted value that is not an attribute name. // We know it is not an attribute name because the previous // zero-width match would've eliminated that possibility. + '|[^>\"\'\\s]*' + ')' ) + ')' ) + '?' + ')' ) // End of tag captured in group 3. + '|(/?>)' // Don't capture cruft + '|.[^\\w\\s>]*)', 'i'); var OUTSIDE_TAG_TOKEN = new RegExp( '^(?:' // Entity captured in group 1. + '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' // Comment, doctypes, and processing instructions not captured. + '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>' // '/' captured in group 2 for close tags, and name captured in group 3. + '|<(/)?([a-z][a-z0-9]*)' // Text captured in group 4. + '|([^<&>]+)' // Cruft captured in group 5. + '|([<&>]))', 'i'); /** * Given a SAX-like event handler, produce a function that feeds those * events and a parameter to the event handler. * * The event handler has the form:{@code * { * // Name is an upper-case HTML tag name. Attribs is an array of * // alternating upper-case attribute names, and attribute values. The * // attribs array is reused by the parser. Param is the value passed to * // the saxParser. * startTag: function (name, attribs, param) { ... }, * endTag: function (name, param) { ... }, * pcdata: function (text, param) { ... }, * rcdata: function (text, param) { ... }, * cdata: function (text, param) { ... }, * startDoc: function (param) { ... }, * endDoc: function (param) { ... } * }} * * @param {Object} handler a record containing event handlers. * @return {Function} that takes a chunk of html and a parameter. * The parameter is passed on to the handler methods. */ function makeSaxParser(handler) { return function parse(htmlText, param) { htmlText = String(htmlText); var htmlLower = null; var inTag = false; // True iff we're currently processing a tag. var attribs = []; // Accumulates attribute names and values. var tagName = void 0; // The name of the tag currently being processed. var eflags = void 0; // The element flags for the current tag. var openTag = void 0; // True if the current tag is an open tag. if (handler.startDoc) { handler.startDoc(param); } while (htmlText) { var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN); htmlText = htmlText.substring(m[0].length); if (inTag) { if (m[1]) { // attribute // setAttribute with uppercase names doesn't work on IE6. var attribName = lcase(m[1]); var decodedValue; if (m[2]) { var encodedValue = m[3]; switch (encodedValue.charCodeAt(0)) { // Strip quotes case 34: case 39: encodedValue = encodedValue.substring( 1, encodedValue.length - 1); break; } decodedValue = unescapeEntities(stripNULs(encodedValue)); } else { // Use name as value for valueless attribs, so // <input type=checkbox checked> // gets attributes ['type', 'checkbox', 'checked', 'checked'] decodedValue = attribName; } attribs.push(attribName, decodedValue); } else if (m[4]) { if (eflags !== void 0) { // False if not in whitelist. if (openTag) { if (handler.startTag) { handler.startTag(tagName, attribs, param); } } else { if (handler.endTag) { handler.endTag(tagName, param); } } } if (openTag && (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) { if (htmlLower === null) { htmlLower = lcase(htmlText); } else { htmlLower = htmlLower.substring( htmlLower.length - htmlText.length); } var dataEnd = htmlLower.indexOf('</' + tagName); if (dataEnd < 0) { dataEnd = htmlText.length; } if (eflags & html4.eflags.CDATA) { if (handler.cdata) { handler.cdata(htmlText.substring(0, dataEnd), param); } } else if (handler.rcdata) { handler.rcdata( normalizeRCData(htmlText.substring(0, dataEnd)), param); } htmlText = htmlText.substring(dataEnd); } tagName = eflags = openTag = void 0; attribs.length = 0; inTag = false; } } else { if (m[1]) { // Entity if (handler.pcdata) { handler.pcdata(m[0], param); } } else if (m[3]) { // Tag openTag = !m[2]; inTag = true; tagName = lcase(m[3]); eflags = html4.ELEMENTS.hasOwnProperty(tagName) ? html4.ELEMENTS[tagName] : void 0; } else if (m[4]) { // Text if (handler.pcdata) { handler.pcdata(m[4], param); } } else if (m[5]) { // Cruft if (handler.pcdata) { switch (m[5]) { case '<': handler.pcdata('<', param); break; case '>': handler.pcdata('>', param); break; default: handler.pcdata('&', param); break; } } } } } if (handler.endDoc) { handler.endDoc(param); } }; } return { normalizeRCData: normalizeRCData, escapeAttrib: escapeAttrib, unescapeEntities: unescapeEntities, makeSaxParser: makeSaxParser }; })(); /** * Returns a function that strips unsafe tags and attributes from html. * @param {Function} sanitizeAttributes * maps from (tagName, attribs[]) to null or a sanitized attribute array. * The attribs array can be arbitrarily modified, but the same array * instance is reused, so should not be held. * @return {Function} from html to sanitized html */ html.makeHtmlSanitizer = function (sanitizeAttributes) { var stack = []; var ignoring = false; return html.makeSaxParser({ startDoc: function (_) { stack = []; ignoring = false; }, startTag: function (tagName, attribs, out) { if (ignoring) { return; } if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } var eflags = html4.ELEMENTS[tagName]; if (eflags & html4.eflags.FOLDABLE) { return; } else if (eflags & html4.eflags.UNSAFE) { ignoring = !(eflags & html4.eflags.EMPTY); return; } attribs = sanitizeAttributes(tagName, attribs); // TODO(mikesamuel): relying on sanitizeAttributes not to // insert unsafe attribute names. if (attribs) { if (!(eflags & html4.eflags.EMPTY)) { stack.push(tagName); } out.push('<', tagName); for (var i = 0, n = attribs.length; i < n; i += 2) { var attribName = attribs[i], value = attribs[i + 1]; if (value !== null && value !== void 0) { out.push(' ', attribName, '="', html.escapeAttrib(value), '"'); } } out.push('>'); } }, endTag: function (tagName, out) { if (ignoring) { ignoring = false; return; } if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } var eflags = html4.ELEMENTS[tagName]; if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY | html4.eflags.FOLDABLE))) { var index; if (eflags & html4.eflags.OPTIONAL_ENDTAG) { for (index = stack.length; --index >= 0;) { var stackEl = stack[index]; if (stackEl === tagName) { break; } if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { // Don't pop non optional end tags looking for a match. return; } } } else { for (index = stack.length; --index >= 0;) { if (stack[index] === tagName) { break; } } } if (index < 0) { return; } // Not opened. for (var i = stack.length; --i > index;) { var stackEl = stack[i]; if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { out.push('</', stackEl, '>'); } } stack.length = index; out.push('</', tagName, '>'); } }, pcdata: function (text, out) { if (!ignoring) { out.push(text); } }, rcdata: function (text, out) { if (!ignoring) { out.push(text); } }, cdata: function (text, out) { if (!ignoring) { out.push(text); } }, endDoc: function (out) { for (var i = stack.length; --i >= 0;) { out.push('</', stack[i], '>'); } stack.length = 0; } }); }; /** * Strips unsafe tags and attributes from html. * @param {string} htmlText to sanitize * @param {Function} opt_urlPolicy -- a transform to apply to url attribute * values. * @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to * apply to names, ids, and classes. * @return {string} html */ function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) { var out = []; html.makeHtmlSanitizer( function sanitizeAttribs(tagName, attribs) { for (var i = 0; i < attribs.length; i += 2) { var attribName = attribs[i]; var value = attribs[i + 1]; var atype = null, attribKey; if ((attribKey = tagName + ':' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey)) || (attribKey = '*:' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey))) { atype = html4.ATTRIBS[attribKey]; } if (atype !== null) { switch (atype) { case html4.atype.SCRIPT: case html4.atype.STYLE: value = null; break; case html4.atype.IDREF: case html4.atype.IDREFS: case html4.atype.GLOBAL_NAME: case html4.atype.LOCAL_NAME: case html4.atype.CLASSES: value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value; break; case html4.atype.URI: value = opt_urlPolicy && opt_urlPolicy(value); break; } } else { value = null; } attribs[i + 1] = value; } return attribs; })(htmlText, out); return out.join(''); }