caja-test: js/ext/html-sanitizer.js comparison

comparison js/ext/html-sanitizer.js @ 0:633c9cb05555

Origination.

author	Atul Varma <varmaa@toolness.com>
date	Sun, 07 Jun 2009 19:29:10 -0700
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:633c9cb05555
+// Copyright (C) 2006 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+* @fileoverview
+* An HTML sanitizer that can satisfy a variety of security policies.
+*
+* <p>
+* The HTML sanitizer is built around a SAX parser and HTML element and
+* attributes schemas.
+*
+* @author mikesamuel@gmail.com
+* @requires html4
+* @provides html, html_sanitize
+*/
+/**
+* @namespace
+*/
+var html = (function () {
+var lcase;
+// The below may not be true on browsers in the Turkish locale.
+if ('script' === 'SCRIPT'.toLowerCase()) {
+lcase = function (s) { return s.toLowerCase(); };
+} else {
+/**
+* {@updoc
+* $ lcase('SCRIPT')
+* # 'script'
+* $ lcase('script')
+* # 'script'
+* }
+*/
+lcase = function (s) {
+return s.replace(
+/[A-Z]/g,
+function (ch) {
+return String.fromCharCode(ch.charCodeAt(0) | 32);
+});
+};
+}
+var ENTITIES = {
+lt   : '<',
+gt   : '>',
+amp  : '&',
+nbsp : '\240',
+quot : '"',
+apos : '\''
+};
+var decimalEscapeRe = /^#(\d+)$/;
+var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
+/**
+* Decodes an HTML entity.
+*
+* {@updoc
+* $ lookupEntity('lt')
+* # '<'
+* $ lookupEntity('GT')
+* # '>'
+* $ lookupEntity('amp')
+* # '&'
+* $ lookupEntity('nbsp')
+* # '\xA0'
+* $ lookupEntity('apos')
+* # "'"
+* $ lookupEntity('quot')
+* # '"'
+* $ lookupEntity('#xa')
+* # '\n'
+* $ lookupEntity('#10')
+* # '\n'
+* $ lookupEntity('#x0a')
+* # '\n'
+* $ lookupEntity('#010')
+* # '\n'
+* $ lookupEntity('#x00A')
+* # '\n'
+* $ lookupEntity('Pi')      // Known failure
+* # '\u03A0'
+* $ lookupEntity('pi')      // Known failure
+* # '\u03C0'
+* }
+*
+* @param name the content between the '&' and the ';'.
+* @return a single unicode code-point as a string.
+*/
+function lookupEntity(name) {
+name = lcase(name);  // TODO: &pi; is different from &Pi;
+if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
+var m = name.match(decimalEscapeRe);
+if (m) {
+return String.fromCharCode(parseInt(m[1], 10));
+} else if (!!(m = name.match(hexEscapeRe))) {
+return String.fromCharCode(parseInt(m[1], 16));
+}
+return '';
+}
+function decodeOneEntity(_, name) {
+return lookupEntity(name);
+}
+var nulRe = /\0/g;
+function stripNULs(s) {
+return s.replace(nulRe, '');
+}
+var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g;
+/**
+* The plain text of a chunk of HTML CDATA which possibly containing.
+*
+* {@updoc
+* $ unescapeEntities('')
+* # ''
+* $ unescapeEntities('hello World!')
+* # 'hello World!'
+* $ unescapeEntities('1 &lt; 2 &amp;&AMP; 4 &gt; 3&#10;')
+* # '1 < 2 && 4 > 3\n'
+* $ unescapeEntities('&lt;&lt <- unfinished entity&gt;')
+* # '<&lt <- unfinished entity>'
+* $ unescapeEntities('/foo?bar=baz&copy=true')  // & often unescaped in URLS
+* # '/foo?bar=baz&copy=true'
+* $ unescapeEntities('pi=&pi;&#x3c0;, Pi=&Pi;\u03A0') // FIXME: known failure
+* # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
+* }
+*
+* @param s a chunk of HTML CDATA.  It must not start or end inside an HTML
+*   entity.
+*/
+function unescapeEntities(s) {
+return s.replace(entityRe, decodeOneEntity);
+}
+var ampRe = /&/g;
+var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
+var ltRe = /</g;
+var gtRe = />/g;
+var quotRe = /\"/g;
+var eqRe = /\=/g;  // Backslash required on JScript.net
+/**
+* Escapes HTML special characters in attribute values as HTML entities.
+*
+* {@updoc
+* $ escapeAttrib('')
+* # ''
+* $ escapeAttrib('"<<&==&>>"')  // Do not just escape the first occurrence.
+* # '&quot;&lt;&lt;&amp;&#61;&#61;&amp;&gt;&gt;&quot;'
+* $ escapeAttrib('Hello <World>!')
+* # 'Hello &lt;World&gt;!'
+* }
+*/
+function escapeAttrib(s) {
+// Escaping '=' defangs many UTF-7 and SGML short-tag attacks.
+return s.replace(ampRe, '&amp;').replace(ltRe, '&lt;').replace(gtRe, '&gt;')
+.replace(quotRe, '&quot;').replace(eqRe, '&#61;');
+}
+/**
+* Escape entities in RCDATA that can be escaped without changing the meaning.
+* {@updoc
+* $ normalizeRCData('1 < 2 &&amp; 3 > 4 &amp;& 5 &lt; 7&8')
+* # '1 &lt; 2 &amp;&amp; 3 &gt; 4 &amp;&amp; 5 &lt; 7&amp;8'
+* }
+*/
+function normalizeRCData(rcdata) {
+return rcdata
+.replace(looseAmpRe, '&amp;$1')
+.replace(ltRe, '&lt;')
+.replace(gtRe, '&gt;');
+}
+// TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
+/** token definitions. */
+var INSIDE_TAG_TOKEN = new RegExp(
+// Don't capture space.
+'^\\s*(?:'
+// Capture an attribute name in group 1, and value in group 3.
+// We capture the fact that there was an attribute in group 2, since
+// interpreters are inconsistent in whether a group that matches nothing
+// is null, undefined, or the empty string.
++ ('(?:'
++ '([a-z][a-z-]*)'                    // attribute name
++ ('('                                // optionally followed
++ '\\s*=\\s*'
++ ('('
+// A double quoted string.
++ '\"[^\"]*\"'
+// A single quoted string.
++ '|\'[^\']*\''
+// The positive lookahead is used to make sure that in
+// <foo bar= baz=boo>, the value for bar is blank, not "baz=boo".
++ '|(?=[a-z][a-z-]*\\s*=)'
+// An unquoted value that is not an attribute name.
+// We know it is not an attribute name because the previous
+// zero-width match would've eliminated that possibility.
++ '|[^>\"\'\\s]*'
++ ')'
+)
++ ')'
+) + '?'
++ ')'
+)
+// End of tag captured in group 3.
++ '|(/?>)'
+// Don't capture cruft
++ '|.[^\\w\\s>]*)',
+'i');
+var OUTSIDE_TAG_TOKEN = new RegExp(
+'^(?:'
+// Entity captured in group 1.
++ '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);'
+// Comment, doctypes, and processing instructions not captured.
++ '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>'
+// '/' captured in group 2 for close tags, and name captured in group 3.
++ '|<(/)?([a-z][a-z0-9]*)'
+// Text captured in group 4.
++ '|([^<&>]+)'
+// Cruft captured in group 5.
++ '|([<&>]))',
+'i');
+/**
+* Given a SAX-like event handler, produce a function that feeds those
+* events and a parameter to the event handler.
+*
+* The event handler has the form:{@code
+* {
+*   // Name is an upper-case HTML tag name.  Attribs is an array of
+*   // alternating upper-case attribute names, and attribute values.  The
+*   // attribs array is reused by the parser.  Param is the value passed to
+*   // the saxParser.
+*   startTag: function (name, attribs, param) { ... },
+*   endTag:   function (name, param) { ... },
+*   pcdata:   function (text, param) { ... },
+*   rcdata:   function (text, param) { ... },
+*   cdata:    function (text, param) { ... },
+*   startDoc: function (param) { ... },
+*   endDoc:   function (param) { ... }
+* }}
+*
+* @param {Object} handler a record containing event handlers.
+* @return {Function} that takes a chunk of html and a parameter.
+*   The parameter is passed on to the handler methods.
+*/
+function makeSaxParser(handler) {
+return function parse(htmlText, param) {
+htmlText = String(htmlText);
+var htmlLower = null;
+var inTag = false;  // True iff we're currently processing a tag.
+var attribs = [];  // Accumulates attribute names and values.
+var tagName = void 0;  // The name of the tag currently being processed.
+var eflags = void 0;  // The element flags for the current tag.
+var openTag = void 0;  // True if the current tag is an open tag.
+if (handler.startDoc) { handler.startDoc(param); }
+while (htmlText) {
+var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN);
+htmlText = htmlText.substring(m[0].length);
+if (inTag) {
+if (m[1]) { // attribute
+// setAttribute with uppercase names doesn't work on IE6.
+var attribName = lcase(m[1]);
+var decodedValue;
+if (m[2]) {
+var encodedValue = m[3];
+switch (encodedValue.charCodeAt(0)) {  // Strip quotes
+case 34: case 39:
+encodedValue = encodedValue.substring(
+1, encodedValue.length - 1);
+break;
+}
+decodedValue = unescapeEntities(stripNULs(encodedValue));
+} else {
+// Use name as value for valueless attribs, so
+//   <input type=checkbox checked>
+// gets attributes ['type', 'checkbox', 'checked', 'checked']
+decodedValue = attribName;
+}
+attribs.push(attribName, decodedValue);
+} else if (m[4]) {
+if (eflags !== void 0) {  // False if not in whitelist.
+if (openTag) {
+if (handler.startTag) {
+handler.startTag(tagName, attribs, param);
+}
+} else {
+if (handler.endTag) {
+handler.endTag(tagName, param);
+}
+}
+}
+if (openTag
+&& (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) {
+if (htmlLower === null) {
+htmlLower = lcase(htmlText);
+} else {
+htmlLower = htmlLower.substring(
+htmlLower.length - htmlText.length);
+}
+var dataEnd = htmlLower.indexOf('</' + tagName);
+if (dataEnd < 0) { dataEnd = htmlText.length; }
+if (eflags & html4.eflags.CDATA) {
+if (handler.cdata) {
+handler.cdata(htmlText.substring(0, dataEnd), param);
+}
+} else if (handler.rcdata) {
+handler.rcdata(
+normalizeRCData(htmlText.substring(0, dataEnd)), param);
+}
+htmlText = htmlText.substring(dataEnd);
+}
+tagName = eflags = openTag = void 0;
+attribs.length = 0;
+inTag = false;
+}
+} else {
+if (m[1]) {  // Entity
+if (handler.pcdata) { handler.pcdata(m[0], param); }
+} else if (m[3]) {  // Tag
+openTag = !m[2];
+inTag = true;
+tagName = lcase(m[3]);
+eflags = html4.ELEMENTS.hasOwnProperty(tagName)
+? html4.ELEMENTS[tagName] : void 0;
+} else if (m[4]) {  // Text
+if (handler.pcdata) { handler.pcdata(m[4], param); }
+} else if (m[5]) {  // Cruft
+if (handler.pcdata) {
+switch (m[5]) {
+case '<': handler.pcdata('&lt;', param); break;
+case '>': handler.pcdata('&gt;', param); break;
+default: handler.pcdata('&amp;', param); break;
+}
+}
+}
+}
+}
+if (handler.endDoc) { handler.endDoc(param); }
+};
+}
+return {
+normalizeRCData: normalizeRCData,
+escapeAttrib: escapeAttrib,
+unescapeEntities: unescapeEntities,
+makeSaxParser: makeSaxParser
+};
+})();
+/**
+* Returns a function that strips unsafe tags and attributes from html.
+* @param {Function} sanitizeAttributes
+*     maps from (tagName, attribs[]) to null or a sanitized attribute array.
+*     The attribs array can be arbitrarily modified, but the same array
+*     instance is reused, so should not be held.
+* @return {Function} from html to sanitized html
+*/
+html.makeHtmlSanitizer = function (sanitizeAttributes) {
+var stack = [];
+var ignoring = false;
+return html.makeSaxParser({
+startDoc: function (_) {
+stack = [];
+ignoring = false;
+},
+startTag: function (tagName, attribs, out) {
+if (ignoring) { return; }
+if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
+var eflags = html4.ELEMENTS[tagName];
+if (eflags & html4.eflags.FOLDABLE) {
+return;
+} else if (eflags & html4.eflags.UNSAFE) {
+ignoring = !(eflags & html4.eflags.EMPTY);
+return;
+}
+attribs = sanitizeAttributes(tagName, attribs);
+// TODO(mikesamuel): relying on sanitizeAttributes not to
+// insert unsafe attribute names.
+if (attribs) {
+if (!(eflags & html4.eflags.EMPTY)) {
+stack.push(tagName);
+}
+out.push('<', tagName);
+for (var i = 0, n = attribs.length; i < n; i += 2) {
+var attribName = attribs[i],
+value = attribs[i + 1];
+if (value !== null && value !== void 0) {
+out.push(' ', attribName, '="', html.escapeAttrib(value), '"');
+}
+}
+out.push('>');
+}
+},
+endTag: function (tagName, out) {
+if (ignoring) {
+ignoring = false;
+return;
+}
+if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
+var eflags = html4.ELEMENTS[tagName];
+if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY
+| html4.eflags.FOLDABLE))) {
+var index;
+if (eflags & html4.eflags.OPTIONAL_ENDTAG) {
+for (index = stack.length; --index >= 0;) {
+var stackEl = stack[index];
+if (stackEl === tagName) { break; }
+if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
+// Don't pop non optional end tags looking for a match.
+return;
+}
+}
+} else {
+for (index = stack.length; --index >= 0;) {
+if (stack[index] === tagName) { break; }
+}
+}
+if (index < 0) { return; }  // Not opened.
+for (var i = stack.length; --i > index;) {
+var stackEl = stack[i];
+if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
+out.push('</', stackEl, '>');
+}
+}
+stack.length = index;
+out.push('</', tagName, '>');
+}
+},
+pcdata: function (text, out) {
+if (!ignoring) { out.push(text); }
+},
+rcdata: function (text, out) {
+if (!ignoring) { out.push(text); }
+},
+cdata: function (text, out) {
+if (!ignoring) { out.push(text); }
+},
+endDoc: function (out) {
+for (var i = stack.length; --i >= 0;) {
+out.push('</', stack[i], '>');
+}
+stack.length = 0;
+}
+});
+};
+/**
+* Strips unsafe tags and attributes from html.
+* @param {string} htmlText to sanitize
+* @param {Function} opt_urlPolicy -- a transform to apply to url attribute
+*     values.
+* @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to
+*     apply to names, ids, and classes.
+* @return {string} html
+*/
+function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) {
+var out = [];
+html.makeHtmlSanitizer(
+function sanitizeAttribs(tagName, attribs) {
+for (var i = 0; i < attribs.length; i += 2) {
+var attribName = attribs[i];
+var value = attribs[i + 1];
+var atype = null, attribKey;
+if ((attribKey = tagName + ':' + attribName,
+html4.ATTRIBS.hasOwnProperty(attribKey))
+|| (attribKey = '*:' + attribName,
+html4.ATTRIBS.hasOwnProperty(attribKey))) {
+atype = html4.ATTRIBS[attribKey];
+}
+if (atype !== null) {
+switch (atype) {
+case html4.atype.SCRIPT:
+case html4.atype.STYLE:
+value = null;
+break;
+case html4.atype.IDREF:
+case html4.atype.IDREFS:
+case html4.atype.GLOBAL_NAME:
+case html4.atype.LOCAL_NAME:
+case html4.atype.CLASSES:
+value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
+break;
+case html4.atype.URI:
+value = opt_urlPolicy && opt_urlPolicy(value);
+break;
+}
+} else {
+value = null;
+}
+attribs[i + 1] = value;
+}
+return attribs;
+})(htmlText, out);
+return out.join('');
+}