Mercurial > caja-test
comparison js/ext/html-sanitizer.js @ 0:633c9cb05555
Origination.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 07 Jun 2009 19:29:10 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:633c9cb05555 |
---|---|
1 // Copyright (C) 2006 Google Inc. | |
2 // | |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
4 // you may not use this file except in compliance with the License. | |
5 // You may obtain a copy of the License at | |
6 // | |
7 // http://www.apache.org/licenses/LICENSE-2.0 | |
8 // | |
9 // Unless required by applicable law or agreed to in writing, software | |
10 // distributed under the License is distributed on an "AS IS" BASIS, | |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 // See the License for the specific language governing permissions and | |
13 // limitations under the License. | |
14 | |
15 /** | |
16 * @fileoverview | |
17 * An HTML sanitizer that can satisfy a variety of security policies. | |
18 * | |
19 * <p> | |
20 * The HTML sanitizer is built around a SAX parser and HTML element and | |
21 * attributes schemas. | |
22 * | |
23 * @author mikesamuel@gmail.com | |
24 * @requires html4 | |
25 * @provides html, html_sanitize | |
26 */ | |
27 | |
28 /** | |
29 * @namespace | |
30 */ | |
31 var html = (function () { | |
32 var lcase; | |
33 // The below may not be true on browsers in the Turkish locale. | |
34 if ('script' === 'SCRIPT'.toLowerCase()) { | |
35 lcase = function (s) { return s.toLowerCase(); }; | |
36 } else { | |
37 /** | |
38 * {@updoc | |
39 * $ lcase('SCRIPT') | |
40 * # 'script' | |
41 * $ lcase('script') | |
42 * # 'script' | |
43 * } | |
44 */ | |
45 lcase = function (s) { | |
46 return s.replace( | |
47 /[A-Z]/g, | |
48 function (ch) { | |
49 return String.fromCharCode(ch.charCodeAt(0) | 32); | |
50 }); | |
51 }; | |
52 } | |
53 | |
54 var ENTITIES = { | |
55 lt : '<', | |
56 gt : '>', | |
57 amp : '&', | |
58 nbsp : '\240', | |
59 quot : '"', | |
60 apos : '\'' | |
61 }; | |
62 | |
63 var decimalEscapeRe = /^#(\d+)$/; | |
64 var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/; | |
65 /** | |
66 * Decodes an HTML entity. | |
67 * | |
68 * {@updoc | |
69 * $ lookupEntity('lt') | |
70 * # '<' | |
71 * $ lookupEntity('GT') | |
72 * # '>' | |
73 * $ lookupEntity('amp') | |
74 * # '&' | |
75 * $ lookupEntity('nbsp') | |
76 * # '\xA0' | |
77 * $ lookupEntity('apos') | |
78 * # "'" | |
79 * $ lookupEntity('quot') | |
80 * # '"' | |
81 * $ lookupEntity('#xa') | |
82 * # '\n' | |
83 * $ lookupEntity('#10') | |
84 * # '\n' | |
85 * $ lookupEntity('#x0a') | |
86 * # '\n' | |
87 * $ lookupEntity('#010') | |
88 * # '\n' | |
89 * $ lookupEntity('#x00A') | |
90 * # '\n' | |
91 * $ lookupEntity('Pi') // Known failure | |
92 * # '\u03A0' | |
93 * $ lookupEntity('pi') // Known failure | |
94 * # '\u03C0' | |
95 * } | |
96 * | |
97 * @param name the content between the '&' and the ';'. | |
98 * @return a single unicode code-point as a string. | |
99 */ | |
100 function lookupEntity(name) { | |
101 name = lcase(name); // TODO: π is different from Π | |
102 if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; } | |
103 var m = name.match(decimalEscapeRe); | |
104 if (m) { | |
105 return String.fromCharCode(parseInt(m[1], 10)); | |
106 } else if (!!(m = name.match(hexEscapeRe))) { | |
107 return String.fromCharCode(parseInt(m[1], 16)); | |
108 } | |
109 return ''; | |
110 } | |
111 | |
112 function decodeOneEntity(_, name) { | |
113 return lookupEntity(name); | |
114 } | |
115 | |
116 var nulRe = /\0/g; | |
117 function stripNULs(s) { | |
118 return s.replace(nulRe, ''); | |
119 } | |
120 | |
121 var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; | |
122 /** | |
123 * The plain text of a chunk of HTML CDATA which possibly containing. | |
124 * | |
125 * {@updoc | |
126 * $ unescapeEntities('') | |
127 * # '' | |
128 * $ unescapeEntities('hello World!') | |
129 * # 'hello World!' | |
130 * $ unescapeEntities('1 < 2 && 4 > 3 ') | |
131 * # '1 < 2 && 4 > 3\n' | |
132 * $ unescapeEntities('<< <- unfinished entity>') | |
133 * # '<< <- unfinished entity>' | |
134 * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS | |
135 * # '/foo?bar=baz©=true' | |
136 * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure | |
137 * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0' | |
138 * } | |
139 * | |
140 * @param s a chunk of HTML CDATA. It must not start or end inside an HTML | |
141 * entity. | |
142 */ | |
143 function unescapeEntities(s) { | |
144 return s.replace(entityRe, decodeOneEntity); | |
145 } | |
146 | |
147 var ampRe = /&/g; | |
148 var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; | |
149 var ltRe = /</g; | |
150 var gtRe = />/g; | |
151 var quotRe = /\"/g; | |
152 var eqRe = /\=/g; // Backslash required on JScript.net | |
153 | |
154 /** | |
155 * Escapes HTML special characters in attribute values as HTML entities. | |
156 * | |
157 * {@updoc | |
158 * $ escapeAttrib('') | |
159 * # '' | |
160 * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence. | |
161 * # '"<<&==&>>"' | |
162 * $ escapeAttrib('Hello <World>!') | |
163 * # 'Hello <World>!' | |
164 * } | |
165 */ | |
166 function escapeAttrib(s) { | |
167 // Escaping '=' defangs many UTF-7 and SGML short-tag attacks. | |
168 return s.replace(ampRe, '&').replace(ltRe, '<').replace(gtRe, '>') | |
169 .replace(quotRe, '"').replace(eqRe, '='); | |
170 } | |
171 | |
172 /** | |
173 * Escape entities in RCDATA that can be escaped without changing the meaning. | |
174 * {@updoc | |
175 * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8') | |
176 * # '1 < 2 && 3 > 4 && 5 < 7&8' | |
177 * } | |
178 */ | |
179 function normalizeRCData(rcdata) { | |
180 return rcdata | |
181 .replace(looseAmpRe, '&$1') | |
182 .replace(ltRe, '<') | |
183 .replace(gtRe, '>'); | |
184 } | |
185 | |
186 | |
187 // TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at | |
188 // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html | |
189 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html | |
190 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html | |
191 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html | |
192 | |
193 /** token definitions. */ | |
194 var INSIDE_TAG_TOKEN = new RegExp( | |
195 // Don't capture space. | |
196 '^\\s*(?:' | |
197 // Capture an attribute name in group 1, and value in group 3. | |
198 // We capture the fact that there was an attribute in group 2, since | |
199 // interpreters are inconsistent in whether a group that matches nothing | |
200 // is null, undefined, or the empty string. | |
201 + ('(?:' | |
202 + '([a-z][a-z-]*)' // attribute name | |
203 + ('(' // optionally followed | |
204 + '\\s*=\\s*' | |
205 + ('(' | |
206 // A double quoted string. | |
207 + '\"[^\"]*\"' | |
208 // A single quoted string. | |
209 + '|\'[^\']*\'' | |
210 // The positive lookahead is used to make sure that in | |
211 // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo". | |
212 + '|(?=[a-z][a-z-]*\\s*=)' | |
213 // An unquoted value that is not an attribute name. | |
214 // We know it is not an attribute name because the previous | |
215 // zero-width match would've eliminated that possibility. | |
216 + '|[^>\"\'\\s]*' | |
217 + ')' | |
218 ) | |
219 + ')' | |
220 ) + '?' | |
221 + ')' | |
222 ) | |
223 // End of tag captured in group 3. | |
224 + '|(/?>)' | |
225 // Don't capture cruft | |
226 + '|.[^\\w\\s>]*)', | |
227 'i'); | |
228 | |
229 var OUTSIDE_TAG_TOKEN = new RegExp( | |
230 '^(?:' | |
231 // Entity captured in group 1. | |
232 + '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' | |
233 // Comment, doctypes, and processing instructions not captured. | |
234 + '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>' | |
235 // '/' captured in group 2 for close tags, and name captured in group 3. | |
236 + '|<(/)?([a-z][a-z0-9]*)' | |
237 // Text captured in group 4. | |
238 + '|([^<&>]+)' | |
239 // Cruft captured in group 5. | |
240 + '|([<&>]))', | |
241 'i'); | |
242 | |
243 /** | |
244 * Given a SAX-like event handler, produce a function that feeds those | |
245 * events and a parameter to the event handler. | |
246 * | |
247 * The event handler has the form:{@code | |
248 * { | |
249 * // Name is an upper-case HTML tag name. Attribs is an array of | |
250 * // alternating upper-case attribute names, and attribute values. The | |
251 * // attribs array is reused by the parser. Param is the value passed to | |
252 * // the saxParser. | |
253 * startTag: function (name, attribs, param) { ... }, | |
254 * endTag: function (name, param) { ... }, | |
255 * pcdata: function (text, param) { ... }, | |
256 * rcdata: function (text, param) { ... }, | |
257 * cdata: function (text, param) { ... }, | |
258 * startDoc: function (param) { ... }, | |
259 * endDoc: function (param) { ... } | |
260 * }} | |
261 * | |
262 * @param {Object} handler a record containing event handlers. | |
263 * @return {Function} that takes a chunk of html and a parameter. | |
264 * The parameter is passed on to the handler methods. | |
265 */ | |
266 function makeSaxParser(handler) { | |
267 return function parse(htmlText, param) { | |
268 htmlText = String(htmlText); | |
269 var htmlLower = null; | |
270 | |
271 var inTag = false; // True iff we're currently processing a tag. | |
272 var attribs = []; // Accumulates attribute names and values. | |
273 var tagName = void 0; // The name of the tag currently being processed. | |
274 var eflags = void 0; // The element flags for the current tag. | |
275 var openTag = void 0; // True if the current tag is an open tag. | |
276 | |
277 if (handler.startDoc) { handler.startDoc(param); } | |
278 | |
279 while (htmlText) { | |
280 var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN); | |
281 htmlText = htmlText.substring(m[0].length); | |
282 | |
283 if (inTag) { | |
284 if (m[1]) { // attribute | |
285 // setAttribute with uppercase names doesn't work on IE6. | |
286 var attribName = lcase(m[1]); | |
287 var decodedValue; | |
288 if (m[2]) { | |
289 var encodedValue = m[3]; | |
290 switch (encodedValue.charCodeAt(0)) { // Strip quotes | |
291 case 34: case 39: | |
292 encodedValue = encodedValue.substring( | |
293 1, encodedValue.length - 1); | |
294 break; | |
295 } | |
296 decodedValue = unescapeEntities(stripNULs(encodedValue)); | |
297 } else { | |
298 // Use name as value for valueless attribs, so | |
299 // <input type=checkbox checked> | |
300 // gets attributes ['type', 'checkbox', 'checked', 'checked'] | |
301 decodedValue = attribName; | |
302 } | |
303 attribs.push(attribName, decodedValue); | |
304 } else if (m[4]) { | |
305 if (eflags !== void 0) { // False if not in whitelist. | |
306 if (openTag) { | |
307 if (handler.startTag) { | |
308 handler.startTag(tagName, attribs, param); | |
309 } | |
310 } else { | |
311 if (handler.endTag) { | |
312 handler.endTag(tagName, param); | |
313 } | |
314 } | |
315 } | |
316 | |
317 if (openTag | |
318 && (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) { | |
319 if (htmlLower === null) { | |
320 htmlLower = lcase(htmlText); | |
321 } else { | |
322 htmlLower = htmlLower.substring( | |
323 htmlLower.length - htmlText.length); | |
324 } | |
325 var dataEnd = htmlLower.indexOf('</' + tagName); | |
326 if (dataEnd < 0) { dataEnd = htmlText.length; } | |
327 if (eflags & html4.eflags.CDATA) { | |
328 if (handler.cdata) { | |
329 handler.cdata(htmlText.substring(0, dataEnd), param); | |
330 } | |
331 } else if (handler.rcdata) { | |
332 handler.rcdata( | |
333 normalizeRCData(htmlText.substring(0, dataEnd)), param); | |
334 } | |
335 htmlText = htmlText.substring(dataEnd); | |
336 } | |
337 | |
338 tagName = eflags = openTag = void 0; | |
339 attribs.length = 0; | |
340 inTag = false; | |
341 } | |
342 } else { | |
343 if (m[1]) { // Entity | |
344 if (handler.pcdata) { handler.pcdata(m[0], param); } | |
345 } else if (m[3]) { // Tag | |
346 openTag = !m[2]; | |
347 inTag = true; | |
348 tagName = lcase(m[3]); | |
349 eflags = html4.ELEMENTS.hasOwnProperty(tagName) | |
350 ? html4.ELEMENTS[tagName] : void 0; | |
351 } else if (m[4]) { // Text | |
352 if (handler.pcdata) { handler.pcdata(m[4], param); } | |
353 } else if (m[5]) { // Cruft | |
354 if (handler.pcdata) { | |
355 switch (m[5]) { | |
356 case '<': handler.pcdata('<', param); break; | |
357 case '>': handler.pcdata('>', param); break; | |
358 default: handler.pcdata('&', param); break; | |
359 } | |
360 } | |
361 } | |
362 } | |
363 } | |
364 | |
365 if (handler.endDoc) { handler.endDoc(param); } | |
366 }; | |
367 } | |
368 | |
369 return { | |
370 normalizeRCData: normalizeRCData, | |
371 escapeAttrib: escapeAttrib, | |
372 unescapeEntities: unescapeEntities, | |
373 makeSaxParser: makeSaxParser | |
374 }; | |
375 })(); | |
376 | |
377 /** | |
378 * Returns a function that strips unsafe tags and attributes from html. | |
379 * @param {Function} sanitizeAttributes | |
380 * maps from (tagName, attribs[]) to null or a sanitized attribute array. | |
381 * The attribs array can be arbitrarily modified, but the same array | |
382 * instance is reused, so should not be held. | |
383 * @return {Function} from html to sanitized html | |
384 */ | |
385 html.makeHtmlSanitizer = function (sanitizeAttributes) { | |
386 var stack = []; | |
387 var ignoring = false; | |
388 return html.makeSaxParser({ | |
389 startDoc: function (_) { | |
390 stack = []; | |
391 ignoring = false; | |
392 }, | |
393 startTag: function (tagName, attribs, out) { | |
394 if (ignoring) { return; } | |
395 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } | |
396 var eflags = html4.ELEMENTS[tagName]; | |
397 if (eflags & html4.eflags.FOLDABLE) { | |
398 return; | |
399 } else if (eflags & html4.eflags.UNSAFE) { | |
400 ignoring = !(eflags & html4.eflags.EMPTY); | |
401 return; | |
402 } | |
403 attribs = sanitizeAttributes(tagName, attribs); | |
404 // TODO(mikesamuel): relying on sanitizeAttributes not to | |
405 // insert unsafe attribute names. | |
406 if (attribs) { | |
407 if (!(eflags & html4.eflags.EMPTY)) { | |
408 stack.push(tagName); | |
409 } | |
410 | |
411 out.push('<', tagName); | |
412 for (var i = 0, n = attribs.length; i < n; i += 2) { | |
413 var attribName = attribs[i], | |
414 value = attribs[i + 1]; | |
415 if (value !== null && value !== void 0) { | |
416 out.push(' ', attribName, '="', html.escapeAttrib(value), '"'); | |
417 } | |
418 } | |
419 out.push('>'); | |
420 } | |
421 }, | |
422 endTag: function (tagName, out) { | |
423 if (ignoring) { | |
424 ignoring = false; | |
425 return; | |
426 } | |
427 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } | |
428 var eflags = html4.ELEMENTS[tagName]; | |
429 if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY | |
430 | html4.eflags.FOLDABLE))) { | |
431 var index; | |
432 if (eflags & html4.eflags.OPTIONAL_ENDTAG) { | |
433 for (index = stack.length; --index >= 0;) { | |
434 var stackEl = stack[index]; | |
435 if (stackEl === tagName) { break; } | |
436 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { | |
437 // Don't pop non optional end tags looking for a match. | |
438 return; | |
439 } | |
440 } | |
441 } else { | |
442 for (index = stack.length; --index >= 0;) { | |
443 if (stack[index] === tagName) { break; } | |
444 } | |
445 } | |
446 if (index < 0) { return; } // Not opened. | |
447 for (var i = stack.length; --i > index;) { | |
448 var stackEl = stack[i]; | |
449 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) { | |
450 out.push('</', stackEl, '>'); | |
451 } | |
452 } | |
453 stack.length = index; | |
454 out.push('</', tagName, '>'); | |
455 } | |
456 }, | |
457 pcdata: function (text, out) { | |
458 if (!ignoring) { out.push(text); } | |
459 }, | |
460 rcdata: function (text, out) { | |
461 if (!ignoring) { out.push(text); } | |
462 }, | |
463 cdata: function (text, out) { | |
464 if (!ignoring) { out.push(text); } | |
465 }, | |
466 endDoc: function (out) { | |
467 for (var i = stack.length; --i >= 0;) { | |
468 out.push('</', stack[i], '>'); | |
469 } | |
470 stack.length = 0; | |
471 } | |
472 }); | |
473 }; | |
474 | |
475 | |
476 /** | |
477 * Strips unsafe tags and attributes from html. | |
478 * @param {string} htmlText to sanitize | |
479 * @param {Function} opt_urlPolicy -- a transform to apply to url attribute | |
480 * values. | |
481 * @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to | |
482 * apply to names, ids, and classes. | |
483 * @return {string} html | |
484 */ | |
485 function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) { | |
486 var out = []; | |
487 html.makeHtmlSanitizer( | |
488 function sanitizeAttribs(tagName, attribs) { | |
489 for (var i = 0; i < attribs.length; i += 2) { | |
490 var attribName = attribs[i]; | |
491 var value = attribs[i + 1]; | |
492 var atype = null, attribKey; | |
493 if ((attribKey = tagName + ':' + attribName, | |
494 html4.ATTRIBS.hasOwnProperty(attribKey)) | |
495 || (attribKey = '*:' + attribName, | |
496 html4.ATTRIBS.hasOwnProperty(attribKey))) { | |
497 atype = html4.ATTRIBS[attribKey]; | |
498 } | |
499 if (atype !== null) { | |
500 switch (atype) { | |
501 case html4.atype.SCRIPT: | |
502 case html4.atype.STYLE: | |
503 value = null; | |
504 break; | |
505 case html4.atype.IDREF: | |
506 case html4.atype.IDREFS: | |
507 case html4.atype.GLOBAL_NAME: | |
508 case html4.atype.LOCAL_NAME: | |
509 case html4.atype.CLASSES: | |
510 value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value; | |
511 break; | |
512 case html4.atype.URI: | |
513 value = opt_urlPolicy && opt_urlPolicy(value); | |
514 break; | |
515 } | |
516 } else { | |
517 value = null; | |
518 } | |
519 attribs[i + 1] = value; | |
520 } | |
521 return attribs; | |
522 })(htmlText, out); | |
523 return out.join(''); | |
524 } |