0
|
1 // Copyright (C) 2006 Google Inc.
|
|
2 //
|
|
3 // Licensed under the Apache License, Version 2.0 (the "License");
|
|
4 // you may not use this file except in compliance with the License.
|
|
5 // You may obtain a copy of the License at
|
|
6 //
|
|
7 // http://www.apache.org/licenses/LICENSE-2.0
|
|
8 //
|
|
9 // Unless required by applicable law or agreed to in writing, software
|
|
10 // distributed under the License is distributed on an "AS IS" BASIS,
|
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12 // See the License for the specific language governing permissions and
|
|
13 // limitations under the License.
|
|
14
|
|
15 /**
|
|
16 * @fileoverview
|
|
17 * An HTML sanitizer that can satisfy a variety of security policies.
|
|
18 *
|
|
19 * <p>
|
|
20 * The HTML sanitizer is built around a SAX parser and HTML element and
|
|
21 * attributes schemas.
|
|
22 *
|
|
23 * @author mikesamuel@gmail.com
|
|
24 * @requires html4
|
|
25 * @provides html, html_sanitize
|
|
26 */
|
|
27
|
|
28 /**
|
|
29 * @namespace
|
|
30 */
|
|
31 var html = (function () {
|
|
32 var lcase;
|
|
33 // The below may not be true on browsers in the Turkish locale.
|
|
34 if ('script' === 'SCRIPT'.toLowerCase()) {
|
|
35 lcase = function (s) { return s.toLowerCase(); };
|
|
36 } else {
|
|
37 /**
|
|
38 * {@updoc
|
|
39 * $ lcase('SCRIPT')
|
|
40 * # 'script'
|
|
41 * $ lcase('script')
|
|
42 * # 'script'
|
|
43 * }
|
|
44 */
|
|
45 lcase = function (s) {
|
|
46 return s.replace(
|
|
47 /[A-Z]/g,
|
|
48 function (ch) {
|
|
49 return String.fromCharCode(ch.charCodeAt(0) | 32);
|
|
50 });
|
|
51 };
|
|
52 }
|
|
53
|
|
54 var ENTITIES = {
|
|
55 lt : '<',
|
|
56 gt : '>',
|
|
57 amp : '&',
|
|
58 nbsp : '\240',
|
|
59 quot : '"',
|
|
60 apos : '\''
|
|
61 };
|
|
62
|
|
63 var decimalEscapeRe = /^#(\d+)$/;
|
|
64 var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
|
|
65 /**
|
|
66 * Decodes an HTML entity.
|
|
67 *
|
|
68 * {@updoc
|
|
69 * $ lookupEntity('lt')
|
|
70 * # '<'
|
|
71 * $ lookupEntity('GT')
|
|
72 * # '>'
|
|
73 * $ lookupEntity('amp')
|
|
74 * # '&'
|
|
75 * $ lookupEntity('nbsp')
|
|
76 * # '\xA0'
|
|
77 * $ lookupEntity('apos')
|
|
78 * # "'"
|
|
79 * $ lookupEntity('quot')
|
|
80 * # '"'
|
|
81 * $ lookupEntity('#xa')
|
|
82 * # '\n'
|
|
83 * $ lookupEntity('#10')
|
|
84 * # '\n'
|
|
85 * $ lookupEntity('#x0a')
|
|
86 * # '\n'
|
|
87 * $ lookupEntity('#010')
|
|
88 * # '\n'
|
|
89 * $ lookupEntity('#x00A')
|
|
90 * # '\n'
|
|
91 * $ lookupEntity('Pi') // Known failure
|
|
92 * # '\u03A0'
|
|
93 * $ lookupEntity('pi') // Known failure
|
|
94 * # '\u03C0'
|
|
95 * }
|
|
96 *
|
|
97 * @param name the content between the '&' and the ';'.
|
|
98 * @return a single unicode code-point as a string.
|
|
99 */
|
|
100 function lookupEntity(name) {
|
|
101 name = lcase(name); // TODO: π is different from Π
|
|
102 if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
|
|
103 var m = name.match(decimalEscapeRe);
|
|
104 if (m) {
|
|
105 return String.fromCharCode(parseInt(m[1], 10));
|
|
106 } else if (!!(m = name.match(hexEscapeRe))) {
|
|
107 return String.fromCharCode(parseInt(m[1], 16));
|
|
108 }
|
|
109 return '';
|
|
110 }
|
|
111
|
|
112 function decodeOneEntity(_, name) {
|
|
113 return lookupEntity(name);
|
|
114 }
|
|
115
|
|
116 var nulRe = /\0/g;
|
|
117 function stripNULs(s) {
|
|
118 return s.replace(nulRe, '');
|
|
119 }
|
|
120
|
|
121 var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g;
|
|
122 /**
|
|
123 * The plain text of a chunk of HTML CDATA which possibly containing.
|
|
124 *
|
|
125 * {@updoc
|
|
126 * $ unescapeEntities('')
|
|
127 * # ''
|
|
128 * $ unescapeEntities('hello World!')
|
|
129 * # 'hello World!'
|
|
130 * $ unescapeEntities('1 < 2 && 4 > 3 ')
|
|
131 * # '1 < 2 && 4 > 3\n'
|
|
132 * $ unescapeEntities('<< <- unfinished entity>')
|
|
133 * # '<< <- unfinished entity>'
|
|
134 * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS
|
|
135 * # '/foo?bar=baz©=true'
|
|
136 * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure
|
|
137 * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
|
|
138 * }
|
|
139 *
|
|
140 * @param s a chunk of HTML CDATA. It must not start or end inside an HTML
|
|
141 * entity.
|
|
142 */
|
|
143 function unescapeEntities(s) {
|
|
144 return s.replace(entityRe, decodeOneEntity);
|
|
145 }
|
|
146
|
|
147 var ampRe = /&/g;
|
|
148 var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
|
|
149 var ltRe = /</g;
|
|
150 var gtRe = />/g;
|
|
151 var quotRe = /\"/g;
|
|
152 var eqRe = /\=/g; // Backslash required on JScript.net
|
|
153
|
|
154 /**
|
|
155 * Escapes HTML special characters in attribute values as HTML entities.
|
|
156 *
|
|
157 * {@updoc
|
|
158 * $ escapeAttrib('')
|
|
159 * # ''
|
|
160 * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence.
|
|
161 * # '"<<&==&>>"'
|
|
162 * $ escapeAttrib('Hello <World>!')
|
|
163 * # 'Hello <World>!'
|
|
164 * }
|
|
165 */
|
|
166 function escapeAttrib(s) {
|
|
167 // Escaping '=' defangs many UTF-7 and SGML short-tag attacks.
|
|
168 return s.replace(ampRe, '&').replace(ltRe, '<').replace(gtRe, '>')
|
|
169 .replace(quotRe, '"').replace(eqRe, '=');
|
|
170 }
|
|
171
|
|
172 /**
|
|
173 * Escape entities in RCDATA that can be escaped without changing the meaning.
|
|
174 * {@updoc
|
|
175 * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8')
|
|
176 * # '1 < 2 && 3 > 4 && 5 < 7&8'
|
|
177 * }
|
|
178 */
|
|
179 function normalizeRCData(rcdata) {
|
|
180 return rcdata
|
|
181 .replace(looseAmpRe, '&$1')
|
|
182 .replace(ltRe, '<')
|
|
183 .replace(gtRe, '>');
|
|
184 }
|
|
185
|
|
186
|
|
187 // TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at
|
|
188 // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
|
|
189 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
|
|
190 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
|
|
191 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
|
|
192
|
|
193 /** token definitions. */
|
|
194 var INSIDE_TAG_TOKEN = new RegExp(
|
|
195 // Don't capture space.
|
|
196 '^\\s*(?:'
|
|
197 // Capture an attribute name in group 1, and value in group 3.
|
|
198 // We capture the fact that there was an attribute in group 2, since
|
|
199 // interpreters are inconsistent in whether a group that matches nothing
|
|
200 // is null, undefined, or the empty string.
|
|
201 + ('(?:'
|
|
202 + '([a-z][a-z-]*)' // attribute name
|
|
203 + ('(' // optionally followed
|
|
204 + '\\s*=\\s*'
|
|
205 + ('('
|
|
206 // A double quoted string.
|
|
207 + '\"[^\"]*\"'
|
|
208 // A single quoted string.
|
|
209 + '|\'[^\']*\''
|
|
210 // The positive lookahead is used to make sure that in
|
|
211 // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo".
|
|
212 + '|(?=[a-z][a-z-]*\\s*=)'
|
|
213 // An unquoted value that is not an attribute name.
|
|
214 // We know it is not an attribute name because the previous
|
|
215 // zero-width match would've eliminated that possibility.
|
|
216 + '|[^>\"\'\\s]*'
|
|
217 + ')'
|
|
218 )
|
|
219 + ')'
|
|
220 ) + '?'
|
|
221 + ')'
|
|
222 )
|
|
223 // End of tag captured in group 3.
|
|
224 + '|(/?>)'
|
|
225 // Don't capture cruft
|
|
226 + '|.[^\\w\\s>]*)',
|
|
227 'i');
|
|
228
|
|
229 var OUTSIDE_TAG_TOKEN = new RegExp(
|
|
230 '^(?:'
|
|
231 // Entity captured in group 1.
|
|
232 + '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);'
|
|
233 // Comment, doctypes, and processing instructions not captured.
|
|
234 + '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>'
|
|
235 // '/' captured in group 2 for close tags, and name captured in group 3.
|
|
236 + '|<(/)?([a-z][a-z0-9]*)'
|
|
237 // Text captured in group 4.
|
|
238 + '|([^<&>]+)'
|
|
239 // Cruft captured in group 5.
|
|
240 + '|([<&>]))',
|
|
241 'i');
|
|
242
|
|
243 /**
|
|
244 * Given a SAX-like event handler, produce a function that feeds those
|
|
245 * events and a parameter to the event handler.
|
|
246 *
|
|
247 * The event handler has the form:{@code
|
|
248 * {
|
|
249 * // Name is an upper-case HTML tag name. Attribs is an array of
|
|
250 * // alternating upper-case attribute names, and attribute values. The
|
|
251 * // attribs array is reused by the parser. Param is the value passed to
|
|
252 * // the saxParser.
|
|
253 * startTag: function (name, attribs, param) { ... },
|
|
254 * endTag: function (name, param) { ... },
|
|
255 * pcdata: function (text, param) { ... },
|
|
256 * rcdata: function (text, param) { ... },
|
|
257 * cdata: function (text, param) { ... },
|
|
258 * startDoc: function (param) { ... },
|
|
259 * endDoc: function (param) { ... }
|
|
260 * }}
|
|
261 *
|
|
262 * @param {Object} handler a record containing event handlers.
|
|
263 * @return {Function} that takes a chunk of html and a parameter.
|
|
264 * The parameter is passed on to the handler methods.
|
|
265 */
|
|
266 function makeSaxParser(handler) {
|
|
267 return function parse(htmlText, param) {
|
|
268 htmlText = String(htmlText);
|
|
269 var htmlLower = null;
|
|
270
|
|
271 var inTag = false; // True iff we're currently processing a tag.
|
|
272 var attribs = []; // Accumulates attribute names and values.
|
|
273 var tagName = void 0; // The name of the tag currently being processed.
|
|
274 var eflags = void 0; // The element flags for the current tag.
|
|
275 var openTag = void 0; // True if the current tag is an open tag.
|
|
276
|
|
277 if (handler.startDoc) { handler.startDoc(param); }
|
|
278
|
|
279 while (htmlText) {
|
|
280 var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN);
|
|
281 htmlText = htmlText.substring(m[0].length);
|
|
282
|
|
283 if (inTag) {
|
|
284 if (m[1]) { // attribute
|
|
285 // setAttribute with uppercase names doesn't work on IE6.
|
|
286 var attribName = lcase(m[1]);
|
|
287 var decodedValue;
|
|
288 if (m[2]) {
|
|
289 var encodedValue = m[3];
|
|
290 switch (encodedValue.charCodeAt(0)) { // Strip quotes
|
|
291 case 34: case 39:
|
|
292 encodedValue = encodedValue.substring(
|
|
293 1, encodedValue.length - 1);
|
|
294 break;
|
|
295 }
|
|
296 decodedValue = unescapeEntities(stripNULs(encodedValue));
|
|
297 } else {
|
|
298 // Use name as value for valueless attribs, so
|
|
299 // <input type=checkbox checked>
|
|
300 // gets attributes ['type', 'checkbox', 'checked', 'checked']
|
|
301 decodedValue = attribName;
|
|
302 }
|
|
303 attribs.push(attribName, decodedValue);
|
|
304 } else if (m[4]) {
|
|
305 if (eflags !== void 0) { // False if not in whitelist.
|
|
306 if (openTag) {
|
|
307 if (handler.startTag) {
|
|
308 handler.startTag(tagName, attribs, param);
|
|
309 }
|
|
310 } else {
|
|
311 if (handler.endTag) {
|
|
312 handler.endTag(tagName, param);
|
|
313 }
|
|
314 }
|
|
315 }
|
|
316
|
|
317 if (openTag
|
|
318 && (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) {
|
|
319 if (htmlLower === null) {
|
|
320 htmlLower = lcase(htmlText);
|
|
321 } else {
|
|
322 htmlLower = htmlLower.substring(
|
|
323 htmlLower.length - htmlText.length);
|
|
324 }
|
|
325 var dataEnd = htmlLower.indexOf('</' + tagName);
|
|
326 if (dataEnd < 0) { dataEnd = htmlText.length; }
|
|
327 if (eflags & html4.eflags.CDATA) {
|
|
328 if (handler.cdata) {
|
|
329 handler.cdata(htmlText.substring(0, dataEnd), param);
|
|
330 }
|
|
331 } else if (handler.rcdata) {
|
|
332 handler.rcdata(
|
|
333 normalizeRCData(htmlText.substring(0, dataEnd)), param);
|
|
334 }
|
|
335 htmlText = htmlText.substring(dataEnd);
|
|
336 }
|
|
337
|
|
338 tagName = eflags = openTag = void 0;
|
|
339 attribs.length = 0;
|
|
340 inTag = false;
|
|
341 }
|
|
342 } else {
|
|
343 if (m[1]) { // Entity
|
|
344 if (handler.pcdata) { handler.pcdata(m[0], param); }
|
|
345 } else if (m[3]) { // Tag
|
|
346 openTag = !m[2];
|
|
347 inTag = true;
|
|
348 tagName = lcase(m[3]);
|
|
349 eflags = html4.ELEMENTS.hasOwnProperty(tagName)
|
|
350 ? html4.ELEMENTS[tagName] : void 0;
|
|
351 } else if (m[4]) { // Text
|
|
352 if (handler.pcdata) { handler.pcdata(m[4], param); }
|
|
353 } else if (m[5]) { // Cruft
|
|
354 if (handler.pcdata) {
|
|
355 switch (m[5]) {
|
|
356 case '<': handler.pcdata('<', param); break;
|
|
357 case '>': handler.pcdata('>', param); break;
|
|
358 default: handler.pcdata('&', param); break;
|
|
359 }
|
|
360 }
|
|
361 }
|
|
362 }
|
|
363 }
|
|
364
|
|
365 if (handler.endDoc) { handler.endDoc(param); }
|
|
366 };
|
|
367 }
|
|
368
|
|
369 return {
|
|
370 normalizeRCData: normalizeRCData,
|
|
371 escapeAttrib: escapeAttrib,
|
|
372 unescapeEntities: unescapeEntities,
|
|
373 makeSaxParser: makeSaxParser
|
|
374 };
|
|
375 })();
|
|
376
|
|
377 /**
|
|
378 * Returns a function that strips unsafe tags and attributes from html.
|
|
379 * @param {Function} sanitizeAttributes
|
|
380 * maps from (tagName, attribs[]) to null or a sanitized attribute array.
|
|
381 * The attribs array can be arbitrarily modified, but the same array
|
|
382 * instance is reused, so should not be held.
|
|
383 * @return {Function} from html to sanitized html
|
|
384 */
|
|
385 html.makeHtmlSanitizer = function (sanitizeAttributes) {
|
|
386 var stack = [];
|
|
387 var ignoring = false;
|
|
388 return html.makeSaxParser({
|
|
389 startDoc: function (_) {
|
|
390 stack = [];
|
|
391 ignoring = false;
|
|
392 },
|
|
393 startTag: function (tagName, attribs, out) {
|
|
394 if (ignoring) { return; }
|
|
395 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
|
|
396 var eflags = html4.ELEMENTS[tagName];
|
|
397 if (eflags & html4.eflags.FOLDABLE) {
|
|
398 return;
|
|
399 } else if (eflags & html4.eflags.UNSAFE) {
|
|
400 ignoring = !(eflags & html4.eflags.EMPTY);
|
|
401 return;
|
|
402 }
|
|
403 attribs = sanitizeAttributes(tagName, attribs);
|
|
404 // TODO(mikesamuel): relying on sanitizeAttributes not to
|
|
405 // insert unsafe attribute names.
|
|
406 if (attribs) {
|
|
407 if (!(eflags & html4.eflags.EMPTY)) {
|
|
408 stack.push(tagName);
|
|
409 }
|
|
410
|
|
411 out.push('<', tagName);
|
|
412 for (var i = 0, n = attribs.length; i < n; i += 2) {
|
|
413 var attribName = attribs[i],
|
|
414 value = attribs[i + 1];
|
|
415 if (value !== null && value !== void 0) {
|
|
416 out.push(' ', attribName, '="', html.escapeAttrib(value), '"');
|
|
417 }
|
|
418 }
|
|
419 out.push('>');
|
|
420 }
|
|
421 },
|
|
422 endTag: function (tagName, out) {
|
|
423 if (ignoring) {
|
|
424 ignoring = false;
|
|
425 return;
|
|
426 }
|
|
427 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
|
|
428 var eflags = html4.ELEMENTS[tagName];
|
|
429 if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY
|
|
430 | html4.eflags.FOLDABLE))) {
|
|
431 var index;
|
|
432 if (eflags & html4.eflags.OPTIONAL_ENDTAG) {
|
|
433 for (index = stack.length; --index >= 0;) {
|
|
434 var stackEl = stack[index];
|
|
435 if (stackEl === tagName) { break; }
|
|
436 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
|
|
437 // Don't pop non optional end tags looking for a match.
|
|
438 return;
|
|
439 }
|
|
440 }
|
|
441 } else {
|
|
442 for (index = stack.length; --index >= 0;) {
|
|
443 if (stack[index] === tagName) { break; }
|
|
444 }
|
|
445 }
|
|
446 if (index < 0) { return; } // Not opened.
|
|
447 for (var i = stack.length; --i > index;) {
|
|
448 var stackEl = stack[i];
|
|
449 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
|
|
450 out.push('</', stackEl, '>');
|
|
451 }
|
|
452 }
|
|
453 stack.length = index;
|
|
454 out.push('</', tagName, '>');
|
|
455 }
|
|
456 },
|
|
457 pcdata: function (text, out) {
|
|
458 if (!ignoring) { out.push(text); }
|
|
459 },
|
|
460 rcdata: function (text, out) {
|
|
461 if (!ignoring) { out.push(text); }
|
|
462 },
|
|
463 cdata: function (text, out) {
|
|
464 if (!ignoring) { out.push(text); }
|
|
465 },
|
|
466 endDoc: function (out) {
|
|
467 for (var i = stack.length; --i >= 0;) {
|
|
468 out.push('</', stack[i], '>');
|
|
469 }
|
|
470 stack.length = 0;
|
|
471 }
|
|
472 });
|
|
473 };
|
|
474
|
|
475
|
|
476 /**
|
|
477 * Strips unsafe tags and attributes from html.
|
|
478 * @param {string} htmlText to sanitize
|
|
479 * @param {Function} opt_urlPolicy -- a transform to apply to url attribute
|
|
480 * values.
|
|
481 * @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to
|
|
482 * apply to names, ids, and classes.
|
|
483 * @return {string} html
|
|
484 */
|
|
485 function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) {
|
|
486 var out = [];
|
|
487 html.makeHtmlSanitizer(
|
|
488 function sanitizeAttribs(tagName, attribs) {
|
|
489 for (var i = 0; i < attribs.length; i += 2) {
|
|
490 var attribName = attribs[i];
|
|
491 var value = attribs[i + 1];
|
|
492 var atype = null, attribKey;
|
|
493 if ((attribKey = tagName + ':' + attribName,
|
|
494 html4.ATTRIBS.hasOwnProperty(attribKey))
|
|
495 || (attribKey = '*:' + attribName,
|
|
496 html4.ATTRIBS.hasOwnProperty(attribKey))) {
|
|
497 atype = html4.ATTRIBS[attribKey];
|
|
498 }
|
|
499 if (atype !== null) {
|
|
500 switch (atype) {
|
|
501 case html4.atype.SCRIPT:
|
|
502 case html4.atype.STYLE:
|
|
503 value = null;
|
|
504 break;
|
|
505 case html4.atype.IDREF:
|
|
506 case html4.atype.IDREFS:
|
|
507 case html4.atype.GLOBAL_NAME:
|
|
508 case html4.atype.LOCAL_NAME:
|
|
509 case html4.atype.CLASSES:
|
|
510 value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
|
|
511 break;
|
|
512 case html4.atype.URI:
|
|
513 value = opt_urlPolicy && opt_urlPolicy(value);
|
|
514 break;
|
|
515 }
|
|
516 } else {
|
|
517 value = null;
|
|
518 }
|
|
519 attribs[i + 1] = value;
|
|
520 }
|
|
521 return attribs;
|
|
522 })(htmlText, out);
|
|
523 return out.join('');
|
|
524 }
|