annotate js/ext/html-sanitizer.js @ 0:633c9cb05555

Origination.
author Atul Varma <varmaa@toolness.com>
date Sun, 07 Jun 2009 19:29:10 -0700
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
1 // Copyright (C) 2006 Google Inc.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
2 //
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
3 // Licensed under the Apache License, Version 2.0 (the "License");
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
4 // you may not use this file except in compliance with the License.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
5 // You may obtain a copy of the License at
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
6 //
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
7 // http://www.apache.org/licenses/LICENSE-2.0
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
8 //
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
9 // Unless required by applicable law or agreed to in writing, software
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
10 // distributed under the License is distributed on an "AS IS" BASIS,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
12 // See the License for the specific language governing permissions and
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
13 // limitations under the License.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
14
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
15 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
16 * @fileoverview
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
17 * An HTML sanitizer that can satisfy a variety of security policies.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
18 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
19 * <p>
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
20 * The HTML sanitizer is built around a SAX parser and HTML element and
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
21 * attributes schemas.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
22 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
23 * @author mikesamuel@gmail.com
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
24 * @requires html4
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
25 * @provides html, html_sanitize
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
26 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
27
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
28 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
29 * @namespace
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
30 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
31 var html = (function () {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
32 var lcase;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
33 // The below may not be true on browsers in the Turkish locale.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
34 if ('script' === 'SCRIPT'.toLowerCase()) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
35 lcase = function (s) { return s.toLowerCase(); };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
36 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
37 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
38 * {@updoc
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
39 * $ lcase('SCRIPT')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
40 * # 'script'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
41 * $ lcase('script')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
42 * # 'script'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
43 * }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
44 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
45 lcase = function (s) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
46 return s.replace(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
47 /[A-Z]/g,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
48 function (ch) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
49 return String.fromCharCode(ch.charCodeAt(0) | 32);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
50 });
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
51 };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
52 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
53
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
54 var ENTITIES = {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
55 lt : '<',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
56 gt : '>',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
57 amp : '&',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
58 nbsp : '\240',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
59 quot : '"',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
60 apos : '\''
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
61 };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
62
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
63 var decimalEscapeRe = /^#(\d+)$/;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
64 var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
65 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
66 * Decodes an HTML entity.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
67 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
68 * {@updoc
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
69 * $ lookupEntity('lt')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
70 * # '<'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
71 * $ lookupEntity('GT')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
72 * # '>'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
73 * $ lookupEntity('amp')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
74 * # '&'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
75 * $ lookupEntity('nbsp')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
76 * # '\xA0'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
77 * $ lookupEntity('apos')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
78 * # "'"
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
79 * $ lookupEntity('quot')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
80 * # '"'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
81 * $ lookupEntity('#xa')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
82 * # '\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
83 * $ lookupEntity('#10')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
84 * # '\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
85 * $ lookupEntity('#x0a')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
86 * # '\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
87 * $ lookupEntity('#010')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
88 * # '\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
89 * $ lookupEntity('#x00A')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
90 * # '\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
91 * $ lookupEntity('Pi') // Known failure
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
92 * # '\u03A0'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
93 * $ lookupEntity('pi') // Known failure
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
94 * # '\u03C0'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
95 * }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
96 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
97 * @param name the content between the '&' and the ';'.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
98 * @return a single unicode code-point as a string.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
99 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
100 function lookupEntity(name) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
101 name = lcase(name); // TODO: &pi; is different from &Pi;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
102 if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
103 var m = name.match(decimalEscapeRe);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
104 if (m) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
105 return String.fromCharCode(parseInt(m[1], 10));
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
106 } else if (!!(m = name.match(hexEscapeRe))) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
107 return String.fromCharCode(parseInt(m[1], 16));
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
108 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
109 return '';
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
110 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
111
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
112 function decodeOneEntity(_, name) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
113 return lookupEntity(name);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
114 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
115
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
116 var nulRe = /\0/g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
117 function stripNULs(s) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
118 return s.replace(nulRe, '');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
119 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
120
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
121 var entityRe = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
122 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
123 * The plain text of a chunk of HTML CDATA which possibly containing.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
124 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
125 * {@updoc
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
126 * $ unescapeEntities('')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
127 * # ''
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
128 * $ unescapeEntities('hello World!')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
129 * # 'hello World!'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
130 * $ unescapeEntities('1 &lt; 2 &amp;&AMP; 4 &gt; 3&#10;')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
131 * # '1 < 2 && 4 > 3\n'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
132 * $ unescapeEntities('&lt;&lt <- unfinished entity&gt;')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
133 * # '<&lt <- unfinished entity>'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
134 * $ unescapeEntities('/foo?bar=baz&copy=true') // & often unescaped in URLS
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
135 * # '/foo?bar=baz&copy=true'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
136 * $ unescapeEntities('pi=&pi;&#x3c0;, Pi=&Pi;\u03A0') // FIXME: known failure
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
137 * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
138 * }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
139 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
140 * @param s a chunk of HTML CDATA. It must not start or end inside an HTML
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
141 * entity.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
142 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
143 function unescapeEntities(s) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
144 return s.replace(entityRe, decodeOneEntity);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
145 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
146
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
147 var ampRe = /&/g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
148 var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
149 var ltRe = /</g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
150 var gtRe = />/g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
151 var quotRe = /\"/g;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
152 var eqRe = /\=/g; // Backslash required on JScript.net
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
153
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
154 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
155 * Escapes HTML special characters in attribute values as HTML entities.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
156 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
157 * {@updoc
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
158 * $ escapeAttrib('')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
159 * # ''
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
160 * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
161 * # '&quot;&lt;&lt;&amp;&#61;&#61;&amp;&gt;&gt;&quot;'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
162 * $ escapeAttrib('Hello <World>!')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
163 * # 'Hello &lt;World&gt;!'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
164 * }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
165 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
166 function escapeAttrib(s) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
167 // Escaping '=' defangs many UTF-7 and SGML short-tag attacks.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
168 return s.replace(ampRe, '&amp;').replace(ltRe, '&lt;').replace(gtRe, '&gt;')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
169 .replace(quotRe, '&quot;').replace(eqRe, '&#61;');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
170 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
171
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
172 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
173 * Escape entities in RCDATA that can be escaped without changing the meaning.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
174 * {@updoc
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
175 * $ normalizeRCData('1 < 2 &&amp; 3 > 4 &amp;& 5 &lt; 7&8')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
176 * # '1 &lt; 2 &amp;&amp; 3 &gt; 4 &amp;&amp; 5 &lt; 7&amp;8'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
177 * }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
178 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
179 function normalizeRCData(rcdata) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
180 return rcdata
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
181 .replace(looseAmpRe, '&amp;$1')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
182 .replace(ltRe, '&lt;')
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
183 .replace(gtRe, '&gt;');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
184 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
185
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
186
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
187 // TODO(mikesamuel): validate sanitizer regexs against the HTML5 grammar at
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
188 // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
189 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
190 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
191 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
192
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
193 /** token definitions. */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
194 var INSIDE_TAG_TOKEN = new RegExp(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
195 // Don't capture space.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
196 '^\\s*(?:'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
197 // Capture an attribute name in group 1, and value in group 3.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
198 // We capture the fact that there was an attribute in group 2, since
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
199 // interpreters are inconsistent in whether a group that matches nothing
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
200 // is null, undefined, or the empty string.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
201 + ('(?:'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
202 + '([a-z][a-z-]*)' // attribute name
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
203 + ('(' // optionally followed
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
204 + '\\s*=\\s*'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
205 + ('('
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
206 // A double quoted string.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
207 + '\"[^\"]*\"'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
208 // A single quoted string.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
209 + '|\'[^\']*\''
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
210 // The positive lookahead is used to make sure that in
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
211 // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo".
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
212 + '|(?=[a-z][a-z-]*\\s*=)'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
213 // An unquoted value that is not an attribute name.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
214 // We know it is not an attribute name because the previous
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
215 // zero-width match would've eliminated that possibility.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
216 + '|[^>\"\'\\s]*'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
217 + ')'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
218 )
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
219 + ')'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
220 ) + '?'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
221 + ')'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
222 )
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
223 // End of tag captured in group 3.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
224 + '|(/?>)'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
225 // Don't capture cruft
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
226 + '|.[^\\w\\s>]*)',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
227 'i');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
228
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
229 var OUTSIDE_TAG_TOKEN = new RegExp(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
230 '^(?:'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
231 // Entity captured in group 1.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
232 + '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
233 // Comment, doctypes, and processing instructions not captured.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
234 + '|<\!--[\\s\\S]*?--\>|<!\\w[^>]*>|<\\?[^>*]*>'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
235 // '/' captured in group 2 for close tags, and name captured in group 3.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
236 + '|<(/)?([a-z][a-z0-9]*)'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
237 // Text captured in group 4.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
238 + '|([^<&>]+)'
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
239 // Cruft captured in group 5.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
240 + '|([<&>]))',
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
241 'i');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
242
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
243 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
244 * Given a SAX-like event handler, produce a function that feeds those
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
245 * events and a parameter to the event handler.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
246 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
247 * The event handler has the form:{@code
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
248 * {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
249 * // Name is an upper-case HTML tag name. Attribs is an array of
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
250 * // alternating upper-case attribute names, and attribute values. The
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
251 * // attribs array is reused by the parser. Param is the value passed to
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
252 * // the saxParser.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
253 * startTag: function (name, attribs, param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
254 * endTag: function (name, param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
255 * pcdata: function (text, param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
256 * rcdata: function (text, param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
257 * cdata: function (text, param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
258 * startDoc: function (param) { ... },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
259 * endDoc: function (param) { ... }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
260 * }}
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
261 *
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
262 * @param {Object} handler a record containing event handlers.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
263 * @return {Function} that takes a chunk of html and a parameter.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
264 * The parameter is passed on to the handler methods.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
265 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
266 function makeSaxParser(handler) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
267 return function parse(htmlText, param) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
268 htmlText = String(htmlText);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
269 var htmlLower = null;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
270
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
271 var inTag = false; // True iff we're currently processing a tag.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
272 var attribs = []; // Accumulates attribute names and values.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
273 var tagName = void 0; // The name of the tag currently being processed.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
274 var eflags = void 0; // The element flags for the current tag.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
275 var openTag = void 0; // True if the current tag is an open tag.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
276
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
277 if (handler.startDoc) { handler.startDoc(param); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
278
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
279 while (htmlText) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
280 var m = htmlText.match(inTag ? INSIDE_TAG_TOKEN : OUTSIDE_TAG_TOKEN);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
281 htmlText = htmlText.substring(m[0].length);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
282
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
283 if (inTag) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
284 if (m[1]) { // attribute
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
285 // setAttribute with uppercase names doesn't work on IE6.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
286 var attribName = lcase(m[1]);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
287 var decodedValue;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
288 if (m[2]) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
289 var encodedValue = m[3];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
290 switch (encodedValue.charCodeAt(0)) { // Strip quotes
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
291 case 34: case 39:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
292 encodedValue = encodedValue.substring(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
293 1, encodedValue.length - 1);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
294 break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
295 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
296 decodedValue = unescapeEntities(stripNULs(encodedValue));
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
297 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
298 // Use name as value for valueless attribs, so
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
299 // <input type=checkbox checked>
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
300 // gets attributes ['type', 'checkbox', 'checked', 'checked']
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
301 decodedValue = attribName;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
302 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
303 attribs.push(attribName, decodedValue);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
304 } else if (m[4]) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
305 if (eflags !== void 0) { // False if not in whitelist.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
306 if (openTag) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
307 if (handler.startTag) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
308 handler.startTag(tagName, attribs, param);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
309 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
310 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
311 if (handler.endTag) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
312 handler.endTag(tagName, param);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
313 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
314 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
315 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
316
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
317 if (openTag
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
318 && (eflags & (html4.eflags.CDATA | html4.eflags.RCDATA))) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
319 if (htmlLower === null) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
320 htmlLower = lcase(htmlText);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
321 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
322 htmlLower = htmlLower.substring(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
323 htmlLower.length - htmlText.length);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
324 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
325 var dataEnd = htmlLower.indexOf('</' + tagName);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
326 if (dataEnd < 0) { dataEnd = htmlText.length; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
327 if (eflags & html4.eflags.CDATA) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
328 if (handler.cdata) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
329 handler.cdata(htmlText.substring(0, dataEnd), param);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
330 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
331 } else if (handler.rcdata) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
332 handler.rcdata(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
333 normalizeRCData(htmlText.substring(0, dataEnd)), param);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
334 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
335 htmlText = htmlText.substring(dataEnd);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
336 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
337
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
338 tagName = eflags = openTag = void 0;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
339 attribs.length = 0;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
340 inTag = false;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
341 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
342 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
343 if (m[1]) { // Entity
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
344 if (handler.pcdata) { handler.pcdata(m[0], param); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
345 } else if (m[3]) { // Tag
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
346 openTag = !m[2];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
347 inTag = true;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
348 tagName = lcase(m[3]);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
349 eflags = html4.ELEMENTS.hasOwnProperty(tagName)
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
350 ? html4.ELEMENTS[tagName] : void 0;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
351 } else if (m[4]) { // Text
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
352 if (handler.pcdata) { handler.pcdata(m[4], param); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
353 } else if (m[5]) { // Cruft
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
354 if (handler.pcdata) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
355 switch (m[5]) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
356 case '<': handler.pcdata('&lt;', param); break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
357 case '>': handler.pcdata('&gt;', param); break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
358 default: handler.pcdata('&amp;', param); break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
359 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
360 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
361 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
362 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
363 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
364
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
365 if (handler.endDoc) { handler.endDoc(param); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
366 };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
367 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
368
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
369 return {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
370 normalizeRCData: normalizeRCData,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
371 escapeAttrib: escapeAttrib,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
372 unescapeEntities: unescapeEntities,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
373 makeSaxParser: makeSaxParser
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
374 };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
375 })();
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
376
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
377 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
378 * Returns a function that strips unsafe tags and attributes from html.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
379 * @param {Function} sanitizeAttributes
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
380 * maps from (tagName, attribs[]) to null or a sanitized attribute array.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
381 * The attribs array can be arbitrarily modified, but the same array
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
382 * instance is reused, so should not be held.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
383 * @return {Function} from html to sanitized html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
384 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
385 html.makeHtmlSanitizer = function (sanitizeAttributes) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
386 var stack = [];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
387 var ignoring = false;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
388 return html.makeSaxParser({
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
389 startDoc: function (_) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
390 stack = [];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
391 ignoring = false;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
392 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
393 startTag: function (tagName, attribs, out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
394 if (ignoring) { return; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
395 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
396 var eflags = html4.ELEMENTS[tagName];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
397 if (eflags & html4.eflags.FOLDABLE) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
398 return;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
399 } else if (eflags & html4.eflags.UNSAFE) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
400 ignoring = !(eflags & html4.eflags.EMPTY);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
401 return;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
402 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
403 attribs = sanitizeAttributes(tagName, attribs);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
404 // TODO(mikesamuel): relying on sanitizeAttributes not to
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
405 // insert unsafe attribute names.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
406 if (attribs) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
407 if (!(eflags & html4.eflags.EMPTY)) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
408 stack.push(tagName);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
409 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
410
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
411 out.push('<', tagName);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
412 for (var i = 0, n = attribs.length; i < n; i += 2) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
413 var attribName = attribs[i],
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
414 value = attribs[i + 1];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
415 if (value !== null && value !== void 0) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
416 out.push(' ', attribName, '="', html.escapeAttrib(value), '"');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
417 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
418 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
419 out.push('>');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
420 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
421 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
422 endTag: function (tagName, out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
423 if (ignoring) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
424 ignoring = false;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
425 return;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
426 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
427 if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
428 var eflags = html4.ELEMENTS[tagName];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
429 if (!(eflags & (html4.eflags.UNSAFE | html4.eflags.EMPTY
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
430 | html4.eflags.FOLDABLE))) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
431 var index;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
432 if (eflags & html4.eflags.OPTIONAL_ENDTAG) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
433 for (index = stack.length; --index >= 0;) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
434 var stackEl = stack[index];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
435 if (stackEl === tagName) { break; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
436 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
437 // Don't pop non optional end tags looking for a match.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
438 return;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
439 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
440 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
441 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
442 for (index = stack.length; --index >= 0;) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
443 if (stack[index] === tagName) { break; }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
444 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
445 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
446 if (index < 0) { return; } // Not opened.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
447 for (var i = stack.length; --i > index;) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
448 var stackEl = stack[i];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
449 if (!(html4.ELEMENTS[stackEl] & html4.eflags.OPTIONAL_ENDTAG)) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
450 out.push('</', stackEl, '>');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
451 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
452 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
453 stack.length = index;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
454 out.push('</', tagName, '>');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
455 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
456 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
457 pcdata: function (text, out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
458 if (!ignoring) { out.push(text); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
459 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
460 rcdata: function (text, out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
461 if (!ignoring) { out.push(text); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
462 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
463 cdata: function (text, out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
464 if (!ignoring) { out.push(text); }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
465 },
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
466 endDoc: function (out) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
467 for (var i = stack.length; --i >= 0;) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
468 out.push('</', stack[i], '>');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
469 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
470 stack.length = 0;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
471 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
472 });
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
473 };
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
474
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
475
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
476 /**
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
477 * Strips unsafe tags and attributes from html.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
478 * @param {string} htmlText to sanitize
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
479 * @param {Function} opt_urlPolicy -- a transform to apply to url attribute
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
480 * values.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
481 * @param {Function} opt_nmTokenPolicy : string -> string? -- a transform to
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
482 * apply to names, ids, and classes.
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
483 * @return {string} html
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
484 */
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
485 function html_sanitize(htmlText, opt_urlPolicy, opt_nmTokenPolicy) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
486 var out = [];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
487 html.makeHtmlSanitizer(
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
488 function sanitizeAttribs(tagName, attribs) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
489 for (var i = 0; i < attribs.length; i += 2) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
490 var attribName = attribs[i];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
491 var value = attribs[i + 1];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
492 var atype = null, attribKey;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
493 if ((attribKey = tagName + ':' + attribName,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
494 html4.ATTRIBS.hasOwnProperty(attribKey))
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
495 || (attribKey = '*:' + attribName,
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
496 html4.ATTRIBS.hasOwnProperty(attribKey))) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
497 atype = html4.ATTRIBS[attribKey];
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
498 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
499 if (atype !== null) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
500 switch (atype) {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
501 case html4.atype.SCRIPT:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
502 case html4.atype.STYLE:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
503 value = null;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
504 break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
505 case html4.atype.IDREF:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
506 case html4.atype.IDREFS:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
507 case html4.atype.GLOBAL_NAME:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
508 case html4.atype.LOCAL_NAME:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
509 case html4.atype.CLASSES:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
510 value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
511 break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
512 case html4.atype.URI:
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
513 value = opt_urlPolicy && opt_urlPolicy(value);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
514 break;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
515 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
516 } else {
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
517 value = null;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
518 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
519 attribs[i + 1] = value;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
520 }
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
521 return attribs;
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
522 })(htmlText, out);
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
523 return out.join('');
633c9cb05555 Origination.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
524 }