-
Notifications
You must be signed in to change notification settings - Fork 143
/
Copy pathclean_element.rb
251 lines (214 loc) · 9.58 KB
/
clean_element.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# frozen_string_literal: true
require "cgi"
require "set"
class Sanitize
module Transformers
class CleanElement
# Matches a valid HTML5 data attribute name. The unicode ranges included
# here are a conservative subset of the full range of characters that are
# technically allowed, with the intent of matching the most common
# characters used in data attribute names while excluding uncommon or
# potentially misleading characters, or characters with the potential to
# be normalized into unsafe or confusing forms.
#
# If you need data attr names with characters that aren't included here
# (such as combining marks, full-width characters, or CJK), please
# consider creating a custom transformer to validate attributes according
# to your needs.
#
# https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
# Elements whose content is treated as unescaped text by HTML parsers.
UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
iframe
noembed
noframes
noscript
plaintext
script
style
xmp
])
# Attributes that need additional escaping on `<a>` elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
name
])
# Attributes that need additional escaping on all elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
action
href
src
])
# Mapping of original characters to escape sequences for characters that
# should be escaped in attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_CHARS = {
" " => "%20",
'"' => "%22"
}
# Regex that matches any single character that needs to be escaped in
# attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
def initialize(config)
@add_attributes = config[:add_attributes]
@attributes = config[:attributes].dup
@elements = config[:elements]
@protocols = config[:protocols]
@remove_all_contents = false
@remove_element_contents = Set.new
@whitespace_elements = {}
@attributes.each do |element_name, attrs|
unless element_name == :all
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
end
end
# Backcompat: if :whitespace_elements is a Set, convert it to a hash.
if config[:whitespace_elements].is_a?(Set)
config[:whitespace_elements].each do |element|
@whitespace_elements[element] = {before: " ", after: " "}
end
else
@whitespace_elements = config[:whitespace_elements]
end
if config[:remove_contents].is_a?(Enumerable)
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
else
@remove_all_contents = !!config[:remove_contents]
end
end
def call(env)
node = env[:node]
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
name = env[:node_name]
# Delete any element that isn't in the config allowlist, unless the node
# has already been deleted from the document.
#
# It's important that we not try to reparent the children of a node that
# has already been deleted, since that seems to trigger a memory leak in
# Nokogiri.
unless @elements.include?(name) || node.parent.nil?
# Elements like br, div, p, etc. need to be replaced with whitespace
# in order to preserve readability.
if @whitespace_elements.include?(name)
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
unless node.children.empty?
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
end
end
unless node.children.empty?
unless @remove_all_contents || @remove_element_contents.include?(name)
node.add_previous_sibling(node.children)
end
end
node.unlink
return
end
attr_allowlist = @attributes[name] || @attributes[:all]
if attr_allowlist.nil?
# Delete all attributes from elements with no allowlisted attributes.
node.attribute_nodes.each { |attr| attr.unlink }
else
allow_data_attributes = attr_allowlist.include?(:data)
# Delete any attribute that isn't allowed on this element.
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
unless attr_allowlist.include?(attr_name)
# The attribute isn't in the allowlist, but may still be allowed
# if it's a data attribute.
unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
# Either the attribute isn't a data attribute or arbitrary data
# attributes aren't allowed. Remove the attribute.
attr.unlink
next
end
end
# The attribute is allowed.
# Remove any attributes that use unacceptable protocols.
if @protocols.include?(name) && @protocols[name].include?(attr_name)
attr_protocols = @protocols[name][attr_name]
if attr.value =~ REGEX_PROTOCOL
unless attr_protocols.include?($1.downcase)
attr.unlink
next
end
else
unless attr_protocols.include?(:relative)
attr.unlink
next
end
end
# Leading and trailing whitespace around URLs is ignored at parse
# time. Stripping it here prevents it from being escaped by the
# libxml2 workaround below.
attr.value = attr.value.strip
end
# libxml2 >= 2.9.2 doesn't escape comments within some attributes,
# in an attempt to preserve server-side includes. This can result in
# XSS since an unescaped double quote can allow an attacker to
# inject a non-allowlisted attribute.
#
# Sanitize works around this by implementing its own escaping for
# affected attributes, some of which can exist on any element and
# some of which can only exist on `<a>` elements.
#
# This fix is technically no longer necessary with Nokogumbo >= 2.0
# since it no longer uses libxml2's serializer, but it's retained to
# avoid breaking use cases where people might be sanitizing
# individual Nokogiri nodes and then serializing them manually
# without Nokogumbo.
#
# The relevant libxml2 code is here:
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
(name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
end
end
end
# Add required attributes.
if @add_attributes.include?(name)
@add_attributes[name].each { |key, val| node[key] = val }
end
# Element-specific special cases.
case name
# If this is an allowlisted iframe that has children, remove all its
# children. The HTML standard says iframes shouldn't have content, but
# when they do, this content is parsed as text and is serialized
# verbatim without being escaped, which is unsafe because legacy
# browsers may still render it and execute `<script>` content. So the
# safe and correct thing to do is to always remove iframe content.
when "iframe"
if !node.children.empty?
node.children.each do |child|
child.unlink
end
end
# Prevent the use of `<meta>` elements that set a charset other than
# UTF-8, since Sanitize's output is always UTF-8.
when "meta"
if node.has_attribute?("charset") &&
node["charset"].downcase != "utf-8"
node["charset"] = "utf-8"
end
if node.has_attribute?("http-equiv") &&
node.has_attribute?("content") &&
node["http-equiv"].downcase == "content-type" &&
node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
end
# A `<noscript>` element's content is parsed differently in browsers
# depending on whether or not scripting is enabled. Since Nokogiri
# doesn't support scripting, it always parses `<noscript>` elements as
# if scripting is disabled. This results in edge cases where it's not
# possible to reliably sanitize the contents of a `<noscript>` element
# because Nokogiri can't fully replicate the parsing behavior of a
# scripting-enabled browser. The safest thing to do is to simply remove
# all `<noscript>` elements.
when "noscript"
node.unlink
end
end
end
end
end