Skip to content

Commit 93feeb3

Browse files
committed
Various minor efficiency tweaks to improve perf on large documents.
In my tests parsing the 7-megabyte single page WHATWG HTML spec, these tweaks resulted in a 1.3x performance improvement.
1 parent e4def09 commit 93feeb3

File tree

4 files changed

+50
-43
lines changed

4 files changed

+50
-43
lines changed

lib/sanitize/transformers/clean_cdata.rb

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33
class Sanitize; module Transformers
44

55
CleanCDATA = lambda do |env|
6-
return if env[:is_whitelisted]
7-
86
node = env[:node]
97

10-
if node.cdata?
8+
if node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
119
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
1210
end
1311
end

lib/sanitize/transformers/clean_comment.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
class Sanitize; module Transformers
44

55
CleanComment = lambda do |env|
6-
return if env[:is_whitelisted]
7-
env[:node].unlink if env[:node].comment?
6+
node = env[:node]
7+
8+
if node.type == Nokogiri::XML::Node::COMMENT_NODE
9+
node.unlink unless env[:is_whitelisted]
10+
end
811
end
912

1013
end; end

lib/sanitize/transformers/clean_doctype.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
class Sanitize; module Transformers
44

55
CleanDoctype = lambda do |env|
6-
return if env[:is_whitelisted]
7-
env[:node].unlink if env[:node].type == Nokogiri::XML::Node::DTD_NODE
6+
node = env[:node]
7+
8+
if node.type == Nokogiri::XML::Node::DTD_NODE
9+
node.unlink unless env[:is_whitelisted]
10+
end
811
end
912

1013
end; end

lib/sanitize/transformers/clean_element.rb

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# encoding: utf-8
22

3+
require 'set'
4+
35
class Sanitize; module Transformers; class CleanElement
46

57
# Matches a valid HTML5 data attribute name. The unicode ranges included here
@@ -24,21 +26,28 @@ class Sanitize; module Transformers; class CleanElement
2426
REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
2527

2628
def initialize(config)
27-
@config = config
28-
29-
# For faster lookups.
3029
@add_attributes = config[:add_attributes]
31-
@allowed_elements = Set.new(config[:elements])
32-
@attributes = config[:attributes]
30+
@attributes = config[:attributes].dup
31+
@elements = Set.new(config[:elements])
3332
@protocols = config[:protocols]
3433
@remove_all_contents = false
3534
@remove_element_contents = Set.new
36-
@whitespace_elements = Hash.new
35+
@whitespace_elements = {}
36+
37+
if @attributes.include?(:all)
38+
@attributes[:all] = Set.new(@attributes[:all])
39+
end
40+
41+
@attributes.each do |element_name, attrs|
42+
unless element_name == :all
43+
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
44+
end
45+
end
3746

38-
# Converting :whitespace_element into a Hash for backwards compatibility.
47+
# Backcompat: if :whitespace_elements is an array, convert it to a hash.
3948
if config[:whitespace_elements].is_a?(Array)
4049
config[:whitespace_elements].each do |element|
41-
@whitespace_elements[element] = { :before => ' ', :after => ' ' }
50+
@whitespace_elements[element] = {:before => ' ', :after => ' '}
4251
end
4352
else
4453
@whitespace_elements = config[:whitespace_elements]
@@ -55,10 +64,10 @@ def call(env)
5564
name = env[:node_name]
5665
node = env[:node]
5766

58-
return if env[:is_whitelisted] || !node.element?
67+
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
5968

6069
# Delete any element that isn't in the config whitelist.
61-
unless @allowed_elements.include?(name)
70+
unless @elements.include?(name)
6271
# Elements like br, div, p, etc. need to be replaced with whitespace in
6372
# order to preserve readability.
6473
if @whitespace_elements.include?(name)
@@ -77,21 +86,33 @@ def call(env)
7786
return
7887
end
7988

80-
attr_whitelist = Set.new((@attributes[name] || []) +
81-
(@attributes[:all] || []))
82-
83-
allow_data_attributes = attr_whitelist.include?(:data)
89+
attr_whitelist = @attributes[name] || @attributes[:all]
8490

85-
if attr_whitelist.empty?
91+
if attr_whitelist.nil?
8692
# Delete all attributes from elements with no whitelisted attributes.
8793
node.attribute_nodes.each {|attr| attr.unlink }
8894
else
95+
allow_data_attributes = attr_whitelist.include?(:data)
96+
8997
# Delete any attribute that isn't allowed on this element.
9098
node.attribute_nodes.each do |attr|
9199
attr_name = attr.name.downcase
92100

93-
unless attr_whitelist.include?(attr_name)
94-
# The attribute isn't explicitly whitelisted.
101+
if attr_whitelist.include?(attr_name)
102+
# The attribute is whitelisted.
103+
104+
# Remove any attributes that use unacceptable protocols.
105+
if @protocols.include?(name) && @protocols[name].include?(attr_name)
106+
attr_protocols = @protocols[name][attr_name]
107+
108+
if attr.value.to_s.downcase =~ REGEX_PROTOCOL
109+
attr.unlink unless attr_protocols.include?($1.downcase)
110+
else
111+
attr.unlink unless attr_protocols.include?(:relative)
112+
end
113+
end
114+
else
115+
# The attribute isn't whitelisted.
95116

96117
if allow_data_attributes && attr_name.start_with?('data-')
97118
# Arbitrary data attributes are allowed. Verify that the attribute
@@ -104,28 +125,10 @@ def call(env)
104125
end
105126
end
106127
end
107-
108-
# Delete remaining attributes that use unacceptable protocols.
109-
if @protocols.has_key?(name)
110-
protocol = @protocols[name]
111-
112-
node.attribute_nodes.each do |attr|
113-
attr_name = attr.name.downcase
114-
next false unless protocol.has_key?(attr_name)
115-
116-
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
117-
!protocol[attr_name].include?($1.downcase)
118-
else
119-
!protocol[attr_name].include?(:relative)
120-
end
121-
122-
attr.unlink if del
123-
end
124-
end
125128
end
126129

127130
# Add required attributes.
128-
if @add_attributes.has_key?(name)
131+
if @add_attributes.include?(name)
129132
@add_attributes[name].each {|key, val| node[key] = val }
130133
end
131134
end

0 commit comments

Comments
 (0)