Skip to content

Commit 17ac8a2

Browse files
straight-shootaRX14
authored andcommittedOct 4, 2017
Improve decoding of HTML entities (#5064)
* Improve method doc * fix invalid entity with trailing semicolon * fix entites with numerical characters * Improve decoding of numerical character references according to HTML5 spec * Use case instead of if branches * group all disallowed codepoints outside case statement * fix typo * Simplify branches * replacement character as Char
1 parent 25392e1 commit 17ac8a2

File tree

2 files changed

+129
-20
lines changed

2 files changed

+129
-20
lines changed
 

Diff for: ‎spec/std/html_spec.cr

+53-7
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ describe "HTML" do
2323
str.should eq("safe_string")
2424
end
2525

26-
it "unescapes dangerous characters from a string" do
26+
it "unescapes html special characters" do
2727
str = HTML.unescape("< & >")
2828

2929
str.should eq("< & >")
@@ -42,9 +42,9 @@ describe "HTML" do
4242
end
4343

4444
it "unescapes with invalid entities" do
45-
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn")
45+
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn &ThisIsNotAnEntitiy;")
4646

47-
str.should eq("&<&>\"&abcdefghijklmn")
47+
str.should eq("&<&>\"&abcdefghijklmn &ThisIsNotAnEntitiy;")
4848
end
4949

5050
it "unescapes hex encoded chars" do
@@ -53,18 +53,33 @@ describe "HTML" do
5353
str.should eq("3 + 2 = 5")
5454
end
5555

56+
it "unescapes decimal encoded chars" do
57+
str = HTML.unescape("3 &#00043; 2 &#00061 5")
58+
59+
str.should eq("3 + 2 = 5")
60+
end
61+
5662
it "unescapes &nbsp;" do
5763
str = HTML.unescape("nbsp&nbsp;space ")
5864

5965
str.should eq("nbsp\u{0000A0}space ")
6066
end
6167

62-
it "unescapes Char::MAX_CODEPOINT" do
68+
it "does not unescape Char::MAX_CODEPOINT" do
69+
# Char::MAX_CODEPOINT is actually a noncharacter and is not replaced
6370
str = HTML.unescape("limit &#x10FFFF;")
64-
str.should eq("limit 􏿿")
71+
str.should eq("limit &#x10FFFF;")
6572

6673
str = HTML.unescape("limit &#1114111;")
67-
str.should eq("limit 􏿿")
74+
str.should eq("limit &#1114111;")
75+
end
76+
77+
it "does not unescape characters above Char::MAX_CODEPOINT" do
78+
str = HTML.unescape("limit &#x110000;")
79+
str.should eq("limit \uFFFD")
80+
81+
str = HTML.unescape("limit &#1114112;")
82+
str.should eq("limit \uFFFD")
6883
end
6984

7085
it "unescapes &NotSquareSuperset;" do
@@ -73,9 +88,40 @@ describe "HTML" do
7388
str.should eq(" ⊐̸ ")
7489
end
7590

76-
it "unescapes &ampd" do
91+
it "unescapes entities without trailing semicolon" do
7792
str = HTML.unescape("&amphello")
7893
str.should eq("&hello")
7994
end
95+
96+
it "unescapes named character reference with numerical characters" do
97+
str = HTML.unescape("&frac34;")
98+
str.should eq("\u00BE")
99+
end
100+
101+
it "does not escape unicode control characters except space characters" do
102+
string = "&#x0001;-&#x001F; &#x000D; &#x007F;"
103+
HTML.unescape(string).should eq(string)
104+
105+
string = HTML.unescape("&#x0080;-&#x009F;")
106+
string.should eq("\u20AC-\u0178")
107+
108+
HTML.unescape("&#x000;").should eq("\uFFFD")
109+
end
110+
111+
it "escapes space characters" do
112+
string = HTML.unescape("&#x0020;&#32;&#x0009;&#x000A;&#x000C;")
113+
string.should eq(" \t\n\f")
114+
end
115+
116+
it "does not escape noncharacter codepoints" do
117+
# noncharacters http://www.unicode.org/faq/private_use.html
118+
string = "&#xFDD0;-&#xFDEF; &#xFFFE; &#FFFF; &#x1FFFE; &#x1FFFF; &#x2FFFE; &#x10FFFF;"
119+
HTML.unescape(string).should eq(string)
120+
end
121+
122+
it "does not escape unicode surrogate characters" do
123+
string = "&#xD800;-&#xDFFF;"
124+
HTML.unescape(string).should eq("\uFFFD-\uFFFD")
125+
end
80126
end
81127
end

Diff for: ‎src/html.cr

+76-13
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,61 @@ module HTML
3636
end
3737
end
3838

39+
# These replacements permit compatibility with old numeric entities that
40+
# assumed Windows-1252 encoding.
41+
# http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
42+
private CHARACTER_REPLACEMENTS = {
43+
'\u20AC', # First entry is what 0x80 should be replaced with.
44+
'\u0081',
45+
'\u201A',
46+
'\u0192',
47+
'\u201E',
48+
'\u2026',
49+
'\u2020',
50+
'\u2021',
51+
'\u02C6',
52+
'\u2030',
53+
'\u0160',
54+
'\u2039',
55+
'\u0152',
56+
'\u008D',
57+
'\u017D',
58+
'\u008F',
59+
'\u0090',
60+
'\u2018',
61+
'\u2019',
62+
'\u201C',
63+
'\u201D',
64+
'\u2022',
65+
'\u2013',
66+
'\u2014',
67+
'\u02DC',
68+
'\u2122',
69+
'\u0161',
70+
'\u203A',
71+
'\u0153',
72+
'\u009D',
73+
'\u017E',
74+
'\u0178', # Last entry is 0x9F.
75+
# 0x00->'\uFFFD' is handled programmatically.
76+
# 0x0D->'\u000D' is a no-op.
77+
}
78+
3979
# Returns a string where named and numeric character references
4080
# (e.g. &gt;, &#62;, &x3e;) in *string* are replaced with the corresponding
41-
# unicode characters.
81+
# unicode characters. This method decodes all HTML5 entities including those
82+
# without a trailing semicolon (such as `&copy`).
4283
#
4384
# ```
4485
# HTML.unescape("Crystal &amp; You") # => "Crystal & You"
4586
# ```
4687
def self.unescape(string : String) : String
47-
string.gsub(/&(?:([a-zA-Z]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
88+
string.gsub(/&(?:([a-zA-Z0-9]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
4889
if code = match[1]?
4990
# Try to find the code
5091
value = named_entity(code)
51-
if value
52-
value
53-
elsif !code.ends_with?(';')
92+
93+
unless value || code.ends_with?(';')
5494
# If we can't find it and it doesn't end with ';',
5595
# we need to find each prefix of it.
5696
# We start from the largest prefix.
@@ -67,19 +107,17 @@ module HTML
67107
break
68108
end
69109
end
70-
71-
# We either found the code or not,
72-
# in which case we need to return the original string
73-
value || string
74110
end
111+
112+
# We either found the code or not,
113+
# in which case we need to return the original string
114+
value || string
75115
elsif code = match[2]?
76116
# Find by decimal code
77-
n = code.to_i
78-
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
117+
decode_codepoint(code.to_i) || string
79118
elsif code = match[3]?
80119
# Find by hexadecimal code
81-
n = code.to_i(16)
82-
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
120+
decode_codepoint(code.to_i(16)) || string
83121
else
84122
string
85123
end
@@ -89,4 +127,29 @@ module HTML
89127
private def self.named_entity(code)
90128
HTML::SINGLE_CHAR_ENTITIES[code]? || HTML::DOUBLE_CHAR_ENTITIES[code]?
91129
end
130+
131+
# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
132+
private def self.decode_codepoint(codepoint)
133+
case codepoint
134+
when 0x80..0x9F
135+
# Replace characters from Windows-1252 with UTF-8 equivalents.
136+
CHARACTER_REPLACEMENTS[codepoint - 0x80].to_s
137+
when 0,
138+
.>(Char::MAX_CODEPOINT),
139+
0xD800..0xDFFF # unicode surrogate characters
140+
# Replace invalid characters with replacement character.
141+
'\uFFFD'
142+
else
143+
# don't replace disallowed codepoints
144+
unless codepoint == 0x007F ||
145+
# unicode noncharacters
146+
(0xFDD0..0xFDEF).includes?(codepoint) ||
147+
# last two of each plane (nonchars) disallowed
148+
codepoint & 0xFFFF >= 0xFFFE ||
149+
# unicode control characters expect space
150+
(codepoint < 0x0020 && !{0x0009, 0x000A, 0x000C}.includes?(codepoint))
151+
codepoint.unsafe_chr
152+
end
153+
end
154+
end
92155
end

0 commit comments

Comments
 (0)
Please sign in to comment.