@@ -36,21 +36,61 @@ module HTML
36
36
end
37
37
end
38
38
39
+ # These replacements permit compatibility with old numeric entities that
40
+ # assumed Windows-1252 encoding.
41
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
42
+ private CHARACTER_REPLACEMENTS = {
43
+ '\u20AC' , # First entry is what 0x80 should be replaced with.
44
+ '\u0081' ,
45
+ '\u201A' ,
46
+ '\u0192' ,
47
+ '\u201E' ,
48
+ '\u2026' ,
49
+ '\u2020' ,
50
+ '\u2021' ,
51
+ '\u02C6' ,
52
+ '\u2030' ,
53
+ '\u0160' ,
54
+ '\u2039' ,
55
+ '\u0152' ,
56
+ '\u008D' ,
57
+ '\u017D' ,
58
+ '\u008F' ,
59
+ '\u0090' ,
60
+ '\u2018' ,
61
+ '\u2019' ,
62
+ '\u201C' ,
63
+ '\u201D' ,
64
+ '\u2022' ,
65
+ '\u2013' ,
66
+ '\u2014' ,
67
+ '\u02DC' ,
68
+ '\u2122' ,
69
+ '\u0161' ,
70
+ '\u203A' ,
71
+ '\u0153' ,
72
+ '\u009D' ,
73
+ '\u017E' ,
74
+ '\u0178' , # Last entry is 0x9F.
75
+ # 0x00->'\uFFFD' is handled programmatically.
76
+ # 0x0D->'\u000D' is a no-op.
77
+ }
78
+
39
79
# Returns a string where named and numeric character references
40
80
# (e.g. >, >, &x3e;) in *string* are replaced with the corresponding
41
- # unicode characters.
81
+ # unicode characters. This method decodes all HTML5 entities including those
82
+ # without a trailing semicolon (such as `©`).
42
83
#
43
84
# ```
44
85
# HTML.unescape("Crystal & You") # => "Crystal & You"
45
86
# ```
46
87
def self.unescape (string : String ) : String
47
- string.gsub(/&(?:([a-zA-Z ] {2,32} ;?) |\# ([0-9] +) ;?|\# [xX] ([0-9A-Fa-f] +) ;?) / ) do |string , match |
88
+ string.gsub(/&(?:([a-zA-Z0-9 ] {2,32} ;?) |\# ([0-9] +) ;?|\# [xX] ([0-9A-Fa-f] +) ;?) / ) do |string , match |
48
89
if code = match[1 ]?
49
90
# Try to find the code
50
91
value = named_entity(code)
51
- if value
52
- value
53
- elsif ! code.ends_with?(';' )
92
+
93
+ unless value || code.ends_with?(';' )
54
94
# If we can't find it and it doesn't end with ';',
55
95
# we need to find each prefix of it.
56
96
# We start from the largest prefix.
@@ -67,19 +107,17 @@ module HTML
67
107
break
68
108
end
69
109
end
70
-
71
- # We either found the code or not,
72
- # in which case we need to return the original string
73
- value || string
74
110
end
111
+
112
+ # We either found the code or not,
113
+ # in which case we need to return the original string
114
+ value || string
75
115
elsif code = match[2 ]?
76
116
# Find by decimal code
77
- n = code.to_i
78
- n <= Char ::MAX_CODEPOINT ? n.unsafe_chr : string
117
+ decode_codepoint(code.to_i) || string
79
118
elsif code = match[3 ]?
80
119
# Find by hexadecimal code
81
- n = code.to_i(16 )
82
- n <= Char ::MAX_CODEPOINT ? n.unsafe_chr : string
120
+ decode_codepoint(code.to_i(16 )) || string
83
121
else
84
122
string
85
123
end
@@ -89,4 +127,29 @@ module HTML
89
127
private def self.named_entity (code )
90
128
HTML ::SINGLE_CHAR_ENTITIES [code]? || HTML ::DOUBLE_CHAR_ENTITIES [code]?
91
129
end
130
+
131
+ # see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
132
+ private def self.decode_codepoint (codepoint )
133
+ case codepoint
134
+ when 0x80 ..0x9F
135
+ # Replace characters from Windows-1252 with UTF-8 equivalents.
136
+ CHARACTER_REPLACEMENTS [codepoint - 0x80 ].to_s
137
+ when 0 ,
138
+ .> (Char ::MAX_CODEPOINT ),
139
+ 0xD800 ..0xDFFF # unicode surrogate characters
140
+ # Replace invalid characters with replacement character.
141
+ '\uFFFD'
142
+ else
143
+ # don't replace disallowed codepoints
144
+ unless codepoint == 0x007F ||
145
+ # unicode noncharacters
146
+ (0xFDD0 ..0xFDEF ).includes?(codepoint) ||
147
+ # last two of each plane (nonchars) disallowed
148
+ codepoint & 0xFFFF >= 0xFFFE ||
149
+ # unicode control characters expect space
150
+ (codepoint < 0x0020 && ! {0x0009 , 0x000A , 0x000C }.includes?(codepoint))
151
+ codepoint.unsafe_chr
152
+ end
153
+ end
154
+ end
92
155
end
0 commit comments