Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Improve decoding of HTML entities (#5064)
* Improve method doc

* fix invalid entity with trailing semicolon

* fix entites with numerical characters

* Improve decoding of numerical character references according to HTML5 spec

* Use case instead of if branches

* group all disallowed codepoints outside case statement

* fix typo

* Simplify branches

* replacement character as Char
  • Loading branch information
straight-shoota authored and RX14 committed Oct 4, 2017
1 parent 25392e1 commit 17ac8a2
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 20 deletions.
60 changes: 53 additions & 7 deletions spec/std/html_spec.cr
Expand Up @@ -23,7 +23,7 @@ describe "HTML" do
str.should eq("safe_string")
end

it "unescapes dangerous characters from a string" do
it "unescapes html special characters" do
str = HTML.unescape("< & >")

str.should eq("< & >")
Expand All @@ -42,9 +42,9 @@ describe "HTML" do
end

it "unescapes with invalid entities" do
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn")
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn &ThisIsNotAnEntitiy;")

str.should eq("&<&>\"&abcdefghijklmn")
str.should eq("&<&>\"&abcdefghijklmn &ThisIsNotAnEntitiy;")
end

it "unescapes hex encoded chars" do
Expand All @@ -53,18 +53,33 @@ describe "HTML" do
str.should eq("3 + 2 = 5")
end

it "unescapes decimal encoded chars" do
str = HTML.unescape("3 &#00043; 2 &#00061 5")

str.should eq("3 + 2 = 5")
end

it "unescapes &nbsp;" do
str = HTML.unescape("nbsp&nbsp;space ")

str.should eq("nbsp\u{0000A0}space ")
end

it "unescapes Char::MAX_CODEPOINT" do
it "does not unescape Char::MAX_CODEPOINT" do
# Char::MAX_CODEPOINT is actually a noncharacter and is not replaced
str = HTML.unescape("limit &#x10FFFF;")
str.should eq("limit 􏿿")
str.should eq("limit &#x10FFFF;")

str = HTML.unescape("limit &#1114111;")
str.should eq("limit 􏿿")
str.should eq("limit &#1114111;")
end

it "does not unescape characters above Char::MAX_CODEPOINT" do
str = HTML.unescape("limit &#x110000;")
str.should eq("limit \uFFFD")

str = HTML.unescape("limit &#1114112;")
str.should eq("limit \uFFFD")
end

it "unescapes &NotSquareSuperset;" do
Expand All @@ -73,9 +88,40 @@ describe "HTML" do
str.should eq(" ⊐̸ ")
end

it "unescapes &ampd" do
it "unescapes entities without trailing semicolon" do
str = HTML.unescape("&amphello")
str.should eq("&hello")
end

it "unescapes named character reference with numerical characters" do
str = HTML.unescape("&frac34;")
str.should eq("\u00BE")
end

it "does not escape unicode control characters except space characters" do
string = "&#x0001;-&#x001F; &#x000D; &#x007F;"
HTML.unescape(string).should eq(string)

string = HTML.unescape("&#x0080;-&#x009F;")
string.should eq("\u20AC-\u0178")

HTML.unescape("&#x000;").should eq("\uFFFD")
end

it "escapes space characters" do
string = HTML.unescape("&#x0020;&#32;&#x0009;&#x000A;&#x000C;")
string.should eq(" \t\n\f")
end

it "does not escape noncharacter codepoints" do
# noncharacters http://www.unicode.org/faq/private_use.html
string = "&#xFDD0;-&#xFDEF; &#xFFFE; &#FFFF; &#x1FFFE; &#x1FFFF; &#x2FFFE; &#x10FFFF;"
HTML.unescape(string).should eq(string)
end

it "does not escape unicode surrogate characters" do
string = "&#xD800;-&#xDFFF;"
HTML.unescape(string).should eq("\uFFFD-\uFFFD")
end
end
end
89 changes: 76 additions & 13 deletions src/html.cr
Expand Up @@ -36,21 +36,61 @@ module HTML
end
end

# These replacements permit compatibility with old numeric entities that
# assumed Windows-1252 encoding.
# http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
private CHARACTER_REPLACEMENTS = {
'\u20AC', # First entry is what 0x80 should be replaced with.
'\u0081',
'\u201A',
'\u0192',
'\u201E',
'\u2026',
'\u2020',
'\u2021',
'\u02C6',
'\u2030',
'\u0160',
'\u2039',
'\u0152',
'\u008D',
'\u017D',
'\u008F',
'\u0090',
'\u2018',
'\u2019',
'\u201C',
'\u201D',
'\u2022',
'\u2013',
'\u2014',
'\u02DC',
'\u2122',
'\u0161',
'\u203A',
'\u0153',
'\u009D',
'\u017E',
'\u0178', # Last entry is 0x9F.
# 0x00->'\uFFFD' is handled programmatically.
# 0x0D->'\u000D' is a no-op.
}

# Returns a string where named and numeric character references
# (e.g. &gt;, &#62;, &x3e;) in *string* are replaced with the corresponding
# unicode characters.
# unicode characters. This method decodes all HTML5 entities including those
# without a trailing semicolon (such as `&copy`).
#
# ```
# HTML.unescape("Crystal &amp; You") # => "Crystal & You"
# ```
def self.unescape(string : String) : String
string.gsub(/&(?:([a-zA-Z]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
string.gsub(/&(?:([a-zA-Z0-9]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
if code = match[1]?
# Try to find the code
value = named_entity(code)
if value
value
elsif !code.ends_with?(';')

unless value || code.ends_with?(';')
# If we can't find it and it doesn't end with ';',
# we need to find each prefix of it.
# We start from the largest prefix.
Expand All @@ -67,19 +107,17 @@ module HTML
break
end
end

# We either found the code or not,
# in which case we need to return the original string
value || string
end

# We either found the code or not,
# in which case we need to return the original string
value || string
elsif code = match[2]?
# Find by decimal code
n = code.to_i
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
decode_codepoint(code.to_i) || string
elsif code = match[3]?
# Find by hexadecimal code
n = code.to_i(16)
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
decode_codepoint(code.to_i(16)) || string
else
string
end
Expand All @@ -89,4 +127,29 @@ module HTML
private def self.named_entity(code)
HTML::SINGLE_CHAR_ENTITIES[code]? || HTML::DOUBLE_CHAR_ENTITIES[code]?
end

# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
private def self.decode_codepoint(codepoint)
case codepoint
when 0x80..0x9F
# Replace characters from Windows-1252 with UTF-8 equivalents.
CHARACTER_REPLACEMENTS[codepoint - 0x80].to_s
when 0,
.>(Char::MAX_CODEPOINT),
0xD800..0xDFFF # unicode surrogate characters
# Replace invalid characters with replacement character.
'\uFFFD'
else
# don't replace disallowed codepoints
unless codepoint == 0x007F ||
# unicode noncharacters
(0xFDD0..0xFDEF).includes?(codepoint) ||
# last two of each plane (nonchars) disallowed
codepoint & 0xFFFF >= 0xFFFE ||
# unicode control characters expect space
(codepoint < 0x0020 && !{0x0009, 0x000A, 0x000C}.includes?(codepoint))
codepoint.unsafe_chr
end
end
end
end

0 comments on commit 17ac8a2

Please sign in to comment.