Improve decoding of HTML entities (#5064)

straight-shoota · RX14 · commit 17ac8a28b599 · 2017-10-04T09:35:35.000+01:00
* Improve method doc

* fix invalid entity with trailing semicolon

* fix entites with numerical characters

* Improve decoding of numerical character references according to HTML5 spec

* Use case instead of if branches

* group all disallowed codepoints outside case statement

* fix typo

* Simplify branches

* replacement character as Char
diff --git a/spec/std/html_spec.cr b/spec/std/html_spec.cr
@@ -23,7 +23,7 @@ describe "HTML" do
       str.should eq("safe_string")
     end
 
-    it "unescapes dangerous characters from a string" do
+    it "unescapes html special characters" do
       str = HTML.unescape("&lt; &amp; &gt;")
 
       str.should eq("< & >")
@@ -42,9 +42,9 @@ describe "HTML" do
     end
 
     it "unescapes with invalid entities" do
-      str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn")
+      str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn &ThisIsNotAnEntitiy;")
 
-      str.should eq("&<&>\"&abcdefghijklmn")
+      str.should eq("&<&>\"&abcdefghijklmn &ThisIsNotAnEntitiy;")
     end
 
     it "unescapes hex encoded chars" do
@@ -53,18 +53,33 @@ describe "HTML" do
       str.should eq("3 + 2 = 5")
     end
 
+    it "unescapes decimal encoded chars" do
+      str = HTML.unescape("3 &#00043; 2 &#00061 5")
+
+      str.should eq("3 + 2 = 5")
+    end
+
     it "unescapes &nbsp;" do
       str = HTML.unescape("nbsp&nbsp;space ")
 
       str.should eq("nbsp\u{0000A0}space ")
     end
 
-    it "unescapes Char::MAX_CODEPOINT" do
+    it "does not unescape Char::MAX_CODEPOINT" do
+      # Char::MAX_CODEPOINT is actually a noncharacter and is not replaced
       str = HTML.unescape("limit &#x10FFFF;")
-      str.should eq("limit 􏿿")
+      str.should eq("limit &#x10FFFF;")
 
       str = HTML.unescape("limit &#1114111;")
-      str.should eq("limit 􏿿")
+      str.should eq("limit &#1114111;")
+    end
+
+    it "does not unescape characters above Char::MAX_CODEPOINT" do
+      str = HTML.unescape("limit &#x110000;")
+      str.should eq("limit \uFFFD")
+
+      str = HTML.unescape("limit &#1114112;")
+      str.should eq("limit \uFFFD")
     end
 
     it "unescapes &NotSquareSuperset;" do
@@ -73,9 +88,40 @@ describe "HTML" do
       str.should eq(" ⊐̸ ")
     end
 
-    it "unescapes &ampd" do
+    it "unescapes entities without trailing semicolon" do
       str = HTML.unescape("&amphello")
       str.should eq("&hello")
     end
+
+    it "unescapes named character reference with numerical characters" do
+      str = HTML.unescape("&frac34;")
+      str.should eq("\u00BE")
+    end
+
+    it "does not escape unicode control characters except space characters" do
+      string = "&#x0001;-&#x001F; &#x000D; &#x007F;"
+      HTML.unescape(string).should eq(string)
+
+      string = HTML.unescape("&#x0080;-&#x009F;")
+      string.should eq("\u20AC-\u0178")
+
+      HTML.unescape("&#x000;").should eq("\uFFFD")
+    end
+
+    it "escapes space characters" do
+      string = HTML.unescape("&#x0020;&#32;&#x0009;&#x000A;&#x000C;")
+      string.should eq("  \t\n\f")
+    end
+
+    it "does not escape noncharacter codepoints" do
+      # noncharacters http://www.unicode.org/faq/private_use.html
+      string = "&#xFDD0;-&#xFDEF; &#xFFFE; &#FFFF; &#x1FFFE; &#x1FFFF; &#x2FFFE; &#x10FFFF;"
+      HTML.unescape(string).should eq(string)
+    end
+
+    it "does not escape unicode surrogate characters" do
+      string = "&#xD800;-&#xDFFF;"
+      HTML.unescape(string).should eq("\uFFFD-\uFFFD")
+    end
   end
 end
diff --git a/src/html.cr b/src/html.cr
@@ -36,21 +36,61 @@ module HTML
     end
   end
 
+  # These replacements permit compatibility with old numeric entities that
+  # assumed Windows-1252 encoding.
+  # http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+  private CHARACTER_REPLACEMENTS = {
+    '\u20AC', # First entry is what 0x80 should be replaced with.
+    '\u0081',
+    '\u201A',
+    '\u0192',
+    '\u201E',
+    '\u2026',
+    '\u2020',
+    '\u2021',
+    '\u02C6',
+    '\u2030',
+    '\u0160',
+    '\u2039',
+    '\u0152',
+    '\u008D',
+    '\u017D',
+    '\u008F',
+    '\u0090',
+    '\u2018',
+    '\u2019',
+    '\u201C',
+    '\u201D',
+    '\u2022',
+    '\u2013',
+    '\u2014',
+    '\u02DC',
+    '\u2122',
+    '\u0161',
+    '\u203A',
+    '\u0153',
+    '\u009D',
+    '\u017E',
+    '\u0178', # Last entry is 0x9F.
+    # 0x00->'\uFFFD' is handled programmatically.
+    # 0x0D->'\u000D' is a no-op.
+  }
+
   # Returns a string where named and numeric character references
   # (e.g. &gt;, &#62;, &x3e;) in *string* are replaced with the corresponding
-  # unicode characters.
+  # unicode characters. This method decodes all HTML5 entities including those
+  # without a trailing semicolon (such as `&copy`).
   #
   # ```
   # HTML.unescape("Crystal &amp; You") # => "Crystal & You"
   # ```
   def self.unescape(string : String) : String
-    string.gsub(/&(?:([a-zA-Z]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
+    string.gsub(/&(?:([a-zA-Z0-9]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
       if code = match[1]?
         # Try to find the code
         value = named_entity(code)
-        if value
-          value
-        elsif !code.ends_with?(';')
+
+        unless value || code.ends_with?(';')
           # If we can't find it and it doesn't end with ';',
           # we need to find each prefix of it.
           # We start from the largest prefix.
@@ -67,19 +107,17 @@ module HTML
               break
             end
           end
-
-          # We either found the code or not,
-          # in which case we need to return the original string
-          value || string
         end
+
+        # We either found the code or not,
+        # in which case we need to return the original string
+        value || string
       elsif code = match[2]?
         # Find by decimal code
-        n = code.to_i
-        n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
+        decode_codepoint(code.to_i) || string
       elsif code = match[3]?
         # Find by hexadecimal code
-        n = code.to_i(16)
-        n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
+        decode_codepoint(code.to_i(16)) || string
       else
         string
       end
@@ -89,4 +127,29 @@ module HTML
   private def self.named_entity(code)
     HTML::SINGLE_CHAR_ENTITIES[code]? || HTML::DOUBLE_CHAR_ENTITIES[code]?
   end
+
+  # see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+  private def self.decode_codepoint(codepoint)
+    case codepoint
+    when 0x80..0x9F
+      # Replace characters from Windows-1252 with UTF-8 equivalents.
+      CHARACTER_REPLACEMENTS[codepoint - 0x80].to_s
+    when 0,
+         .>(Char::MAX_CODEPOINT),
+         0xD800..0xDFFF # unicode surrogate characters
+      # Replace invalid characters with replacement character.
+      '\uFFFD'
+    else
+      # don't replace disallowed codepoints
+      unless codepoint == 0x007F ||
+             # unicode noncharacters
+             (0xFDD0..0xFDEF).includes?(codepoint) ||
+             # last two of each plane (nonchars) disallowed
+             codepoint & 0xFFFF >= 0xFFFE ||
+             # unicode control characters expect space
+             (codepoint < 0x0020 && !{0x0009, 0x000A, 0x000C}.includes?(codepoint))
+        codepoint.unsafe_chr
+      end
+    end
+  end
 end