Make String#[] not read out-of-bounds if string ends in unicode char (#…

…5257) * Char::Reader: Fail if multi-byte UTF-8 sequence is cut off prematurely Before this change, Char::Reader#decode_char_at() wouldn't detect that an UTF-8 sequence was cut off prematurely. Instead, it continued on reading, leading to an out-of-bounds read. This fix should be regarded as band-aid for Char::Reader. * String#[]: Detect out-of-bounds when ending in a UTF-8 character As Char::Reader signals the end of a string by returning a NUL-char, we need to check in String#[] if the read could go out of bounds. This only triggers if the first byte of the character would be OOB already, which is fine. If a later byte in the character sequence is missing, Char::Reader would notice it.
crystal-lang · Nov 22, 2017 · f6a1fa1 · f6a1fa1
1 parent a2e8300
commit f6a1fa1
Showing 4 changed files with 39 additions and 10 deletions.
diff --git a/spec/std/char/reader_spec.cr b/spec/std/char/reader_spec.cr
@@ -165,4 +165,16 @@ describe "Char::Reader" do
   it "errors if first_byte >= 0xF5" do
     assert_invalid_byte_sequence Bytes[0xf5, 0x8F, 0xA0, 0xA0], 4
   end
+
+  it "errors if second_byte is out of bounds" do
+    assert_invalid_byte_sequence Bytes[0xf4], 1
+  end
+
+  it "errors if third_byte is out of bounds" do
+    assert_invalid_byte_sequence Bytes[0xf4, 0x8f], 2
+  end
+
+  it "errors if fourth_byte is out of bounds" do
+    assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0], 3
+  end
 end
diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr
@@ -62,6 +62,12 @@ describe "String" do
       "há日本語"[5, 10].should eq("")
     end
 
+    it "raises IndexError if pointing after last char which is non-ASCII" do
+      expect_raises(IndexError) do
+        "ß"[1]
+      end
+    end
+
     it "raises index out of bound on index out of range with range" do
       expect_raises(IndexError) do
         "foo"[4..1]
@@ -151,6 +157,10 @@ describe "String" do
       "hello"[-1]?.should eq('o')
       "hello"[-6]?.should be_nil
     end
+
+    it "returns nil with []? if pointing after last char which is non-ASCII" do
+      "ß"[1]?.should be_nil
+    end
   end
 
   describe "byte_slice" do

diff --git a/src/char/reader.cr b/src/char/reader.cr
@@ -49,7 +49,8 @@ struct Char
 
     # If there was an error decoding the current char because
     # of an invalid UTF-8 byte sequence, returns the byte
-    # that produced the invalid encoding. Otherwise returns `nil`.
+    # that produced the invalid encoding. Returns `0` if the char would've been
+    # out of bounds. Otherwise returns `nil`.
     getter error : UInt8?
 
     # Creates a reader with the specified *string* positioned at
@@ -184,7 +185,7 @@ struct Char
     end
 
     private def decode_char_at(pos)
-      first = byte_at(pos)
+      first = byte_at?(pos) || 0u32
       if first < 0x80
         return yield first, 1, nil
       end
@@ -193,17 +194,17 @@ struct Char
         invalid_byte_sequence 1
       end
 
-      second = byte_at(pos + 1)
-      if (second & 0xc0) != 0x80
+      second = byte_at?(pos + 1)
+      if second.nil? || (second & 0xc0) != 0x80
         invalid_byte_sequence 1
       end
 
       if first < 0xe0
         return yield (first << 6) + (second - 0x3080), 2, nil
       end
 
-      third = byte_at(pos + 2)
-      if (third & 0xc0) != 0x80
+      third = byte_at?(pos + 2)
+      if third.nil? || (third & 0xc0) != 0x80
         invalid_byte_sequence 2
       end
 
@@ -227,8 +228,10 @@ struct Char
         invalid_byte_sequence 3
       end
 
-      fourth = byte_at(pos + 3)
-      if (fourth & 0xc0) != 0x80
+      fourth = byte_at?(pos + 3)
+      if fourth.nil?
+        invalid_byte_sequence 3
+      elsif (fourth & 0xc0) != 0x80
         invalid_byte_sequence 4
       end
 
@@ -271,7 +274,11 @@ struct Char
     end
 
     private def byte_at(i)
-      @string.unsafe_byte_at(i).to_u32
+      @string.byte_at(i).to_u32
+    end
+
+    private def byte_at?(i)
+      @string.byte_at?(i).try(&.to_u32)
     end
   end
 end
diff --git a/src/string.cr b/src/string.cr
@@ -837,7 +837,7 @@ class String
     index += size if index < 0
 
     byte_index = char_index_to_byte_index(index)
-    if byte_index
+    if byte_index && byte_index < @bytesize
       reader = Char::Reader.new(self, pos: byte_index)
       return reader.current_char
     else