Skip to content

Commit

Permalink
Make String#[] not read out-of-bounds if string ends in unicode char (#…
Browse files Browse the repository at this point in the history
…5257)

* Char::Reader: Fail if multi-byte UTF-8 sequence is cut off prematurely

Before this change, Char::Reader#decode_char_at() wouldn't detect that
an UTF-8 sequence was cut off prematurely.  Instead, it continued on
reading, leading to an out-of-bounds read.

This fix should be regarded as band-aid for Char::Reader.

* String#[]: Detect out-of-bounds when ending in a UTF-8 character

As Char::Reader signals the end of a string by returning a NUL-char,
we need to check in String#[] if the read could go out of bounds.  This
only triggers if the first byte of the character would be OOB already,
which is fine.  If a later byte in the character sequence is missing,
Char::Reader would notice it.
  • Loading branch information
Papierkorb authored and RX14 committed Nov 22, 2017
1 parent a2e8300 commit f6a1fa1
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 10 deletions.
12 changes: 12 additions & 0 deletions spec/std/char/reader_spec.cr
Expand Up @@ -165,4 +165,16 @@ describe "Char::Reader" do
it "errors if first_byte >= 0xF5" do
assert_invalid_byte_sequence Bytes[0xf5, 0x8F, 0xA0, 0xA0], 4
end

it "errors if second_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4], 1
end

it "errors if third_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f], 2
end

it "errors if fourth_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0], 3
end
end
10 changes: 10 additions & 0 deletions spec/std/string_spec.cr
Expand Up @@ -62,6 +62,12 @@ describe "String" do
"há日本語"[5, 10].should eq("")
end

it "raises IndexError if pointing after last char which is non-ASCII" do
expect_raises(IndexError) do
"ß"[1]
end
end

it "raises index out of bound on index out of range with range" do
expect_raises(IndexError) do
"foo"[4..1]
Expand Down Expand Up @@ -151,6 +157,10 @@ describe "String" do
"hello"[-1]?.should eq('o')
"hello"[-6]?.should be_nil
end

it "returns nil with []? if pointing after last char which is non-ASCII" do
"ß"[1]?.should be_nil
end
end

describe "byte_slice" do
Expand Down
25 changes: 16 additions & 9 deletions src/char/reader.cr
Expand Up @@ -49,7 +49,8 @@ struct Char

# If there was an error decoding the current char because
# of an invalid UTF-8 byte sequence, returns the byte
# that produced the invalid encoding. Otherwise returns `nil`.
# that produced the invalid encoding. Returns `0` if the char would've been
# out of bounds. Otherwise returns `nil`.
getter error : UInt8?

# Creates a reader with the specified *string* positioned at
Expand Down Expand Up @@ -184,7 +185,7 @@ struct Char
end

private def decode_char_at(pos)
first = byte_at(pos)
first = byte_at?(pos) || 0u32
if first < 0x80
return yield first, 1, nil
end
Expand All @@ -193,17 +194,17 @@ struct Char
invalid_byte_sequence 1
end

second = byte_at(pos + 1)
if (second & 0xc0) != 0x80
second = byte_at?(pos + 1)
if second.nil? || (second & 0xc0) != 0x80
invalid_byte_sequence 1
end

if first < 0xe0
return yield (first << 6) + (second - 0x3080), 2, nil
end

third = byte_at(pos + 2)
if (third & 0xc0) != 0x80
third = byte_at?(pos + 2)
if third.nil? || (third & 0xc0) != 0x80
invalid_byte_sequence 2
end

Expand All @@ -227,8 +228,10 @@ struct Char
invalid_byte_sequence 3
end

fourth = byte_at(pos + 3)
if (fourth & 0xc0) != 0x80
fourth = byte_at?(pos + 3)
if fourth.nil?
invalid_byte_sequence 3
elsif (fourth & 0xc0) != 0x80
invalid_byte_sequence 4
end

Expand Down Expand Up @@ -271,7 +274,11 @@ struct Char
end

private def byte_at(i)
@string.unsafe_byte_at(i).to_u32
@string.byte_at(i).to_u32
end

private def byte_at?(i)
@string.byte_at?(i).try(&.to_u32)
end
end
end
2 changes: 1 addition & 1 deletion src/string.cr
Expand Up @@ -837,7 +837,7 @@ class String
index += size if index < 0

byte_index = char_index_to_byte_index(index)
if byte_index
if byte_index && byte_index < @bytesize
reader = Char::Reader.new(self, pos: byte_index)
return reader.current_char
else
Expand Down

0 comments on commit f6a1fa1

Please sign in to comment.