Skip to content

Commit

Permalink
Fix String#scan behavior same as Ruby (#3877)
Browse files Browse the repository at this point in the history
* Fix String#scan behavior same as Ruby

For example, Ruby's String#scan is:

    "hello world".scan(/\w+|(?= )/) # => ["hello", "", "world"]

But Crystal's is:

    "hello world".scan(/\w+|(?= )/).map &.[0] # => ["hello", ""]

This commit fixes it by continuing to scan when match is empty.

* Fix String#split behavior same as Ruby
  • Loading branch information
makenowjust authored and bcardiff committed Jan 16, 2017
1 parent ff314f9 commit ce1b6d6
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 37 deletions.
3 changes: 3 additions & 0 deletions spec/std/string_spec.cr
Expand Up @@ -810,7 +810,9 @@ describe "String" do
assert { "foo,bar,baz,qux".split(/,/, 30).should eq(["foo", "bar", "baz", "qux"]) }
assert { "a b c".split(Regex.new(" "), 2).should eq(["a", "b c"]) }
assert { "日本ん語日本ん語".split(/ん/).should eq(["日本", "語日本", ""]) }
assert { "九十九十九".split(/(?=十)/).should eq(["", "十九", "十九"]) }
assert { "hello world".split(/\b/).should eq(["hello", " ", "world", ""]) }
assert { "hello world".split(/\w+|(?= )/).should eq(["", " ", ""]) }
assert { "abc".split(//).should eq(["a", "b", "c"]) }
assert { "hello".split(/\w+/).should eq(["", ""]) }
assert { "foo".split(/o/).should eq(["f", "", ""]) }
Expand Down Expand Up @@ -1674,6 +1676,7 @@ describe "String" do
it "works when match is empty" do
r = %r([\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*))
"hello".scan(r).map(&.[0]).should eq(["hello", ""])
"hello world".scan(/\w+|(?= )/).map(&.[0]).should eq(["hello", "", "world"])
end

it "works with strings with block" do
Expand Down
66 changes: 29 additions & 37 deletions src/string.cr
Expand Up @@ -2852,41 +2852,32 @@ class String
end

count = 0
match_offset = 0
slice_offset = 0
last_slice_offset = 0
match_offset = slice_offset = 0

while match = separator.match_at_byte_index(self, match_offset)
index = match.byte_begin(0)
slice_size = index - slice_offset
match_bytesize = match[0].bytesize
next_offset = index + match_bytesize

if slice_offset == 0 && slice_size == 0 && match_bytesize == 0
# Skip
elsif slice_offset == bytesize && slice_size == 0
yield byte_slice(last_slice_offset)
if next_offset == slice_offset
match_offset = next_offset + char_bytesize_at(next_offset)
else
slice_size = index - slice_offset

yield byte_slice(slice_offset, slice_size)
end
count += 1
count += 1

1.upto(match.size) do |i|
if group = match[i]?
yield group
1.upto(match.size) do |i|
if group = match[i]?
yield group
end
end
end

last_slice_offset = slice_offset

if match_bytesize == 0
match_offset = index + 1
slice_offset = index
else
match_offset = index + match_bytesize
slice_offset = match_offset
slice_offset = match_offset = next_offset
end

break if limit && count + 1 == limit
break if slice_offset > bytesize
break if match_offset >= bytesize
end

yield byte_slice(slice_offset)
Expand Down Expand Up @@ -3208,7 +3199,7 @@ class String
$~ = match
yield match
match_bytesize = match[0].bytesize
break if match_bytesize == 0
match_bytesize += 1 if match_bytesize == 0
byte_offset = index + match_bytesize
end

Expand Down Expand Up @@ -3574,6 +3565,19 @@ class String
@bytesize == size
end

protected def char_bytesize_at(byte_index)
case unsafe_byte_at(byte_index)
when .< 0x80
1
when .< 0xe0
2
when .< 0xf0
3
else
4
end
end

protected def size_known?
@bytesize == 0 || @length > 0
end
Expand All @@ -3584,19 +3588,7 @@ class String

while byte_index < bytesize
yield byte_index, char_index

c = to_unsafe[byte_index]

if c < 0x80
byte_index += 1
elsif c < 0xe0
byte_index += 2
elsif c < 0xf0
byte_index += 3
else
byte_index += 4
end

byte_index += char_bytesize_at(byte_index)
char_index += 1
end

Expand Down

0 comments on commit ce1b6d6

Please sign in to comment.