Skip to content

Commit

Permalink
Fix #3285 for a second time
Browse files Browse the repository at this point in the history
Ary Borenszweig committed Sep 14, 2016

Verified

This commit was signed with the committer’s verified signature.
wyattjoh Wyatt Johnson
1 parent c12acf4 commit 6bd28f4
Showing 3 changed files with 156 additions and 37 deletions.
96 changes: 96 additions & 0 deletions spec/std/data/io_data_incomplete_multibyte_sequence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
77u/PGJvZHk+Cgo8ZGl2IGlkPSJ3cmFwcGVyIj4KCgk8ZGl2IGlkPSJiYW5u
ZXIiPiA8L2Rpdj4KCgk8ZGl2IGlkPSJtZW51Ij4KCTx1bD4KCQk8bGk+PGEg
aHJlZj0iaHR0cDovL21lbnNoYWtvdi5ydS8iPtCSINC90LDRh9Cw0LvQvjwv
YT48L2xpPgoJCQk8L3VsPgoJPGJyIGNsYXNzPSJjbGVhciIgLz4KCTwvZGl2
PgoJPGRpdiBpZD0iY29udGVudCI+CgkKCQkKCQk8ZGl2IGNsYXNzPSJlbnRy
eSI+CgkJCTxoMT48YSBocmVmPSJodHRwOi8vbWVuc2hha292LnJ1LzIwMDkv
MDEvMjMvJWQwJWJjJWQxJTgzJWQwJWI0JWQxJTgwJWQwJWI1JWQxJTg2LSVk
MCViOC0lZDAlYmMlZDAlYmUlZDAlYmQlZDAlYjUlZDElODIlZDAlYjAvIiBy
ZWw9ImJvb2ttYXJrIiB0aXRsZT0i0JzRg9C00YDQtdGGINC4INC80L7QvdC1
0YLQsCI+0JzRg9C00YDQtdGGINC4INC80L7QvdC10YLQsDwvYT48L2gxPgoJ
CQk8ZGl2IGNsYXNzPSJkYXRlIj5KYW51YXJ5IDIzcmQsIDIwMDk8L2Rpdj4K
CQkJPHA+PCEtLVtlbmRpZl0tLT48L3A+CjxwIGNsYXNzPSJNc29Ob3JtYWwi
PtCe0LTQvdCw0LbQtNGLINCn0LXQu9C+0LLQtdC6INC/0YDQuNGI0LXQuyDQ
uiDRgdGC0LDRgNC+0LzRgyDQvNGD0LTRgNC10YbRgyDQt9CwINGB0L7QstC1
0YLQvtC8OjwvcD4KPHAgY2xhc3M9Ik1zb05vcm1hbCI+4oCTINCn0YLQviDQ
vNC90LUg0L3Rg9C20L3QviDRgdC00LXQu9Cw0YLRjCwg0YfRgtC+0LHRiyDR
gdGC0LDRgtGMINGF0L7Qt9GP0LjQvdC+0Lwg0YHQstC+0LXQuSDQttC40LfQ
vdC4LCDRh9GC0L7QsdGLINGPINC80L7QsyDRgNC10LDQu9C40LfQvtCy0LDR
gtGMINGB0LLQvtC4INC20LXQu9Cw0L3QuNGPPyDQryDRgtGA0LDRh9GDINC+
0YfQtdC90Ywg0LzQvdC+0LPQviDQstGA0LXQvNC10L3QuCwg0YfRgtC+0LHR
iyDQtNC+0YHRgtC40LPQvdGD0YLRjCDRgtC+0LPQviwg0YfQtdCz0L4g0Y8g
0YXQvtGH0YMsINC90L4g0LzQtdC90Y8g0LLRgdC10LPQtNCwINC/0L7RgdGC
0LjQs9Cw0Y7RgiDQvdC10YPQtNCw0YfQuCDQuCDQstGA0LXQvNGPINGD0YXQ
vtC00LjRgiDQvdCwINC/0L7RgdGC0L7RgNC+0L3QvdC40LUsINC90L4g0YLQ
vtC20LUg0L3QtdC+0LHRhdC+0LTQuNC80YvQtSDQvNC90LUsINCy0LXRidC4
LiA8YSBocmVmPSJodHRwOi8vbWVuc2hha292LnJ1LzIwMDkvMDEvMjMvJWQw
JWJjJWQxJTgzJWQwJWI0JWQxJTgwJWQwJWI1JWQxJTg2LSVkMCViOC0lZDAl
YmMlZDAlYmUlZDAlYmQlZDAlYjUlZDElODIlZDAlYjAvI21vcmUtMzQiIGNs
YXNzPSJtb3JlLWxpbmsiPtCU0LDQu9C10LU8L2E+PC9wPgoJCQk8ZGl2IGNs
YXNzPSJtZXRhIj4KCQkJCQkJCQkg0JTQvtCx0LDQstC70LXQvdC+INCyIDxh
IGhyZWY9Imh0dHA6Ly9tZW5zaGFrb3YucnUvY2F0ZWdvcnkvcGFyYWJsZXMv
IiB0aXRsZT0iVmlldyBhbGwgcG9zdHMgaW4g0J/RgNC40YLRh9C4IiByZWw9
ImNhdGVnb3J5IHRhZyI+0J/RgNC40YLRh9C4PC9hPiB8IDxhIGhyZWY9Imh0
dHA6Ly9tZW5zaGFrb3YucnUvMjAwOS8wMS8yMy8lZDAlYmMlZDElODMlZDAl
YjQlZDElODAlZDAlYjUlZDElODYtJWQwJWI4LSVkMCViYyVkMCViZSVkMCVi
ZCVkMCViNSVkMSU4MiVkMCViMC8jY29tbWVudHMiIHRpdGxlPSJDb21tZW50
IG9uINCc0YPQtNGA0LXRhiDQuCDQvNC+0L3QtdGC0LAiPjIg0LrQvtC80LzQ
tdC90YLQsNGA0LjQtdCyICYjMTg3OzwvYT4JCQk8L2Rpdj4KCQk8L2Rpdj4K
CgkJCgkJCgkJPGRpdiBjbGFzcz0iZW50cnkiPgoJCQk8aDE+PGEgaHJlZj0i
aHR0cDovL21lbnNoYWtvdi5ydS8yMDA4LzEyLzEzL2ljZS8iIHJlbD0iYm9v
a21hcmsiIHRpdGxlPSLQktC+0LTQsCDQuCDQu9C10LQiPtCS0L7QtNCwINC4
INC70LXQtDwvYT48L2gxPgoJCQk8ZGl2IGNsYXNzPSJkYXRlIj5EZWNlbWJl
ciAxM3RoLCAyMDA4PC9kaXY+CgkJCTxwPtCe0LTQvdCw0LbQtNGLINC30LjQ
vNC+0Lkg0KPRh9C40YLQtdC70Ywg0Lgg0YPRh9C10L3QuNC6INC/0YDQvtCz
0YPQu9C40LLQsNC70LjRgdGMINC/0L4g0LHQtdGA0LXQs9GDINGA0LXQutC4
LjxiciAvPgrigJQg0KPRh9C40YLQtdC70YwhINCb0Y7QtNC4INC90LUg0L/Q
vtC90LjQvNCw0Y7RgiDQtNGA0YPQsyDQtNGA0YPQs9CwLiDQm9GO0LTQuCDR
gdGC0LDRgNCw0Y7RgtGB0Y8g0L7QsdGJ0LDRgtGM0YHRjywg0YfQuNGC0LDR
jtGCINC60L3QuNCz0LgsINC60L7RgtC+0YDRi9C1INC/0L7QstC10YHRgtCy
0YPRjtGCINC+INCy0LfQsNC40LzQvtC/0L7QvdC40LzQsNC90LjQuCDigJQg
0Lgg0L3QsNGC0YvQutCw0Y7RgtGB0Y8g0L3QsCDQvdC10LLQuNC00LjQvNGD
0Y4g0YHRgtC10L3Rgy4g0J/QvtGH0LXQvNGDINGC0LDQuj8g0J3QtdGD0LbQ
tdC70Lgg0L3QtdC70YzQt9GPINGN0YLQvtC80YMg0L3QsNGD0YfQuNGC0Yw/
PGJyIC8+CuKAlCDQn9C+0LnQtNC10Lwg0YHQviDQvNC90L7QuSDigJQg0KPR
h9C40YLQtdC70Ywg0L/QvtGI0LXQuyDQv9C+INC70YzQtNGDINC90LAg0YHQ
tdGA0LXQtNC40L3RgyDRgNC10LrQuC4g4oCUINCf0L7RgdC80L7RgtGA0Lgg
0LLQvdC40LcuINCi0Ysg0LLQuNC00LjRiNGMINC30LTQtdGB0Ywg0YfRgtC+
LdGC0L4/PGJyIC8+CuKAlCDQndC10YIuINCa0LDQuiDRjyDQvNC+0LPRgyDR
g9Cy0LjQtNC10YLRjCDRh9GC0L4t0YLQviDRgdC60LLQvtC30Ywg0LvQtdC0
PzxiciAvPgrigJQg0KLQsNC8LCDQv9C+0LTQviDQu9GM0LTQvtC8IOKAlCDR
htC10LvRi9C5INC80LjRgCwg0L3QtdCy0LXQtNC+0LzRi9C5INGC0LXQsdC1
LiDQoNCw0YHRgtC+0L/QuCDQu9C10LQg4oCUINC4INC+0L0g0YHRgtCw0L3Q
tdGCINCy0L7QtNC+0LksINC00LDRjtGJ0LXQuSDQttC40LfQvdGMINGG0LXQ
u9C+0LzRgyDQvNC40YDRgy4g0J3QviDQv9GA0L7Qu9C10Lkg0YHRjtC00LAg
0LPQvtGC0L7QstGD0Y4g0LLQvtC00YMg4oCUINC+0L3QsCDQt9Cw0LzQtdGA
0LfQvdC10YIg0Lgg0LvQuNGI0Ywg0YPQutGA0LXQv9C40YIg0YbQsNGA0YHR
gtCy0L4g0LvRjNC00LAuPC9wPgoJCQk8ZGl2IGNsYXNzPSJtZXRhIj4KCQkJ
CQkJCQkg0JTQvtCx0LDQstC70LXQvdC+INCyIDxhIGhyZWY9Imh0dHA6Ly9t
ZW5zaGFrb3YucnUvY2F0ZWdvcnkvcGFyYWJsZXMvIiB0aXRsZT0iVmlldyBh
bGwgcG9zdHMgaW4g0J/RgNC40YLRh9C4IiByZWw9ImNhdGVnb3J5IHRhZyI+
0J/RgNC40YLRh9C4PC9hPiB8IDxhIGhyZWY9Imh0dHA6Ly9tZW5zaGFrb3Yu
cnUvMjAwOC8xMi8xMy9pY2UvI2NvbW1lbnRzIiB0aXRsZT0iQ29tbWVudCBv
biDQktC+0LTQsCDQuCDQu9C10LQiPjEg0LrQvtC80LzQtdC90YLQsNGA0LjQ
uSAmIzE4Nzs8L2E+CQkJPC9kaXY+CgkJPC9kaXY+CgoJCQoJCQoJCTxkaXYg
Y2xhc3M9ImVudHJ5Ij4KCQkJPGgxPjxhIGhyZWY9Imh0dHA6Ly9tZW5zaGFr
b3YucnUvMjAwOC8xMS8wNi9mcmFnaWxlLyIgcmVsPSJib29rbWFyayIgdGl0
bGU9ItCl0YDRg9C/0LrQuNC1INCy0LXRidC4Ij7QpdGA0YPQv9C60LjQtSDQ
stC10YnQuDwvYT48L2gxPgoJCQk8ZGl2IGNsYXNzPSJkYXRlIj5Ob3ZlbWJl
ciA2dGgsIDIwMDg8L2Rpdj4KCQkJPHA+0JrQvtCz0LTQsC3RgtC+INCyINC+
0LTQvdC+0Lwg0YHQtdC70LXQvdC40Lgg0LbQuNC7INGB0YLQsNGA0YvQuSDQ
vNGD0LTRgNGL0Lkg0YfQtdC70L7QstC10LouINCe0L0g0LvRjtCx0LjQuyDQ
tNC10YLQtdC5INC4INC/0YDQvtCy0L7QtNC40Lsg0YEg0L3QuNC80Lgg0LzQ
vdC+0LPQviDQstGA0LXQvNC10L3QuC4g0JXRidGRINC+0L0g0LvRjtCx0LjQ
uyDQtNC10LvQsNGC0Ywg0LjQvCDQv9C+0LTQsNGA0LrQuCwg0L/RgNCw0LLQ
tNCwINC00LDRgNC40Lsg0L/RgNC10LrRgNCw0YHQvdGL0LUs0L3QviDQvtGH
0LXQvdGMINGF0YDRg9C/0LrQuNC1INCy0LXRidC4LiDQmtCw0Log0L3QuCDR
gdGC0LDRgNCw0LvQuNGB0Ywg0LTQtdGC0Lgg0LHRi9GC0Ywg0LDQutC60YPR
gNCw0YLQvdGL0LzQuCwg0LjRhSDQvdC+0LLRi9C1INC40LPRgNGD0YjQutC4
INGH0LDRgdGC0L4g0LvQvtC80LDQu9C40YHRjC4g0JTQtdGC0Lgg0YDQsNGB
0YHRgtGA0LDQuNCy0LDQu9C40YHRjCDQuCDQs9C+0YDRjNC60L4g0L/Qu9Cw
0LrQsNC70LguINCQINC80YPQtNGA0LXRhiDRgdC90L7QstCwINC00LDRgNC4
0Lsg0LjQvCDQuNCz0YDRg9GI0LrQuCwg0L3QviDQtdGJ0ZEg0LHQvtC70LXQ
tSDRhdGA0YPQv9C60LjQtSDQuCDQvdC10LbQvdGL0LUmIzgyMzA7PGJyIC8+
CtCe0LTQvdCw0LbQtNGLINGA0L7QtNC40YLQtdC70Lgg0L3QtSDQstGL0LTQ
tdGA0LbQsNC70Lgg0Lgg0L/RgNC40YjQu9C4INC6INC90LXQvNGDOjxiciAv
Pgo=
7 changes: 7 additions & 0 deletions spec/std/io/io_spec.cr
Original file line number Diff line number Diff line change
@@ -575,6 +575,13 @@ describe IO do
m.set_encoding("UTF-8", invalid: :skip)
m.gets_to_end.should eq(" 0.0126 \n")
end

it "decodes incomplete multibyte sequence with skip (2) (#3285)" do
str = File.read("#{__DIR__}/../data/io_data_incomplete_multibyte_sequence.txt")
m = MemoryIO.new(Base64.decode_string str)
m.set_encoding("UTF-8", invalid: :skip)
m.gets_to_end.bytesize.should eq(4277)
end
end

describe "encode" do
90 changes: 53 additions & 37 deletions src/io/encoding.cr
Original file line number Diff line number Diff line change
@@ -64,56 +64,72 @@ module IO
@in_buffer_left = LibC::SizeT.new(0)
@out_buffer = Slice(UInt8).new((GC.malloc_atomic(OUT_BUFFER_SIZE).as(UInt8*)), OUT_BUFFER_SIZE)
@out_slice = Slice(UInt8).new(Pointer(UInt8).null, 0)
@last_errno = 0
@closed = false
end

def read(io)
return unless @out_slice.empty?
loop do
return unless @out_slice.empty?

if @in_buffer_left == 0
@in_buffer = @buffer.to_unsafe
@in_buffer_left = LibC::SizeT.new(io.read(@buffer))
elsif @last_errno == Errno::EINVAL
# EINVAL means "An incomplete multibyte sequence has been encountered in the input."

# If we have just a few bytes remaining to fill, move the ones we have to the beginning
# and read into the rest, so we avoid just decoding a small amount of bytes
buffer_remaining = BUFFER_SIZE - @in_buffer_left - (@in_buffer - @buffer.to_unsafe)
if buffer_remaining < 64
if @in_buffer_left == 0
@in_buffer = @buffer.to_unsafe
@in_buffer_left = LibC::SizeT.new(io.read(@buffer))
end

# If we just have a few bytes to decode, read more, just in case these don't produce a character
if @in_buffer_left < 16
buffer_remaining = BUFFER_SIZE - @in_buffer_left - (@in_buffer - @buffer.to_unsafe)
@buffer.copy_from(@in_buffer, @in_buffer_left)
@in_buffer = @buffer.to_unsafe
buffer_remaining = BUFFER_SIZE - @in_buffer_left
@in_buffer_left += LibC::SizeT.new(io.read(Slice.new(@in_buffer + @in_buffer_left, buffer_remaining)))
end
@in_buffer_left += LibC::SizeT.new(io.read(Slice.new(@in_buffer + @in_buffer_left, buffer_remaining)))
end

# If we just have a few bytes to decode, read more, just in case these don't produce a character
if @in_buffer_left < 16
buffer_remaining = BUFFER_SIZE - @in_buffer_left - (@in_buffer - @buffer.to_unsafe)
@buffer.copy_from(@in_buffer, @in_buffer_left)
@in_buffer = @buffer.to_unsafe
@in_buffer_left += LibC::SizeT.new(io.read(Slice.new(@in_buffer + @in_buffer_left, buffer_remaining)))
end
# If, after refilling the buffer, we couldn't read new bytes
# it means we reached the end
break if @in_buffer_left == 0

# Convert bytes using iconv
out_buffer = @out_buffer.to_unsafe
out_buffer_left = LibC::SizeT.new(OUT_BUFFER_SIZE)
old_in_buffer_left = @in_buffer_left
result = @iconv.convert(pointerof(@in_buffer), pointerof(@in_buffer_left), pointerof(out_buffer), pointerof(out_buffer_left))
@out_slice = @out_buffer[0, OUT_BUFFER_SIZE - out_buffer_left]

# Check for errors
if result == -1
case Errno.value
when Errno::EILSEQ
# For an illegal sequence we just skip one byte and we'll continue next
@iconv.handle_invalid(pointerof(@in_buffer), pointerof(@in_buffer_left))
when Errno::EINVAL
# EINVAL means "An incomplete multibyte sequence has been encountered in the input."

# On invalid multibyte sequence we try to read more bytes
# to see if they complete the sequence
refill_in_buffer(io)

# If we couldn't read anything new, we raise or skip
if old_in_buffer_left == @in_buffer_left
@iconv.handle_invalid(pointerof(@in_buffer), pointerof(@in_buffer_left))
end
end

out_buffer = @out_buffer.to_unsafe
out_buffer_left = LibC::SizeT.new(OUT_BUFFER_SIZE)
old_in_buffer_left = @in_buffer_left
result = @iconv.convert(pointerof(@in_buffer), pointerof(@in_buffer_left), pointerof(out_buffer), pointerof(out_buffer_left))
@out_slice = @out_buffer[0, OUT_BUFFER_SIZE - out_buffer_left]
if result == -1
case Errno.value
when Errno::EILSEQ, Errno::EINVAL
@iconv.handle_invalid(pointerof(@in_buffer), pointerof(@in_buffer_left))
# Continue decoding after an error
next
end

if old_in_buffer_left == @in_buffer_left
raise ArgumentError.new "incomplete multibyte sequence"
end
@last_errno = Errno.value
else
@last_errno = 0
break
end
end

private def refill_in_buffer(io)
buffer_remaining = BUFFER_SIZE - @in_buffer_left - (@in_buffer - @buffer.to_unsafe)
if buffer_remaining < 64
@buffer.copy_from(@in_buffer, @in_buffer_left)
@in_buffer = @buffer.to_unsafe
buffer_remaining = BUFFER_SIZE - @in_buffer_left
end
@in_buffer_left += LibC::SizeT.new(io.read(Slice.new(@in_buffer + @in_buffer_left, buffer_remaining)))
end

def read_byte(io)

0 comments on commit 6bd28f4

Please sign in to comment.