Skip to content

Commit

Permalink
refactor IO::EachReader, dry it up, and improve code docs
Browse files Browse the repository at this point in the history
  • Loading branch information
chuckremes committed Oct 3, 2015
1 parent b9e0907 commit 9dd628a
Showing 1 changed file with 94 additions and 116 deletions.
210 changes: 94 additions & 116 deletions kernel/common/io.rb
Expand Up @@ -1851,7 +1851,7 @@ def each(&block)
if @separator
if @separator.empty?
@separator = "\n\n"
@skip = 10
@skip = "\n"
end

if @limit
Expand All @@ -1868,21 +1868,43 @@ def each(&block)
end
end

def do_skip(buffer)
return 0 unless @skip
def read_and_yield_count_chars(str, buffer, byte_count, &block)
str << buffer.slice!(0, byte_count)

skip_count = 0
skip_count += 1 while buffer[skip_count].ord == @skip
if skip_count > 0
slice = buffer.slice!(0, skip_count)
slice.bytesize
if @limit
# Always read to char boundary because the +limit+ may have cut a multi-byte
# character in the middle. Returning such a string would have an invalid encoding.
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
str, bytes_read = read_to_char_boundary(@io, str, buffer)
else
0
# We are confident that our +str+ ends on a char boundary
str = IO.read_encode(@io, str)
end

str.taint
$. = @io.increment_lineno
skip_contiguous_chars(buffer)

# Unused bytes/chars should be saved for the next read. Since the block that we yield to
# may +return+ we don't want to drop the bytes that are stored in +buffer+. To save,
# unget them so the next read will fetch them again. This might be expensive and could
# potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
# a pointer around. Think about this for the mmap stuff.
@io.ungetc(buffer)
buffer.clear

yield str
end

def read_and_yield_entire_string(str, &block)
str = IO.read_encode(@io, str)
str.taint
$. = @io.increment_lineno
yield str
end

# method A, D
def read_to_separator
def read_to_separator(&block)
str = "".force_encoding(Encoding::ASCII_8BIT)
buffer = "".force_encoding(Encoding::ASCII_8BIT)
separator_size = @separator.bytesize
Expand All @@ -1899,25 +1921,7 @@ def read_to_separator
# the pattern/separator which may be >1. therefore, add the separator size.
count += separator_size

substring = buffer.slice!(0, count)
str << substring

str = IO.read_encode(@io, str)
str.taint

$. = @io.increment_lineno

do_skip(buffer)

# Unused bytes/chars should be saved for the next read. Since the block that we yield to
# may +return+ we don't want to drop the bytes that are stored in +buffer+. To save,
# unget them so the next read will fetch them again. This might be expensive and could
# potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
# a pointer around. Think about this for the mmap stuff.
@io.ungetc(buffer)
buffer.clear
yield str

read_and_yield_count_chars(str, buffer, count, &block)
str = "".force_encoding(Encoding::ASCII_8BIT)
else
str << buffer
Expand All @@ -1928,44 +1932,13 @@ def read_to_separator
str << buffer

unless str.empty?
str = IO.read_encode(@io, str)
str.taint
$. = @io.increment_lineno
yield str
read_and_yield_entire_string(str, &block)
end
end

# method B, E

def try_to_force_encoding(io, str)
str.force_encoding(io.external_encoding || Encoding.default_external)

IO.read_encode io, str
end

PEEK_AHEAD_LIMIT = 16

def read_to_char_boundary(io, str, buffer)
str.force_encoding(io.external_encoding || Encoding.default_external)
return [IO.read_encode(io, str), 0] if str.valid_encoding?

peek_ahead = 0
while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
str.force_encoding Encoding::ASCII_8BIT
substring = buffer.slice!(0, 1)
str << substring
peek_ahead += 1

str.force_encoding(io.external_encoding || Encoding.default_external)
if str.valid_encoding?
return [IO.read_encode(io, str), peek_ahead]
end
end

[IO.read_encode(io, str), peek_ahead]
end

def read_to_separator_with_limit
def read_to_separator_with_limit(&block)
str = "".force_encoding(Encoding::ASCII_8BIT)
buffer = "".force_encoding(Encoding::ASCII_8BIT)
separator_size = @separator.bytesize
Expand All @@ -1984,39 +1957,13 @@ def read_to_separator_with_limit
# #index returns a 0-based location but we want a length (so +1) and it should include
# the pattern/separator which may be >1. therefore, add the separator size.
count += separator_size
bytes = count < wanted ? count : wanted
str << buffer.slice!(0, bytes)

# Always read to char boundary because the +limit+ may have cut a multi-byte
# character in the middle. Returning such a string would have an invalid encoding.
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
str, bytes_read = read_to_char_boundary(@io, str, buffer)
str.taint

$. = @io.increment_lineno
do_skip(buffer)
@io.ungetc(buffer)
buffer.clear

yield str
count = count < wanted ? count : wanted
read_and_yield_count_chars(str, buffer, count, &block)

str = "".force_encoding(Encoding::ASCII_8BIT)
else
if wanted < buffer.size
str << buffer.slice!(0, wanted)

# replenish the buffer if we don't have enough bytes to satisfy the peek ahead
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
str, bytes_read = read_to_char_boundary(@io, str, buffer)
str.taint

$. = @io.increment_lineno
do_skip(buffer)
@io.ungetc(buffer)
buffer.clear

yield str

read_and_yield_count_chars(str, buffer, wanted, &block)
str = "".force_encoding(Encoding::ASCII_8BIT)
else
str << buffer
Expand All @@ -2027,53 +1974,84 @@ def read_to_separator_with_limit
end until buffer.size == 0 && @io.eof?

unless str.empty?
str = IO.read_encode(@io, str)
str.taint
$. = @io.increment_lineno
yield str
read_and_yield_entire_string(str, &block)
end
end

# Method G
def read_all
str = ""
def read_all(&block)
str = "".force_encoding(Encoding::ASCII_8BIT)

begin
str << @io.read
end until @io.eof?

unless str.empty?
str = IO.read_encode(@io, str)
str.taint
$. = @io.increment_lineno
yield str
read_and_yield_entire_string(str, &block)
end
end

# Method H
def read_to_limit
str = ""
def read_to_limit(&block)
str = "".force_encoding(Encoding::ASCII_8BIT)
wanted = limit = @limit.abs

begin
str << @io.read(wanted)
read_and_yield_count_chars(str, '', str.bytesize, &block)
str = "".force_encoding(Encoding::ASCII_8BIT)
end until @io.eof?

buffer = (@io.read(PEEK_AHEAD_LIMIT) || '')
str, bytes_read = read_to_char_boundary(@io, str, buffer)
str.taint
@io.ungetc(buffer)
unless str.empty?
read_and_yield_entire_string(str, &block)
end
end

$. = @io.increment_lineno
yield str
# Utility methods

str = ""
end until @io.eof?
def try_to_force_encoding(io, str)
str.force_encoding(io.external_encoding || Encoding.default_external)

unless str.empty?
str = IO.read_encode(@io, str)
str.taint
$. = @io.increment_lineno
yield str
IO.read_encode io, str
end

PEEK_AHEAD_LIMIT = 16

def read_to_char_boundary(io, str, buffer)
str.force_encoding(io.external_encoding || Encoding.default_external)
return [IO.read_encode(io, str), 0] if str.valid_encoding?

peek_ahead = 0
while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
str.force_encoding Encoding::ASCII_8BIT
substring = buffer.slice!(0, 1)
str << substring
peek_ahead += 1

str.force_encoding(io.external_encoding || Encoding.default_external)
if str.valid_encoding?
return [IO.read_encode(io, str), peek_ahead]
end
end

[IO.read_encode(io, str), peek_ahead]
end

# Advances the buffer index past any number of contiguous
# characters == +skip+ and throws away that data. For
# example, if +skip+ is ?\n and the buffer contents are
# "\n\n\nAbc...", the buffer will discard all chars
# up to 'A'.
def skip_contiguous_chars(buffer)
return 0 unless @skip

skip_count = 0
skip_count += 1 while buffer[skip_count] == @skip
if skip_count > 0
slice = buffer.slice!(0, skip_count)
slice.bytesize
else
0
end
end
end
Expand Down

0 comments on commit 9dd628a

Please sign in to comment.