Skip to content

Commit 9dd628a

Browse files
committedOct 3, 2015
refactor IO::EachReader, dry it up, and improve code docs
1 parent b9e0907 commit 9dd628a

File tree

1 file changed

+94
-116
lines changed

1 file changed

+94
-116
lines changed
 

‎kernel/common/io.rb

+94-116
Original file line numberDiff line numberDiff line change
@@ -1851,7 +1851,7 @@ def each(&block)
18511851
if @separator
18521852
if @separator.empty?
18531853
@separator = "\n\n"
1854-
@skip = 10
1854+
@skip = "\n"
18551855
end
18561856

18571857
if @limit
@@ -1868,21 +1868,43 @@ def each(&block)
18681868
end
18691869
end
18701870

1871-
def do_skip(buffer)
1872-
return 0 unless @skip
1871+
def read_and_yield_count_chars(str, buffer, byte_count, &block)
1872+
str << buffer.slice!(0, byte_count)
18731873

1874-
skip_count = 0
1875-
skip_count += 1 while buffer[skip_count].ord == @skip
1876-
if skip_count > 0
1877-
slice = buffer.slice!(0, skip_count)
1878-
slice.bytesize
1874+
if @limit
1875+
# Always read to char boundary because the +limit+ may have cut a multi-byte
1876+
# character in the middle. Returning such a string would have an invalid encoding.
1877+
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
1878+
str, bytes_read = read_to_char_boundary(@io, str, buffer)
18791879
else
1880-
0
1880+
# We are confident that our +str+ ends on a char boundary
1881+
str = IO.read_encode(@io, str)
18811882
end
1883+
1884+
str.taint
1885+
$. = @io.increment_lineno
1886+
skip_contiguous_chars(buffer)
1887+
1888+
# Unused bytes/chars should be saved for the next read. Since the block that we yield to
1889+
# may +return+ we don't want to drop the bytes that are stored in +buffer+. To save,
1890+
# unget them so the next read will fetch them again. This might be expensive and could
1891+
# potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
1892+
# a pointer around. Think about this for the mmap stuff.
1893+
@io.ungetc(buffer)
1894+
buffer.clear
1895+
1896+
yield str
1897+
end
1898+
1899+
def read_and_yield_entire_string(str, &block)
1900+
str = IO.read_encode(@io, str)
1901+
str.taint
1902+
$. = @io.increment_lineno
1903+
yield str
18821904
end
18831905

18841906
# method A, D
1885-
def read_to_separator
1907+
def read_to_separator(&block)
18861908
str = "".force_encoding(Encoding::ASCII_8BIT)
18871909
buffer = "".force_encoding(Encoding::ASCII_8BIT)
18881910
separator_size = @separator.bytesize
@@ -1899,25 +1921,7 @@ def read_to_separator
18991921
# the pattern/separator which may be >1. therefore, add the separator size.
19001922
count += separator_size
19011923

1902-
substring = buffer.slice!(0, count)
1903-
str << substring
1904-
1905-
str = IO.read_encode(@io, str)
1906-
str.taint
1907-
1908-
$. = @io.increment_lineno
1909-
1910-
do_skip(buffer)
1911-
1912-
# Unused bytes/chars should be saved for the next read. Since the block that we yield to
1913-
# may +return+ we don't want to drop the bytes that are stored in +buffer+. To save,
1914-
# unget them so the next read will fetch them again. This might be expensive and could
1915-
# potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
1916-
# a pointer around. Think about this for the mmap stuff.
1917-
@io.ungetc(buffer)
1918-
buffer.clear
1919-
yield str
1920-
1924+
read_and_yield_count_chars(str, buffer, count, &block)
19211925
str = "".force_encoding(Encoding::ASCII_8BIT)
19221926
else
19231927
str << buffer
@@ -1928,44 +1932,13 @@ def read_to_separator
19281932
str << buffer
19291933

19301934
unless str.empty?
1931-
str = IO.read_encode(@io, str)
1932-
str.taint
1933-
$. = @io.increment_lineno
1934-
yield str
1935+
read_and_yield_entire_string(str, &block)
19351936
end
19361937
end
19371938

19381939
# method B, E
19391940

1940-
def try_to_force_encoding(io, str)
1941-
str.force_encoding(io.external_encoding || Encoding.default_external)
1942-
1943-
IO.read_encode io, str
1944-
end
1945-
1946-
PEEK_AHEAD_LIMIT = 16
1947-
1948-
def read_to_char_boundary(io, str, buffer)
1949-
str.force_encoding(io.external_encoding || Encoding.default_external)
1950-
return [IO.read_encode(io, str), 0] if str.valid_encoding?
1951-
1952-
peek_ahead = 0
1953-
while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
1954-
str.force_encoding Encoding::ASCII_8BIT
1955-
substring = buffer.slice!(0, 1)
1956-
str << substring
1957-
peek_ahead += 1
1958-
1959-
str.force_encoding(io.external_encoding || Encoding.default_external)
1960-
if str.valid_encoding?
1961-
return [IO.read_encode(io, str), peek_ahead]
1962-
end
1963-
end
1964-
1965-
[IO.read_encode(io, str), peek_ahead]
1966-
end
1967-
1968-
def read_to_separator_with_limit
1941+
def read_to_separator_with_limit(&block)
19691942
str = "".force_encoding(Encoding::ASCII_8BIT)
19701943
buffer = "".force_encoding(Encoding::ASCII_8BIT)
19711944
separator_size = @separator.bytesize
@@ -1984,39 +1957,13 @@ def read_to_separator_with_limit
19841957
# #index returns a 0-based location but we want a length (so +1) and it should include
19851958
# the pattern/separator which may be >1. therefore, add the separator size.
19861959
count += separator_size
1987-
bytes = count < wanted ? count : wanted
1988-
str << buffer.slice!(0, bytes)
1989-
1990-
# Always read to char boundary because the +limit+ may have cut a multi-byte
1991-
# character in the middle. Returning such a string would have an invalid encoding.
1992-
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
1993-
str, bytes_read = read_to_char_boundary(@io, str, buffer)
1994-
str.taint
1995-
1996-
$. = @io.increment_lineno
1997-
do_skip(buffer)
1998-
@io.ungetc(buffer)
1999-
buffer.clear
2000-
2001-
yield str
1960+
count = count < wanted ? count : wanted
1961+
read_and_yield_count_chars(str, buffer, count, &block)
20021962

20031963
str = "".force_encoding(Encoding::ASCII_8BIT)
20041964
else
20051965
if wanted < buffer.size
2006-
str << buffer.slice!(0, wanted)
2007-
2008-
# replenish the buffer if we don't have enough bytes to satisfy the peek ahead
2009-
buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
2010-
str, bytes_read = read_to_char_boundary(@io, str, buffer)
2011-
str.taint
2012-
2013-
$. = @io.increment_lineno
2014-
do_skip(buffer)
2015-
@io.ungetc(buffer)
2016-
buffer.clear
2017-
2018-
yield str
2019-
1966+
read_and_yield_count_chars(str, buffer, wanted, &block)
20201967
str = "".force_encoding(Encoding::ASCII_8BIT)
20211968
else
20221969
str << buffer
@@ -2027,53 +1974,84 @@ def read_to_separator_with_limit
20271974
end until buffer.size == 0 && @io.eof?
20281975

20291976
unless str.empty?
2030-
str = IO.read_encode(@io, str)
2031-
str.taint
2032-
$. = @io.increment_lineno
2033-
yield str
1977+
read_and_yield_entire_string(str, &block)
20341978
end
20351979
end
20361980

20371981
# Method G
2038-
def read_all
2039-
str = ""
1982+
def read_all(&block)
1983+
str = "".force_encoding(Encoding::ASCII_8BIT)
20401984

20411985
begin
20421986
str << @io.read
20431987
end until @io.eof?
20441988

20451989
unless str.empty?
2046-
str = IO.read_encode(@io, str)
2047-
str.taint
2048-
$. = @io.increment_lineno
2049-
yield str
1990+
read_and_yield_entire_string(str, &block)
20501991
end
20511992
end
20521993

20531994
# Method H
2054-
def read_to_limit
2055-
str = ""
1995+
def read_to_limit(&block)
1996+
str = "".force_encoding(Encoding::ASCII_8BIT)
20561997
wanted = limit = @limit.abs
20571998

20581999
begin
20592000
str << @io.read(wanted)
2001+
read_and_yield_count_chars(str, '', str.bytesize, &block)
2002+
str = "".force_encoding(Encoding::ASCII_8BIT)
2003+
end until @io.eof?
20602004

2061-
buffer = (@io.read(PEEK_AHEAD_LIMIT) || '')
2062-
str, bytes_read = read_to_char_boundary(@io, str, buffer)
2063-
str.taint
2064-
@io.ungetc(buffer)
2005+
unless str.empty?
2006+
read_and_yield_entire_string(str, &block)
2007+
end
2008+
end
20652009

2066-
$. = @io.increment_lineno
2067-
yield str
2010+
# Utility methods
20682011

2069-
str = ""
2070-
end until @io.eof?
2012+
def try_to_force_encoding(io, str)
2013+
str.force_encoding(io.external_encoding || Encoding.default_external)
20712014

2072-
unless str.empty?
2073-
str = IO.read_encode(@io, str)
2074-
str.taint
2075-
$. = @io.increment_lineno
2076-
yield str
2015+
IO.read_encode io, str
2016+
end
2017+
2018+
PEEK_AHEAD_LIMIT = 16
2019+
2020+
def read_to_char_boundary(io, str, buffer)
2021+
str.force_encoding(io.external_encoding || Encoding.default_external)
2022+
return [IO.read_encode(io, str), 0] if str.valid_encoding?
2023+
2024+
peek_ahead = 0
2025+
while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
2026+
str.force_encoding Encoding::ASCII_8BIT
2027+
substring = buffer.slice!(0, 1)
2028+
str << substring
2029+
peek_ahead += 1
2030+
2031+
str.force_encoding(io.external_encoding || Encoding.default_external)
2032+
if str.valid_encoding?
2033+
return [IO.read_encode(io, str), peek_ahead]
2034+
end
2035+
end
2036+
2037+
[IO.read_encode(io, str), peek_ahead]
2038+
end
2039+
2040+
# Advances the buffer index past any number of contiguous
2041+
# characters == +skip+ and throws away that data. For
2042+
# example, if +skip+ is ?\n and the buffer contents are
2043+
# "\n\n\nAbc...", the buffer will discard all chars
2044+
# up to 'A'.
2045+
def skip_contiguous_chars(buffer)
2046+
return 0 unless @skip
2047+
2048+
skip_count = 0
2049+
skip_count += 1 while buffer[skip_count] == @skip
2050+
if skip_count > 0
2051+
slice = buffer.slice!(0, skip_count)
2052+
slice.bytesize
2053+
else
2054+
0
20772055
end
20782056
end
20792057
end

0 commit comments

Comments
 (0)
Please sign in to comment.