refactor IO::EachReader, dry it up, and improve code docs

chuckremes · chuckremes · commit 9dd628a31d4d · 2015-10-03T11:54:04.000-05:00
diff --git a/kernel/common/io.rb b/kernel/common/io.rb
@@ -1851,7 +1851,7 @@ def each(&block)
       if @separator
         if @separator.empty?
           @separator = "\n\n"
-          @skip = 10
+          @skip = "\n"
         end
 
         if @limit
@@ -1868,21 +1868,43 @@ def each(&block)
       end
     end
 
-    def do_skip(buffer)
-      return 0 unless @skip
+    def read_and_yield_count_chars(str, buffer, byte_count, &block)
+      str << buffer.slice!(0, byte_count)
 
-      skip_count = 0
-      skip_count += 1 while buffer[skip_count].ord == @skip
-      if skip_count > 0
-        slice = buffer.slice!(0, skip_count)
-        slice.bytesize
+      if @limit
+        # Always read to char boundary because the +limit+ may have cut a multi-byte
+        # character in the middle. Returning such a string would have an invalid encoding.
+        buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
+        str, bytes_read = read_to_char_boundary(@io, str, buffer)
       else
-        0
+        # We are confident that our +str+ ends on a char boundary
+        str = IO.read_encode(@io, str)
       end
+
+      str.taint
+      $. = @io.increment_lineno
+      skip_contiguous_chars(buffer)
+
+      # Unused bytes/chars should be saved for the next read. Since the block that we yield to
+      # may +return+ we don't want to drop the bytes that are stored in +buffer+. To save,
+      # unget them so the next read will fetch them again. This might be expensive and could
+      # potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
+      # a pointer around. Think about this for the mmap stuff.
+      @io.ungetc(buffer)
+      buffer.clear
+
+      yield str
+    end
+
+    def read_and_yield_entire_string(str, &block)
+      str = IO.read_encode(@io, str)
+      str.taint
+      $. = @io.increment_lineno
+      yield str
     end
 
     # method A, D
-    def read_to_separator
+    def read_to_separator(&block)
       str = "".force_encoding(Encoding::ASCII_8BIT)
       buffer = "".force_encoding(Encoding::ASCII_8BIT)
       separator_size = @separator.bytesize
@@ -1899,25 +1921,7 @@ def read_to_separator
           # the pattern/separator which may be >1. therefore, add the separator size.
           count += separator_size
 
-          substring = buffer.slice!(0, count)
-          str << substring
-
-          str = IO.read_encode(@io, str)
-          str.taint
-
-          $. = @io.increment_lineno
-
-          do_skip(buffer)
-
-          # Unused bytes/chars should be saved for the next read. Since the block that we yield to
-          # may +return+ we don't want to drop the bytes that are stored in +buffer+. To save, 
-          # unget them so the next read will fetch them again. This might be expensive and could
-          # potentially use a little tuning. Maybe use an +unread(bytes)+ method which just moves
-          # a pointer around. Think about this for the mmap stuff.
-          @io.ungetc(buffer)
-          buffer.clear
-          yield str
-
+          read_and_yield_count_chars(str, buffer, count, &block)
           str = "".force_encoding(Encoding::ASCII_8BIT)
         else
           str << buffer
@@ -1928,44 +1932,13 @@ def read_to_separator
       str << buffer
 
       unless str.empty?
-        str = IO.read_encode(@io, str)
-        str.taint
-        $. = @io.increment_lineno
-        yield str
+        read_and_yield_entire_string(str, &block)
       end
     end
 
     # method B, E
 
-    def try_to_force_encoding(io, str)
-      str.force_encoding(io.external_encoding || Encoding.default_external)
-
-      IO.read_encode io, str
-    end
-
-    PEEK_AHEAD_LIMIT = 16
-
-    def read_to_char_boundary(io, str, buffer)
-      str.force_encoding(io.external_encoding || Encoding.default_external)
-      return [IO.read_encode(io, str), 0] if str.valid_encoding?
-
-      peek_ahead = 0
-      while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
-        str.force_encoding Encoding::ASCII_8BIT
-        substring = buffer.slice!(0, 1)
-        str << substring
-        peek_ahead += 1
-
-        str.force_encoding(io.external_encoding || Encoding.default_external)
-        if str.valid_encoding?
-          return [IO.read_encode(io, str), peek_ahead]
-        end
-      end
-
-      [IO.read_encode(io, str), peek_ahead]
-    end
-
-    def read_to_separator_with_limit
+    def read_to_separator_with_limit(&block)
       str = "".force_encoding(Encoding::ASCII_8BIT)
       buffer = "".force_encoding(Encoding::ASCII_8BIT)
       separator_size = @separator.bytesize
@@ -1984,39 +1957,13 @@ def read_to_separator_with_limit
           # #index returns a 0-based location but we want a length (so +1) and it should include
           # the pattern/separator which may be >1. therefore, add the separator size.
           count += separator_size
-          bytes = count < wanted ? count : wanted
-          str << buffer.slice!(0, bytes)
-
-          # Always read to char boundary because the +limit+ may have cut a multi-byte
-          # character in the middle. Returning such a string would have an invalid encoding.
-          buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
-          str, bytes_read = read_to_char_boundary(@io, str, buffer)
-          str.taint
-
-          $. = @io.increment_lineno
-          do_skip(buffer)
-          @io.ungetc(buffer)
-          buffer.clear
-
-          yield str
+          count = count < wanted ? count : wanted
+          read_and_yield_count_chars(str, buffer, count, &block)
 
           str = "".force_encoding(Encoding::ASCII_8BIT)
         else
           if wanted < buffer.size
-            str << buffer.slice!(0, wanted)
-
-            # replenish the buffer if we don't have enough bytes to satisfy the peek ahead
-            buffer += (@io.read(PEEK_AHEAD_LIMIT) || '') if buffer.size < PEEK_AHEAD_LIMIT
-            str, bytes_read = read_to_char_boundary(@io, str, buffer)
-            str.taint
-
-            $. = @io.increment_lineno
-            do_skip(buffer)
-            @io.ungetc(buffer)
-            buffer.clear
-
-            yield str
-
+            read_and_yield_count_chars(str, buffer, wanted, &block)
             str = "".force_encoding(Encoding::ASCII_8BIT)
           else
             str << buffer
@@ -2027,53 +1974,84 @@ def read_to_separator_with_limit
       end until buffer.size == 0 && @io.eof?
 
       unless str.empty?
-        str = IO.read_encode(@io, str)
-        str.taint
-        $. = @io.increment_lineno
-        yield str
+        read_and_yield_entire_string(str, &block)
       end
     end
 
     # Method G
-    def read_all
-      str = ""
+    def read_all(&block)
+      str = "".force_encoding(Encoding::ASCII_8BIT)
 
       begin
         str << @io.read
       end until @io.eof?
 
       unless str.empty?
-        str = IO.read_encode(@io, str)
-        str.taint
-        $. = @io.increment_lineno
-        yield str
+        read_and_yield_entire_string(str, &block)
       end
     end
 
     # Method H
-    def read_to_limit
-      str = ""
+    def read_to_limit(&block)
+      str = "".force_encoding(Encoding::ASCII_8BIT)
       wanted = limit = @limit.abs
 
       begin
         str << @io.read(wanted)
+        read_and_yield_count_chars(str, '', str.bytesize, &block)
+        str = "".force_encoding(Encoding::ASCII_8BIT)
+      end until @io.eof?
 
-        buffer = (@io.read(PEEK_AHEAD_LIMIT) || '')
-        str, bytes_read = read_to_char_boundary(@io, str, buffer)
-        str.taint
-        @io.ungetc(buffer)
+      unless str.empty?
+        read_and_yield_entire_string(str, &block)
+      end
+    end
 
-        $. = @io.increment_lineno
-        yield str
+    # Utility methods
 
-        str = ""
-      end until @io.eof?
+    def try_to_force_encoding(io, str)
+      str.force_encoding(io.external_encoding || Encoding.default_external)
 
-      unless str.empty?
-        str = IO.read_encode(@io, str)
-        str.taint
-        $. = @io.increment_lineno
-        yield str
+      IO.read_encode io, str
+    end
+
+    PEEK_AHEAD_LIMIT = 16
+
+    def read_to_char_boundary(io, str, buffer)
+      str.force_encoding(io.external_encoding || Encoding.default_external)
+      return [IO.read_encode(io, str), 0] if str.valid_encoding?
+
+      peek_ahead = 0
+      while buffer.size > 0 and peek_ahead < PEEK_AHEAD_LIMIT
+        str.force_encoding Encoding::ASCII_8BIT
+        substring = buffer.slice!(0, 1)
+        str << substring
+        peek_ahead += 1
+
+        str.force_encoding(io.external_encoding || Encoding.default_external)
+        if str.valid_encoding?
+          return [IO.read_encode(io, str), peek_ahead]
+        end
+      end
+
+      [IO.read_encode(io, str), peek_ahead]
+    end
+
+    # Advances the buffer index past any number of contiguous
+    # characters == +skip+ and throws away that data. For
+    # example, if +skip+ is ?\n and the buffer contents are
+    # "\n\n\nAbc...", the buffer will discard all chars
+    # up to 'A'.
+    def skip_contiguous_chars(buffer)
+      return 0 unless @skip
+
+      skip_count = 0
+      skip_count += 1 while buffer[skip_count] == @skip
+      if skip_count > 0
+        slice = buffer.slice!(0, skip_count)
+        slice.bytesize
+      else
+        0
       end
     end
   end