jruby · Mar 11, 2015 · Mar 11, 2015
diff --git a/spec/truffle/tags/core/string/split_tags.txt b/spec/truffle/tags/core/string/split_tags.txt
@@ -1,23 +1,7 @@
 fails:String#split with String throws an ArgumentError if the pattern is not a valid string
 fails:String#split with String splits on multibyte characters
-fails:String#split with String suppresses trailing empty fields when limit isn't given or 0
-fails:String#split with String returns an array with one entry if limit is 1: the original string
-fails:String#split with String returns at most limit fields when limit > 1
-fails:String#split with String doesn't suppress or limit fields when limit is negative
-fails:String#split with String defaults to $; when string isn't given or nil
-fails:String#split with String ignores leading and continuous whitespace when string is a single space
-fails:String#split with String splits between characters when its argument is an empty string
-fails:String#split with String tries converting its pattern argument to a string via to_str
-fails:String#split with String tries converting limit to an integer via to_int
 fails:String#split with String returns subclass instances based on self
 fails:String#split with String taints the resulting strings if self is tainted
-fails:String#split with Regexp defaults to $; when regexp isn't given or nil
-fails:String#split with Regexp includes all captures in the result array
-fails:String#split with Regexp does not include non-matching captures in the result array
-fails:String#split with Regexp tries converting limit to an integer via to_int
 fails:String#split with Regexp returns subclass instances based on self
 fails:String#split with Regexp taints the resulting strings if self is tainted
-fails:String#split with Regexp taints an empty string if self is tainted
-fails:String#split with Regexp retains the encoding of the source string
-fails:String#split with Regexp splits a string on each character for a multibyte encoding and empty split
 fails:String#split with Regexp returns an ArgumentError if an invalid UTF-8 string is supplied
diff --git a/truffle/src/main/java/org/jruby/truffle/nodes/core/KernelNodes.java b/truffle/src/main/java/org/jruby/truffle/nodes/core/KernelNodes.java
@@ -114,7 +114,8 @@ public RubyString backtick(RubyString command) {
                 throw new RuntimeException(e);
             }
 
-            return context.makeString(resultBuilder.toString());
+            // TODO (nirvdrum 10-Mar-15) This should be using the default external encoding, rather than hard-coded to UTF-8.
+            return context.makeString(resultBuilder.toString(), RubyEncoding.getEncoding("UTF-8").getEncoding());
         }
 
     }

diff --git a/truffle/src/main/java/org/jruby/truffle/nodes/core/StringNodes.java b/truffle/src/main/java/org/jruby/truffle/nodes/core/StringNodes.java
@@ -1601,58 +1601,6 @@ public int size(RubyString string) {
         }
     }
 
-    @CoreMethod(names = "split", optional = 2, lowerFixnumParameters = 2, taintFromSelf = true)
-    public abstract static class SplitNode extends CoreMethodNode {
-
-        public SplitNode(RubyContext context, SourceSection sourceSection) {
-            super(context, sourceSection);
-        }
-
-        public SplitNode(SplitNode prev) {
-            super(prev);
-        }
-
-        @Specialization
-        public RubyArray split(RubyString string, RubyString sep, @SuppressWarnings("unused") UndefinedPlaceholder limit) {
-            notDesignedForCompilation();
-
-            return splitHelper(string, sep.toString());
-        }
-
-        @Specialization
-        public RubyArray split(RubyString string, RubyRegexp sep, @SuppressWarnings("unused") UndefinedPlaceholder limit) {
-            notDesignedForCompilation();
-
-            return RubyArray.fromObjects(getContext().getCoreLibrary().getArrayClass(), (Object[]) sep.split(string, false, 0));
-        }
-
-        @Specialization
-        public RubyArray split(RubyString string, RubyRegexp sep, int limit) {
-            notDesignedForCompilation();
-
-            return RubyArray.fromObjects(getContext().getCoreLibrary().getArrayClass(), (Object[]) sep.split(string, limit > 0, limit));
-        }
-
-        @Specialization
-        public RubyArray split(RubyString string, @SuppressWarnings("unused") UndefinedPlaceholder sep, @SuppressWarnings("unused") UndefinedPlaceholder limit) {
-            notDesignedForCompilation();
-
-            return splitHelper(string, " ");
-        }
-
-        private RubyArray splitHelper(RubyString string, String sep) {
-            final String[] components = string.toString().split(Pattern.quote(sep));
-
-            final Object[] objects = new Object[components.length];
-
-            for (int n = 0; n < objects.length; n++) {
-                objects[n] = getContext().makeString(string.getLogicalClass(), components[n]);
-            }
-
-            return RubyArray.fromObjects(getContext().getCoreLibrary().getArrayClass(), objects);
-        }
-    }
-
     @CoreMethod(names = "succ", taintFromSelf = true)
     public abstract static class SuccNode extends CoreMethodNode {
 

diff --git a/truffle/src/main/java/org/jruby/truffle/nodes/rubinius/StringPrimitiveNodes.java b/truffle/src/main/java/org/jruby/truffle/nodes/rubinius/StringPrimitiveNodes.java
@@ -60,13 +60,99 @@
 import org.jruby.util.ConvertBytes;
 import org.jruby.util.StringSupport;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 /**
  * Rubinius primitives associated with the Ruby {@code String} class.
  */
 public abstract class StringPrimitiveNodes {
 
+    @RubiniusPrimitive(name = "string_awk_split")
+    public static abstract class StringAwkSplitPrimitiveNode extends RubiniusPrimitiveNode {
+
+        @Child private TaintResultNode taintResultNode;
+
+        public StringAwkSplitPrimitiveNode(RubyContext context, SourceSection sourceSection) {
+            super(context, sourceSection);
+            taintResultNode = new TaintResultNode(context, sourceSection, true, new int[]{});
+        }
+
+        public StringAwkSplitPrimitiveNode(StringAwkSplitPrimitiveNode prev) {
+            super(prev);
+            taintResultNode = prev.taintResultNode;
+        }
+
+        @Specialization
+        public RubyArray stringAwkSplit(RubyString string, int lim) {
+            notDesignedForCompilation();
+
+            final List<RubyString> ret = new ArrayList<>();
+            final ByteList value = string.getBytes();
+            final boolean limit = lim > 0;
+            int i = lim > 0 ? 1 : 0;
+
+            byte[]bytes = value.getUnsafeBytes();
+            int p = value.getBegin();
+            int ptr = p;
+            int len = value.getRealSize();
+            int end = p + len;
+            Encoding enc = value.getEncoding();
+            boolean skip = true;
+
+            int e = 0, b = 0;
+            final boolean singlebyte = StringSupport.isSingleByteOptimizable(string, enc);
+            while (p < end) {
+                final int c;
+                if (singlebyte) {
+                    c = bytes[p++] & 0xff;
+                } else {
+                    try {
+                        c = StringSupport.codePoint(getContext().getRuntime(), enc, bytes, p, end);
+                    } catch (org.jruby.exceptions.RaiseException ex) {
+                        throw new RaiseException(getContext().toTruffle(ex.getException(), this));
+                    }
+
+                    p += StringSupport.length(enc, bytes, p, end);
+                }
+
+                if (skip) {
+                    if (enc.isSpace(c)) {
+                        b = p - ptr;
+                    } else {
+                        e = p - ptr;
+                        skip = false;
+                        if (limit && lim <= i) break;
+                    }
+                } else {
+                    if (enc.isSpace(c)) {
+                        ret.add(makeString(string, b, e - b));
+                        skip = true;
+                        b = p - ptr;
+                        if (limit) i++;
+                    } else {
+                        e = p - ptr;
+                    }
+                }
+            }
+
+            if (len > 0 && (limit || len > b || lim < 0)) ret.add(makeString(string, b, len - b));
+
+            return RubyArray.fromObjects(getContext().getCoreLibrary().getArrayClass(), ret.toArray());
+        }
+
+        private RubyString makeString(RubyString source, int index, int length) {
+            final ByteList bytes = new ByteList(source.getBytes(), index, length);
+            bytes.setEncoding(source.getBytes().getEncoding());
+
+            final RubyString ret = getContext().makeString(source.getLogicalClass(), bytes);
+            taintResultNode.maybeTaint(source, ret);
+
+            return ret;
+        }
+    }
+
     @RubiniusPrimitive(name = "string_byte_substring")
     public static abstract class StringByteSubstringPrimitiveNode extends RubiniusPrimitiveNode {
 

diff --git a/truffle/src/main/ruby/core.rb b/truffle/src/main/ruby/core.rb
@@ -67,6 +67,7 @@
 require_relative 'core/rubinius/common/symbol'
 require_relative 'core/rubinius/common/regexp'
 require_relative 'core/rubinius/common/signal'
+require_relative 'core/rubinius/common/splitter'
 require_relative 'core/rubinius/common/mutex'
 require_relative 'core/rubinius/common/throw_catch'
 require_relative 'core/rubinius/common/time'

diff --git a/truffle/src/main/ruby/core/rubinius/common/splitter.rb b/truffle/src/main/ruby/core/rubinius/common/splitter.rb
@@ -0,0 +1,180 @@
+# Copyright (c) 2007-2014, Evan Phoenix and contributors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Rubinius nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+module Rubinius
+  class Splitter
+    def self.split_characters(string, pattern, limit, tail_empty)
+      if limit
+        string.chars.take(limit - 1) << string[(limit - 1)..-1]
+      else
+        ret = string.chars.to_a
+        # Use #byteslice because it returns the right class and taints
+        # automatically. This is just appending a "", which is this
+        # strange protocol if a negative limit is passed in
+        ret << string.byteslice(0,0) if tail_empty
+        ret
+      end
+    end
+
+    def self.valid_encoding?(string)
+      raise ArgumentError, "invalid byte sequence in #{string.encoding.name}" unless string.valid_encoding?
+    end
+
+    def self.split(string, pattern, limit)
+      # Odd edge case
+      return [] if string.empty?
+
+      tail_empty = false
+
+      if undefined.equal?(limit)
+        limited = false
+      else
+        limit = Rubinius::Type.coerce_to limit, Fixnum, :to_int
+
+        if limit > 0
+          return [string.dup] if limit == 1
+          limited = true
+        else
+          tail_empty = true
+          limited = false
+        end
+      end
+
+      pattern ||= ($; || " ")
+
+      if pattern == ' '
+        if limited
+          lim = limit
+        elsif tail_empty
+          lim = -1
+        else
+          lim = 0
+        end
+
+        return Rubinius.invoke_primitive :string_awk_split, string, lim
+      elsif pattern.kind_of?(Regexp)
+      else
+        pattern = StringValue(pattern) unless pattern.kind_of?(String)
+
+        valid_encoding?(string)
+        valid_encoding?(pattern)
+
+        trim_end = !tail_empty || limit == 0
+
+        unless limited
+          if pattern.empty?
+            if trim_end
+              return string.chars.to_a
+            end
+          else
+            return split_on_string(string, pattern, trim_end)
+          end
+        end
+
+        pattern = Regexp.new(Regexp.quote(pattern))
+      end
+
+      # Handle // as a special case.
+      if pattern.source.empty?
+        return split_characters(string, pattern, limited && limit, tail_empty)
+      end
+
+      start = 0
+      ret = []
+
+      last_match = nil
+      last_match_end = 0
+
+      while match = pattern.match_from(string, start)
+        break if limited && limit - ret.size <= 1
+
+        collapsed = match.collapsing?
+
+        unless collapsed && (match.full.at(0) == last_match_end)
+          ret << match.pre_match_from(last_match_end)
+
+          # length > 1 means there are captures
+          if match.length > 1
+            ret.concat(match.captures.compact)
+          end
+        end
+
+        start = match.full.at(1)
+        if collapsed
+          start += 1
+        end
+
+        last_match = match
+        last_match_end = last_match.full.at(1)
+      end
+
+      if last_match
+        ret << last_match.post_match
+      elsif ret.empty?
+        ret << string.dup
+      end
+
+      # Trim from end
+      if undefined.equal?(limit) || limit == 0
+        while s = ret.at(-1) and s.empty?
+          ret.pop
+        end
+      end
+
+      ret
+    end
+
+    def self.split_on_string(string, pattern, trim_end)
+      pos = 0
+
+      ret = []
+
+      pat_size = pattern.bytesize
+      str_size = string.bytesize
+
+      while pos < str_size
+        nxt = string.find_string(pattern, pos)
+        break unless nxt
+
+        match_size = nxt - pos
+        ret << string.byteslice(pos, match_size)
+
+        pos = nxt + pat_size
+      end
+
+      # No more separators, but we need to grab the last part still.
+      ret << string.byteslice(pos, str_size - pos)
+
+      if trim_end
+        while s = ret.at(-1) and s.empty?
+          ret.pop
+        end
+      end
+
+      ret
+    end
+  end
+end
diff --git a/truffle/src/main/ruby/core/rubinius/common/string.rb b/truffle/src/main/ruby/core/rubinius/common/string.rb
@@ -48,6 +48,10 @@ def hex
     to_inum(16, false)
   end
 
+  def split(pattern=nil, limit=undefined)
+    Rubinius::Splitter.split(self, pattern, limit)
+  end
+
   def chars
     if block_given?
       each_char do |char|