jruby · Mar 13, 2015 · Mar 13, 2015 · Mar 13, 2015 · Mar 13, 2015
Showing with 106 additions and 90 deletions.

+12 −5 core/src/main/java/org/jruby/RubyInteger.java

+48 −69 core/src/main/java/org/jruby/RubyString.java

+45 −4 core/src/main/java/org/jruby/util/io/EncodingUtils.java

+1 −12 test/mri/excludes/TestM17N.rb
diff --git a/core/src/main/java/org/jruby/RubyInteger.java b/core/src/main/java/org/jruby/RubyInteger.java
@@ -348,26 +348,33 @@ public RubyString chr19(ThreadContext context, IRubyObject arg) {
         if (enc == ASCIIEncoding.INSTANCE && value >= 0x80) {
             return chr19(context);
         }
-        return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, (int)value), enc, 0);
+        return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, value), enc, 0);
     }
 
-    private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, int value) {
+    private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, long value) {
         int n;
         try {
-            n = value < 0 ? 0 : enc.codeToMbcLength(value);
+            n = value < 0 ? 0 : enc.codeToMbcLength((int)value);
         } catch (EncodingException ee) {
             n = 0;
         }
 
         if (n <= 0) throw runtime.newRangeError(this.toString() + " out of char range");
 
         ByteList bytes = new ByteList(n);
-
+
+        boolean ok = false;
         try {
-            enc.codeToMbc(value, bytes.getUnsafeBytes(), 0);
+            enc.codeToMbc((int)value, bytes.getUnsafeBytes(), 0);
+            ok = StringSupport.preciseLength(enc, bytes.unsafeBytes(), 0, n) == n;
         } catch (EncodingException e) {
+            // ok = false, fall through
+        }
+
+        if (!ok) {
             throw runtime.newRangeError("invalid codepoint " + String.format("0x%x in ", value) + enc.getCharsetName());
         }
+
         bytes.setRealSize(n);
         return bytes;
     }

diff --git a/core/src/main/java/org/jruby/RubyString.java b/core/src/main/java/org/jruby/RubyString.java
@@ -40,7 +40,6 @@
 
 import jnr.posix.POSIX;
 import org.jcodings.Encoding;
-import org.jcodings.EncodingDB;
 import org.jcodings.ascii.AsciiTables;
 import org.jcodings.exception.EncodingException;
 import org.jcodings.specific.ASCIIEncoding;
@@ -50,7 +49,7 @@
 import org.jcodings.specific.UTF32BEEncoding;
 import org.jcodings.specific.UTF32LEEncoding;
 import org.jcodings.specific.UTF8Encoding;
-import org.jcodings.util.CaseInsensitiveBytesHash;
+import org.jcodings.unicode.UnicodeEncoding;
 import org.jcodings.util.IntHash;
 import org.joni.Matcher;
 import org.joni.Option;
@@ -257,14 +256,14 @@ private void copyCodeRange(RubyString from) {
     public final int scanForCodeRange() {
         int cr = getCodeRange();
         if (cr == CR_UNKNOWN) {
-            cr = codeRangeScan(value.getEncoding(), value);
+            cr = codeRangeScan(EncodingUtils.getActualEncoding(getEncoding(), value), value);
             setCodeRange(cr);
         }
         return cr;
     }
 
     final boolean singleByteOptimizable() {
-        return StringSupport.isSingleByteOptimizable(this, value.getEncoding());
+        return StringSupport.isSingleByteOptimizable(this, EncodingUtils.STR_ENC_GET(this));
     }
 
     final boolean singleByteOptimizable(Encoding enc) {
@@ -2076,54 +2075,30 @@ public IRubyObject inspect19() {
     }
 
     public static IRubyObject inspect19(Ruby runtime, ByteList byteList) {
+        ThreadContext context = runtime.getCurrentContext();
+
+        Encoding enc = byteList.getEncoding();
         byte bytes[] = byteList.getUnsafeBytes();
         int p = byteList.getBegin();
         int end = p + byteList.getRealSize();
         RubyString result = new RubyString(runtime, runtime.getString(), new ByteList(end - p));
-        Encoding enc = byteList.getEncoding();
-
         Encoding resultEnc = runtime.getDefaultInternalEncoding();
-        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
-        if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
-        result.associateEncoding(resultEnc);
-
         boolean isUnicode = StringSupport.isUnicode(enc);
         boolean asciiCompat = enc.isAsciiCompatible();
 
-        EncodingDB.Entry e = null;
-        CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = runtime.getEncodingService().getEncodings();
-        if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p > 1) {
-            int c0 = bytes[p] & 0xff;
-            int c1 = bytes[p + 1] & 0xff;
-
-            if (c0 == 0xFE && c1 == 0xFF) {
-                e = encodings.get("UTF-16BE".getBytes());
-            } else if (c0 == 0xFF && c1 == 0xFE) {
-                e = encodings.get("UTF-16LE".getBytes());
-            } else {
-                e = encodings.get("ASCII-8BIT".getBytes());
-                isUnicode = false;
-            }
-        } else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p > 3) {
-            int c0 = bytes[p] & 0xff;
-            int c1 = bytes[p + 1] & 0xff;
-            int c2 = bytes[p + 2] & 0xff;
-            int c3 = bytes[p + 3] & 0xff;
-
-            if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
-                e = encodings.get("UTF-32BE".getBytes());
-            } else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
-                e = encodings.get("UTF-32LE".getBytes());
-            } else {
-                e = encodings.get("ASCII-8BIT".getBytes());
-                isUnicode = false;
-            }
-        }
-
-        if (e != null) enc = e.getEncoding();
 
+        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
+        if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
+        result.associateEncoding(resultEnc);
         result.cat('"');
+
         int prev = p;
+        Encoding actEnc = EncodingUtils.getActualEncoding(enc, byteList);
+        if (actEnc != enc) {
+            enc = actEnc;
+            if (isUnicode) isUnicode = enc instanceof UnicodeEncoding;
+        }
+
         while (p < end) {
             int cc = 0;
 
@@ -3737,7 +3712,7 @@ private IRubyObject populateCapturesForScan(Ruby runtime, Matcher matcher, int r
     @JRubyMethod(name = "scan", reads = BACKREF, writes = BACKREF)
     public IRubyObject scan19(ThreadContext context, IRubyObject arg, Block block) {
         Ruby runtime = context.runtime;
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         final Regex pattern, prepared;
         final RubyRegexp regexp;
         final int tuFlags;
@@ -3910,11 +3885,11 @@ private boolean end_with_pCommon(IRubyObject arg) {
 
     private static final ByteList SPACE_BYTELIST = new ByteList(ByteList.plain(" "));
 
-    private IRubyObject justify19(IRubyObject arg0, int jflag) {
-        Ruby runtime = getRuntime();
+    private IRubyObject justify19(ThreadContext context, IRubyObject arg0, int jflag) {
+        Ruby runtime = context.runtime;
         RubyString result = justifyCommon(runtime, SPACE_BYTELIST,
-                                                   1,
-                                                   true, value.getEncoding(), RubyFixnum.num2int(arg0), jflag);
+                1,
+                true, EncodingUtils.STR_ENC_GET(this), RubyFixnum.num2int(arg0), jflag);
         if (getCodeRange() != CR_BROKEN) result.setCodeRange(getCodeRange());
         return result;
     }
@@ -4014,7 +3989,7 @@ public IRubyObject ljust(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "ljust")
     public IRubyObject ljust19(IRubyObject arg0) {
-        return justify19(arg0, 'l');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'l');
     }
 
     @JRubyMethod(name = "ljust")
@@ -4035,7 +4010,7 @@ public IRubyObject rjust(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "rjust")
     public IRubyObject rjust19(IRubyObject arg0) {
-        return justify19(arg0, 'r');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'r');
     }
 
     @JRubyMethod(name = "rjust")
@@ -4056,7 +4031,7 @@ public IRubyObject center(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "center")
     public IRubyObject center19(IRubyObject arg0) {
-        return justify19(arg0, 'c');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'c');
     }
 
     @JRubyMethod(name = "center")
@@ -4148,26 +4123,26 @@ public IRubyObject chop_bang(ThreadContext context) {
     public IRubyObject chop19(ThreadContext context) {
         Ruby runtime = context.runtime;
         if (value.getRealSize() == 0) return newEmptyString(runtime, getMetaClass(), value.getEncoding()).infectBy(this);
-        return makeShared19(runtime, 0, choppedLength19(runtime));
+        return makeShared19(runtime, 0, choppedLength19(runtime, context));
     }
 
     @JRubyMethod(name = "chop!")
     public IRubyObject chop_bang19(ThreadContext context) {
         modifyCheck();
         Ruby runtime = context.runtime;
         if (value.getRealSize() == 0) return runtime.getNil();
-        view(0, choppedLength19(runtime));
+        view(0, choppedLength19(runtime, context));
         if (getCodeRange() != CR_7BIT) clearCodeRange();
         return this;
     }
 
-    private int choppedLength19(Ruby runtime) {
+    private int choppedLength19(Ruby runtime, ThreadContext context) {
         int p = value.getBegin();
         int end = p + value.getRealSize();
 
         if (p > end) return 0;
         byte bytes[] = value.getUnsafeBytes();
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
 
         int s = enc.prevCharHead(bytes, p, end, end);
         if (s == -1) return 0;
@@ -4345,7 +4320,7 @@ public IRubyObject lstrip_bang19(ThreadContext context) {
             return runtime.getNil();
         }
 
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         int s = value.getBegin();
         int end = s + value.getRealSize();
         byte[]bytes = value.getUnsafeBytes();
@@ -4413,8 +4388,9 @@ public IRubyObject rstrip_bang19(ThreadContext context) {
             return runtime.getNil();
         }
 
-        IRubyObject result = singleByteOptimizable(value.getEncoding()) ?
-            singleByteRStrip19(runtime) : multiByteRStrip19(runtime);
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
+        IRubyObject result = singleByteOptimizable(enc) ?
+            singleByteRStrip19(runtime) : multiByteRStrip19(runtime, context);
 
         keepCodeRange();
         return result;
@@ -4438,11 +4414,11 @@ private IRubyObject singleByteRStrip19(Ruby runtime) {
     }
 
     // In 1.9 we strip any combination of \0 and \s
-    private IRubyObject multiByteRStrip19(Ruby runtime) {
+    private IRubyObject multiByteRStrip19(Ruby runtime, ThreadContext context) {
         byte[] bytes = value.getUnsafeBytes();
         int start = value.getBegin();
         int end = start + value.getRealSize();
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         int endp = end;
         int prev;
         while ((prev = enc.prevCharHead(bytes, start, endp, end)) != -1) {
@@ -4512,17 +4488,20 @@ public IRubyObject count19(ThreadContext context, IRubyObject arg) {
         if (value.getRealSize() == 0) return RubyFixnum.zero(runtime);
 
         RubyString otherStr = arg.convertToString();
+        ByteList otherBL = otherStr.getByteList();
         Encoding enc = checkEncoding(otherStr);
 
-        int c;
-        if (otherStr.value.length() == 1 && enc.isAsciiCompatible() &&
-                ((c = otherStr.value.unsafeBytes()[otherStr.value.getBegin()] & 0xff)) < 0x80 && scanForCodeRange() != CR_BROKEN) {
+        if (otherBL.length() == 1 && enc.isAsciiCompatible() &&
+                enc.isReverseMatchAllowed(otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize()) &&
+                scanForCodeRange() != CR_BROKEN) {
+            int n = 0;
+            int[] len_p = {0};
+            int c = EncodingUtils.encCodepointLength(runtime, otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize(), len_p, enc);
 
             if (value.length() ==0) return RubyFixnum.zero(runtime);
             byte[]bytes = value.unsafeBytes();
             int p = value.getBegin();
             int end = p + value.length();
-            int n = 0;
             while (p < end) {
                 if ((bytes[p++] & 0xff) == c) n++;
             }
@@ -5371,7 +5350,7 @@ private IRubyObject enumerateCodepoints(ThreadContext context, String name, Bloc
         ptrBytes = strByteList.unsafeBytes();
         ptr = strByteList.begin();
         end = ptr + strByteList.getRealSize();
-        enc = str.getEncoding();
+        enc = EncodingUtils.STR_ENC_GET(str);
 
         if (block.isGiven()) {
             if (wantarray) {
@@ -5492,8 +5471,8 @@ public RubySymbol intern19() {
     @JRubyMethod
     public IRubyObject ord(ThreadContext context) {
         Ruby runtime = context.runtime;
-        return RubyFixnum.newFixnum(runtime, codePoint(runtime, value.getEncoding(), value.getUnsafeBytes(), value.getBegin(),
-                                                                value.getBegin() + value.getRealSize()));
+        return RubyFixnum.newFixnum(runtime, codePoint(runtime, EncodingUtils.STR_ENC_GET(this), value.getUnsafeBytes(), value.getBegin(),
+                value.getBegin() + value.getRealSize()));
     }
 
     @JRubyMethod
@@ -5680,9 +5659,9 @@ public IRubyObject ascii_only_p(ThreadContext context) {
     @JRubyMethod
     public IRubyObject b(ThreadContext context) {
         Encoding encoding = ASCIIEncoding.INSTANCE;
-        RubyString dup = (RubyString)dup();
-        dup.associateEncoding(encoding);
-        dup.clearCodeRange();
+        RubyString dup = strDup(context.runtime);
+        dup.modify19();
+        dup.setEncoding(encoding);
         return dup;
     }
 
@@ -5803,7 +5782,7 @@ public IRubyObject strScrub(ThreadContext context, IRubyObject repl, Block block
         if (cr == CR_7BIT || cr == CR_VALID)
             return context.nil;
 
-        enc = getEncoding();
+        enc = EncodingUtils.STR_ENC_GET(this);
         if (!repl.isNil()) {
             repl = EncodingUtils.strCompatAndValid(context, repl, enc);
         }

diff --git a/core/src/main/java/org/jruby/util/io/EncodingUtils.java b/core/src/main/java/org/jruby/util/io/EncodingUtils.java
@@ -4,7 +4,6 @@
 import org.jcodings.EncodingDB;
 import org.jcodings.Ptr;
 import org.jcodings.specific.ASCIIEncoding;
-import org.jcodings.specific.USASCIIEncoding;
 import org.jcodings.specific.UTF16BEEncoding;
 import org.jcodings.specific.UTF16LEEncoding;
 import org.jcodings.specific.UTF32BEEncoding;
@@ -16,6 +15,7 @@
 import org.jcodings.transcode.Transcoder;
 import org.jcodings.transcode.TranscoderDB;
 import org.jcodings.transcode.Transcoding;
+import org.jcodings.util.CaseInsensitiveBytesHash;
 import org.jruby.Ruby;
 import org.jruby.RubyArray;
 import org.jruby.RubyBasicObject;
@@ -24,7 +24,6 @@
 import org.jruby.RubyFixnum;
 import org.jruby.RubyHash;
 import org.jruby.RubyIO;
-import org.jruby.RubyInteger;
 import org.jruby.RubyMethod;
 import org.jruby.RubyNumeric;
 import org.jruby.RubyProc;
@@ -1810,17 +1809,59 @@ public static IRubyObject strCompatAndValid(ThreadContext context, IRubyObject _
             throw context.runtime.newArgumentError("replacement must be valid byte sequence '" + str + "'");
         }
         else if (cr == StringSupport.CR_7BIT) {
-            Encoding e = str.getEncoding();
+            Encoding e = STR_ENC_GET(str);
             if (!enc.isAsciiCompatible()) {
                 throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
             }
         }
         else { /* ENC_CODERANGE_VALID */
-            Encoding e = str.getEncoding();
+            Encoding e = STR_ENC_GET(str);
             if (enc != e) {
                 throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
             }
         }
         return str;
     }
+
+    // MRI: get_encoding
+    public static Encoding getEncoding(ByteList str) {
+        return getActualEncoding(str.getEncoding(), str);
+    }
+
+    // MRI: get_actual_encoding
+    public static Encoding getActualEncoding(Encoding enc, ByteList byteList) {
+        byte[] bytes = byteList.unsafeBytes();
+        int p = byteList.begin();
+        int end = p + byteList.getRealSize();
+
+        CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = EncodingDB.getEncodings();
+        if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p >= 2) {
+            int c0 = bytes[p] & 0xff;
+            int c1 = bytes[p + 1] & 0xff;
+
+            if (c0 == 0xFE && c1 == 0xFF) {
+                return UTF16BEEncoding.INSTANCE;
+            } else if (c0 == 0xFF && c1 == 0xFE) {
+                return UTF16LEEncoding.INSTANCE;
+            }
+            return ASCIIEncoding.INSTANCE;
+        } else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p >= 4) {
+            int c0 = bytes[p] & 0xff;
+            int c1 = bytes[p + 1] & 0xff;
+            int c2 = bytes[p + 2] & 0xff;
+            int c3 = bytes[p + 3] & 0xff;
+
+            if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
+                return UTF32BEEncoding.INSTANCE;
+            } else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
+                return UTF32LEEncoding.INSTANCE;
+            }
+            return ASCIIEncoding.INSTANCE;
+        }
+        return enc;
+    }
+
+    public static Encoding STR_ENC_GET(RubyString str) {
+        return getEncoding(str.getByteList());
+    }
 }
diff --git a/test/mri/excludes/TestM17N.rb b/test/mri/excludes/TestM17N.rb
@@ -1,4 +1,3 @@
-exclude :test_chr, "needs investigation"
 exclude :test_delete, "needs investigation"
 exclude :test_dynamic_eucjp_regexp, "needs investigation"
 exclude :test_dynamic_sjis_regexp, "needs investigation"
@@ -15,21 +14,11 @@
 exclude :test_regexp_unicode, "needs investigation"
 exclude :test_regexp_usascii, "needs investigation"
 exclude :test_scrub, ""
-exclude :test_scrub_bang, ""
-exclude :test_count_sjis_trailing_byte, "needs investigation"
 exclude :test_sprintf_c, "format string encoding should be used to decode incoming fixnums"
 exclude :test_str_concat, "needs investigation"
 exclude :test_string_inspect_encoding, "needs investigation"
 exclude :test_string_mixed_unicode, "needs investigation"
 exclude :test_symbol, "needs investigation"
-exclude :test_symbol_op, "random operators setting encoding earlier"
+exclude :test_symbol_op, "some symbols are created early and do not have UTF-8 encoding"
 exclude :test_union_1_nonascii_string, "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-16BE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-16LE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-32BE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-32LE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-16BE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-16LE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-32BE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-32LE)', "needs investigation"
 exclude :test_valid_encoding, "needs investigation"