Merge branch 'test-encoding-fixes' into string-quagmire

Conflicts: core/src/main/java/org/jruby/RubyString.java
jruby · Mar 19, 2015 · 436dd30 · 436dd30
2 parents fe821de + 3ffcfa9
commit 436dd30
Showing 5 changed files with 106 additions and 91 deletions.
diff --git a/core/src/main/java/org/jruby/RubyInteger.java b/core/src/main/java/org/jruby/RubyInteger.java
@@ -348,26 +348,33 @@ public RubyString chr19(ThreadContext context, IRubyObject arg) {
         if (enc == ASCIIEncoding.INSTANCE && value >= 0x80) {
             return chr19(context);
         }
-        return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, (int)value), enc, 0);
+        return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, value), enc, 0);
     }
 
-    private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, int value) {
+    private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, long value) {
         int n;
         try {
-            n = value < 0 ? 0 : enc.codeToMbcLength(value);
+            n = value < 0 ? 0 : enc.codeToMbcLength((int)value);
         } catch (EncodingException ee) {
             n = 0;
         }
 
         if (n <= 0) throw runtime.newRangeError(this.toString() + " out of char range");
 
         ByteList bytes = new ByteList(n);
-
+
+        boolean ok = false;
         try {
-            enc.codeToMbc(value, bytes.getUnsafeBytes(), 0);
+            enc.codeToMbc((int)value, bytes.getUnsafeBytes(), 0);
+            ok = StringSupport.preciseLength(enc, bytes.unsafeBytes(), 0, n) == n;
         } catch (EncodingException e) {
+            // ok = false, fall through
+        }
+
+        if (!ok) {
             throw runtime.newRangeError("invalid codepoint " + String.format("0x%x in ", value) + enc.getCharsetName());
         }
+
         bytes.setRealSize(n);
         return bytes;
     }

diff --git a/core/src/main/java/org/jruby/RubyString.java b/core/src/main/java/org/jruby/RubyString.java
@@ -40,7 +40,6 @@
 
 import jnr.posix.POSIX;
 import org.jcodings.Encoding;
-import org.jcodings.EncodingDB;
 import org.jcodings.ascii.AsciiTables;
 import org.jcodings.exception.EncodingException;
 import org.jcodings.specific.ASCIIEncoding;
@@ -50,7 +49,7 @@
 import org.jcodings.specific.UTF32BEEncoding;
 import org.jcodings.specific.UTF32LEEncoding;
 import org.jcodings.specific.UTF8Encoding;
-import org.jcodings.util.CaseInsensitiveBytesHash;
+import org.jcodings.unicode.UnicodeEncoding;
 import org.jcodings.util.IntHash;
 import org.joni.Matcher;
 import org.joni.Option;
@@ -259,14 +258,14 @@ private void copyCodeRange(RubyString from) {
     public final int scanForCodeRange() {
         int cr = getCodeRange();
         if (cr == CR_UNKNOWN) {
-            cr = codeRangeScan(value.getEncoding(), value);
+            cr = codeRangeScan(EncodingUtils.getActualEncoding(getEncoding(), value), value);
             setCodeRange(cr);
         }
         return cr;
     }
 
     final boolean singleByteOptimizable() {
-        return StringSupport.isSingleByteOptimizable(this, value.getEncoding());
+        return StringSupport.isSingleByteOptimizable(this, EncodingUtils.STR_ENC_GET(this));
     }
 
     final boolean singleByteOptimizable(Encoding enc) {
@@ -2036,54 +2035,30 @@ public IRubyObject inspect19() {
     }
 
     public static IRubyObject inspect19(Ruby runtime, ByteList byteList) {
+        ThreadContext context = runtime.getCurrentContext();
+
+        Encoding enc = byteList.getEncoding();
         byte bytes[] = byteList.getUnsafeBytes();
         int p = byteList.getBegin();
         int end = p + byteList.getRealSize();
         RubyString result = new RubyString(runtime, runtime.getString(), new ByteList(end - p));
-        Encoding enc = byteList.getEncoding();
-
         Encoding resultEnc = runtime.getDefaultInternalEncoding();
-        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
-        if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
-        result.associateEncoding(resultEnc);
-
         boolean isUnicode = StringSupport.isUnicode(enc);
         boolean asciiCompat = enc.isAsciiCompatible();
 
-        EncodingDB.Entry e = null;
-        CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = runtime.getEncodingService().getEncodings();
-        if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p > 1) {
-            int c0 = bytes[p] & 0xff;
-            int c1 = bytes[p + 1] & 0xff;
-
-            if (c0 == 0xFE && c1 == 0xFF) {
-                e = encodings.get("UTF-16BE".getBytes());
-            } else if (c0 == 0xFF && c1 == 0xFE) {
-                e = encodings.get("UTF-16LE".getBytes());
-            } else {
-                e = encodings.get("ASCII-8BIT".getBytes());
-                isUnicode = false;
-            }
-        } else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p > 3) {
-            int c0 = bytes[p] & 0xff;
-            int c1 = bytes[p + 1] & 0xff;
-            int c2 = bytes[p + 2] & 0xff;
-            int c3 = bytes[p + 3] & 0xff;
-
-            if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
-                e = encodings.get("UTF-32BE".getBytes());
-            } else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
-                e = encodings.get("UTF-32LE".getBytes());
-            } else {
-                e = encodings.get("ASCII-8BIT".getBytes());
-                isUnicode = false;
-            }
-        }
-
-        if (e != null) enc = e.getEncoding();
 
+        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
+        if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
+        result.associateEncoding(resultEnc);
         result.cat('"');
+
         int prev = p;
+        Encoding actEnc = EncodingUtils.getActualEncoding(enc, byteList);
+        if (actEnc != enc) {
+            enc = actEnc;
+            if (isUnicode) isUnicode = enc instanceof UnicodeEncoding;
+        }
+
         while (p < end) {
             int cc = 0;
 
@@ -3708,7 +3683,7 @@ private IRubyObject populateCapturesForScan(Ruby runtime, Matcher matcher, int r
     @JRubyMethod(name = "scan", reads = BACKREF, writes = BACKREF)
     public IRubyObject scan19(ThreadContext context, IRubyObject arg, Block block) {
         Ruby runtime = context.runtime;
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         final Regex pattern, prepared;
         final RubyRegexp regexp;
         final int tuFlags;
@@ -3881,11 +3856,11 @@ private boolean end_with_pCommon(IRubyObject arg) {
 
     private static final ByteList SPACE_BYTELIST = new ByteList(ByteList.plain(" "));
 
-    private IRubyObject justify19(IRubyObject arg0, int jflag) {
-        Ruby runtime = getRuntime();
+    private IRubyObject justify19(ThreadContext context, IRubyObject arg0, int jflag) {
+        Ruby runtime = context.runtime;
         RubyString result = justifyCommon(runtime, SPACE_BYTELIST,
-                                                   1,
-                                                   true, value.getEncoding(), RubyFixnum.num2int(arg0), jflag);
+                1,
+                true, EncodingUtils.STR_ENC_GET(this), RubyFixnum.num2int(arg0), jflag);
         if (getCodeRange() != CR_BROKEN) result.setCodeRange(getCodeRange());
         return result;
     }
@@ -3985,7 +3960,7 @@ public IRubyObject ljust(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "ljust")
     public IRubyObject ljust19(IRubyObject arg0) {
-        return justify19(arg0, 'l');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'l');
     }
 
     @JRubyMethod(name = "ljust")
@@ -4006,7 +3981,7 @@ public IRubyObject rjust(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "rjust")
     public IRubyObject rjust19(IRubyObject arg0) {
-        return justify19(arg0, 'r');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'r');
     }
 
     @JRubyMethod(name = "rjust")
@@ -4027,7 +4002,7 @@ public IRubyObject center(IRubyObject arg0, IRubyObject arg1) {
 
     @JRubyMethod(name = "center")
     public IRubyObject center19(IRubyObject arg0) {
-        return justify19(arg0, 'c');
+        return justify19(getRuntime().getCurrentContext(), arg0, 'c');
     }
 
     @JRubyMethod(name = "center")
@@ -4296,7 +4271,7 @@ public IRubyObject lstrip_bang19(ThreadContext context) {
             return runtime.getNil();
         }
 
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         int s = value.getBegin();
         int end = s + value.getRealSize();
         byte[]bytes = value.getUnsafeBytes();
@@ -4364,8 +4339,9 @@ public IRubyObject rstrip_bang19(ThreadContext context) {
             return runtime.getNil();
         }
 
-        IRubyObject result = singleByteOptimizable(value.getEncoding()) ?
-            singleByteRStrip19(runtime) : multiByteRStrip19(runtime);
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
+        IRubyObject result = singleByteOptimizable(enc) ?
+            singleByteRStrip19(runtime) : multiByteRStrip19(runtime, context);
 
         keepCodeRange();
         return result;
@@ -4389,11 +4365,11 @@ private IRubyObject singleByteRStrip19(Ruby runtime) {
     }
 
     // In 1.9 we strip any combination of \0 and \s
-    private IRubyObject multiByteRStrip19(Ruby runtime) {
+    private IRubyObject multiByteRStrip19(Ruby runtime, ThreadContext context) {
         byte[] bytes = value.getUnsafeBytes();
         int start = value.getBegin();
         int end = start + value.getRealSize();
-        Encoding enc = value.getEncoding();
+        Encoding enc = EncodingUtils.STR_ENC_GET(this);
         int endp = end;
         int prev;
         while ((prev = enc.prevCharHead(bytes, start, endp, end)) != -1) {
@@ -4463,17 +4439,20 @@ public IRubyObject count19(ThreadContext context, IRubyObject arg) {
         if (value.getRealSize() == 0) return RubyFixnum.zero(runtime);
 
         RubyString otherStr = arg.convertToString();
+        ByteList otherBL = otherStr.getByteList();
         Encoding enc = checkEncoding(otherStr);
 
-        int c;
-        if (otherStr.value.length() == 1 && enc.isAsciiCompatible() &&
-                ((c = otherStr.value.unsafeBytes()[otherStr.value.getBegin()] & 0xff)) < 0x80 && scanForCodeRange() != CR_BROKEN) {
+        if (otherBL.length() == 1 && enc.isAsciiCompatible() &&
+                enc.isReverseMatchAllowed(otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize()) &&
+                scanForCodeRange() != CR_BROKEN) {
+            int n = 0;
+            int[] len_p = {0};
+            int c = EncodingUtils.encCodepointLength(runtime, otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize(), len_p, enc);
 
             if (value.length() ==0) return RubyFixnum.zero(runtime);
             byte[]bytes = value.unsafeBytes();
             int p = value.getBegin();
             int end = p + value.length();
-            int n = 0;
             while (p < end) {
                 if ((bytes[p++] & 0xff) == c) n++;
             }
@@ -5357,7 +5336,7 @@ private IRubyObject enumerateCodepoints(ThreadContext context, String name, Bloc
         ptrBytes = strByteList.unsafeBytes();
         ptr = strByteList.begin();
         end = ptr + strByteList.getRealSize();
-        enc = str.getEncoding();
+        enc = EncodingUtils.STR_ENC_GET(str);
 
         if (block.isGiven()) {
             if (wantarray) {
@@ -5478,8 +5457,8 @@ public RubySymbol intern19() {
     @JRubyMethod
     public IRubyObject ord(ThreadContext context) {
         Ruby runtime = context.runtime;
-        return RubyFixnum.newFixnum(runtime, codePoint(runtime, value.getEncoding(), value.getUnsafeBytes(), value.getBegin(),
-                                                                value.getBegin() + value.getRealSize()));
+        return RubyFixnum.newFixnum(runtime, codePoint(runtime, EncodingUtils.STR_ENC_GET(this), value.getUnsafeBytes(), value.getBegin(),
+                value.getBegin() + value.getRealSize()));
     }
 
     @JRubyMethod
@@ -5789,7 +5768,7 @@ public IRubyObject strScrub(ThreadContext context, IRubyObject repl, Block block
         if (cr == CR_7BIT || cr == CR_VALID)
             return context.nil;
 
-        enc = getEncoding();
+        enc = EncodingUtils.STR_ENC_GET(this);
         if (!repl.isNil()) {
             repl = EncodingUtils.strCompatAndValid(context, repl, enc);
         }

diff --git a/core/src/main/java/org/jruby/parser/ParserSupport.java b/core/src/main/java/org/jruby/parser/ParserSupport.java
@@ -46,6 +46,7 @@
 import org.jruby.ast.types.INameNode;
 import org.jruby.common.IRubyWarnings;
 import org.jruby.common.IRubyWarnings.ID;
+import org.jruby.exceptions.RaiseException;
 import org.jruby.lexer.yacc.ISourcePosition;
 import org.jruby.lexer.yacc.ISourcePositionHolder;
 import org.jruby.lexer.yacc.RubyLexer;
@@ -1222,8 +1223,13 @@ public Node arg_append(Node node1, Node node2) {
     // MRI: reg_fragment_check
     public void regexpFragmentCheck(RegexpNode end, ByteList value) {
         setRegexpEncoding(end, value);
-        RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
+        try {
+            RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
+        } catch (RaiseException re) {
+            compile_error(re.getMessage());
+        }
     }        // 1.9 mode overrides to do extra checking...
+
     private List<Integer> allocateNamedLocals(RegexpNode regexpNode) {
         RubyRegexp pattern = RubyRegexp.newRegexp(configuration.getRuntime(), regexpNode.getValue(), regexpNode.getOptions());
         pattern.setLiteral();

diff --git a/core/src/main/java/org/jruby/util/io/EncodingUtils.java b/core/src/main/java/org/jruby/util/io/EncodingUtils.java
@@ -4,7 +4,6 @@
 import org.jcodings.EncodingDB;
 import org.jcodings.Ptr;
 import org.jcodings.specific.ASCIIEncoding;
-import org.jcodings.specific.USASCIIEncoding;
 import org.jcodings.specific.UTF16BEEncoding;
 import org.jcodings.specific.UTF16LEEncoding;
 import org.jcodings.specific.UTF32BEEncoding;
@@ -16,6 +15,7 @@
 import org.jcodings.transcode.Transcoder;
 import org.jcodings.transcode.TranscoderDB;
 import org.jcodings.transcode.Transcoding;
+import org.jcodings.util.CaseInsensitiveBytesHash;
 import org.jruby.Ruby;
 import org.jruby.RubyArray;
 import org.jruby.RubyBasicObject;
@@ -24,7 +24,6 @@
 import org.jruby.RubyFixnum;
 import org.jruby.RubyHash;
 import org.jruby.RubyIO;
-import org.jruby.RubyInteger;
 import org.jruby.RubyMethod;
 import org.jruby.RubyNumeric;
 import org.jruby.RubyProc;
@@ -1810,17 +1809,59 @@ public static IRubyObject strCompatAndValid(ThreadContext context, IRubyObject _
             throw context.runtime.newArgumentError("replacement must be valid byte sequence '" + str + "'");
         }
         else if (cr == StringSupport.CR_7BIT) {
-            Encoding e = str.getEncoding();
+            Encoding e = STR_ENC_GET(str);
             if (!enc.isAsciiCompatible()) {
                 throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
             }
         }
         else { /* ENC_CODERANGE_VALID */
-            Encoding e = str.getEncoding();
+            Encoding e = STR_ENC_GET(str);
             if (enc != e) {
                 throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
             }
         }
         return str;
     }
+
+    // MRI: get_encoding
+    public static Encoding getEncoding(ByteList str) {
+        return getActualEncoding(str.getEncoding(), str);
+    }
+
+    // MRI: get_actual_encoding
+    public static Encoding getActualEncoding(Encoding enc, ByteList byteList) {
+        byte[] bytes = byteList.unsafeBytes();
+        int p = byteList.begin();
+        int end = p + byteList.getRealSize();
+
+        CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = EncodingDB.getEncodings();
+        if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p >= 2) {
+            int c0 = bytes[p] & 0xff;
+            int c1 = bytes[p + 1] & 0xff;
+
+            if (c0 == 0xFE && c1 == 0xFF) {
+                return UTF16BEEncoding.INSTANCE;
+            } else if (c0 == 0xFF && c1 == 0xFE) {
+                return UTF16LEEncoding.INSTANCE;
+            }
+            return ASCIIEncoding.INSTANCE;
+        } else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p >= 4) {
+            int c0 = bytes[p] & 0xff;
+            int c1 = bytes[p + 1] & 0xff;
+            int c2 = bytes[p + 2] & 0xff;
+            int c3 = bytes[p + 3] & 0xff;
+
+            if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
+                return UTF32BEEncoding.INSTANCE;
+            } else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
+                return UTF32LEEncoding.INSTANCE;
+            }
+            return ASCIIEncoding.INSTANCE;
+        }
+        return enc;
+    }
+
+    public static Encoding STR_ENC_GET(RubyString str) {
+        return getEncoding(str.getByteList());
+    }
 }
diff --git a/test/mri/excludes/TestM17N.rb b/test/mri/excludes/TestM17N.rb
@@ -1,35 +1,17 @@
-exclude :test_chr, "needs investigation"
 exclude :test_delete, "needs investigation"
-exclude :test_dynamic_eucjp_regexp, "needs investigation"
-exclude :test_dynamic_sjis_regexp, "needs investigation"
-exclude :test_dynamic_utf8_regexp, "needs investigation"
 exclude :test_end_with, "needs investigation"
 exclude :test_euc_tw, "needs investigation"
 exclude :test_force_encoding, "needs investigation"
 exclude :test_nonascii_method_name, "needs investigation"
 exclude :test_object_inspect_external, "needs investigation"
 exclude :test_object_utf16_32_inspect, "needs investigation"
 exclude :test_regexp_ascii, "needs investigation"
-exclude :test_regexp_mixed_unicode, "needs investigation"
-exclude :test_regexp_too_short_multibyte_character, "needs investigation"
-exclude :test_regexp_unicode, "needs investigation"
-exclude :test_regexp_usascii, "needs investigation"
 exclude :test_scrub, ""
-exclude :test_scrub_bang, ""
-exclude :test_count_sjis_trailing_byte, "needs investigation"
 exclude :test_sprintf_c, "format string encoding should be used to decode incoming fixnums"
 exclude :test_str_concat, "needs investigation"
 exclude :test_string_inspect_encoding, "needs investigation"
 exclude :test_string_mixed_unicode, "needs investigation"
 exclude :test_symbol, "needs investigation"
-exclude :test_symbol_op, "random operators setting encoding earlier"
+exclude :test_symbol_op, "some symbols are created early and do not have UTF-8 encoding"
 exclude :test_union_1_nonascii_string, "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-16BE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-16LE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-32BE)', "needs investigation"
-exclude :'test_utf_16_32_codepoints(UTF-32LE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-16BE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-16LE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-32BE)', "needs investigation"
-exclude :'test_utf_16_32_ord(UTF-32LE)', "needs investigation"
 exclude :test_valid_encoding, "needs investigation"