Introduced a RegexpSupport to house shared regexp utility methods.

jruby · Jun 2, 2015 · 2f7aebd · 2f7aebd
1 parent 0430387
commit 2f7aebd
Showing 3 changed files with 510 additions and 471 deletions.
diff --git a/core/src/main/java/org/jruby/RubyRegexp.java b/core/src/main/java/org/jruby/RubyRegexp.java
@@ -68,6 +68,7 @@
 import org.jruby.util.KCode;
 import org.jruby.util.Pack;
 import org.jruby.util.RegexpOptions;
+import org.jruby.util.RegexpSupport;
 import org.jruby.util.Sprintf;
 import org.jruby.util.StringSupport;
 import org.jruby.util.TypeConverter;
@@ -149,7 +150,7 @@ private static Regex makeRegexp(Ruby runtime, ByteList bytes, RegexpOptions opti
             int p = bytes.getBegin();
             return new Regex(bytes.getUnsafeBytes(), p, p + bytes.getRealSize(), options.toJoniOptions(), enc, Syntax.DEFAULT, runtime.getWarnings());
         } catch (Exception e) {
-            raiseRegexpError19(runtime, bytes, enc, options, e.getMessage());
+            RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, e.getMessage());
             return null; // not reached
         }
     }
@@ -184,10 +185,10 @@ static Regex getQuotedRegexpFromCache19(Ruby runtime, ByteList bytes, RegexpOpti
         return regex;
     }
 
-    private static Regex getPreprocessedRegexpFromCache(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, ErrorMode mode) {
+    private static Regex getPreprocessedRegexpFromCache(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, RegexpSupport.ErrorMode mode) {
         Regex regex = preprocessedPatternCache.get(bytes);
         if (regex != null && regex.getEncoding() == enc && regex.getOptions() == options.toJoniOptions()) return regex;
-        ByteList preprocessed = preprocess(runtime, bytes, enc, new Encoding[]{null}, ErrorMode.RAISE);
+        ByteList preprocessed = RegexpSupport.preprocess(runtime, bytes, enc, new Encoding[]{null}, RegexpSupport.ErrorMode.RAISE);
         regex = makeRegexp(runtime, preprocessed, options, enc);
         regex.setUserObject(preprocessed);
         preprocessedPatternCache.put(bytes, regex);
@@ -443,359 +444,9 @@ public final Regex preparePattern(RubyString str) {
         check();
         Encoding enc = checkEncoding(str, true);
         if (enc == pattern.getEncoding()) return pattern;
-        return getPreprocessedRegexpFromCache(getRuntime(), this.str, enc, options, ErrorMode.PREPROCESS);
+        return getPreprocessedRegexpFromCache(getRuntime(), this.str, enc, options, RegexpSupport.ErrorMode.PREPROCESS);
     }
 
-    private enum ErrorMode {RAISE, PREPROCESS, DESC}
-
-    private static int raisePreprocessError(Ruby runtime, ByteList str, String err, ErrorMode mode) {
-        switch (mode) {
-        case RAISE:
-            raiseRegexpError19(runtime, str, str.getEncoding(), RegexpOptions.NULL_OPTIONS, err);
-        case PREPROCESS:
-            throw runtime.newArgumentError("regexp preprocess failed: " + err);
-        case DESC:
-            // silent ?
-        }
-        return 0;
-    }
-
-    private static int readEscapedByte(Ruby runtime, byte[]to, int toP, byte[]bytes, int p, int end, ByteList str, ErrorMode mode) {
-        if (p == end || bytes[p++] != (byte)'\\') raisePreprocessError(runtime, str, "too short escaped multibyte character", mode);
-
-        boolean metaPrefix = false, ctrlPrefix = false;
-        int code = 0;
-        while (true) {
-            if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
-
-            switch (bytes[p++]) {
-            case '\\': code = '\\'; break;
-            case 'n': code = '\n'; break;
-            case 't': code = '\t'; break;
-            case 'r': code = '\r'; break;
-            case 'f': code = '\f'; break;
-            case 'v': code = '\013'; break;
-            case 'a': code = '\007'; break;
-            case 'e': code = '\033'; break;
-
-            /* \OOO */
-            case '0': case '1': case '2': case '3':
-            case '4': case '5': case '6': case '7':
-                p--;
-                int olen = end < p + 3 ? end - p : 3;
-                code = StringSupport.scanOct(bytes, p, olen);
-                p += StringSupport.octLength(bytes, p, olen);
-                break;
-
-            case 'x': /* \xHH */
-                int hlen = end < p + 2 ? end - p : 2;
-                code = StringSupport.scanHex(bytes, p, hlen);
-                int len = StringSupport.hexLength(bytes, p, hlen);
-                if (len < 1) raisePreprocessError(runtime, str, "invalid hex escape", mode);
-                p += len;
-                break;
-
-            case 'M': /* \M-X, \M-\C-X, \M-\cX */
-                if (metaPrefix) raisePreprocessError(runtime, str, "duplicate meta escape", mode);
-                metaPrefix = true;
-                if (p + 1 < end && bytes[p++] == (byte)'-' && (bytes[p] & 0x80) == 0) {
-                    if (bytes[p] == (byte)'\\') {
-                        p++;
-                        continue;
-                    } else {
-                        code = bytes[p++] & 0xff;
-                        break;
-                    }
-                }
-                raisePreprocessError(runtime, str, "too short meta escape", mode);
-
-            case 'C': /* \C-X, \C-\M-X */
-                if (p == end || bytes[p++] != (byte)'-') raisePreprocessError(runtime, str, "too short control escape", mode);
-
-            case 'c': /* \cX, \c\M-X */
-                if (ctrlPrefix) raisePreprocessError(runtime, str, "duplicate control escape", mode);
-                ctrlPrefix = true;
-                if (p < end && (bytes[p] & 0x80) == 0) {
-                    if (bytes[p] == (byte)'\\') {
-                        p++;
-                        continue;
-                    } else {
-                        code = bytes[p++] & 0xff;
-                        break;
-                    }
-                }
-                raisePreprocessError(runtime, str, "too short control escape", mode);
-            default:
-                raisePreprocessError(runtime, str, "unexpected escape sequence", mode);
-            } // switch
-
-            if (code < 0 || code > 0xff) raisePreprocessError(runtime, str, "invalid escape code", mode);
-
-            if (ctrlPrefix) code &= 0x1f;
-            if (metaPrefix) code |= 0x80;
-
-            to[toP] = (byte)code;
-            return p;
-        } // while
-    }
-
-    /**
-     * Unescape escaped non-ascii character at start position, appending all
-     * to the given bytelist if provided.
-     * 
-     * @param runtime current runtime
-     * @param to output bytelist; if null, no appending will be done
-     * @param bytes incoming bytes
-     * @param p start position
-     * @param end end position
-     * @param enc bytes' encoding
-     * @param encp out param for fixed encoding
-     * @param str original bytes wrapper
-     * @param mode error mode
-     * @return new position after performing unescaping
-     */
-    // MRI: unescape_escapted_nonascii
-    private static int unescapeEscapedNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
-        byte[]chBuf = new byte[enc.maxLength()];
-        int chLen = 0;
-
-        p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
-        while (chLen < enc.maxLength() && StringSupport.MBCLEN_NEEDMORE_P(StringSupport.preciseLength(enc, chBuf, 0, chLen))) {
-            p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
-        }
-
-        int cl = StringSupport.preciseLength(enc, chBuf, 0, chLen);
-        if (cl == -1) {
-            raisePreprocessError(runtime, str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
-        }
-
-        if (chLen > 1 || (chBuf[0] & 0x80) != 0) {
-            if (to != null) to.append(chBuf, 0, chLen);
-
-            if (encp[0] == null) {
-                encp[0] = enc;
-            } else if (encp[0] != enc) {
-                raisePreprocessError(runtime, str, "escaped non ASCII character in UTF-8 regexp", mode);
-            }
-        } else {
-            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", chBuf[0] & 0xff);
-        }
-        return p;
-    }
-
-    private static void checkUnicodeRange(Ruby runtime, int code, ByteList str, ErrorMode mode) {
-        // Unicode is can be only 21 bits long, int is enough
-        if ((0xd800 <= code && code <= 0xdfff) /* Surrogates */ || 0x10ffff < code) {
-            raisePreprocessError(runtime, str, "invalid Unicode range", mode);
-        }
-    }
-
-    /**
-     * Append the given utf8 characters to the buffer, if given, checking for
-     * errors along the way.
-     * 
-     * @param runtime current runtime
-     * @param to output buffer; if null, no appending will be done
-     * @param code utf8 character code
-     * @param enc output param for new encoding
-     * @param str original wrapper of source bytes
-     * @param mode error mode
-     */
-    private static void appendUtf8(Ruby runtime, ByteList to, int code, Encoding[] enc, ByteList str, ErrorMode mode) {
-        checkUnicodeRange(runtime, code, str, mode);
-
-        if (code < 0x80) {
-            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", code);
-        } else {
-            if (to != null) {
-                to.ensure(to.getRealSize() + 6);
-                to.setRealSize(to.getRealSize() + Pack.utf8Decode(runtime, to.getUnsafeBytes(), to.getBegin() + to.getRealSize(), code));
-            }
-            if (enc[0] == null) {
-                enc[0] = UTF8Encoding.INSTANCE;
-            } else if (!(enc[0] instanceof UTF8Encoding)) { // do not load the class if not used
-                raisePreprocessError(runtime, str, "UTF-8 character in non UTF-8 regexp", mode);
-            }
-        }
-    }
-
-    /**
-     * Unescape unicode characters at given offset, appending to the given
-     * out buffer if provided.
-     * 
-     * @param runtime current runtime
-     * @param to output buffer; if null, no appending will be done
-     * @param bytes input bytes
-     * @param p start position
-     * @param end end position
-     * @param encp out param for fixed encoding
-     * @param str original bytes wrapper
-     * @param mode error mode
-     * @return new position after unescaping
-     */
-    private static int unescapeUnicodeList(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding[]encp, ByteList str, ErrorMode mode) {
-        while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
-
-        boolean hasUnicode = false; 
-        while (true) {
-            int code = StringSupport.scanHex(bytes, p, end - p);
-            int len = StringSupport.hexLength(bytes, p, end - p);
-            if (len == 0) break;
-            if (len > 6) raisePreprocessError(runtime, str, "invalid Unicode range", mode);
-            p += len;
-            if (to != null) appendUtf8(runtime, to, code, encp, str, mode);
-            hasUnicode = true;
-            while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
-        }
-
-        if (!hasUnicode) raisePreprocessError(runtime, str, "invalid Unicode list", mode); 
-        return p;
-    }
-
-    /**
-     * Unescape unicode BMP char at given offset, appending to the specified
-     * buffer if non-null.
-     * 
-     * @param runtime current runtime
-     * @param to output buffer; if null, no appending will be done
-     * @param bytes input bytes
-     * @param p start position
-     * @param end end position
-     * @param encp out param for fixed encoding
-     * @param str original bytes wrapper
-     * @param mode error mode
-     * @return new position after unescaping
-     */
-    private static int unescapeUnicodeBmp(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding[] encp, ByteList str, ErrorMode mode) {
-        if (p + 4 > end) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
-        int code = StringSupport.scanHex(bytes, p, 4);
-        int len = StringSupport.hexLength(bytes, p, 4);
-        if (len != 4) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
-        appendUtf8(runtime, to, code, encp, str, mode);
-        return p + 4;
-    }
-
-    /**
-     * Unescape non-ascii elements in the given string, appending the results
-     * to the given bytelist if provided.
-     * 
-     * @param runtime current runtime
-     * @param to output bytelist; if null, no appending will be done
-     * @param bytes the bytes to unescape
-     * @param p starting position
-     * @param end ending position
-     * @param enc bytes' encoding
-     * @param encp out param for fixed encoding
-     * @param str original wrapper for the bytes
-     * @param mode error mode
-     * @return whether any propery elements were encountered while walking
-     */
-    private static boolean unescapeNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
-        boolean hasProperty = false;
-        byte[] buf = null;
-
-        while (p < end) {
-            int cl = StringSupport.preciseLength(enc, bytes, p, end);
-            if (cl <= 0) raisePreprocessError(runtime, str, "invalid multibyte character", mode);
-            if (cl > 1 || (bytes[p] & 0x80) != 0) {
-                if (to != null) to.append(bytes, p, cl);
-                p += cl;
-                if (encp[0] == null) {
-                    encp[0] = enc;
-                } else if (encp[0] != enc) {
-                    raisePreprocessError(runtime, str, "non ASCII character in UTF-8 regexp", mode);
-                }
-                continue;
-            }
-            int c;
-            switch (c = bytes[p++] & 0xff) {
-            case '\\':
-                if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
-
-                switch (c = bytes[p++] & 0xff) {
-                case '1': case '2': case '3':
-                case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
-                    if (StringSupport.scanOct(bytes, p - 1, end - (p - 1)) <= 0177) {
-                        if (to != null) to.append('\\').append(c);
-                        break;
-                    }
-
-                case '0': /* \0, \0O, \0OO */
-                case 'x': /* \xHH */
-                case 'c': /* \cX, \c\M-X */
-                case 'C': /* \C-X, \C-\M-X */
-                case 'M': /* \M-X, \M-\C-X, \M-\cX */
-                    p -= 2;
-                    if (enc == USASCIIEncoding.INSTANCE) {
-                        if (buf == null) buf = new byte[1];
-                        p = readEscapedByte(runtime, buf, 0, bytes, p, end, str, mode);
-                        c = buf[0];
-                        if (c == (char)-1) return false;
-                        if (to != null) to.append(c);
-                    }
-                    else {
-                       p = unescapeEscapedNonAscii(runtime, to, bytes, p, end, enc, encp, str, mode);
-                    }
-                    break;
-
-                case 'u':
-                    if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
-                    if (bytes[p] == (byte)'{') { /* \\u{H HH HHH HHHH HHHHH HHHHHH ...} */
-                        p++;
-                        p = unescapeUnicodeList(runtime, to, bytes, p, end, encp, str, mode);
-                        if (p == end || bytes[p++] != (byte)'}') raisePreprocessError(runtime, str, "invalid Unicode list", mode);
-                    } else { /* \\uHHHH */
-                        p = unescapeUnicodeBmp(runtime, to, bytes, p, end, encp, str, mode);
-                    }
-                    break;
-                case 'p': /* \p{Hiragana} */
-                    if (encp[0] == null) hasProperty = true;
-                    if (to != null) to.append('\\').append(c);
-                    break;
-
-                default:
-                    if (to != null) to.append('\\').append(c);
-                    break;
-                } // inner switch
-                break;
-
-            default:
-                if (to != null) to.append(c);
-            } // switch
-        } // while
-        return hasProperty;
-    }
-
-
-    /**
-     * Preprocess the given string for use in regexp, raising errors for encoding
-     * incompatibilities that arise.
-     * 
-     * This version produces a new unescaped version of the string based on
-     * fixes performed while walking.
-     * 
-     * @param runtime current runtime
-     * @param str string to preprocess
-     * @param enc string's encoding
-     * @param fixedEnc new encoding after fixing
-     * @param mode mode of errors
-     * @return a new unescaped string
-     */
-    private static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, ErrorMode mode) {
-        ByteList to = new ByteList(str.getRealSize());
-
-        if (enc.isAsciiCompatible()) {
-            fixedEnc[0] = null;
-        } else {
-            fixedEnc[0] = enc;
-            to.setEncoding(enc);
-        }
-
-        boolean hasProperty = unescapeNonAscii(runtime, to, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
-        if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
-        if (fixedEnc[0] != null) to.setEncoding(fixedEnc[0]);
-        return to;
-    }
 
     /**
      * Preprocess the given string for use in regexp, raising errors for encoding
@@ -810,19 +461,19 @@ private static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Enc
      * @param fixedEnc new encoding after fixing
      * @param mode mode of errors
      */
-    private static void preprocessLight(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, ErrorMode mode) {
+    private static void preprocessLight(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, RegexpSupport.ErrorMode mode) {
         if (enc.isAsciiCompatible()) {
             fixedEnc[0] = null;
         } else {
             fixedEnc[0] = enc;
         }
 
-        boolean hasProperty = unescapeNonAscii(runtime, null, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
+        boolean hasProperty = RegexpSupport.unescapeNonAscii(runtime, null, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
         if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
     }
 
     public static void preprocessCheck(Ruby runtime, ByteList bytes) {
-        preprocess(runtime, bytes, bytes.getEncoding(), new Encoding[]{null}, ErrorMode.RAISE);
+        RegexpSupport.preprocess(runtime, bytes, bytes.getEncoding(), new Encoding[]{null}, RegexpSupport.ErrorMode.RAISE);
     }
 
     public static RubyString preprocessDRegexp(Ruby runtime, RubyString[] strings, int embeddedOptions) {
@@ -849,7 +500,7 @@ public static RubyString preprocessDRegexp(Ruby runtime, IRubyObject[] strings,
             // This used to call preprocess, but the resulting bytelist was not
             // used. Since the preprocessing error-checking can be done without
             // creating a new bytelist, I added a "light" path.
-            RubyRegexp.preprocessLight(runtime, str.getByteList(), strEnc, fixedEnc, RubyRegexp.ErrorMode.PREPROCESS);
+            RubyRegexp.preprocessLight(runtime, str.getByteList(), strEnc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
 
             if (fixedEnc[0] != null) {
                 if (regexpEnc != null && regexpEnc != fixedEnc[0]) {
@@ -1179,30 +830,6 @@ public static IRubyObject union19(ThreadContext context, IRubyObject recv, IRuby
         }
     }
 
-    // rb_enc_reg_raise
-    private static void raiseRegexpError19(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, String err) {
-        // TODO: we loose encoding information here, fix it
-        throw runtime.newRegexpError(err + ": " + regexpDescription19(runtime, bytes, options, enc));
-    }
-
-    // rb_enc_reg_error_desc
-    static ByteList regexpDescription19(Ruby runtime, ByteList bytes, RegexpOptions options, Encoding enc) {
-        return regexpDescription19(runtime, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize(), options, enc);
-    }
-    private static ByteList regexpDescription19(Ruby runtime, byte[] s, int start, int len, RegexpOptions options, Encoding enc) {
-        ByteList description = new ByteList();
-        description.setEncoding(enc);
-        description.append((byte)'/');
-        Encoding resultEnc = runtime.getDefaultInternalEncoding();
-        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
-
-        appendRegexpString19(runtime, description, s, start, len, enc, resultEnc);
-        description.append((byte)'/');
-        appendOptions(description, options);
-        if (options.isEncodingNone()) description.append((byte) 'n');
-        return description; 
-    }
-
     /** rb_reg_init_copy
      */
     @JRubyMethod(required = 1, visibility = Visibility.PRIVATE)
@@ -1297,7 +924,7 @@ private RubyRegexp initializeCommon19(RubyString str, RegexpOptions options) {
         if (options.isEncodingNone()) {
             if (enc != ASCIIEncoding.INSTANCE) {
                 if (str.scanForCodeRange() != StringSupport.CR_7BIT) {
-                    raiseRegexpError19(getRuntime(), bytes, enc, options, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
+                    RegexpSupport.raiseRegexpError19(getRuntime(), bytes, enc, options, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
                 }
                 enc = ASCIIEncoding.INSTANCE;
             }
@@ -1314,14 +941,14 @@ private RubyRegexp initializeCommon19(ByteList bytes, Encoding enc, RegexpOption
         // FIXME: Something unsets this bit, but we aren't...be more permissive until we figure this out
         //if (isLiteral()) throw runtime.newSecurityError("can't modify literal regexp");
         if (pattern != null) throw runtime.newTypeError("already initialized regexp");
-        if (enc.isDummy()) raiseRegexpError19(runtime, bytes, enc, options, "can't make regexp with dummy encoding");
+        if (enc.isDummy()) RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, "can't make regexp with dummy encoding");
 
         Encoding[]fixedEnc = new Encoding[]{null};
-        ByteList unescaped = preprocess(runtime, bytes, enc, fixedEnc, ErrorMode.RAISE);
+        ByteList unescaped = RegexpSupport.preprocess(runtime, bytes, enc, fixedEnc, RegexpSupport.ErrorMode.RAISE);
         if (fixedEnc[0] != null) {
             if ((fixedEnc[0] != enc && options.isFixed()) ||
                (fixedEnc[0] != ASCIIEncoding.INSTANCE && options.isEncodingNone())) {
-                   raiseRegexpError19(runtime, bytes, enc, options, "incompatible character encoding");
+                   RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, "incompatible character encoding");
             }
             if (fixedEnc[0] != ASCIIEncoding.INSTANCE) {
                 options.setFixed(true);
@@ -1688,7 +1315,7 @@ public IRubyObject inspect() {
     @JRubyMethod(name = "inspect")
     public IRubyObject inspect19() {
         if (pattern == null) return anyToString();
-        return RubyString.newString(getRuntime(), regexpDescription19(getRuntime(), str, options, str.getEncoding()));
+        return RubyString.newString(getRuntime(), RegexpSupport.regexpDescription19(getRuntime(), str, options, str.getEncoding()));
     }
 
     private final static int EMBEDDABLE = RE_OPTION_MULTILINE|RE_OPTION_IGNORECASE|RE_OPTION_EXTENDED;
@@ -1764,7 +1391,7 @@ public IRubyObject to_s() {
                 }
             }
 
-            appendOptions(result, newOptions);
+            RegexpSupport.appendOptions(result, newOptions);
 
             if (!newOptions.isEmbeddable()) {
                 result.append((byte)'-');
@@ -1773,7 +1400,7 @@ public IRubyObject to_s() {
                 if (!newOptions.isExtended()) result.append((byte)'x');
             }
             result.append((byte)':');
-            appendRegexpString19(runtime, result, bytes, p, len, str.getEncoding(), null);
+            RegexpSupport.appendRegexpString19(runtime, result, bytes, p, len, str.getEncoding(), null);
 
             result.append((byte)')');
             return RubyString.newString(getRuntime(), result, getEncoding()).infectBy(this);
@@ -1828,87 +1455,6 @@ private static void appendRegexpString(Ruby runtime, ByteList to, byte[]bytes, i
         }
     }
 
-    private static void appendRegexpString19(Ruby runtime, ByteList to, byte[]bytes, int start, int len, Encoding enc, Encoding resEnc) {
-        int p = start;
-        int end = p + len;
-        boolean needEscape = false;
-        while (p < end) {
-            final int c;
-            final int cl;
-            if (enc.isAsciiCompatible()) {
-                cl = 1;
-                c = bytes[p] & 0xff;
-            } else {
-                cl = StringSupport.preciseLength(enc, bytes, p, end);
-                c = enc.mbcToCode(bytes, p, end);
-            }
-
-            if (!Encoding.isAscii(c)) {
-                p += StringSupport.length(enc, bytes, p, end);
-            } else if (c != '/' && enc.isPrint(c)) {
-                p += cl;
-            } else {
-                needEscape = true;
-                break;
-            }
-        }
-        if (!needEscape) {
-            to.append(bytes, start, len);
-        } else {
-            boolean isUnicode = StringSupport.isUnicode(enc);
-            p = start; 
-            while (p < end) {
-                final int c;
-                final int cl;
-                if (enc.isAsciiCompatible()) {
-                    cl = 1;
-                    c = bytes[p] & 0xff;
-                } else {
-                    cl = StringSupport.preciseLength(enc, bytes, p, end);
-                    c = enc.mbcToCode(bytes, p, end);
-                }
-
-                if (c == '\\' && p + cl < end) {
-                    int n = cl + StringSupport.length(enc, bytes, p + cl, end);
-                    to.append(bytes, p, n);
-                    p += n;
-                    continue;
-                } else if (c == '/') {
-                    to.append((byte) '\\');
-                    to.append(bytes, p, cl);
-                } else if (!Encoding.isAscii(c)) {
-                    int l = StringSupport.preciseLength(enc, bytes, p, end);
-                    if (l <= 0) {
-                        l = 1;
-                        Sprintf.sprintf(runtime, to, "\\x%02X", c);
-                    } else if (resEnc != null) {
-                        int code = enc.mbcToCode(bytes, p, end);
-                        Sprintf.sprintf(runtime, to , StringSupport.escapedCharFormat(code, isUnicode), code);
-                    } else {
-                        to.append(bytes, p, l);
-                    }
-                    p += l;
-
-                    continue;    
-                } else if (enc.isPrint(c)) {
-                    to.append(bytes, p, cl);
-                } else if (!enc.isSpace(c)) {
-                    Sprintf.sprintf(runtime, to, "\\x%02X", c);
-                } else {
-                    to.append(bytes, p, cl);
-                }
-                p += cl;
-            }
-        }
-    }
-
-    // option_to_str
-    private static void appendOptions(ByteList to, RegexpOptions options) {
-        if (options.isMultiline()) to.append((byte)'m');
-        if (options.isIgnorecase()) to.append((byte)'i');
-        if (options.isExtended()) to.append((byte)'x');
-    }
-
     private static String[] NO_NAMES = new String[] {}; //TODO: Perhaps we have another empty string arr
     public String[] getNames() {
         int nameLength = pattern.numberOfNames();

diff --git a/core/src/main/java/org/jruby/RubyString.java b/core/src/main/java/org/jruby/RubyString.java
@@ -3616,7 +3616,7 @@ private Regex getStringPattern19(Ruby runtime, IRubyObject obj) {
         RubyString str = getStringForPattern(obj);
         if (str.scanForCodeRange() == CR_BROKEN) {
             throw runtime.newRegexpError("invalid multybyte character: " +
-                    RubyRegexp.regexpDescription19(runtime, str.value, new RegexpOptions(), str.value.getEncoding()).toString());
+                    RegexpSupport.regexpDescription19(runtime, str.value, new RegexpOptions(), str.value.getEncoding()).toString());
         }
         if (str.value.getEncoding().isDummy()) {
             throw runtime.newArgumentError("can't make regexp with dummy encoding");

diff --git a/core/src/main/java/org/jruby/util/RegexpSupport.java b/core/src/main/java/org/jruby/util/RegexpSupport.java
@@ -0,0 +1,493 @@
+/***** BEGIN LICENSE BLOCK *****
+ * Version: EPL 1.0/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Eclipse Public
+ * License Version 1.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Software distributed under the License is distributed on an "AS
+ * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * rights and limitations under the License.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the EPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the EPL, the GPL or the LGPL.
+ ***** END LICENSE BLOCK *****/
+
+package org.jruby.util;
+
+import org.jcodings.Encoding;
+import org.jcodings.specific.ASCIIEncoding;
+import org.jcodings.specific.USASCIIEncoding;
+import org.jcodings.specific.UTF8Encoding;
+import org.jruby.Ruby;
+import org.jruby.RubyRegexp;
+
+public class RegexpSupport {
+
+    public enum ErrorMode {RAISE, PREPROCESS, DESC}
+
+    /**
+     * Preprocess the given string for use in regexp, raising errors for encoding
+     * incompatibilities that arise.
+     *
+     * This version produces a new unescaped version of the string based on
+     * fixes performed while walking.
+     *
+     * @param runtime current runtime
+     * @param str string to preprocess
+     * @param enc string's encoding
+     * @param fixedEnc new encoding after fixing
+     * @param mode mode of errors
+     * @return a new unescaped string
+     */
+    public static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Encoding[] fixedEnc, ErrorMode mode) {
+        ByteList to = new ByteList(str.getRealSize());
+
+        if (enc.isAsciiCompatible()) {
+            fixedEnc[0] = null;
+        } else {
+            fixedEnc[0] = enc;
+            to.setEncoding(enc);
+        }
+
+        boolean hasProperty = unescapeNonAscii(runtime, to, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
+        if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
+        if (fixedEnc[0] != null) to.setEncoding(fixedEnc[0]);
+        return to;
+    }
+
+    /**
+     * Unescape non-ascii elements in the given string, appending the results
+     * to the given bytelist if provided.
+     *
+     * @param runtime current runtime
+     * @param to output bytelist; if null, no appending will be done
+     * @param bytes the bytes to unescape
+     * @param p starting position
+     * @param end ending position
+     * @param enc bytes' encoding
+     * @param encp out param for fixed encoding
+     * @param str original wrapper for the bytes
+     * @param mode error mode
+     * @return whether any propery elements were encountered while walking
+     */
+    public static boolean unescapeNonAscii(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding enc, Encoding[] encp, ByteList str, ErrorMode mode) {
+        boolean hasProperty = false;
+        byte[] buf = null;
+
+        while (p < end) {
+            int cl = StringSupport.preciseLength(enc, bytes, p, end);
+            if (cl <= 0) raisePreprocessError(runtime, str, "invalid multibyte character", mode);
+            if (cl > 1 || (bytes[p] & 0x80) != 0) {
+                if (to != null) to.append(bytes, p, cl);
+                p += cl;
+                if (encp[0] == null) {
+                    encp[0] = enc;
+                } else if (encp[0] != enc) {
+                    raisePreprocessError(runtime, str, "non ASCII character in UTF-8 regexp", mode);
+                }
+                continue;
+            }
+            int c;
+            switch (c = bytes[p++] & 0xff) {
+                case '\\':
+                    if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
+
+                    switch (c = bytes[p++] & 0xff) {
+                        case '1': case '2': case '3':
+                        case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
+                            if (StringSupport.scanOct(bytes, p - 1, end - (p - 1)) <= 0177) {
+                                if (to != null) to.append('\\').append(c);
+                                break;
+                            }
+
+                        case '0': /* \0, \0O, \0OO */
+                        case 'x': /* \xHH */
+                        case 'c': /* \cX, \c\M-X */
+                        case 'C': /* \C-X, \C-\M-X */
+                        case 'M': /* \M-X, \M-\C-X, \M-\cX */
+                            p -= 2;
+                            if (enc == USASCIIEncoding.INSTANCE) {
+                                if (buf == null) buf = new byte[1];
+                                p = readEscapedByte(runtime, buf, 0, bytes, p, end, str, mode);
+                                c = buf[0];
+                                if (c == (char)-1) return false;
+                                if (to != null) to.append(c);
+                            }
+                            else {
+                                p = unescapeEscapedNonAscii(runtime, to, bytes, p, end, enc, encp, str, mode);
+                            }
+                            break;
+
+                        case 'u':
+                            if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
+                            if (bytes[p] == (byte)'{') { /* \\u{H HH HHH HHHH HHHHH HHHHHH ...} */
+                                p++;
+                                p = unescapeUnicodeList(runtime, to, bytes, p, end, encp, str, mode);
+                                if (p == end || bytes[p++] != (byte)'}') raisePreprocessError(runtime, str, "invalid Unicode list", mode);
+                            } else { /* \\uHHHH */
+                                p = unescapeUnicodeBmp(runtime, to, bytes, p, end, encp, str, mode);
+                            }
+                            break;
+                        case 'p': /* \p{Hiragana} */
+                            if (encp[0] == null) hasProperty = true;
+                            if (to != null) to.append('\\').append(c);
+                            break;
+
+                        default:
+                            if (to != null) to.append('\\').append(c);
+                            break;
+                    } // inner switch
+                    break;
+
+                default:
+                    if (to != null) to.append(c);
+            } // switch
+        } // while
+        return hasProperty;
+    }
+
+    public static int raisePreprocessError(Ruby runtime, ByteList str, String err, ErrorMode mode) {
+        switch (mode) {
+            case RAISE:
+                raiseRegexpError19(runtime, str, str.getEncoding(), RegexpOptions.NULL_OPTIONS, err);
+            case PREPROCESS:
+                throw runtime.newArgumentError("regexp preprocess failed: " + err);
+            case DESC:
+                // silent ?
+        }
+        return 0;
+    }
+
+    // rb_enc_reg_raise
+    public static void raiseRegexpError19(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, String err) {
+        // TODO: we loose encoding information here, fix it
+        throw runtime.newRegexpError(err + ": " + regexpDescription19(runtime, bytes, options, enc));
+    }
+
+    // rb_enc_reg_error_desc
+    public static ByteList regexpDescription19(Ruby runtime, ByteList bytes, RegexpOptions options, Encoding enc) {
+        return regexpDescription19(runtime, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize(), options, enc);
+    }
+
+    private static ByteList regexpDescription19(Ruby runtime, byte[] s, int start, int len, RegexpOptions options, Encoding enc) {
+        ByteList description = new ByteList();
+        description.setEncoding(enc);
+        description.append((byte)'/');
+        Encoding resultEnc = runtime.getDefaultInternalEncoding();
+        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
+
+        appendRegexpString19(runtime, description, s, start, len, enc, resultEnc);
+        description.append((byte)'/');
+        appendOptions(description, options);
+        if (options.isEncodingNone()) description.append((byte) 'n');
+        return description;
+    }
+
+    public static void appendRegexpString19(Ruby runtime, ByteList to, byte[] bytes, int start, int len, Encoding enc, Encoding resEnc) {
+        int p = start;
+        int end = p + len;
+        boolean needEscape = false;
+        while (p < end) {
+            final int c;
+            final int cl;
+            if (enc.isAsciiCompatible()) {
+                cl = 1;
+                c = bytes[p] & 0xff;
+            } else {
+                cl = StringSupport.preciseLength(enc, bytes, p, end);
+                c = enc.mbcToCode(bytes, p, end);
+            }
+
+            if (!Encoding.isAscii(c)) {
+                p += StringSupport.length(enc, bytes, p, end);
+            } else if (c != '/' && enc.isPrint(c)) {
+                p += cl;
+            } else {
+                needEscape = true;
+                break;
+            }
+        }
+        if (!needEscape) {
+            to.append(bytes, start, len);
+        } else {
+            boolean isUnicode = StringSupport.isUnicode(enc);
+            p = start;
+            while (p < end) {
+                final int c;
+                final int cl;
+                if (enc.isAsciiCompatible()) {
+                    cl = 1;
+                    c = bytes[p] & 0xff;
+                } else {
+                    cl = StringSupport.preciseLength(enc, bytes, p, end);
+                    c = enc.mbcToCode(bytes, p, end);
+                }
+
+                if (c == '\\' && p + cl < end) {
+                    int n = cl + StringSupport.length(enc, bytes, p + cl, end);
+                    to.append(bytes, p, n);
+                    p += n;
+                    continue;
+                } else if (c == '/') {
+                    to.append((byte) '\\');
+                    to.append(bytes, p, cl);
+                } else if (!Encoding.isAscii(c)) {
+                    int l = StringSupport.preciseLength(enc, bytes, p, end);
+                    if (l <= 0) {
+                        l = 1;
+                        Sprintf.sprintf(runtime, to, "\\x%02X", c);
+                    } else if (resEnc != null) {
+                        int code = enc.mbcToCode(bytes, p, end);
+                        Sprintf.sprintf(runtime, to , StringSupport.escapedCharFormat(code, isUnicode), code);
+                    } else {
+                        to.append(bytes, p, l);
+                    }
+                    p += l;
+
+                    continue;
+                } else if (enc.isPrint(c)) {
+                    to.append(bytes, p, cl);
+                } else if (!enc.isSpace(c)) {
+                    Sprintf.sprintf(runtime, to, "\\x%02X", c);
+                } else {
+                    to.append(bytes, p, cl);
+                }
+                p += cl;
+            }
+        }
+    }
+
+    // option_to_str
+    public static void appendOptions(ByteList to, RegexpOptions options) {
+        if (options.isMultiline()) to.append((byte)'m');
+        if (options.isIgnorecase()) to.append((byte)'i');
+        if (options.isExtended()) to.append((byte)'x');
+    }
+
+    public static int readEscapedByte(Ruby runtime, byte[] to, int toP, byte[] bytes, int p, int end, ByteList str, ErrorMode mode) {
+        if (p == end || bytes[p++] != (byte)'\\') raisePreprocessError(runtime, str, "too short escaped multibyte character", mode);
+
+        boolean metaPrefix = false, ctrlPrefix = false;
+        int code = 0;
+        while (true) {
+            if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
+
+            switch (bytes[p++]) {
+                case '\\': code = '\\'; break;
+                case 'n': code = '\n'; break;
+                case 't': code = '\t'; break;
+                case 'r': code = '\r'; break;
+                case 'f': code = '\f'; break;
+                case 'v': code = '\013'; break;
+                case 'a': code = '\007'; break;
+                case 'e': code = '\033'; break;
+
+            /* \OOO */
+                case '0': case '1': case '2': case '3':
+                case '4': case '5': case '6': case '7':
+                    p--;
+                    int olen = end < p + 3 ? end - p : 3;
+                    code = StringSupport.scanOct(bytes, p, olen);
+                    p += StringSupport.octLength(bytes, p, olen);
+                    break;
+
+                case 'x': /* \xHH */
+                    int hlen = end < p + 2 ? end - p : 2;
+                    code = StringSupport.scanHex(bytes, p, hlen);
+                    int len = StringSupport.hexLength(bytes, p, hlen);
+                    if (len < 1) raisePreprocessError(runtime, str, "invalid hex escape", mode);
+                    p += len;
+                    break;
+
+                case 'M': /* \M-X, \M-\C-X, \M-\cX */
+                    if (metaPrefix) raisePreprocessError(runtime, str, "duplicate meta escape", mode);
+                    metaPrefix = true;
+                    if (p + 1 < end && bytes[p++] == (byte)'-' && (bytes[p] & 0x80) == 0) {
+                        if (bytes[p] == (byte)'\\') {
+                            p++;
+                            continue;
+                        } else {
+                            code = bytes[p++] & 0xff;
+                            break;
+                        }
+                    }
+                    raisePreprocessError(runtime, str, "too short meta escape", mode);
+
+                case 'C': /* \C-X, \C-\M-X */
+                    if (p == end || bytes[p++] != (byte)'-') raisePreprocessError(runtime, str, "too short control escape", mode);
+
+                case 'c': /* \cX, \c\M-X */
+                    if (ctrlPrefix) raisePreprocessError(runtime, str, "duplicate control escape", mode);
+                    ctrlPrefix = true;
+                    if (p < end && (bytes[p] & 0x80) == 0) {
+                        if (bytes[p] == (byte)'\\') {
+                            p++;
+                            continue;
+                        } else {
+                            code = bytes[p++] & 0xff;
+                            break;
+                        }
+                    }
+                    raisePreprocessError(runtime, str, "too short control escape", mode);
+                default:
+                    raisePreprocessError(runtime, str, "unexpected escape sequence", mode);
+            } // switch
+
+            if (code < 0 || code > 0xff) raisePreprocessError(runtime, str, "invalid escape code", mode);
+
+            if (ctrlPrefix) code &= 0x1f;
+            if (metaPrefix) code |= 0x80;
+
+            to[toP] = (byte)code;
+            return p;
+        } // while
+    }
+
+    /**
+     * Unescape escaped non-ascii character at start position, appending all
+     * to the given bytelist if provided.
+     *
+     * @param runtime current runtime
+     * @param to output bytelist; if null, no appending will be done
+     * @param bytes incoming bytes
+     * @param p start position
+     * @param end end position
+     * @param enc bytes' encoding
+     * @param encp out param for fixed encoding
+     * @param str original bytes wrapper
+     * @param mode error mode
+     * @return new position after performing unescaping
+     */
+    // MRI: unescape_escapted_nonascii
+    private static int unescapeEscapedNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
+        byte[]chBuf = new byte[enc.maxLength()];
+        int chLen = 0;
+
+        p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
+        while (chLen < enc.maxLength() && StringSupport.MBCLEN_NEEDMORE_P(StringSupport.preciseLength(enc, chBuf, 0, chLen))) {
+            p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
+        }
+
+        int cl = StringSupport.preciseLength(enc, chBuf, 0, chLen);
+        if (cl == -1) {
+            raisePreprocessError(runtime, str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
+        }
+
+        if (chLen > 1 || (chBuf[0] & 0x80) != 0) {
+            if (to != null) to.append(chBuf, 0, chLen);
+
+            if (encp[0] == null) {
+                encp[0] = enc;
+            } else if (encp[0] != enc) {
+                raisePreprocessError(runtime, str, "escaped non ASCII character in UTF-8 regexp", mode);
+            }
+        } else {
+            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", chBuf[0] & 0xff);
+        }
+        return p;
+    }
+
+    /**
+     * Unescape unicode characters at given offset, appending to the given
+     * out buffer if provided.
+     *
+     * @param runtime current runtime
+     * @param to output buffer; if null, no appending will be done
+     * @param bytes input bytes
+     * @param p start position
+     * @param end end position
+     * @param encp out param for fixed encoding
+     * @param str original bytes wrapper
+     * @param mode error mode
+     * @return new position after unescaping
+     */
+    private static int unescapeUnicodeList(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding[]encp, ByteList str, ErrorMode mode) {
+        while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
+
+        boolean hasUnicode = false;
+        while (true) {
+            int code = StringSupport.scanHex(bytes, p, end - p);
+            int len = StringSupport.hexLength(bytes, p, end - p);
+            if (len == 0) break;
+            if (len > 6) raisePreprocessError(runtime, str, "invalid Unicode range", mode);
+            p += len;
+            if (to != null) appendUtf8(runtime, to, code, encp, str, mode);
+            hasUnicode = true;
+            while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
+        }
+
+        if (!hasUnicode) raisePreprocessError(runtime, str, "invalid Unicode list", mode);
+        return p;
+    }
+
+    /**
+     * Unescape unicode BMP char at given offset, appending to the specified
+     * buffer if non-null.
+     *
+     * @param runtime current runtime
+     * @param to output buffer; if null, no appending will be done
+     * @param bytes input bytes
+     * @param p start position
+     * @param end end position
+     * @param encp out param for fixed encoding
+     * @param str original bytes wrapper
+     * @param mode error mode
+     * @return new position after unescaping
+     */
+    private static int unescapeUnicodeBmp(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding[] encp, ByteList str, ErrorMode mode) {
+        if (p + 4 > end) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
+        int code = StringSupport.scanHex(bytes, p, 4);
+        int len = StringSupport.hexLength(bytes, p, 4);
+        if (len != 4) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
+        appendUtf8(runtime, to, code, encp, str, mode);
+        return p + 4;
+    }
+
+    /**
+     * Append the given utf8 characters to the buffer, if given, checking for
+     * errors along the way.
+     *
+     * @param runtime current runtime
+     * @param to output buffer; if null, no appending will be done
+     * @param code utf8 character code
+     * @param enc output param for new encoding
+     * @param str original wrapper of source bytes
+     * @param mode error mode
+     */
+    private static void appendUtf8(Ruby runtime, ByteList to, int code, Encoding[] enc, ByteList str, ErrorMode mode) {
+        checkUnicodeRange(runtime, code, str, mode);
+
+        if (code < 0x80) {
+            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", code);
+        } else {
+            if (to != null) {
+                to.ensure(to.getRealSize() + 6);
+                to.setRealSize(to.getRealSize() + Pack.utf8Decode(runtime, to.getUnsafeBytes(), to.getBegin() + to.getRealSize(), code));
+            }
+            if (enc[0] == null) {
+                enc[0] = UTF8Encoding.INSTANCE;
+            } else if (!(enc[0] instanceof UTF8Encoding)) { // do not load the class if not used
+                raisePreprocessError(runtime, str, "UTF-8 character in non UTF-8 regexp", mode);
+            }
+        }
+    }
+
+    private static void checkUnicodeRange(Ruby runtime, int code, ByteList str, ErrorMode mode) {
+        // Unicode is can be only 21 bits long, int is enough
+        if ((0xd800 <= code && code <= 0xdfff) /* Surrogates */ || 0x10ffff < code) {
+            raisePreprocessError(runtime, str, "invalid Unicode range", mode);
+        }
+    }
+}