Skip to content

Commit

Permalink
Showing 3 changed files with 510 additions and 471 deletions.
486 changes: 16 additions & 470 deletions core/src/main/java/org/jruby/RubyRegexp.java
Original file line number Diff line number Diff line change
@@ -68,6 +68,7 @@
import org.jruby.util.KCode;
import org.jruby.util.Pack;
import org.jruby.util.RegexpOptions;
import org.jruby.util.RegexpSupport;
import org.jruby.util.Sprintf;
import org.jruby.util.StringSupport;
import org.jruby.util.TypeConverter;
@@ -149,7 +150,7 @@ private static Regex makeRegexp(Ruby runtime, ByteList bytes, RegexpOptions opti
int p = bytes.getBegin();
return new Regex(bytes.getUnsafeBytes(), p, p + bytes.getRealSize(), options.toJoniOptions(), enc, Syntax.DEFAULT, runtime.getWarnings());
} catch (Exception e) {
raiseRegexpError19(runtime, bytes, enc, options, e.getMessage());
RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, e.getMessage());
return null; // not reached
}
}
@@ -184,10 +185,10 @@ static Regex getQuotedRegexpFromCache19(Ruby runtime, ByteList bytes, RegexpOpti
return regex;
}

private static Regex getPreprocessedRegexpFromCache(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, ErrorMode mode) {
private static Regex getPreprocessedRegexpFromCache(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, RegexpSupport.ErrorMode mode) {
Regex regex = preprocessedPatternCache.get(bytes);
if (regex != null && regex.getEncoding() == enc && regex.getOptions() == options.toJoniOptions()) return regex;
ByteList preprocessed = preprocess(runtime, bytes, enc, new Encoding[]{null}, ErrorMode.RAISE);
ByteList preprocessed = RegexpSupport.preprocess(runtime, bytes, enc, new Encoding[]{null}, RegexpSupport.ErrorMode.RAISE);
regex = makeRegexp(runtime, preprocessed, options, enc);
regex.setUserObject(preprocessed);
preprocessedPatternCache.put(bytes, regex);
@@ -443,359 +444,9 @@ public final Regex preparePattern(RubyString str) {
check();
Encoding enc = checkEncoding(str, true);
if (enc == pattern.getEncoding()) return pattern;
return getPreprocessedRegexpFromCache(getRuntime(), this.str, enc, options, ErrorMode.PREPROCESS);
return getPreprocessedRegexpFromCache(getRuntime(), this.str, enc, options, RegexpSupport.ErrorMode.PREPROCESS);
}

private enum ErrorMode {RAISE, PREPROCESS, DESC}

private static int raisePreprocessError(Ruby runtime, ByteList str, String err, ErrorMode mode) {
switch (mode) {
case RAISE:
raiseRegexpError19(runtime, str, str.getEncoding(), RegexpOptions.NULL_OPTIONS, err);
case PREPROCESS:
throw runtime.newArgumentError("regexp preprocess failed: " + err);
case DESC:
// silent ?
}
return 0;
}

private static int readEscapedByte(Ruby runtime, byte[]to, int toP, byte[]bytes, int p, int end, ByteList str, ErrorMode mode) {
if (p == end || bytes[p++] != (byte)'\\') raisePreprocessError(runtime, str, "too short escaped multibyte character", mode);

boolean metaPrefix = false, ctrlPrefix = false;
int code = 0;
while (true) {
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

switch (bytes[p++]) {
case '\\': code = '\\'; break;
case 'n': code = '\n'; break;
case 't': code = '\t'; break;
case 'r': code = '\r'; break;
case 'f': code = '\f'; break;
case 'v': code = '\013'; break;
case 'a': code = '\007'; break;
case 'e': code = '\033'; break;

/* \OOO */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
p--;
int olen = end < p + 3 ? end - p : 3;
code = StringSupport.scanOct(bytes, p, olen);
p += StringSupport.octLength(bytes, p, olen);
break;

case 'x': /* \xHH */
int hlen = end < p + 2 ? end - p : 2;
code = StringSupport.scanHex(bytes, p, hlen);
int len = StringSupport.hexLength(bytes, p, hlen);
if (len < 1) raisePreprocessError(runtime, str, "invalid hex escape", mode);
p += len;
break;

case 'M': /* \M-X, \M-\C-X, \M-\cX */
if (metaPrefix) raisePreprocessError(runtime, str, "duplicate meta escape", mode);
metaPrefix = true;
if (p + 1 < end && bytes[p++] == (byte)'-' && (bytes[p] & 0x80) == 0) {
if (bytes[p] == (byte)'\\') {
p++;
continue;
} else {
code = bytes[p++] & 0xff;
break;
}
}
raisePreprocessError(runtime, str, "too short meta escape", mode);

case 'C': /* \C-X, \C-\M-X */
if (p == end || bytes[p++] != (byte)'-') raisePreprocessError(runtime, str, "too short control escape", mode);

case 'c': /* \cX, \c\M-X */
if (ctrlPrefix) raisePreprocessError(runtime, str, "duplicate control escape", mode);
ctrlPrefix = true;
if (p < end && (bytes[p] & 0x80) == 0) {
if (bytes[p] == (byte)'\\') {
p++;
continue;
} else {
code = bytes[p++] & 0xff;
break;
}
}
raisePreprocessError(runtime, str, "too short control escape", mode);
default:
raisePreprocessError(runtime, str, "unexpected escape sequence", mode);
} // switch

if (code < 0 || code > 0xff) raisePreprocessError(runtime, str, "invalid escape code", mode);

if (ctrlPrefix) code &= 0x1f;
if (metaPrefix) code |= 0x80;

to[toP] = (byte)code;
return p;
} // while
}

/**
* Unescape escaped non-ascii character at start position, appending all
* to the given bytelist if provided.
*
* @param runtime current runtime
* @param to output bytelist; if null, no appending will be done
* @param bytes incoming bytes
* @param p start position
* @param end end position
* @param enc bytes' encoding
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after performing unescaping
*/
// MRI: unescape_escapted_nonascii
private static int unescapeEscapedNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
byte[]chBuf = new byte[enc.maxLength()];
int chLen = 0;

p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
while (chLen < enc.maxLength() && StringSupport.MBCLEN_NEEDMORE_P(StringSupport.preciseLength(enc, chBuf, 0, chLen))) {
p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
}

int cl = StringSupport.preciseLength(enc, chBuf, 0, chLen);
if (cl == -1) {
raisePreprocessError(runtime, str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
}

if (chLen > 1 || (chBuf[0] & 0x80) != 0) {
if (to != null) to.append(chBuf, 0, chLen);

if (encp[0] == null) {
encp[0] = enc;
} else if (encp[0] != enc) {
raisePreprocessError(runtime, str, "escaped non ASCII character in UTF-8 regexp", mode);
}
} else {
if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", chBuf[0] & 0xff);
}
return p;
}

private static void checkUnicodeRange(Ruby runtime, int code, ByteList str, ErrorMode mode) {
// Unicode is can be only 21 bits long, int is enough
if ((0xd800 <= code && code <= 0xdfff) /* Surrogates */ || 0x10ffff < code) {
raisePreprocessError(runtime, str, "invalid Unicode range", mode);
}
}

/**
* Append the given utf8 characters to the buffer, if given, checking for
* errors along the way.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param code utf8 character code
* @param enc output param for new encoding
* @param str original wrapper of source bytes
* @param mode error mode
*/
private static void appendUtf8(Ruby runtime, ByteList to, int code, Encoding[] enc, ByteList str, ErrorMode mode) {
checkUnicodeRange(runtime, code, str, mode);

if (code < 0x80) {
if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", code);
} else {
if (to != null) {
to.ensure(to.getRealSize() + 6);
to.setRealSize(to.getRealSize() + Pack.utf8Decode(runtime, to.getUnsafeBytes(), to.getBegin() + to.getRealSize(), code));
}
if (enc[0] == null) {
enc[0] = UTF8Encoding.INSTANCE;
} else if (!(enc[0] instanceof UTF8Encoding)) { // do not load the class if not used
raisePreprocessError(runtime, str, "UTF-8 character in non UTF-8 regexp", mode);
}
}
}

/**
* Unescape unicode characters at given offset, appending to the given
* out buffer if provided.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param bytes input bytes
* @param p start position
* @param end end position
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after unescaping
*/
private static int unescapeUnicodeList(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding[]encp, ByteList str, ErrorMode mode) {
while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;

boolean hasUnicode = false;
while (true) {
int code = StringSupport.scanHex(bytes, p, end - p);
int len = StringSupport.hexLength(bytes, p, end - p);
if (len == 0) break;
if (len > 6) raisePreprocessError(runtime, str, "invalid Unicode range", mode);
p += len;
if (to != null) appendUtf8(runtime, to, code, encp, str, mode);
hasUnicode = true;
while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
}

if (!hasUnicode) raisePreprocessError(runtime, str, "invalid Unicode list", mode);
return p;
}

/**
* Unescape unicode BMP char at given offset, appending to the specified
* buffer if non-null.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param bytes input bytes
* @param p start position
* @param end end position
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after unescaping
*/
private static int unescapeUnicodeBmp(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding[] encp, ByteList str, ErrorMode mode) {
if (p + 4 > end) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
int code = StringSupport.scanHex(bytes, p, 4);
int len = StringSupport.hexLength(bytes, p, 4);
if (len != 4) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
appendUtf8(runtime, to, code, encp, str, mode);
return p + 4;
}

/**
* Unescape non-ascii elements in the given string, appending the results
* to the given bytelist if provided.
*
* @param runtime current runtime
* @param to output bytelist; if null, no appending will be done
* @param bytes the bytes to unescape
* @param p starting position
* @param end ending position
* @param enc bytes' encoding
* @param encp out param for fixed encoding
* @param str original wrapper for the bytes
* @param mode error mode
* @return whether any propery elements were encountered while walking
*/
private static boolean unescapeNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
boolean hasProperty = false;
byte[] buf = null;

while (p < end) {
int cl = StringSupport.preciseLength(enc, bytes, p, end);
if (cl <= 0) raisePreprocessError(runtime, str, "invalid multibyte character", mode);
if (cl > 1 || (bytes[p] & 0x80) != 0) {
if (to != null) to.append(bytes, p, cl);
p += cl;
if (encp[0] == null) {
encp[0] = enc;
} else if (encp[0] != enc) {
raisePreprocessError(runtime, str, "non ASCII character in UTF-8 regexp", mode);
}
continue;
}
int c;
switch (c = bytes[p++] & 0xff) {
case '\\':
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

switch (c = bytes[p++] & 0xff) {
case '1': case '2': case '3':
case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
if (StringSupport.scanOct(bytes, p - 1, end - (p - 1)) <= 0177) {
if (to != null) to.append('\\').append(c);
break;
}

case '0': /* \0, \0O, \0OO */
case 'x': /* \xHH */
case 'c': /* \cX, \c\M-X */
case 'C': /* \C-X, \C-\M-X */
case 'M': /* \M-X, \M-\C-X, \M-\cX */
p -= 2;
if (enc == USASCIIEncoding.INSTANCE) {
if (buf == null) buf = new byte[1];
p = readEscapedByte(runtime, buf, 0, bytes, p, end, str, mode);
c = buf[0];
if (c == (char)-1) return false;
if (to != null) to.append(c);
}
else {
p = unescapeEscapedNonAscii(runtime, to, bytes, p, end, enc, encp, str, mode);
}
break;

case 'u':
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
if (bytes[p] == (byte)'{') { /* \\u{H HH HHH HHHH HHHHH HHHHHH ...} */
p++;
p = unescapeUnicodeList(runtime, to, bytes, p, end, encp, str, mode);
if (p == end || bytes[p++] != (byte)'}') raisePreprocessError(runtime, str, "invalid Unicode list", mode);
} else { /* \\uHHHH */
p = unescapeUnicodeBmp(runtime, to, bytes, p, end, encp, str, mode);
}
break;
case 'p': /* \p{Hiragana} */
if (encp[0] == null) hasProperty = true;
if (to != null) to.append('\\').append(c);
break;

default:
if (to != null) to.append('\\').append(c);
break;
} // inner switch
break;

default:
if (to != null) to.append(c);
} // switch
} // while
return hasProperty;
}


/**
* Preprocess the given string for use in regexp, raising errors for encoding
* incompatibilities that arise.
*
* This version produces a new unescaped version of the string based on
* fixes performed while walking.
*
* @param runtime current runtime
* @param str string to preprocess
* @param enc string's encoding
* @param fixedEnc new encoding after fixing
* @param mode mode of errors
* @return a new unescaped string
*/
private static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, ErrorMode mode) {
ByteList to = new ByteList(str.getRealSize());

if (enc.isAsciiCompatible()) {
fixedEnc[0] = null;
} else {
fixedEnc[0] = enc;
to.setEncoding(enc);
}

boolean hasProperty = unescapeNonAscii(runtime, to, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
if (fixedEnc[0] != null) to.setEncoding(fixedEnc[0]);
return to;
}

/**
* Preprocess the given string for use in regexp, raising errors for encoding
@@ -810,19 +461,19 @@ private static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Enc
* @param fixedEnc new encoding after fixing
* @param mode mode of errors
*/
private static void preprocessLight(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, ErrorMode mode) {
private static void preprocessLight(Ruby runtime, ByteList str, Encoding enc, Encoding[]fixedEnc, RegexpSupport.ErrorMode mode) {
if (enc.isAsciiCompatible()) {
fixedEnc[0] = null;
} else {
fixedEnc[0] = enc;
}

boolean hasProperty = unescapeNonAscii(runtime, null, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
boolean hasProperty = RegexpSupport.unescapeNonAscii(runtime, null, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
}

public static void preprocessCheck(Ruby runtime, ByteList bytes) {
preprocess(runtime, bytes, bytes.getEncoding(), new Encoding[]{null}, ErrorMode.RAISE);
RegexpSupport.preprocess(runtime, bytes, bytes.getEncoding(), new Encoding[]{null}, RegexpSupport.ErrorMode.RAISE);
}

public static RubyString preprocessDRegexp(Ruby runtime, RubyString[] strings, int embeddedOptions) {
@@ -849,7 +500,7 @@ public static RubyString preprocessDRegexp(Ruby runtime, IRubyObject[] strings,
// This used to call preprocess, but the resulting bytelist was not
// used. Since the preprocessing error-checking can be done without
// creating a new bytelist, I added a "light" path.
RubyRegexp.preprocessLight(runtime, str.getByteList(), strEnc, fixedEnc, RubyRegexp.ErrorMode.PREPROCESS);
RubyRegexp.preprocessLight(runtime, str.getByteList(), strEnc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);

if (fixedEnc[0] != null) {
if (regexpEnc != null && regexpEnc != fixedEnc[0]) {
@@ -1179,30 +830,6 @@ public static IRubyObject union19(ThreadContext context, IRubyObject recv, IRuby
}
}

// rb_enc_reg_raise
private static void raiseRegexpError19(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, String err) {
// TODO: we loose encoding information here, fix it
throw runtime.newRegexpError(err + ": " + regexpDescription19(runtime, bytes, options, enc));
}

// rb_enc_reg_error_desc
static ByteList regexpDescription19(Ruby runtime, ByteList bytes, RegexpOptions options, Encoding enc) {
return regexpDescription19(runtime, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize(), options, enc);
}
private static ByteList regexpDescription19(Ruby runtime, byte[] s, int start, int len, RegexpOptions options, Encoding enc) {
ByteList description = new ByteList();
description.setEncoding(enc);
description.append((byte)'/');
Encoding resultEnc = runtime.getDefaultInternalEncoding();
if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();

appendRegexpString19(runtime, description, s, start, len, enc, resultEnc);
description.append((byte)'/');
appendOptions(description, options);
if (options.isEncodingNone()) description.append((byte) 'n');
return description;
}

/** rb_reg_init_copy
*/
@JRubyMethod(required = 1, visibility = Visibility.PRIVATE)
@@ -1297,7 +924,7 @@ private RubyRegexp initializeCommon19(RubyString str, RegexpOptions options) {
if (options.isEncodingNone()) {
if (enc != ASCIIEncoding.INSTANCE) {
if (str.scanForCodeRange() != StringSupport.CR_7BIT) {
raiseRegexpError19(getRuntime(), bytes, enc, options, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
RegexpSupport.raiseRegexpError19(getRuntime(), bytes, enc, options, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
}
enc = ASCIIEncoding.INSTANCE;
}
@@ -1314,14 +941,14 @@ private RubyRegexp initializeCommon19(ByteList bytes, Encoding enc, RegexpOption
// FIXME: Something unsets this bit, but we aren't...be more permissive until we figure this out
//if (isLiteral()) throw runtime.newSecurityError("can't modify literal regexp");
if (pattern != null) throw runtime.newTypeError("already initialized regexp");
if (enc.isDummy()) raiseRegexpError19(runtime, bytes, enc, options, "can't make regexp with dummy encoding");
if (enc.isDummy()) RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, "can't make regexp with dummy encoding");

Encoding[]fixedEnc = new Encoding[]{null};
ByteList unescaped = preprocess(runtime, bytes, enc, fixedEnc, ErrorMode.RAISE);
ByteList unescaped = RegexpSupport.preprocess(runtime, bytes, enc, fixedEnc, RegexpSupport.ErrorMode.RAISE);
if (fixedEnc[0] != null) {
if ((fixedEnc[0] != enc && options.isFixed()) ||
(fixedEnc[0] != ASCIIEncoding.INSTANCE && options.isEncodingNone())) {
raiseRegexpError19(runtime, bytes, enc, options, "incompatible character encoding");
RegexpSupport.raiseRegexpError19(runtime, bytes, enc, options, "incompatible character encoding");
}
if (fixedEnc[0] != ASCIIEncoding.INSTANCE) {
options.setFixed(true);
@@ -1688,7 +1315,7 @@ public IRubyObject inspect() {
@JRubyMethod(name = "inspect")
public IRubyObject inspect19() {
if (pattern == null) return anyToString();
return RubyString.newString(getRuntime(), regexpDescription19(getRuntime(), str, options, str.getEncoding()));
return RubyString.newString(getRuntime(), RegexpSupport.regexpDescription19(getRuntime(), str, options, str.getEncoding()));
}

private final static int EMBEDDABLE = RE_OPTION_MULTILINE|RE_OPTION_IGNORECASE|RE_OPTION_EXTENDED;
@@ -1764,7 +1391,7 @@ public IRubyObject to_s() {
}
}

appendOptions(result, newOptions);
RegexpSupport.appendOptions(result, newOptions);

if (!newOptions.isEmbeddable()) {
result.append((byte)'-');
@@ -1773,7 +1400,7 @@ public IRubyObject to_s() {
if (!newOptions.isExtended()) result.append((byte)'x');
}
result.append((byte)':');
appendRegexpString19(runtime, result, bytes, p, len, str.getEncoding(), null);
RegexpSupport.appendRegexpString19(runtime, result, bytes, p, len, str.getEncoding(), null);

result.append((byte)')');
return RubyString.newString(getRuntime(), result, getEncoding()).infectBy(this);
@@ -1828,87 +1455,6 @@ private static void appendRegexpString(Ruby runtime, ByteList to, byte[]bytes, i
}
}

private static void appendRegexpString19(Ruby runtime, ByteList to, byte[]bytes, int start, int len, Encoding enc, Encoding resEnc) {
int p = start;
int end = p + len;
boolean needEscape = false;
while (p < end) {
final int c;
final int cl;
if (enc.isAsciiCompatible()) {
cl = 1;
c = bytes[p] & 0xff;
} else {
cl = StringSupport.preciseLength(enc, bytes, p, end);
c = enc.mbcToCode(bytes, p, end);
}

if (!Encoding.isAscii(c)) {
p += StringSupport.length(enc, bytes, p, end);
} else if (c != '/' && enc.isPrint(c)) {
p += cl;
} else {
needEscape = true;
break;
}
}
if (!needEscape) {
to.append(bytes, start, len);
} else {
boolean isUnicode = StringSupport.isUnicode(enc);
p = start;
while (p < end) {
final int c;
final int cl;
if (enc.isAsciiCompatible()) {
cl = 1;
c = bytes[p] & 0xff;
} else {
cl = StringSupport.preciseLength(enc, bytes, p, end);
c = enc.mbcToCode(bytes, p, end);
}

if (c == '\\' && p + cl < end) {
int n = cl + StringSupport.length(enc, bytes, p + cl, end);
to.append(bytes, p, n);
p += n;
continue;
} else if (c == '/') {
to.append((byte) '\\');
to.append(bytes, p, cl);
} else if (!Encoding.isAscii(c)) {
int l = StringSupport.preciseLength(enc, bytes, p, end);
if (l <= 0) {
l = 1;
Sprintf.sprintf(runtime, to, "\\x%02X", c);
} else if (resEnc != null) {
int code = enc.mbcToCode(bytes, p, end);
Sprintf.sprintf(runtime, to , StringSupport.escapedCharFormat(code, isUnicode), code);
} else {
to.append(bytes, p, l);
}
p += l;

continue;
} else if (enc.isPrint(c)) {
to.append(bytes, p, cl);
} else if (!enc.isSpace(c)) {
Sprintf.sprintf(runtime, to, "\\x%02X", c);
} else {
to.append(bytes, p, cl);
}
p += cl;
}
}
}

// option_to_str
private static void appendOptions(ByteList to, RegexpOptions options) {
if (options.isMultiline()) to.append((byte)'m');
if (options.isIgnorecase()) to.append((byte)'i');
if (options.isExtended()) to.append((byte)'x');
}

private static String[] NO_NAMES = new String[] {}; //TODO: Perhaps we have another empty string arr
public String[] getNames() {
int nameLength = pattern.numberOfNames();
2 changes: 1 addition & 1 deletion core/src/main/java/org/jruby/RubyString.java
Original file line number Diff line number Diff line change
@@ -3616,7 +3616,7 @@ private Regex getStringPattern19(Ruby runtime, IRubyObject obj) {
RubyString str = getStringForPattern(obj);
if (str.scanForCodeRange() == CR_BROKEN) {
throw runtime.newRegexpError("invalid multybyte character: " +
RubyRegexp.regexpDescription19(runtime, str.value, new RegexpOptions(), str.value.getEncoding()).toString());
RegexpSupport.regexpDescription19(runtime, str.value, new RegexpOptions(), str.value.getEncoding()).toString());
}
if (str.value.getEncoding().isDummy()) {
throw runtime.newArgumentError("can't make regexp with dummy encoding");
493 changes: 493 additions & 0 deletions core/src/main/java/org/jruby/util/RegexpSupport.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,493 @@
/***** BEGIN LICENSE BLOCK *****
* Version: EPL 1.0/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Eclipse Public
* License Version 1.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.eclipse.org/legal/epl-v10.html
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the EPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the EPL, the GPL or the LGPL.
***** END LICENSE BLOCK *****/

package org.jruby.util;

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.Ruby;
import org.jruby.RubyRegexp;

public class RegexpSupport {

public enum ErrorMode {RAISE, PREPROCESS, DESC}

/**
* Preprocess the given string for use in regexp, raising errors for encoding
* incompatibilities that arise.
*
* This version produces a new unescaped version of the string based on
* fixes performed while walking.
*
* @param runtime current runtime
* @param str string to preprocess
* @param enc string's encoding
* @param fixedEnc new encoding after fixing
* @param mode mode of errors
* @return a new unescaped string
*/
public static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Encoding[] fixedEnc, ErrorMode mode) {
ByteList to = new ByteList(str.getRealSize());

if (enc.isAsciiCompatible()) {
fixedEnc[0] = null;
} else {
fixedEnc[0] = enc;
to.setEncoding(enc);
}

boolean hasProperty = unescapeNonAscii(runtime, to, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
if (fixedEnc[0] != null) to.setEncoding(fixedEnc[0]);
return to;
}

/**
* Unescape non-ascii elements in the given string, appending the results
* to the given bytelist if provided.
*
* @param runtime current runtime
* @param to output bytelist; if null, no appending will be done
* @param bytes the bytes to unescape
* @param p starting position
* @param end ending position
* @param enc bytes' encoding
* @param encp out param for fixed encoding
* @param str original wrapper for the bytes
* @param mode error mode
* @return whether any propery elements were encountered while walking
*/
public static boolean unescapeNonAscii(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding enc, Encoding[] encp, ByteList str, ErrorMode mode) {
boolean hasProperty = false;
byte[] buf = null;

while (p < end) {
int cl = StringSupport.preciseLength(enc, bytes, p, end);
if (cl <= 0) raisePreprocessError(runtime, str, "invalid multibyte character", mode);
if (cl > 1 || (bytes[p] & 0x80) != 0) {
if (to != null) to.append(bytes, p, cl);
p += cl;
if (encp[0] == null) {
encp[0] = enc;
} else if (encp[0] != enc) {
raisePreprocessError(runtime, str, "non ASCII character in UTF-8 regexp", mode);
}
continue;
}
int c;
switch (c = bytes[p++] & 0xff) {
case '\\':
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

switch (c = bytes[p++] & 0xff) {
case '1': case '2': case '3':
case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
if (StringSupport.scanOct(bytes, p - 1, end - (p - 1)) <= 0177) {
if (to != null) to.append('\\').append(c);
break;
}

case '0': /* \0, \0O, \0OO */
case 'x': /* \xHH */
case 'c': /* \cX, \c\M-X */
case 'C': /* \C-X, \C-\M-X */
case 'M': /* \M-X, \M-\C-X, \M-\cX */
p -= 2;
if (enc == USASCIIEncoding.INSTANCE) {
if (buf == null) buf = new byte[1];
p = readEscapedByte(runtime, buf, 0, bytes, p, end, str, mode);
c = buf[0];
if (c == (char)-1) return false;
if (to != null) to.append(c);
}
else {
p = unescapeEscapedNonAscii(runtime, to, bytes, p, end, enc, encp, str, mode);
}
break;

case 'u':
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
if (bytes[p] == (byte)'{') { /* \\u{H HH HHH HHHH HHHHH HHHHHH ...} */
p++;
p = unescapeUnicodeList(runtime, to, bytes, p, end, encp, str, mode);
if (p == end || bytes[p++] != (byte)'}') raisePreprocessError(runtime, str, "invalid Unicode list", mode);
} else { /* \\uHHHH */
p = unescapeUnicodeBmp(runtime, to, bytes, p, end, encp, str, mode);
}
break;
case 'p': /* \p{Hiragana} */
if (encp[0] == null) hasProperty = true;
if (to != null) to.append('\\').append(c);
break;

default:
if (to != null) to.append('\\').append(c);
break;
} // inner switch
break;

default:
if (to != null) to.append(c);
} // switch
} // while
return hasProperty;
}

public static int raisePreprocessError(Ruby runtime, ByteList str, String err, ErrorMode mode) {
switch (mode) {
case RAISE:
raiseRegexpError19(runtime, str, str.getEncoding(), RegexpOptions.NULL_OPTIONS, err);
case PREPROCESS:
throw runtime.newArgumentError("regexp preprocess failed: " + err);
case DESC:
// silent ?
}
return 0;
}

// rb_enc_reg_raise
public static void raiseRegexpError19(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, String err) {
// TODO: we loose encoding information here, fix it
throw runtime.newRegexpError(err + ": " + regexpDescription19(runtime, bytes, options, enc));
}

// rb_enc_reg_error_desc
public static ByteList regexpDescription19(Ruby runtime, ByteList bytes, RegexpOptions options, Encoding enc) {
return regexpDescription19(runtime, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize(), options, enc);
}

private static ByteList regexpDescription19(Ruby runtime, byte[] s, int start, int len, RegexpOptions options, Encoding enc) {
ByteList description = new ByteList();
description.setEncoding(enc);
description.append((byte)'/');
Encoding resultEnc = runtime.getDefaultInternalEncoding();
if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();

appendRegexpString19(runtime, description, s, start, len, enc, resultEnc);
description.append((byte)'/');
appendOptions(description, options);
if (options.isEncodingNone()) description.append((byte) 'n');
return description;
}

public static void appendRegexpString19(Ruby runtime, ByteList to, byte[] bytes, int start, int len, Encoding enc, Encoding resEnc) {
int p = start;
int end = p + len;
boolean needEscape = false;
while (p < end) {
final int c;
final int cl;
if (enc.isAsciiCompatible()) {
cl = 1;
c = bytes[p] & 0xff;
} else {
cl = StringSupport.preciseLength(enc, bytes, p, end);
c = enc.mbcToCode(bytes, p, end);
}

if (!Encoding.isAscii(c)) {
p += StringSupport.length(enc, bytes, p, end);
} else if (c != '/' && enc.isPrint(c)) {
p += cl;
} else {
needEscape = true;
break;
}
}
if (!needEscape) {
to.append(bytes, start, len);
} else {
boolean isUnicode = StringSupport.isUnicode(enc);
p = start;
while (p < end) {
final int c;
final int cl;
if (enc.isAsciiCompatible()) {
cl = 1;
c = bytes[p] & 0xff;
} else {
cl = StringSupport.preciseLength(enc, bytes, p, end);
c = enc.mbcToCode(bytes, p, end);
}

if (c == '\\' && p + cl < end) {
int n = cl + StringSupport.length(enc, bytes, p + cl, end);
to.append(bytes, p, n);
p += n;
continue;
} else if (c == '/') {
to.append((byte) '\\');
to.append(bytes, p, cl);
} else if (!Encoding.isAscii(c)) {
int l = StringSupport.preciseLength(enc, bytes, p, end);
if (l <= 0) {
l = 1;
Sprintf.sprintf(runtime, to, "\\x%02X", c);
} else if (resEnc != null) {
int code = enc.mbcToCode(bytes, p, end);
Sprintf.sprintf(runtime, to , StringSupport.escapedCharFormat(code, isUnicode), code);
} else {
to.append(bytes, p, l);
}
p += l;

continue;
} else if (enc.isPrint(c)) {
to.append(bytes, p, cl);
} else if (!enc.isSpace(c)) {
Sprintf.sprintf(runtime, to, "\\x%02X", c);
} else {
to.append(bytes, p, cl);
}
p += cl;
}
}
}

// option_to_str
public static void appendOptions(ByteList to, RegexpOptions options) {
if (options.isMultiline()) to.append((byte)'m');
if (options.isIgnorecase()) to.append((byte)'i');
if (options.isExtended()) to.append((byte)'x');
}

public static int readEscapedByte(Ruby runtime, byte[] to, int toP, byte[] bytes, int p, int end, ByteList str, ErrorMode mode) {
if (p == end || bytes[p++] != (byte)'\\') raisePreprocessError(runtime, str, "too short escaped multibyte character", mode);

boolean metaPrefix = false, ctrlPrefix = false;
int code = 0;
while (true) {
if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

switch (bytes[p++]) {
case '\\': code = '\\'; break;
case 'n': code = '\n'; break;
case 't': code = '\t'; break;
case 'r': code = '\r'; break;
case 'f': code = '\f'; break;
case 'v': code = '\013'; break;
case 'a': code = '\007'; break;
case 'e': code = '\033'; break;

/* \OOO */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
p--;
int olen = end < p + 3 ? end - p : 3;
code = StringSupport.scanOct(bytes, p, olen);
p += StringSupport.octLength(bytes, p, olen);
break;

case 'x': /* \xHH */
int hlen = end < p + 2 ? end - p : 2;
code = StringSupport.scanHex(bytes, p, hlen);
int len = StringSupport.hexLength(bytes, p, hlen);
if (len < 1) raisePreprocessError(runtime, str, "invalid hex escape", mode);
p += len;
break;

case 'M': /* \M-X, \M-\C-X, \M-\cX */
if (metaPrefix) raisePreprocessError(runtime, str, "duplicate meta escape", mode);
metaPrefix = true;
if (p + 1 < end && bytes[p++] == (byte)'-' && (bytes[p] & 0x80) == 0) {
if (bytes[p] == (byte)'\\') {
p++;
continue;
} else {
code = bytes[p++] & 0xff;
break;
}
}
raisePreprocessError(runtime, str, "too short meta escape", mode);

case 'C': /* \C-X, \C-\M-X */
if (p == end || bytes[p++] != (byte)'-') raisePreprocessError(runtime, str, "too short control escape", mode);

case 'c': /* \cX, \c\M-X */
if (ctrlPrefix) raisePreprocessError(runtime, str, "duplicate control escape", mode);
ctrlPrefix = true;
if (p < end && (bytes[p] & 0x80) == 0) {
if (bytes[p] == (byte)'\\') {
p++;
continue;
} else {
code = bytes[p++] & 0xff;
break;
}
}
raisePreprocessError(runtime, str, "too short control escape", mode);
default:
raisePreprocessError(runtime, str, "unexpected escape sequence", mode);
} // switch

if (code < 0 || code > 0xff) raisePreprocessError(runtime, str, "invalid escape code", mode);

if (ctrlPrefix) code &= 0x1f;
if (metaPrefix) code |= 0x80;

to[toP] = (byte)code;
return p;
} // while
}

/**
* Unescape escaped non-ascii character at start position, appending all
* to the given bytelist if provided.
*
* @param runtime current runtime
* @param to output bytelist; if null, no appending will be done
* @param bytes incoming bytes
* @param p start position
* @param end end position
* @param enc bytes' encoding
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after performing unescaping
*/
// MRI: unescape_escapted_nonascii
private static int unescapeEscapedNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
byte[]chBuf = new byte[enc.maxLength()];
int chLen = 0;

p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
while (chLen < enc.maxLength() && StringSupport.MBCLEN_NEEDMORE_P(StringSupport.preciseLength(enc, chBuf, 0, chLen))) {
p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
}

int cl = StringSupport.preciseLength(enc, chBuf, 0, chLen);
if (cl == -1) {
raisePreprocessError(runtime, str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
}

if (chLen > 1 || (chBuf[0] & 0x80) != 0) {
if (to != null) to.append(chBuf, 0, chLen);

if (encp[0] == null) {
encp[0] = enc;
} else if (encp[0] != enc) {
raisePreprocessError(runtime, str, "escaped non ASCII character in UTF-8 regexp", mode);
}
} else {
if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", chBuf[0] & 0xff);
}
return p;
}

/**
* Unescape unicode characters at given offset, appending to the given
* out buffer if provided.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param bytes input bytes
* @param p start position
* @param end end position
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after unescaping
*/
private static int unescapeUnicodeList(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding[]encp, ByteList str, ErrorMode mode) {
while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;

boolean hasUnicode = false;
while (true) {
int code = StringSupport.scanHex(bytes, p, end - p);
int len = StringSupport.hexLength(bytes, p, end - p);
if (len == 0) break;
if (len > 6) raisePreprocessError(runtime, str, "invalid Unicode range", mode);
p += len;
if (to != null) appendUtf8(runtime, to, code, encp, str, mode);
hasUnicode = true;
while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
}

if (!hasUnicode) raisePreprocessError(runtime, str, "invalid Unicode list", mode);
return p;
}

/**
* Unescape unicode BMP char at given offset, appending to the specified
* buffer if non-null.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param bytes input bytes
* @param p start position
* @param end end position
* @param encp out param for fixed encoding
* @param str original bytes wrapper
* @param mode error mode
* @return new position after unescaping
*/
private static int unescapeUnicodeBmp(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding[] encp, ByteList str, ErrorMode mode) {
if (p + 4 > end) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
int code = StringSupport.scanHex(bytes, p, 4);
int len = StringSupport.hexLength(bytes, p, 4);
if (len != 4) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
appendUtf8(runtime, to, code, encp, str, mode);
return p + 4;
}

/**
* Append the given utf8 characters to the buffer, if given, checking for
* errors along the way.
*
* @param runtime current runtime
* @param to output buffer; if null, no appending will be done
* @param code utf8 character code
* @param enc output param for new encoding
* @param str original wrapper of source bytes
* @param mode error mode
*/
private static void appendUtf8(Ruby runtime, ByteList to, int code, Encoding[] enc, ByteList str, ErrorMode mode) {
checkUnicodeRange(runtime, code, str, mode);

if (code < 0x80) {
if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", code);
} else {
if (to != null) {
to.ensure(to.getRealSize() + 6);
to.setRealSize(to.getRealSize() + Pack.utf8Decode(runtime, to.getUnsafeBytes(), to.getBegin() + to.getRealSize(), code));
}
if (enc[0] == null) {
enc[0] = UTF8Encoding.INSTANCE;
} else if (!(enc[0] instanceof UTF8Encoding)) { // do not load the class if not used
raisePreprocessError(runtime, str, "UTF-8 character in non UTF-8 regexp", mode);
}
}
}

private static void checkUnicodeRange(Ruby runtime, int code, ByteList str, ErrorMode mode) {
// Unicode is can be only 21 bits long, int is enough
if ((0xd800 <= code && code <= 0xdfff) /* Surrogates */ || 0x10ffff < code) {
raisePreprocessError(runtime, str, "invalid Unicode range", mode);
}
}
}

0 comments on commit 2f7aebd

Please sign in to comment.