Skip to content


Showing 5 changed files with 106 additions and 91 deletions.
17 changes: 12 additions & 5 deletions core/src/main/java/org/jruby/
Original file line number Diff line number Diff line change
@@ -348,26 +348,33 @@ public RubyString chr19(ThreadContext context, IRubyObject arg) {
if (enc == ASCIIEncoding.INSTANCE && value >= 0x80) {
return chr19(context);
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, (int)value), enc, 0);
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, value), enc, 0);

private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, int value) {
private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, long value) {
int n;
try {
n = value < 0 ? 0 : enc.codeToMbcLength(value);
n = value < 0 ? 0 : enc.codeToMbcLength((int)value);
} catch (EncodingException ee) {
n = 0;

if (n <= 0) throw runtime.newRangeError(this.toString() + " out of char range");

ByteList bytes = new ByteList(n);

boolean ok = false;
try {
enc.codeToMbc(value, bytes.getUnsafeBytes(), 0);
enc.codeToMbc((int)value, bytes.getUnsafeBytes(), 0);
ok = StringSupport.preciseLength(enc, bytes.unsafeBytes(), 0, n) == n;
} catch (EncodingException e) {
// ok = false, fall through

if (!ok) {
throw runtime.newRangeError("invalid codepoint " + String.format("0x%x in ", value) + enc.getCharsetName());

return bytes;
103 changes: 41 additions & 62 deletions core/src/main/java/org/jruby/
Original file line number Diff line number Diff line change
@@ -40,7 +40,6 @@

import jnr.posix.POSIX;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.ascii.AsciiTables;
import org.jcodings.exception.EncodingException;
import org.jcodings.specific.ASCIIEncoding;
@@ -50,7 +49,7 @@
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jcodings.unicode.UnicodeEncoding;
import org.jcodings.util.IntHash;
import org.joni.Matcher;
import org.joni.Option;
@@ -259,14 +258,14 @@ private void copyCodeRange(RubyString from) {
public final int scanForCodeRange() {
int cr = getCodeRange();
if (cr == CR_UNKNOWN) {
cr = codeRangeScan(value.getEncoding(), value);
cr = codeRangeScan(EncodingUtils.getActualEncoding(getEncoding(), value), value);
return cr;

final boolean singleByteOptimizable() {
return StringSupport.isSingleByteOptimizable(this, value.getEncoding());
return StringSupport.isSingleByteOptimizable(this, EncodingUtils.STR_ENC_GET(this));

final boolean singleByteOptimizable(Encoding enc) {
@@ -2036,54 +2035,30 @@ public IRubyObject inspect19() {

public static IRubyObject inspect19(Ruby runtime, ByteList byteList) {
ThreadContext context = runtime.getCurrentContext();

Encoding enc = byteList.getEncoding();
byte bytes[] = byteList.getUnsafeBytes();
int p = byteList.getBegin();
int end = p + byteList.getRealSize();
RubyString result = new RubyString(runtime, runtime.getString(), new ByteList(end - p));
Encoding enc = byteList.getEncoding();

Encoding resultEnc = runtime.getDefaultInternalEncoding();
if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;

boolean isUnicode = StringSupport.isUnicode(enc);
boolean asciiCompat = enc.isAsciiCompatible();

EncodingDB.Entry e = null;
CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = runtime.getEncodingService().getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p > 1) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
e = encodings.get("UTF-16BE".getBytes());
} else if (c0 == 0xFF && c1 == 0xFE) {
e = encodings.get("UTF-16LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p > 3) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
e = encodings.get("UTF-32BE".getBytes());
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
e = encodings.get("UTF-32LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;

if (e != null) enc = e.getEncoding();

if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;

int prev = p;
Encoding actEnc = EncodingUtils.getActualEncoding(enc, byteList);
if (actEnc != enc) {
enc = actEnc;
if (isUnicode) isUnicode = enc instanceof UnicodeEncoding;

while (p < end) {
int cc = 0;

@@ -3708,7 +3683,7 @@ private IRubyObject populateCapturesForScan(Ruby runtime, Matcher matcher, int r
@JRubyMethod(name = "scan", reads = BACKREF, writes = BACKREF)
public IRubyObject scan19(ThreadContext context, IRubyObject arg, Block block) {
Ruby runtime = context.runtime;
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
final Regex pattern, prepared;
final RubyRegexp regexp;
final int tuFlags;
@@ -3881,11 +3856,11 @@ private boolean end_with_pCommon(IRubyObject arg) {

private static final ByteList SPACE_BYTELIST = new ByteList(ByteList.plain(" "));

private IRubyObject justify19(IRubyObject arg0, int jflag) {
Ruby runtime = getRuntime();
private IRubyObject justify19(ThreadContext context, IRubyObject arg0, int jflag) {
Ruby runtime = context.runtime;
RubyString result = justifyCommon(runtime, SPACE_BYTELIST,
true, value.getEncoding(), RubyFixnum.num2int(arg0), jflag);
true, EncodingUtils.STR_ENC_GET(this), RubyFixnum.num2int(arg0), jflag);
if (getCodeRange() != CR_BROKEN) result.setCodeRange(getCodeRange());
return result;
@@ -3985,7 +3960,7 @@ public IRubyObject ljust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "ljust")
public IRubyObject ljust19(IRubyObject arg0) {
return justify19(arg0, 'l');
return justify19(getRuntime().getCurrentContext(), arg0, 'l');

@JRubyMethod(name = "ljust")
@@ -4006,7 +3981,7 @@ public IRubyObject rjust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "rjust")
public IRubyObject rjust19(IRubyObject arg0) {
return justify19(arg0, 'r');
return justify19(getRuntime().getCurrentContext(), arg0, 'r');

@JRubyMethod(name = "rjust")
@@ -4027,7 +4002,7 @@ public IRubyObject center(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "center")
public IRubyObject center19(IRubyObject arg0) {
return justify19(arg0, 'c');
return justify19(getRuntime().getCurrentContext(), arg0, 'c');

@JRubyMethod(name = "center")
@@ -4296,7 +4271,7 @@ public IRubyObject lstrip_bang19(ThreadContext context) {
return runtime.getNil();

Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int s = value.getBegin();
int end = s + value.getRealSize();
byte[]bytes = value.getUnsafeBytes();
@@ -4364,8 +4339,9 @@ public IRubyObject rstrip_bang19(ThreadContext context) {
return runtime.getNil();

IRubyObject result = singleByteOptimizable(value.getEncoding()) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime);
Encoding enc = EncodingUtils.STR_ENC_GET(this);
IRubyObject result = singleByteOptimizable(enc) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime, context);

return result;
@@ -4389,11 +4365,11 @@ private IRubyObject singleByteRStrip19(Ruby runtime) {

// In 1.9 we strip any combination of \0 and \s
private IRubyObject multiByteRStrip19(Ruby runtime) {
private IRubyObject multiByteRStrip19(Ruby runtime, ThreadContext context) {
byte[] bytes = value.getUnsafeBytes();
int start = value.getBegin();
int end = start + value.getRealSize();
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int endp = end;
int prev;
while ((prev = enc.prevCharHead(bytes, start, endp, end)) != -1) {
@@ -4463,17 +4439,20 @@ public IRubyObject count19(ThreadContext context, IRubyObject arg) {
if (value.getRealSize() == 0) return;

RubyString otherStr = arg.convertToString();
ByteList otherBL = otherStr.getByteList();
Encoding enc = checkEncoding(otherStr);

int c;
if (otherStr.value.length() == 1 && enc.isAsciiCompatible() &&
((c = otherStr.value.unsafeBytes()[otherStr.value.getBegin()] & 0xff)) < 0x80 && scanForCodeRange() != CR_BROKEN) {
if (otherBL.length() == 1 && enc.isAsciiCompatible() &&
enc.isReverseMatchAllowed(otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize()) &&
scanForCodeRange() != CR_BROKEN) {
int n = 0;
int[] len_p = {0};
int c = EncodingUtils.encCodepointLength(runtime, otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize(), len_p, enc);

if (value.length() ==0) return;
byte[]bytes = value.unsafeBytes();
int p = value.getBegin();
int end = p + value.length();
int n = 0;
while (p < end) {
if ((bytes[p++] & 0xff) == c) n++;
@@ -5357,7 +5336,7 @@ private IRubyObject enumerateCodepoints(ThreadContext context, String name, Bloc
ptrBytes = strByteList.unsafeBytes();
ptr = strByteList.begin();
end = ptr + strByteList.getRealSize();
enc = str.getEncoding();
enc = EncodingUtils.STR_ENC_GET(str);

if (block.isGiven()) {
if (wantarray) {
@@ -5478,8 +5457,8 @@ public RubySymbol intern19() {
public IRubyObject ord(ThreadContext context) {
Ruby runtime = context.runtime;
return RubyFixnum.newFixnum(runtime, codePoint(runtime, value.getEncoding(), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));
return RubyFixnum.newFixnum(runtime, codePoint(runtime, EncodingUtils.STR_ENC_GET(this), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));

@@ -5789,7 +5768,7 @@ public IRubyObject strScrub(ThreadContext context, IRubyObject repl, Block block
if (cr == CR_7BIT || cr == CR_VALID)
return context.nil;

enc = getEncoding();
enc = EncodingUtils.STR_ENC_GET(this);
if (!repl.isNil()) {
repl = EncodingUtils.strCompatAndValid(context, repl, enc);
8 changes: 7 additions & 1 deletion core/src/main/java/org/jruby/parser/
Original file line number Diff line number Diff line change
@@ -46,6 +46,7 @@
import org.jruby.ast.types.INameNode;
import org.jruby.common.IRubyWarnings;
import org.jruby.common.IRubyWarnings.ID;
import org.jruby.exceptions.RaiseException;
import org.jruby.lexer.yacc.ISourcePosition;
import org.jruby.lexer.yacc.ISourcePositionHolder;
import org.jruby.lexer.yacc.RubyLexer;
@@ -1222,8 +1223,13 @@ public Node arg_append(Node node1, Node node2) {
// MRI: reg_fragment_check
public void regexpFragmentCheck(RegexpNode end, ByteList value) {
setRegexpEncoding(end, value);
RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
try {
RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
} catch (RaiseException re) {
} // 1.9 mode overrides to do extra checking...

private List<Integer> allocateNamedLocals(RegexpNode regexpNode) {
RubyRegexp pattern = RubyRegexp.newRegexp(configuration.getRuntime(), regexpNode.getValue(), regexpNode.getOptions());
49 changes: 45 additions & 4 deletions core/src/main/java/org/jruby/util/io/
Original file line number Diff line number Diff line change
@@ -4,7 +4,6 @@
import org.jcodings.EncodingDB;
import org.jcodings.Ptr;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
@@ -16,6 +15,7 @@
import org.jcodings.transcode.Transcoder;
import org.jcodings.transcode.TranscoderDB;
import org.jcodings.transcode.Transcoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBasicObject;
@@ -24,7 +24,6 @@
import org.jruby.RubyFixnum;
import org.jruby.RubyHash;
import org.jruby.RubyIO;
import org.jruby.RubyInteger;
import org.jruby.RubyMethod;
import org.jruby.RubyNumeric;
import org.jruby.RubyProc;
@@ -1810,17 +1809,59 @@ public static IRubyObject strCompatAndValid(ThreadContext context, IRubyObject _
throw context.runtime.newArgumentError("replacement must be valid byte sequence '" + str + "'");
else if (cr == StringSupport.CR_7BIT) {
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (!enc.isAsciiCompatible()) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (enc != e) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
return str;

// MRI: get_encoding
public static Encoding getEncoding(ByteList str) {
return getActualEncoding(str.getEncoding(), str);

// MRI: get_actual_encoding
public static Encoding getActualEncoding(Encoding enc, ByteList byteList) {
byte[] bytes = byteList.unsafeBytes();
int p = byteList.begin();
int end = p + byteList.getRealSize();

CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = EncodingDB.getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p >= 2) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
return UTF16BEEncoding.INSTANCE;
} else if (c0 == 0xFF && c1 == 0xFE) {
return UTF16LEEncoding.INSTANCE;
return ASCIIEncoding.INSTANCE;
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p >= 4) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
return UTF32BEEncoding.INSTANCE;
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
return UTF32LEEncoding.INSTANCE;
return ASCIIEncoding.INSTANCE;
return enc;

public static Encoding STR_ENC_GET(RubyString str) {
return getEncoding(str.getByteList());
20 changes: 1 addition & 19 deletions test/mri/excludes/TestM17N.rb
Original file line number Diff line number Diff line change
@@ -1,35 +1,17 @@
exclude :test_chr, "needs investigation"
exclude :test_delete, "needs investigation"
exclude :test_dynamic_eucjp_regexp, "needs investigation"
exclude :test_dynamic_sjis_regexp, "needs investigation"
exclude :test_dynamic_utf8_regexp, "needs investigation"
exclude :test_end_with, "needs investigation"
exclude :test_euc_tw, "needs investigation"
exclude :test_force_encoding, "needs investigation"
exclude :test_nonascii_method_name, "needs investigation"
exclude :test_object_inspect_external, "needs investigation"
exclude :test_object_utf16_32_inspect, "needs investigation"
exclude :test_regexp_ascii, "needs investigation"
exclude :test_regexp_mixed_unicode, "needs investigation"
exclude :test_regexp_too_short_multibyte_character, "needs investigation"
exclude :test_regexp_unicode, "needs investigation"
exclude :test_regexp_usascii, "needs investigation"
exclude :test_scrub, ""
exclude :test_scrub_bang, ""
exclude :test_count_sjis_trailing_byte, "needs investigation"
exclude :test_sprintf_c, "format string encoding should be used to decode incoming fixnums"
exclude :test_str_concat, "needs investigation"
exclude :test_string_inspect_encoding, "needs investigation"
exclude :test_string_mixed_unicode, "needs investigation"
exclude :test_symbol, "needs investigation"
exclude :test_symbol_op, "random operators setting encoding earlier"
exclude :test_symbol_op, "some symbols are created early and do not have UTF-8 encoding"
exclude :test_union_1_nonascii_string, "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-16BE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-16LE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-32BE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-32LE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-16BE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-16LE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-32BE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-32LE)', "needs investigation"
exclude :test_valid_encoding, "needs investigation"

0 comments on commit 436dd30

Please sign in to comment.