Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: jruby/jruby
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 8574cb3d77bb^
Choose a base ref
...
head repository: jruby/jruby
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 9717929a3c10
Choose a head ref
  • 4 commits
  • 4 files changed
  • 1 contributor

Commits on Mar 13, 2015

  1. Move String#inspect's "actual encoding" logic out and use it.

    This is STR_ENC_GET/get_encoding/get_actual_encoding used in
    several String methods. This fixes issues with the UTF-16/23 dummy
    replicas confusing the transcoding system.
    headius committed Mar 13, 2015
    Copy the full SHA
    8574cb3 View commit details
  2. Fix String#b to force the dup to modify.

    Without modify here, the BINARY encoding gets applied to the
    still-shared ByteList from the original. Caused various issues in
    tests that use #b.
    headius committed Mar 13, 2015
    Copy the full SHA
    f2350fe View commit details
  3. Integer#chr fixes for unusual encodings.

    * Ensure int gets used as unsigned for 32-bit encodings.
    * Double-check resulting character to be sure it is valid.
    headius committed Mar 13, 2015
    Copy the full SHA
    f90bcc9 View commit details
  4. Copy the full SHA
    9717929 View commit details
17 changes: 12 additions & 5 deletions core/src/main/java/org/jruby/RubyInteger.java
Original file line number Diff line number Diff line change
@@ -348,26 +348,33 @@ public RubyString chr19(ThreadContext context, IRubyObject arg) {
if (enc == ASCIIEncoding.INSTANCE && value >= 0x80) {
return chr19(context);
}
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, (int)value), enc, 0);
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, value), enc, 0);
}

private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, int value) {
private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, long value) {
int n;
try {
n = value < 0 ? 0 : enc.codeToMbcLength(value);
n = value < 0 ? 0 : enc.codeToMbcLength((int)value);
} catch (EncodingException ee) {
n = 0;
}

if (n <= 0) throw runtime.newRangeError(this.toString() + " out of char range");

ByteList bytes = new ByteList(n);


boolean ok = false;
try {
enc.codeToMbc(value, bytes.getUnsafeBytes(), 0);
enc.codeToMbc((int)value, bytes.getUnsafeBytes(), 0);
ok = StringSupport.preciseLength(enc, bytes.unsafeBytes(), 0, n) == n;
} catch (EncodingException e) {
// ok = false, fall through
}

if (!ok) {
throw runtime.newRangeError("invalid codepoint " + String.format("0x%x in ", value) + enc.getCharsetName());
}

bytes.setRealSize(n);
return bytes;
}
117 changes: 48 additions & 69 deletions core/src/main/java/org/jruby/RubyString.java
Original file line number Diff line number Diff line change
@@ -40,7 +40,6 @@

import jnr.posix.POSIX;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.ascii.AsciiTables;
import org.jcodings.exception.EncodingException;
import org.jcodings.specific.ASCIIEncoding;
@@ -50,7 +49,7 @@
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jcodings.unicode.UnicodeEncoding;
import org.jcodings.util.IntHash;
import org.joni.Matcher;
import org.joni.Option;
@@ -257,14 +256,14 @@ private void copyCodeRange(RubyString from) {
public final int scanForCodeRange() {
int cr = getCodeRange();
if (cr == CR_UNKNOWN) {
cr = codeRangeScan(value.getEncoding(), value);
cr = codeRangeScan(EncodingUtils.getActualEncoding(getEncoding(), value), value);
setCodeRange(cr);
}
return cr;
}

final boolean singleByteOptimizable() {
return StringSupport.isSingleByteOptimizable(this, value.getEncoding());
return StringSupport.isSingleByteOptimizable(this, EncodingUtils.STR_ENC_GET(this));
}

final boolean singleByteOptimizable(Encoding enc) {
@@ -2076,54 +2075,30 @@ public IRubyObject inspect19() {
}

public static IRubyObject inspect19(Ruby runtime, ByteList byteList) {
ThreadContext context = runtime.getCurrentContext();

Encoding enc = byteList.getEncoding();
byte bytes[] = byteList.getUnsafeBytes();
int p = byteList.getBegin();
int end = p + byteList.getRealSize();
RubyString result = new RubyString(runtime, runtime.getString(), new ByteList(end - p));
Encoding enc = byteList.getEncoding();

Encoding resultEnc = runtime.getDefaultInternalEncoding();
if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
result.associateEncoding(resultEnc);

boolean isUnicode = StringSupport.isUnicode(enc);
boolean asciiCompat = enc.isAsciiCompatible();

EncodingDB.Entry e = null;
CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = runtime.getEncodingService().getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p > 1) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
e = encodings.get("UTF-16BE".getBytes());
} else if (c0 == 0xFF && c1 == 0xFE) {
e = encodings.get("UTF-16LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;
}
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p > 3) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
e = encodings.get("UTF-32BE".getBytes());
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
e = encodings.get("UTF-32LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;
}
}

if (e != null) enc = e.getEncoding();

if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
result.associateEncoding(resultEnc);
result.cat('"');

int prev = p;
Encoding actEnc = EncodingUtils.getActualEncoding(enc, byteList);
if (actEnc != enc) {
enc = actEnc;
if (isUnicode) isUnicode = enc instanceof UnicodeEncoding;
}

while (p < end) {
int cc = 0;

@@ -3737,7 +3712,7 @@ private IRubyObject populateCapturesForScan(Ruby runtime, Matcher matcher, int r
@JRubyMethod(name = "scan", reads = BACKREF, writes = BACKREF)
public IRubyObject scan19(ThreadContext context, IRubyObject arg, Block block) {
Ruby runtime = context.runtime;
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
final Regex pattern, prepared;
final RubyRegexp regexp;
final int tuFlags;
@@ -3910,11 +3885,11 @@ private boolean end_with_pCommon(IRubyObject arg) {

private static final ByteList SPACE_BYTELIST = new ByteList(ByteList.plain(" "));

private IRubyObject justify19(IRubyObject arg0, int jflag) {
Ruby runtime = getRuntime();
private IRubyObject justify19(ThreadContext context, IRubyObject arg0, int jflag) {
Ruby runtime = context.runtime;
RubyString result = justifyCommon(runtime, SPACE_BYTELIST,
1,
true, value.getEncoding(), RubyFixnum.num2int(arg0), jflag);
1,
true, EncodingUtils.STR_ENC_GET(this), RubyFixnum.num2int(arg0), jflag);
if (getCodeRange() != CR_BROKEN) result.setCodeRange(getCodeRange());
return result;
}
@@ -4014,7 +3989,7 @@ public IRubyObject ljust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "ljust")
public IRubyObject ljust19(IRubyObject arg0) {
return justify19(arg0, 'l');
return justify19(getRuntime().getCurrentContext(), arg0, 'l');
}

@JRubyMethod(name = "ljust")
@@ -4035,7 +4010,7 @@ public IRubyObject rjust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "rjust")
public IRubyObject rjust19(IRubyObject arg0) {
return justify19(arg0, 'r');
return justify19(getRuntime().getCurrentContext(), arg0, 'r');
}

@JRubyMethod(name = "rjust")
@@ -4056,7 +4031,7 @@ public IRubyObject center(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "center")
public IRubyObject center19(IRubyObject arg0) {
return justify19(arg0, 'c');
return justify19(getRuntime().getCurrentContext(), arg0, 'c');
}

@JRubyMethod(name = "center")
@@ -4148,26 +4123,26 @@ public IRubyObject chop_bang(ThreadContext context) {
public IRubyObject chop19(ThreadContext context) {
Ruby runtime = context.runtime;
if (value.getRealSize() == 0) return newEmptyString(runtime, getMetaClass(), value.getEncoding()).infectBy(this);
return makeShared19(runtime, 0, choppedLength19(runtime));
return makeShared19(runtime, 0, choppedLength19(runtime, context));
}

@JRubyMethod(name = "chop!")
public IRubyObject chop_bang19(ThreadContext context) {
modifyCheck();
Ruby runtime = context.runtime;
if (value.getRealSize() == 0) return runtime.getNil();
view(0, choppedLength19(runtime));
view(0, choppedLength19(runtime, context));
if (getCodeRange() != CR_7BIT) clearCodeRange();
return this;
}

private int choppedLength19(Ruby runtime) {
private int choppedLength19(Ruby runtime, ThreadContext context) {
int p = value.getBegin();
int end = p + value.getRealSize();

if (p > end) return 0;
byte bytes[] = value.getUnsafeBytes();
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);

int s = enc.prevCharHead(bytes, p, end, end);
if (s == -1) return 0;
@@ -4345,7 +4320,7 @@ public IRubyObject lstrip_bang19(ThreadContext context) {
return runtime.getNil();
}

Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int s = value.getBegin();
int end = s + value.getRealSize();
byte[]bytes = value.getUnsafeBytes();
@@ -4413,8 +4388,9 @@ public IRubyObject rstrip_bang19(ThreadContext context) {
return runtime.getNil();
}

IRubyObject result = singleByteOptimizable(value.getEncoding()) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime);
Encoding enc = EncodingUtils.STR_ENC_GET(this);
IRubyObject result = singleByteOptimizable(enc) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime, context);

keepCodeRange();
return result;
@@ -4438,11 +4414,11 @@ private IRubyObject singleByteRStrip19(Ruby runtime) {
}

// In 1.9 we strip any combination of \0 and \s
private IRubyObject multiByteRStrip19(Ruby runtime) {
private IRubyObject multiByteRStrip19(Ruby runtime, ThreadContext context) {
byte[] bytes = value.getUnsafeBytes();
int start = value.getBegin();
int end = start + value.getRealSize();
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int endp = end;
int prev;
while ((prev = enc.prevCharHead(bytes, start, endp, end)) != -1) {
@@ -4512,17 +4488,20 @@ public IRubyObject count19(ThreadContext context, IRubyObject arg) {
if (value.getRealSize() == 0) return RubyFixnum.zero(runtime);

RubyString otherStr = arg.convertToString();
ByteList otherBL = otherStr.getByteList();
Encoding enc = checkEncoding(otherStr);

int c;
if (otherStr.value.length() == 1 && enc.isAsciiCompatible() &&
((c = otherStr.value.unsafeBytes()[otherStr.value.getBegin()] & 0xff)) < 0x80 && scanForCodeRange() != CR_BROKEN) {
if (otherBL.length() == 1 && enc.isAsciiCompatible() &&
enc.isReverseMatchAllowed(otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize()) &&
scanForCodeRange() != CR_BROKEN) {
int n = 0;
int[] len_p = {0};
int c = EncodingUtils.encCodepointLength(runtime, otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize(), len_p, enc);

if (value.length() ==0) return RubyFixnum.zero(runtime);
byte[]bytes = value.unsafeBytes();
int p = value.getBegin();
int end = p + value.length();
int n = 0;
while (p < end) {
if ((bytes[p++] & 0xff) == c) n++;
}
@@ -5371,7 +5350,7 @@ private IRubyObject enumerateCodepoints(ThreadContext context, String name, Bloc
ptrBytes = strByteList.unsafeBytes();
ptr = strByteList.begin();
end = ptr + strByteList.getRealSize();
enc = str.getEncoding();
enc = EncodingUtils.STR_ENC_GET(str);

if (block.isGiven()) {
if (wantarray) {
@@ -5492,8 +5471,8 @@ public RubySymbol intern19() {
@JRubyMethod
public IRubyObject ord(ThreadContext context) {
Ruby runtime = context.runtime;
return RubyFixnum.newFixnum(runtime, codePoint(runtime, value.getEncoding(), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));
return RubyFixnum.newFixnum(runtime, codePoint(runtime, EncodingUtils.STR_ENC_GET(this), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));
}

@JRubyMethod
@@ -5680,9 +5659,9 @@ public IRubyObject ascii_only_p(ThreadContext context) {
@JRubyMethod
public IRubyObject b(ThreadContext context) {
Encoding encoding = ASCIIEncoding.INSTANCE;
RubyString dup = (RubyString)dup();
dup.associateEncoding(encoding);
dup.clearCodeRange();
RubyString dup = strDup(context.runtime);
dup.modify19();
dup.setEncoding(encoding);
return dup;
}

@@ -5803,7 +5782,7 @@ public IRubyObject strScrub(ThreadContext context, IRubyObject repl, Block block
if (cr == CR_7BIT || cr == CR_VALID)
return context.nil;

enc = getEncoding();
enc = EncodingUtils.STR_ENC_GET(this);
if (!repl.isNil()) {
repl = EncodingUtils.strCompatAndValid(context, repl, enc);
}
49 changes: 45 additions & 4 deletions core/src/main/java/org/jruby/util/io/EncodingUtils.java
Original file line number Diff line number Diff line change
@@ -4,7 +4,6 @@
import org.jcodings.EncodingDB;
import org.jcodings.Ptr;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
@@ -16,6 +15,7 @@
import org.jcodings.transcode.Transcoder;
import org.jcodings.transcode.TranscoderDB;
import org.jcodings.transcode.Transcoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBasicObject;
@@ -24,7 +24,6 @@
import org.jruby.RubyFixnum;
import org.jruby.RubyHash;
import org.jruby.RubyIO;
import org.jruby.RubyInteger;
import org.jruby.RubyMethod;
import org.jruby.RubyNumeric;
import org.jruby.RubyProc;
@@ -1810,17 +1809,59 @@ public static IRubyObject strCompatAndValid(ThreadContext context, IRubyObject _
throw context.runtime.newArgumentError("replacement must be valid byte sequence '" + str + "'");
}
else if (cr == StringSupport.CR_7BIT) {
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (!enc.isAsciiCompatible()) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
}
}
else { /* ENC_CODERANGE_VALID */
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (enc != e) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
}
}
return str;
}

// MRI: get_encoding
public static Encoding getEncoding(ByteList str) {
return getActualEncoding(str.getEncoding(), str);
}

// MRI: get_actual_encoding
public static Encoding getActualEncoding(Encoding enc, ByteList byteList) {
byte[] bytes = byteList.unsafeBytes();
int p = byteList.begin();
int end = p + byteList.getRealSize();

CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = EncodingDB.getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p >= 2) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
return UTF16BEEncoding.INSTANCE;
} else if (c0 == 0xFF && c1 == 0xFE) {
return UTF16LEEncoding.INSTANCE;
}
return ASCIIEncoding.INSTANCE;
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p >= 4) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
return UTF32BEEncoding.INSTANCE;
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
return UTF32LEEncoding.INSTANCE;
}
return ASCIIEncoding.INSTANCE;
}
return enc;
}

public static Encoding STR_ENC_GET(RubyString str) {
return getEncoding(str.getByteList());
}
}
13 changes: 1 addition & 12 deletions test/mri/excludes/TestM17N.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude :test_chr, "needs investigation"
exclude :test_delete, "needs investigation"
exclude :test_dynamic_eucjp_regexp, "needs investigation"
exclude :test_dynamic_sjis_regexp, "needs investigation"
@@ -15,21 +14,11 @@
exclude :test_regexp_unicode, "needs investigation"
exclude :test_regexp_usascii, "needs investigation"
exclude :test_scrub, ""
exclude :test_scrub_bang, ""
exclude :test_count_sjis_trailing_byte, "needs investigation"
exclude :test_sprintf_c, "format string encoding should be used to decode incoming fixnums"
exclude :test_str_concat, "needs investigation"
exclude :test_string_inspect_encoding, "needs investigation"
exclude :test_string_mixed_unicode, "needs investigation"
exclude :test_symbol, "needs investigation"
exclude :test_symbol_op, "random operators setting encoding earlier"
exclude :test_symbol_op, "some symbols are created early and do not have UTF-8 encoding"
exclude :test_union_1_nonascii_string, "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-16BE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-16LE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-32BE)', "needs investigation"
exclude :'test_utf_16_32_codepoints(UTF-32LE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-16BE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-16LE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-32BE)', "needs investigation"
exclude :'test_utf_16_32_ord(UTF-32LE)', "needs investigation"
exclude :test_valid_encoding, "needs investigation"