Skip to content

Commit

Permalink
Merge branch 'test-encoding-fixes' into string-quagmire
Browse files Browse the repository at this point in the history
Conflicts:
	core/src/main/java/org/jruby/RubyString.java
  • Loading branch information
headius committed Mar 19, 2015
2 parents fe821de + 3ffcfa9 commit 436dd30
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 91 deletions.
17 changes: 12 additions & 5 deletions core/src/main/java/org/jruby/RubyInteger.java
Expand Up @@ -348,26 +348,33 @@ public RubyString chr19(ThreadContext context, IRubyObject arg) {
if (enc == ASCIIEncoding.INSTANCE && value >= 0x80) {
return chr19(context);
}
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, (int)value), enc, 0);
return RubyString.newStringNoCopy(runtime, fromEncodedBytes(runtime, enc, value), enc, 0);
}

private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, int value) {
private ByteList fromEncodedBytes(Ruby runtime, Encoding enc, long value) {
int n;
try {
n = value < 0 ? 0 : enc.codeToMbcLength(value);
n = value < 0 ? 0 : enc.codeToMbcLength((int)value);
} catch (EncodingException ee) {
n = 0;
}

if (n <= 0) throw runtime.newRangeError(this.toString() + " out of char range");

ByteList bytes = new ByteList(n);


boolean ok = false;
try {
enc.codeToMbc(value, bytes.getUnsafeBytes(), 0);
enc.codeToMbc((int)value, bytes.getUnsafeBytes(), 0);
ok = StringSupport.preciseLength(enc, bytes.unsafeBytes(), 0, n) == n;
} catch (EncodingException e) {
// ok = false, fall through
}

if (!ok) {
throw runtime.newRangeError("invalid codepoint " + String.format("0x%x in ", value) + enc.getCharsetName());
}

bytes.setRealSize(n);
return bytes;
}
Expand Down
103 changes: 41 additions & 62 deletions core/src/main/java/org/jruby/RubyString.java
Expand Up @@ -40,7 +40,6 @@

import jnr.posix.POSIX;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.ascii.AsciiTables;
import org.jcodings.exception.EncodingException;
import org.jcodings.specific.ASCIIEncoding;
Expand All @@ -50,7 +49,7 @@
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jcodings.unicode.UnicodeEncoding;
import org.jcodings.util.IntHash;
import org.joni.Matcher;
import org.joni.Option;
Expand Down Expand Up @@ -259,14 +258,14 @@ private void copyCodeRange(RubyString from) {
public final int scanForCodeRange() {
int cr = getCodeRange();
if (cr == CR_UNKNOWN) {
cr = codeRangeScan(value.getEncoding(), value);
cr = codeRangeScan(EncodingUtils.getActualEncoding(getEncoding(), value), value);
setCodeRange(cr);
}
return cr;
}

final boolean singleByteOptimizable() {
return StringSupport.isSingleByteOptimizable(this, value.getEncoding());
return StringSupport.isSingleByteOptimizable(this, EncodingUtils.STR_ENC_GET(this));
}

final boolean singleByteOptimizable(Encoding enc) {
Expand Down Expand Up @@ -2036,54 +2035,30 @@ public IRubyObject inspect19() {
}

public static IRubyObject inspect19(Ruby runtime, ByteList byteList) {
ThreadContext context = runtime.getCurrentContext();

Encoding enc = byteList.getEncoding();
byte bytes[] = byteList.getUnsafeBytes();
int p = byteList.getBegin();
int end = p + byteList.getRealSize();
RubyString result = new RubyString(runtime, runtime.getString(), new ByteList(end - p));
Encoding enc = byteList.getEncoding();

Encoding resultEnc = runtime.getDefaultInternalEncoding();
if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
result.associateEncoding(resultEnc);

boolean isUnicode = StringSupport.isUnicode(enc);
boolean asciiCompat = enc.isAsciiCompatible();

EncodingDB.Entry e = null;
CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = runtime.getEncodingService().getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p > 1) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
e = encodings.get("UTF-16BE".getBytes());
} else if (c0 == 0xFF && c1 == 0xFE) {
e = encodings.get("UTF-16LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;
}
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p > 3) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
e = encodings.get("UTF-32BE".getBytes());
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
e = encodings.get("UTF-32LE".getBytes());
} else {
e = encodings.get("ASCII-8BIT".getBytes());
isUnicode = false;
}
}

if (e != null) enc = e.getEncoding();

if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();
if (!resultEnc.isAsciiCompatible()) resultEnc = USASCIIEncoding.INSTANCE;
result.associateEncoding(resultEnc);
result.cat('"');

int prev = p;
Encoding actEnc = EncodingUtils.getActualEncoding(enc, byteList);
if (actEnc != enc) {
enc = actEnc;
if (isUnicode) isUnicode = enc instanceof UnicodeEncoding;
}

while (p < end) {
int cc = 0;

Expand Down Expand Up @@ -3708,7 +3683,7 @@ private IRubyObject populateCapturesForScan(Ruby runtime, Matcher matcher, int r
@JRubyMethod(name = "scan", reads = BACKREF, writes = BACKREF)
public IRubyObject scan19(ThreadContext context, IRubyObject arg, Block block) {
Ruby runtime = context.runtime;
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
final Regex pattern, prepared;
final RubyRegexp regexp;
final int tuFlags;
Expand Down Expand Up @@ -3881,11 +3856,11 @@ private boolean end_with_pCommon(IRubyObject arg) {

private static final ByteList SPACE_BYTELIST = new ByteList(ByteList.plain(" "));

private IRubyObject justify19(IRubyObject arg0, int jflag) {
Ruby runtime = getRuntime();
private IRubyObject justify19(ThreadContext context, IRubyObject arg0, int jflag) {
Ruby runtime = context.runtime;
RubyString result = justifyCommon(runtime, SPACE_BYTELIST,
1,
true, value.getEncoding(), RubyFixnum.num2int(arg0), jflag);
1,
true, EncodingUtils.STR_ENC_GET(this), RubyFixnum.num2int(arg0), jflag);
if (getCodeRange() != CR_BROKEN) result.setCodeRange(getCodeRange());
return result;
}
Expand Down Expand Up @@ -3985,7 +3960,7 @@ public IRubyObject ljust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "ljust")
public IRubyObject ljust19(IRubyObject arg0) {
return justify19(arg0, 'l');
return justify19(getRuntime().getCurrentContext(), arg0, 'l');
}

@JRubyMethod(name = "ljust")
Expand All @@ -4006,7 +3981,7 @@ public IRubyObject rjust(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "rjust")
public IRubyObject rjust19(IRubyObject arg0) {
return justify19(arg0, 'r');
return justify19(getRuntime().getCurrentContext(), arg0, 'r');
}

@JRubyMethod(name = "rjust")
Expand All @@ -4027,7 +4002,7 @@ public IRubyObject center(IRubyObject arg0, IRubyObject arg1) {

@JRubyMethod(name = "center")
public IRubyObject center19(IRubyObject arg0) {
return justify19(arg0, 'c');
return justify19(getRuntime().getCurrentContext(), arg0, 'c');
}

@JRubyMethod(name = "center")
Expand Down Expand Up @@ -4296,7 +4271,7 @@ public IRubyObject lstrip_bang19(ThreadContext context) {
return runtime.getNil();
}

Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int s = value.getBegin();
int end = s + value.getRealSize();
byte[]bytes = value.getUnsafeBytes();
Expand Down Expand Up @@ -4364,8 +4339,9 @@ public IRubyObject rstrip_bang19(ThreadContext context) {
return runtime.getNil();
}

IRubyObject result = singleByteOptimizable(value.getEncoding()) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime);
Encoding enc = EncodingUtils.STR_ENC_GET(this);
IRubyObject result = singleByteOptimizable(enc) ?
singleByteRStrip19(runtime) : multiByteRStrip19(runtime, context);

keepCodeRange();
return result;
Expand All @@ -4389,11 +4365,11 @@ private IRubyObject singleByteRStrip19(Ruby runtime) {
}

// In 1.9 we strip any combination of \0 and \s
private IRubyObject multiByteRStrip19(Ruby runtime) {
private IRubyObject multiByteRStrip19(Ruby runtime, ThreadContext context) {
byte[] bytes = value.getUnsafeBytes();
int start = value.getBegin();
int end = start + value.getRealSize();
Encoding enc = value.getEncoding();
Encoding enc = EncodingUtils.STR_ENC_GET(this);
int endp = end;
int prev;
while ((prev = enc.prevCharHead(bytes, start, endp, end)) != -1) {
Expand Down Expand Up @@ -4463,17 +4439,20 @@ public IRubyObject count19(ThreadContext context, IRubyObject arg) {
if (value.getRealSize() == 0) return RubyFixnum.zero(runtime);

RubyString otherStr = arg.convertToString();
ByteList otherBL = otherStr.getByteList();
Encoding enc = checkEncoding(otherStr);

int c;
if (otherStr.value.length() == 1 && enc.isAsciiCompatible() &&
((c = otherStr.value.unsafeBytes()[otherStr.value.getBegin()] & 0xff)) < 0x80 && scanForCodeRange() != CR_BROKEN) {
if (otherBL.length() == 1 && enc.isAsciiCompatible() &&
enc.isReverseMatchAllowed(otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize()) &&
scanForCodeRange() != CR_BROKEN) {
int n = 0;
int[] len_p = {0};
int c = EncodingUtils.encCodepointLength(runtime, otherBL.unsafeBytes(), otherBL.begin(), otherBL.begin() + otherBL.getRealSize(), len_p, enc);

if (value.length() ==0) return RubyFixnum.zero(runtime);
byte[]bytes = value.unsafeBytes();
int p = value.getBegin();
int end = p + value.length();
int n = 0;
while (p < end) {
if ((bytes[p++] & 0xff) == c) n++;
}
Expand Down Expand Up @@ -5357,7 +5336,7 @@ private IRubyObject enumerateCodepoints(ThreadContext context, String name, Bloc
ptrBytes = strByteList.unsafeBytes();
ptr = strByteList.begin();
end = ptr + strByteList.getRealSize();
enc = str.getEncoding();
enc = EncodingUtils.STR_ENC_GET(str);

if (block.isGiven()) {
if (wantarray) {
Expand Down Expand Up @@ -5478,8 +5457,8 @@ public RubySymbol intern19() {
@JRubyMethod
public IRubyObject ord(ThreadContext context) {
Ruby runtime = context.runtime;
return RubyFixnum.newFixnum(runtime, codePoint(runtime, value.getEncoding(), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));
return RubyFixnum.newFixnum(runtime, codePoint(runtime, EncodingUtils.STR_ENC_GET(this), value.getUnsafeBytes(), value.getBegin(),
value.getBegin() + value.getRealSize()));
}

@JRubyMethod
Expand Down Expand Up @@ -5789,7 +5768,7 @@ public IRubyObject strScrub(ThreadContext context, IRubyObject repl, Block block
if (cr == CR_7BIT || cr == CR_VALID)
return context.nil;

enc = getEncoding();
enc = EncodingUtils.STR_ENC_GET(this);
if (!repl.isNil()) {
repl = EncodingUtils.strCompatAndValid(context, repl, enc);
}
Expand Down
8 changes: 7 additions & 1 deletion core/src/main/java/org/jruby/parser/ParserSupport.java
Expand Up @@ -46,6 +46,7 @@
import org.jruby.ast.types.INameNode;
import org.jruby.common.IRubyWarnings;
import org.jruby.common.IRubyWarnings.ID;
import org.jruby.exceptions.RaiseException;
import org.jruby.lexer.yacc.ISourcePosition;
import org.jruby.lexer.yacc.ISourcePositionHolder;
import org.jruby.lexer.yacc.RubyLexer;
Expand Down Expand Up @@ -1222,8 +1223,13 @@ public Node arg_append(Node node1, Node node2) {
// MRI: reg_fragment_check
public void regexpFragmentCheck(RegexpNode end, ByteList value) {
setRegexpEncoding(end, value);
RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
try {
RubyRegexp.preprocessCheck(configuration.getRuntime(), value);
} catch (RaiseException re) {
compile_error(re.getMessage());
}
} // 1.9 mode overrides to do extra checking...

private List<Integer> allocateNamedLocals(RegexpNode regexpNode) {
RubyRegexp pattern = RubyRegexp.newRegexp(configuration.getRuntime(), regexpNode.getValue(), regexpNode.getOptions());
pattern.setLiteral();
Expand Down
49 changes: 45 additions & 4 deletions core/src/main/java/org/jruby/util/io/EncodingUtils.java
Expand Up @@ -4,7 +4,6 @@
import org.jcodings.EncodingDB;
import org.jcodings.Ptr;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
Expand All @@ -16,6 +15,7 @@
import org.jcodings.transcode.Transcoder;
import org.jcodings.transcode.TranscoderDB;
import org.jcodings.transcode.Transcoding;
import org.jcodings.util.CaseInsensitiveBytesHash;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBasicObject;
Expand All @@ -24,7 +24,6 @@
import org.jruby.RubyFixnum;
import org.jruby.RubyHash;
import org.jruby.RubyIO;
import org.jruby.RubyInteger;
import org.jruby.RubyMethod;
import org.jruby.RubyNumeric;
import org.jruby.RubyProc;
Expand Down Expand Up @@ -1810,17 +1809,59 @@ public static IRubyObject strCompatAndValid(ThreadContext context, IRubyObject _
throw context.runtime.newArgumentError("replacement must be valid byte sequence '" + str + "'");
}
else if (cr == StringSupport.CR_7BIT) {
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (!enc.isAsciiCompatible()) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
}
}
else { /* ENC_CODERANGE_VALID */
Encoding e = str.getEncoding();
Encoding e = STR_ENC_GET(str);
if (enc != e) {
throw context.runtime.newEncodingCompatibilityError("incompatible character encodings: " + enc + " and " + e);
}
}
return str;
}

// MRI: get_encoding
public static Encoding getEncoding(ByteList str) {
return getActualEncoding(str.getEncoding(), str);
}

// MRI: get_actual_encoding
public static Encoding getActualEncoding(Encoding enc, ByteList byteList) {
byte[] bytes = byteList.unsafeBytes();
int p = byteList.begin();
int end = p + byteList.getRealSize();

CaseInsensitiveBytesHash<EncodingDB.Entry> encodings = EncodingDB.getEncodings();
if (enc == encodings.get("UTF-16".getBytes()).getEncoding() && end - p >= 2) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;

if (c0 == 0xFE && c1 == 0xFF) {
return UTF16BEEncoding.INSTANCE;
} else if (c0 == 0xFF && c1 == 0xFE) {
return UTF16LEEncoding.INSTANCE;
}
return ASCIIEncoding.INSTANCE;
} else if (enc == encodings.get("UTF-32".getBytes()).getEncoding() && end - p >= 4) {
int c0 = bytes[p] & 0xff;
int c1 = bytes[p + 1] & 0xff;
int c2 = bytes[p + 2] & 0xff;
int c3 = bytes[p + 3] & 0xff;

if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
return UTF32BEEncoding.INSTANCE;
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
return UTF32LEEncoding.INSTANCE;
}
return ASCIIEncoding.INSTANCE;
}
return enc;
}

public static Encoding STR_ENC_GET(RubyString str) {
return getEncoding(str.getByteList());
}
}

0 comments on commit 436dd30

Please sign in to comment.