Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Truffle] Implementing String#scrub
Browse files Browse the repository at this point in the history
Brandon Fish committed Dec 26, 2016
1 parent 9a1b231 commit a2836e9
Showing 4 changed files with 358 additions and 62 deletions.
2 changes: 0 additions & 2 deletions spec/truffle/tags/core/string/encode_tags.txt
Original file line number Diff line number Diff line change
@@ -2,5 +2,3 @@ fails(inherited - rubinius):String#encode given the xml: :text option replaces u
fails(inherited - rubinius):String#encode given the xml: :attr option replaces undefined characters with their upper-case hexadecimal numeric character references
fails(inherited - rubinius):String#encode! given the xml: :text option replaces undefined characters with their upper-case hexadecimal numeric character references
fails(inherited - rubinius):String#encode! given the xml: :attr option replaces undefined characters with their upper-case hexadecimal numeric character references
fails:String#encode when passed options replaces invalid characters when replacing Emacs-Mule encoded strings
fails:String#encode! when passed options replaces invalid characters when replacing Emacs-Mule encoded strings
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@
import org.jruby.truffle.core.cast.ToEncodingNode;
import org.jruby.truffle.core.rope.CodeRange;
import org.jruby.truffle.core.rope.Rope;
import org.jruby.truffle.core.string.EncodingUtils;
import org.jruby.truffle.core.string.StringOperations;
import org.jruby.truffle.language.RubyGuards;
import org.jruby.truffle.language.RubyNode;
@@ -458,6 +459,16 @@ public DynamicObject getEncoding(int index) {

}

@Primitive(name = "encoding_minlength", needsSelf = false)
public static abstract class EncodingMinLengthPrimitiveNode extends PrimitiveArrayArgumentsNode {

@Specialization
public int minLength(DynamicObject encoding) {
return EncodingUtils.encMbminlen(Layouts.ENCODING.getEncoding(encoding));
}

}

@NodeChildren({ @NodeChild("first"), @NodeChild("second") })
public static abstract class CheckEncodingNode extends RubyNode {

327 changes: 326 additions & 1 deletion truffle/src/main/java/org/jruby/truffle/core/string/StringNodes.java
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@
* Copyright (C) 2006 Ola Bini <ola@ologix.com>
* Copyright (C) 2007 Nick Sieger <nicksieger@gmail.com>
*
* Some of the code in this class is transposed from org.jruby.util.ByteList,
* Some of the code in this class is transposed from org.jruby.util.ByteList and org.jruby.RubyString,
* licensed under the same EPL1.0/GPL 2.0/LGPL 2.1 used throughout.
*
* Copyright (C) 2007-2010 JRuby Community
@@ -83,6 +83,10 @@
import org.jcodings.exception.EncodingException;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.truffle.Layouts;
import org.jruby.truffle.RubyContext;
@@ -148,9 +152,15 @@
import java.util.Arrays;
import java.util.List;

import static org.jruby.truffle.core.rope.CodeRange.CR_7BIT;
import static org.jruby.truffle.core.rope.CodeRange.CR_VALID;
import static org.jruby.truffle.core.rope.RopeConstants.EMPTY_ASCII_8BIT_ROPE;
import static org.jruby.truffle.core.string.StringOperations.encoding;
import static org.jruby.truffle.core.string.StringOperations.rope;
import static org.jruby.truffle.core.string.StringSupport.MBCLEN_CHARFOUND_LEN;
import static org.jruby.truffle.core.string.StringSupport.MBCLEN_CHARFOUND_P;
import static org.jruby.truffle.core.string.StringSupport.MBCLEN_INVALID_P;
import static org.jruby.truffle.core.string.StringSupport.MBCLEN_NEEDMORE_P;

@CoreClass("String")
public abstract class StringNodes {
@@ -1449,6 +1459,321 @@ private int prevCharHead(Encoding enc, byte[]bytes, int p, int s, int end) {
}
}

@CoreMethod(names = "scrub_internal", optional = 1, needsBlock = true)
public abstract static class ScrubNode extends YieldingCoreMethodNode {

private static final byte[] SCRUB_REPL_UTF8 = new byte[]{(byte)0xEF, (byte)0xBF, (byte)0xBD};
private static final byte[] SCRUB_REPL_ASCII = new byte[]{(byte)'?'};
private static final byte[] SCRUB_REPL_UTF16BE = new byte[]{(byte)0xFF, (byte)0xFD};
private static final byte[] SCRUB_REPL_UTF16LE = new byte[]{(byte)0xFD, (byte)0xFF};
private static final byte[] SCRUB_REPL_UTF32BE = new byte[]{(byte)0x00, (byte)0x00, (byte)0xFF, (byte)0xFD};
private static final byte[] SCRUB_REPL_UTF32LE = new byte[]{(byte)0xFD, (byte)0xFF, (byte)0x00, (byte)0x00};


@Child private IsTaintedNode isTaintedNode;
@Child private TaintNode taintNode;
@Child private CallDispatchHeadNode strCompatAndValidNode;
@Child private RopeNodes.MakeConcatNode makeConcatNode;

public ScrubNode(RubyContext context, SourceSection sourceSection) {
super(context, sourceSection);
isTaintedNode = IsTaintedNode.create();
taintNode = TaintNode.create();
strCompatAndValidNode = DispatchHeadNodeFactory.createMethodCall(context);
makeConcatNode = RopeNodesFactory.MakeConcatNodeGen.create(null, null, null);
}


@Specialization
public DynamicObject scrub(VirtualFrame frame, DynamicObject string, Object repl, Object block) {
if(repl == NotProvided.INSTANCE){
repl = nil();
}
if(block == NotProvided.INSTANCE){
block = nil();
}
return scrubDefault(frame, string, repl, block);
}

private DynamicObject scrubDefault(VirtualFrame frame, DynamicObject string, Object repl, Object block) {
final DynamicObject rubyEncoding = getContext().getEncodingManager().getRubyEncoding(StringOperations.encoding(string));
final Rope rope = rope(string);
final ByteList value = RopeOperations.getByteListReadOnly(rope);

CodeRange cr = rope.getCodeRange();
Encoding enc;
Encoding encidx;
Rope buf = null;
byte[] repBytes;
int rep;
int replen;
boolean tainted = false;

if (cr == CR_7BIT || cr == CR_VALID)
return nil();

enc = rope.getEncoding();
if (repl != nil()) {
repl = strCompatAndValidNode.call(frame, string, "str_compat_and_valid", repl, rubyEncoding);
tainted |= isTaintedNode.executeIsTainted(repl);
}

if (enc.isDummy()) {
return nil();
}
encidx = enc;

if (enc.isAsciiCompatible()) {
byte[] pBytes = value.unsafeBytes();
int p = value.begin();
int e = p + value.getRealSize();
int p1 = p;
boolean rep7bit_p;
if (block != nil()) {
repBytes = null;
rep = 0;
replen = 0;
rep7bit_p = false;
}
else if (repl != nil()) {
final Rope replRope = rope((DynamicObject) repl);
final ByteList replValue = RopeOperations.getByteListReadOnly(replRope);
repBytes = replValue.unsafeBytes();
rep = replValue.begin();
replen = replValue.getRealSize();
rep7bit_p = (replRope.getCodeRange() == CodeRange.CR_7BIT);
}
else if (encidx == UTF8Encoding.INSTANCE) {
repBytes = SCRUB_REPL_UTF8;
rep = 0;
replen = repBytes.length;
rep7bit_p = false;
}
else {
repBytes = SCRUB_REPL_ASCII;
rep = 0;
replen = repBytes.length;
rep7bit_p = false;
}
cr = CR_7BIT;

p = StringSupport.searchNonAscii(pBytes, p, e);
if (p == -1) {
p = e;
}
while (p < e) {
int ret = enc.length(pBytes, p, e);
if (MBCLEN_NEEDMORE_P(ret)) {
break;
}
else if (MBCLEN_CHARFOUND_P(ret)) {
cr = CR_VALID;
p += MBCLEN_CHARFOUND_LEN(ret);
}
else if (MBCLEN_INVALID_P(ret)) {
/*
* p1~p: valid ascii/multibyte chars
* p ~e: invalid bytes + unknown bytes
*/
int clen = enc.maxLength();
if(buf == null){
buf = RopeConstants.EMPTY_ASCII_8BIT_ROPE;
}
if (p > p1) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(pBytes, p1, p - p1)), enc);
}

if (e - p < clen){
clen = e - p;
}
if (clen <= 2) {
clen = 1;
}
else {
int q = p;
clen--;
for (; clen > 1; clen--) {
ret = enc.length(pBytes, q, q + clen);
if (MBCLEN_NEEDMORE_P(ret)) break;
if (MBCLEN_INVALID_P(ret)) continue;
}
}
if (repBytes != null) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(repBytes, rep, replen)), enc);
if (!rep7bit_p){
cr = CR_VALID;
}
}
else {
repl = yield(frame, (DynamicObject)block, createString(new ByteList(pBytes, p, clen, enc, true)));
repl = strCompatAndValidNode.call(frame, string, "str_compat_and_valid", repl, rubyEncoding);
tainted |= isTaintedNode.executeIsTainted(repl);
buf = makeConcatNode.executeMake(buf, rope((DynamicObject)repl), enc);
if (rope((DynamicObject) repl).getCodeRange() == CR_VALID){
cr = CR_VALID;
}
}
p += clen;
p1 = p;
p = StringSupport.searchNonAscii(pBytes, p, e);
if (p == -1) {
p = e;
break;
}
}
}
if (buf == null) {
if (p == e) {
// setCodeRange(cr);
return nil();
}
buf = RopeConstants.EMPTY_ASCII_8BIT_ROPE;
}
if (p1 < p) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(pBytes, p1, p - p1)), enc);
}
if (p < e) {
if (repBytes != null) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(repBytes, rep, replen)), enc);
if (!rep7bit_p){
cr = CR_VALID;
}
}
else {
repl = yield(frame, (DynamicObject)block, createString(new ByteList(pBytes, p, e - p, enc, true)));//);
// repl = block.yieldSpecific(context, RubyString.newString(runtime, pBytes, p, e - p, enc));
repl = strCompatAndValidNode.call(frame, string, "str_compat_and_valid", repl, rubyEncoding);//EncodingUtils.strCompatAndValid(getContext(), repl, enc);
tainted |= isTaintedNode.executeIsTainted(repl);
buf = makeConcatNode.executeMake(buf, rope((DynamicObject)repl), enc);
if (rope((DynamicObject) repl).getCodeRange() == CR_VALID) {
cr = CR_VALID;
}
}
}
}
else {
/* ASCII incompatible */
byte[] pBytes = value.unsafeBytes();
int p = value.begin();
int e = p + value.getRealSize();
int p1 = p;
int mbminlen = enc.minLength();
if (repl != nil()) {
final Rope replRope = rope((DynamicObject) repl);
final ByteList replValue = RopeOperations.getByteListReadOnly(replRope);
repBytes = replValue.unsafeBytes();
rep = replValue.begin();
replen = replValue.getRealSize();
}
else if (encidx == UTF16BEEncoding.INSTANCE) {
repBytes = SCRUB_REPL_UTF16BE;
rep = 0;
replen = repBytes.length;
}
else if (encidx == UTF16LEEncoding.INSTANCE) {
repBytes = SCRUB_REPL_UTF16LE;
rep = 0;
replen = repBytes.length;
}
else if (encidx == UTF32BEEncoding.INSTANCE) {
repBytes = SCRUB_REPL_UTF32BE;
rep = 0;
replen = repBytes.length;
}
else if (encidx == UTF32LEEncoding.INSTANCE) {
repBytes = SCRUB_REPL_UTF32LE;
rep = 0;
replen = repBytes.length;
}
else {
repBytes = SCRUB_REPL_ASCII;
rep = 0;
replen = repBytes.length;
}

while (p < e) {
int ret = StringSupport.preciseLength(enc, pBytes, p, e);
if (MBCLEN_NEEDMORE_P(ret)) {
break;
}
else if (MBCLEN_CHARFOUND_P(ret)) {
p += MBCLEN_CHARFOUND_LEN(ret);
}
else if (MBCLEN_INVALID_P(ret)) {
int q = p;
int clen = enc.maxLength();
if(buf == null){
buf = RopeConstants.EMPTY_ASCII_8BIT_ROPE;
};
if(p > p1){
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(pBytes, p1, p - p1)), enc);
}


if (e - p < clen) clen = e - p;
if (clen <= mbminlen * 2) {
clen = mbminlen;
}
else {
clen -= mbminlen;
for (; clen > mbminlen; clen-=mbminlen) {
ret = enc.length(pBytes, q, q + clen);
if (MBCLEN_NEEDMORE_P(ret)) break;
if (MBCLEN_INVALID_P(ret)) continue;
}
}
if (repBytes != null) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(repBytes, rep, replen)), enc);
}
else {
repl = yield(frame, (DynamicObject)block, createString(new ByteList(pBytes, p, e - p, enc, true)));
repl = strCompatAndValidNode.call(frame, string, "str_compat_and_valid", repl, rubyEncoding);//EncodingUtils.strCompatAndValid(getContext(), repl, enc);
tainted |= isTaintedNode.executeIsTainted(repl);
buf = makeConcatNode.executeMake(buf, rope((DynamicObject)repl), enc);
}
p += clen;
p1 = p;
}
}
if (buf == null) {
if (p == e) {
// setCodeRange(CR_VALID);
return nil();
}
buf = RopeConstants.EMPTY_ASCII_8BIT_ROPE;
}
if (p1 < p) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(pBytes, p1, p - p1)), enc);
}
if (p < e) {
if (repBytes != null) {
buf = makeConcatNode.executeMake(buf, RopeOperations.ropeFromByteList(new ByteList(repBytes, rep, replen)), enc);
}
else {
repl = yield(frame, (DynamicObject)block, createString(new ByteList(pBytes, p, e - p, enc, true)));
repl = strCompatAndValidNode.call(frame, string, "str_compat_and_valid", repl, rubyEncoding);
tainted |= isTaintedNode.executeIsTainted(repl);
buf = makeConcatNode.executeMake(buf, rope((DynamicObject)repl), enc);
}
}
cr = CR_VALID;
}

final boolean isTaint = isTaintedNode.executeIsTainted(string);

final DynamicObject resultString = createString(buf);

if(tainted | isTaint){
taintNode.executeTaint(resultString);
}

// ((RubyString)buf).setEncodingAndCodeRange(enc, cr);

return resultString;
}

}

@CoreMethod(names = "swapcase!", raiseIfFrozenSelf = true)
@ImportStatic(StringGuards.class)
public abstract static class SwapcaseBangNode extends CoreMethodArrayArgumentsNode {
80 changes: 21 additions & 59 deletions truffle/src/main/ruby/core/string.rb
Original file line number Diff line number Diff line change
@@ -536,6 +536,10 @@ def encode!(to=undefined, from=undefined, options=undefined)

# TODO: replace this hack with transcoders
if options.kind_of? Hash
case invalid = options[:invalid]
when :replace
self.scrub!
end
case xml = options[:xml]
when :text
gsub!(/[&><]/, '&' => '&amp;', '>' => '&gt;', '<' => '&lt;')
@@ -1195,72 +1199,30 @@ def match(pattern, pos=0)
result
end

# Removes invalid byte sequences from a String, available since Ruby 2.1.
def scrub(replace = nil)
output = ''
input = dup

# The default replacement character is the "Unicode replacement" character.
# (U+FFFD).
if !replace and !block_given?
replace = "\xEF\xBF\xBD".force_encoding("UTF-8")
.encode(self.encoding, :undef => :replace, :replace => '?')
end

if replace
unless replace.is_a?(String)
raise(
TypeError,
"no implicit conversion of #{replace.class} into String"
)
end

unless replace.valid_encoding?
raise(
ArgumentError,
"replacement must be a valid byte sequence '#{replace.inspect}'"
)
end

replace = replace.force_encoding(Encoding::BINARY)
end

# MRI appears to just return a copy of self when the input encoding is
# BINARY/ASCII_8BIT.
if input.encoding == Encoding::BINARY
return input
end

converter = Encoding::Converter.new(input.encoding, Encoding::BINARY)

while input.length > 0
result = converter.primitive_convert(input, output, output.length)

if result == :finished
break
elsif result == :undefined_conversion
output << converter.primitive_errinfo[3]
else
# Blocks can return strings in any encoding so we'll make sure it's the
# same as our buffer for the mean time.
if block_given?
block_output = yield(converter.primitive_errinfo[3])

output << block_output.force_encoding(output.encoding)
else
output << replace
end
end
end

return output.force_encoding(encoding)
def scrub(replace = nil, &block)
val = scrub_internal(replace, &block)
return val unless val.nil?
self.dup
end

def scrub!(replace = nil, &block)
replace(scrub(replace, &block))
return self
end

def str_compat_and_valid(str, enc)
res = StringValue(str)
if !res.valid_encoding?
raise ArgumentError, "replacement must be valid byte sequence"
else
min_length = Truffle.invoke_primitive(:encoding_minlength, enc)
if res.encoding.ascii_compatible? ? min_length != 1 : enc != res.encoding
raise Encoding::CompatibilityError, "incompatible character encodings"
end
end
res
end

def []=(index, count_or_replacement, replacement=undefined)
Truffle.check_frozen

0 comments on commit a2836e9

Please sign in to comment.