Skip to content

Commit

Permalink
Propagate original bytes (ByteList) from parser to symbols.
Browse files Browse the repository at this point in the history
The first fix here, which fixes #4564, stopped the Symbol operand
from modifying an existing symbol with the same bytes to have a
new Encoding. This was broken because sometimes the encoding it
chose, along some paths, was the wrong one, and it also modified
a shared ByteList causing sym.to_s strings elsewhere to suddenly
change encoding.

Fixing that led to another set of issues that required fixing.

Our Symbol logic in the AST and IR was trying to shove the
original parsed bytes through as a java.lang.String plus encoding.
Unfortunately the parser was providing that string as ISO-8859-1
bytes and then attempting to encode the string using the provided
encoding, resulting in bogus multibyte characters getting into IR.
The fix for #4564 removed broken logic in IR that forced an
existing symbol to have a new encoding, so the broken bytes got
into the table and stayed there.

With this change, Symbols never change encoding, *most* paths to
create them provide the original ByteList, and the remaining
java.lang.String paths encode to bytes properly.

Fixes #4564 (and probably other cases).
headius committed Apr 25, 2017

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent fd0a001 commit 01cd84d
Showing 9 changed files with 62 additions and 79 deletions.
40 changes: 18 additions & 22 deletions core/src/main/java/org/jruby/RubySymbol.java
Original file line number Diff line number Diff line change
@@ -197,44 +197,40 @@ public static RubySymbol newSymbol(Ruby runtime, IRubyObject name) {
} else if (name instanceof RubyString) {
return runtime.getSymbolTable().getSymbol(((RubyString) name).getByteList(), false);
} else {
return newSymbol(runtime, name.asJavaString());
return newSymbol(runtime, name.asString().getByteList());
}
}

public static RubySymbol newHardSymbol(Ruby runtime, IRubyObject name) {
if (name instanceof RubySymbol || name instanceof RubyString) {
return runtime.getSymbolTable().getSymbol(name.asJavaString(), true);
if (name instanceof RubySymbol) {
return runtime.getSymbolTable().getSymbol(((RubySymbol) name).getBytes(), true);
} else if (name instanceof RubyString) {
return runtime.getSymbolTable().getSymbol(((RubyString) name).getByteList(), true);
}

return newSymbol(runtime, name.asJavaString());
return newSymbol(runtime, name.asString().getByteList());
}

public static RubySymbol newSymbol(Ruby runtime, String name) {
return runtime.getSymbolTable().getSymbol(name, false);
}

public static RubySymbol newSymbol(Ruby runtime, ByteList bytes) {
return runtime.getSymbolTable().getSymbol(bytes, false);
}

public static RubySymbol newHardSymbol(Ruby runtime, String name) {
return runtime.getSymbolTable().getSymbol(name, true);
}

// FIXME: same bytesequences will fight over encoding of the symbol once cached. I think largely
// this will only happen in some ISO_8859_?? encodings making symbols at the same time so it should
// be pretty rare.
public static RubySymbol newSymbol(Ruby runtime, String name, Encoding encoding) {
RubySymbol newSymbol = newSymbol(runtime, name);

newSymbol.associateEncoding(encoding);
RubySymbol newSymbol = runtime.getSymbolTable().getSymbol(RubyString.encodeBytelist(name, encoding));

return newSymbol;
}

// FIXME: same bytesequences will fight over encoding of the symbol once cached. I think largely
// this will only happen in some ISO_8859_?? encodings making symbols at the same time so it should
// be pretty rare.
public static RubySymbol newHardSymbol(Ruby runtime, String name, Encoding encoding) {
RubySymbol newSymbol = newHardSymbol(runtime, name);

newSymbol.associateEncoding(encoding);
RubySymbol newSymbol = runtime.getSymbolTable().getSymbol(RubyString.encodeBytelist(name, encoding));

return newSymbol;
}
@@ -386,7 +382,7 @@ private RubyString rubyStringFromString(Ruby runtime) {
@JRubyMethod(name = {"succ", "next"})
public IRubyObject succ(ThreadContext context) {
Ruby runtime = context.runtime;
return newSymbol(runtime, newShared(runtime).succ(context).toString());
return newSymbol(runtime, newShared(runtime).succ(context).asString().getByteList());
}

@JRubyMethod(name = "<=>")
@@ -436,28 +432,28 @@ public IRubyObject empty_p(ThreadContext context) {
public IRubyObject upcase(ThreadContext context) {
Ruby runtime = context.runtime;

return newSymbol(runtime, rubyStringFromString(runtime).upcase19(context).toString());
return newSymbol(runtime, newShared(runtime).upcase19(context).getByteList());
}

@JRubyMethod
public IRubyObject downcase(ThreadContext context) {
Ruby runtime = context.runtime;

return newSymbol(runtime, rubyStringFromString(runtime).downcase19(context).toString());
return newSymbol(runtime, newShared(runtime).downcase19(context).getByteList());
}

@JRubyMethod
public IRubyObject capitalize(ThreadContext context) {
Ruby runtime = context.runtime;

return newSymbol(runtime, rubyStringFromString(runtime).capitalize19(context).toString());
return newSymbol(runtime, ((RubyString) newShared(runtime).capitalize19(context)).getByteList());
}

@JRubyMethod
public IRubyObject swapcase(ThreadContext context) {
Ruby runtime = context.runtime;

return newSymbol(runtime, rubyStringFromString(runtime).swapcase19(context).toString());
return newSymbol(runtime, newShared(runtime).swapcase19(context).getByteList());
}

@JRubyMethod
@@ -614,7 +610,7 @@ public static IRubyObject all_symbols(IRubyObject recv) {
}

public static RubySymbol unmarshalFrom(UnmarshalStream input) throws java.io.IOException {
RubySymbol result = newSymbol(input.getRuntime(), RubyString.byteListToString(input.unmarshalString()));
RubySymbol result = newSymbol(input.getRuntime(), input.unmarshalString());

input.registerLinkTarget(result);

35 changes: 11 additions & 24 deletions core/src/main/java/org/jruby/ast/SymbolNode.java
Original file line number Diff line number Diff line change
@@ -35,52 +35,35 @@
import java.util.List;

import org.jcodings.Encoding;
import org.jcodings.specific.USASCIIEncoding;

import org.jruby.ast.types.ILiteralNode;
import org.jruby.ast.types.INameNode;
import org.jruby.ast.visitor.NodeVisitor;
import org.jruby.lexer.yacc.ISourcePosition;
import org.jruby.util.ByteList;
import org.jruby.util.StringSupport;

/**
* Represents a symbol (:symbol_name).
*/
public class SymbolNode extends Node implements ILiteralNode, INameNode, SideEffectFree {
private final String name;
private final Encoding encoding;
private final ByteList bytes;

// Interned ident path (e.g. [':', ident]).
public SymbolNode(ISourcePosition position, String name, Encoding encoding, int cr) {
super(position, false);
this.name = name; // Assumed all names are already intern'd by lexer.

if (encoding == USASCIIEncoding.INSTANCE || cr == StringSupport.CR_7BIT) {
this.encoding = USASCIIEncoding.INSTANCE;
} else {
this.encoding = encoding;
}
this.bytes = new ByteList(name.getBytes(encoding.getCharset()), encoding);
}

// String path (e.g. [':', str_beg, str_content, str_end])
public SymbolNode(ISourcePosition position, ByteList value) {
super(position, false);
this.name = value.toString().intern();

if (value.getEncoding() != USASCIIEncoding.INSTANCE) {
int size = value.realSize();
this.encoding = value.getEncoding().strLength(value.unsafeBytes(), value.begin(), size) == size ?
USASCIIEncoding.INSTANCE : value.getEncoding();
} else {
this.encoding = USASCIIEncoding.INSTANCE;
}
this.bytes = value;
}

public boolean equals(Object other) {
return other instanceof SymbolNode &&
name.equals(((SymbolNode) other).getName()) &&
encoding == ((SymbolNode) other).getEncoding();
bytes.equals(((SymbolNode) other).bytes) &&
bytes.getEncoding() == ((SymbolNode) other).getEncoding();
}

public NodeType getNodeType() {
@@ -96,14 +79,18 @@ public <T> T accept(NodeVisitor<T> iVisitor) {
* @return Returns a String
*/
public String getName() {
return name;
return new String(bytes.unsafeBytes(), bytes.getEncoding().getCharset());
}

public Encoding getEncoding() {
return encoding;
return bytes.getEncoding();
}

public List<Node> childNodes() {
return EMPTY_LIST;
}

public ByteList getBytes() {
return bytes;
}
}
3 changes: 1 addition & 2 deletions core/src/main/java/org/jruby/ir/IRBuilder.java
Original file line number Diff line number Diff line change
@@ -7,7 +7,6 @@
import org.jruby.ast.*;
import org.jruby.ast.types.INameNode;
import org.jruby.compiler.NotCompilableException;
import org.jruby.lexer.yacc.ISourcePosition;
import org.jruby.runtime.ArgumentDescriptor;
import org.jruby.runtime.ArgumentType;
import org.jruby.ir.instructions.*;
@@ -3821,7 +3820,7 @@ public Operand buildSValue(SValueNode node) {
public Operand buildSymbol(SymbolNode node) {
// Since symbols are interned objects, no need to copyAndReturnValue(...)
// SSS FIXME: Premature opt?
return new Symbol(node.getName(), node.getEncoding());
return new Symbol(node.getBytes());
}

public Operand buildTrue() {
Original file line number Diff line number Diff line change
@@ -17,19 +17,20 @@
import org.jruby.runtime.DynamicScope;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;

public class ConstMissingInstr extends CallInstr implements FixedArityInstr {
private final String missingConst;
private final ByteList missingConst;

public ConstMissingInstr(Variable result, Operand currentModule, String missingConst, boolean isPotentiallyRefined) {
public ConstMissingInstr(Variable result, Operand currentModule, ByteList missingConst, boolean isPotentiallyRefined) {
// FIXME: Missing encoding knowledge of the constant name.
super(Operation.CONST_MISSING, CallType.FUNCTIONAL, result, "const_missing", currentModule,
new Operand[]{new Symbol(missingConst, USASCIIEncoding.INSTANCE)}, null, isPotentiallyRefined);
new Operand[]{new Symbol(missingConst)}, null, isPotentiallyRefined);

this.missingConst = missingConst;
}

public String getMissingConst() {
public ByteList getMissingConst() {
return missingConst;
}

@@ -48,7 +49,7 @@ public void encode(IRWriterEncoder e) {
}

public static ConstMissingInstr decode(IRReaderDecoder d) {
return new ConstMissingInstr(d.decodeVariable(), d.decodeOperand(), d.decodeString(), d.getCurrentScope().maybeUsingRefinements());
return new ConstMissingInstr(d.decodeVariable(), d.decodeOperand(), d.decodeByteList(), d.getCurrentScope().maybeUsingRefinements());
}

@Override
@@ -64,7 +65,7 @@ public String[] toStringNonOperandArgs() {
@Override
public Object interpret(ThreadContext context, StaticScope currScope, DynamicScope currDynScope, IRubyObject self, Object[] temp) {
RubyModule module = (RubyModule) getReceiver().retrieve(context, self, currScope, currDynScope, temp);
return module.callMethod(context, "const_missing", context.runtime.fastNewSymbol(missingConst));
return module.callMethod(context, "const_missing", context.runtime.newSymbol(missingConst));
}

@Override
26 changes: 16 additions & 10 deletions core/src/main/java/org/jruby/ir/operands/Symbol.java
Original file line number Diff line number Diff line change
@@ -2,37 +2,43 @@

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jruby.RubyString;
import org.jruby.RubySymbol;
import org.jruby.ir.IRVisitor;
import org.jruby.ir.persistence.IRReaderDecoder;
import org.jruby.ir.persistence.IRWriterEncoder;
import org.jruby.runtime.ThreadContext;
import org.jruby.util.ByteList;

public class Symbol extends ImmutableLiteral {
public static final Symbol KW_REST_ARG_DUMMY = new Symbol("", ASCIIEncoding.INSTANCE);

private final String name;
private final Encoding encoding;
private final ByteList bytes;

public Symbol(String name, Encoding encoding) {
super();

this.name = name;
this.encoding = encoding;
this.bytes = new ByteList(name.getBytes(encoding.getCharset()), encoding);
}

public Symbol(ByteList bytes) {
this.bytes = bytes;
}

@Override
public OperandType getOperandType() {
return OperandType.SYMBOL;
}

public String getName() {
return name;
public ByteList getBytes() {
return bytes;
}

public String getString() { return RubyString.byteListToString(bytes); }

@Override
public Object createCacheObject(ThreadContext context) {
return RubySymbol.newSymbol(context.runtime, getName(), encoding);
return RubySymbol.newSymbol(context.runtime, bytes);
}

@Override
@@ -41,18 +47,18 @@ public boolean canCopyPropagate() {
}

public Encoding getEncoding() {
return encoding;
return bytes.getEncoding();
}

@Override
public String toString() {
return ":'" + getName() + "'";
return ":'" + getString() + "'";
}

@Override
public void encode(IRWriterEncoder e) {
super.encode(e);
e.encode(getName());
e.encode(getString());
e.encode(getEncoding());
}

2 changes: 1 addition & 1 deletion core/src/main/java/org/jruby/ir/persistence/IRDumper.java
Original file line number Diff line number Diff line change
@@ -324,7 +324,7 @@ public void ObjectClass(ObjectClass objectclass) { }
public void StandardError(StandardError standarderror) { }
public void StringLiteral(StringLiteral stringliteral) { print(stringliteral.getByteList()); }
public void SValue(SValue svalue) { visit(svalue.getArray()); }
public void Symbol(Symbol symbol) { symbol.getName(); }
public void Symbol(Symbol symbol) { print(symbol.getBytes()); }
public void SymbolProc(SymbolProc symbolproc) { print(symbolproc.getName()); }
public void TemporaryVariable(TemporaryVariable temporaryvariable) { print(temporaryvariable.getName()); }
public void TemporaryLocalVariable(TemporaryLocalVariable temporarylocalvariable) { TemporaryVariable(temporarylocalvariable); }
Original file line number Diff line number Diff line change
@@ -324,10 +324,9 @@ public org.objectweb.asm.Label newLabel() {
*
* Stack required: none
*
* @param sym the symbol's string identifier
* @param encoding the symbol's encoding
* @param bytes the ByteList for the symbol
*/
public abstract void pushSymbol(String sym, Encoding encoding);
public abstract void pushSymbol(ByteList bytes);

/**
* Push a Symbol.to_proc on the stack.
12 changes: 4 additions & 8 deletions core/src/main/java/org/jruby/ir/targets/IRBytecodeAdapter6.java
Original file line number Diff line number Diff line change
@@ -343,17 +343,13 @@ public void run() {
}
}

public void pushSymbol(final String sym, final Encoding encoding) {
cacheValuePermanentlyLoadContext("symbol", RubySymbol.class, keyFor("symbol", sym, encoding), new Runnable() {
public void pushSymbol(final ByteList bytes) {
cacheValuePermanentlyLoadContext("symbol", RubySymbol.class, keyFor("symbol", bytes, bytes.getEncoding()), new Runnable() {
@Override
public void run() {
loadRuntime();
adapter.ldc(sym);
loadContext();
adapter.ldc(encoding.toString());
invokeIRHelper("retrieveJCodingsEncoding", sig(Encoding.class, ThreadContext.class, String.class));

adapter.invokestatic(p(RubySymbol.class), "newSymbol", sig(RubySymbol.class, Ruby.class, String.class, Encoding.class));
pushByteList(bytes);
adapter.invokestatic(p(RubyString.class), "newSymbol", sig(RubyString.class, Ruby.class, ByteList.class));
}
});
}
5 changes: 2 additions & 3 deletions core/src/main/java/org/jruby/ir/targets/JVMVisitor.java
Original file line number Diff line number Diff line change
@@ -1176,8 +1176,7 @@ public void ConstMissingInstr(ConstMissingInstr constmissinginstr) {
jvmAdapter().checkcast("org/jruby/RubyModule");
jvmMethod().loadContext();
jvmAdapter().ldc("const_missing");
// FIXME: This has lost it's encoding info by this point
jvmMethod().pushSymbol(constmissinginstr.getMissingConst(), USASCIIEncoding.INSTANCE);
jvmMethod().pushSymbol(constmissinginstr.getMissingConst());
jvmMethod().invokeVirtual(Type.getType(RubyModule.class), Method.getMethod("org.jruby.runtime.builtin.IRubyObject callMethod(org.jruby.runtime.ThreadContext, java.lang.String, org.jruby.runtime.builtin.IRubyObject)"));
jvmStoreLocal(constmissinginstr.getResult());
}
@@ -2544,7 +2543,7 @@ public void SValue(SValue svalue) {

@Override
public void Symbol(Symbol symbol) {
jvmMethod().pushSymbol(symbol.getName(), symbol.getEncoding());
jvmMethod().pushSymbol(symbol.getBytes());
}

@Override

0 comments on commit 01cd84d

Please sign in to comment.