Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: jruby/jruby
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: c9790ea19b97
Choose a base ref
...
head repository: jruby/jruby
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 2614c773b8f4
Choose a head ref
  • 8 commits
  • 8 files changed
  • 2 contributors

Commits on Jan 14, 2018

  1. Extract parseRegexpFlags to Lexer

    And use the same function from both Ripper and main parser.
    grddev committed Jan 14, 2018
    Copy the full SHA
    0829a85 View commit details
  2. Move Regexp checking from ParserSupport to Lexer

    While not really related to Lexing, this is a component that is shared between Ripper and the main parser, and that seemed like the lesser evil.
    grddev committed Jan 14, 2018
    Copy the full SHA
    54a483b View commit details
  3. Align validation code with MRI

    It seems to have been `!ENCODING_IS_ASCII8BIT(str)` from the beginning, so I'm not sure why it was the opposite here.
    grddev committed Jan 14, 2018
    Copy the full SHA
    57661ef View commit details
  4. Make sure to clear $! when rescuing RaiseException

    The code was copied from Parser support, so it was clearly broken before, but it had to be fixed now as parts of the Ripper test suite relies on $! rather than explicitly catching the exception.
    grddev committed Jan 14, 2018
    Copy the full SHA
    a089f8a View commit details
  5. Add support for validating Regexp in Ripper

    This uses the fact that Regexp tokenization is handled by a single StringTerm, and thus all tSTRING_CONTENT fragments are easily collectable until the tREGEXP_END comes with the options that we need for validation.
    
    The validation itself is a copied/simplified version of what is performed by the main parser, as large parts the validation depended on the AST structure, which we do not have here.
    
    Technically, this doesn't perform the validation at the same point in time as the main parser, as it performs the validation when encountering the tREGEXP_END token rather than when processing the regexp rule.
    
    I speculate that the difference doesn't really matter given that the only thing we could do with the tREGEXP_END token is to apply the regexp rule.
    grddev committed Jan 14, 2018
    Copy the full SHA
    38d526b View commit details
  6. Simplify the regexp validation logic

    Use a separate variable to track whether things are dynamic or not, and use a List to avoid tracking the last element explicitly.
    grddev committed Jan 14, 2018
    Copy the full SHA
    d3a8051 View commit details
  7. Inline the regexp validation inside ParserSupport

    The methods were only retained to provide the old interface, but by directly calling the new methods in the Lexer, we can remove the old methods, given that we don't really need to be backwards compatible here.
    grddev committed Jan 14, 2018
    Copy the full SHA
    836f80e View commit details
  8. Merge pull request #4902 from grddev/ripper-regex

    Add support for validating Regexp in Ripper
    enebo authored Jan 14, 2018
    Copy the full SHA
    2614c77 View commit details
18 changes: 18 additions & 0 deletions core/src/main/java/org/jruby/ext/ripper/RipperLexer.java
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@
import org.jruby.lexer.LexingCommon;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.jruby.util.RegexpOptions;
import org.jruby.util.SafeDoubleParser;
import org.jruby.util.StringSupport;
import org.jruby.util.cli.Options;
@@ -405,6 +406,23 @@ protected void setCompileOptionFlag(String name, ByteList value) {
}
}

@Override
protected RegexpOptions parseRegexpFlags() throws IOException {
StringBuilder unknownFlags = new StringBuilder(10);
RegexpOptions options = parseRegexpFlags(unknownFlags);
if (unknownFlags.length() != 0) {
compile_error("unknown regexp option" +
(unknownFlags.length() > 1 ? "s" : "") + " - " + unknownFlags);
}
return options;
}

@Override
protected void mismatchedRegexpEncodingError(Encoding optionEncoding, Encoding encoding) {
compile_error("regexp encoding option '" + optionsEncodingChar(optionEncoding) +
"' differs from source encoding '" + encoding + "'");
}

@Override
protected void setTokenInfo(String name, ByteList value) {

57 changes: 33 additions & 24 deletions core/src/main/java/org/jruby/ext/ripper/StringTerm.java
Original file line number Diff line number Diff line change
@@ -28,10 +28,14 @@
package org.jruby.ext.ripper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jcodings.Encoding;
import org.jruby.Ruby;
import org.jruby.lexer.LexerSource;
import org.jruby.parser.RubyParser;
import org.jruby.util.ByteList;
import org.jruby.util.RegexpOptions;

import static org.jruby.lexer.LexingCommon.*;

@@ -48,11 +52,17 @@ public class StringTerm extends StrTerm {
// How many strings are nested in the current string term
private int nest;

private List<ByteList> regexpFragments;
private boolean regexpDynamic;

public StringTerm(int flags, int begin, int end) {
this.flags = flags;
this.begin = (char) begin;
this.end = (char) end;
this.nest = 0;
if ((flags & STR_FUNC_REGEXP) != 0) {
this.regexpFragments = new ArrayList<>();
}
}

public int getFlags() {
@@ -69,11 +79,27 @@ private int endFound(RipperLexer lexer) throws IOException {
return ' ';
}

if ((flags & STR_FUNC_REGEXP) != 0) return parseRegexpFlags(lexer);
if ((flags & STR_FUNC_REGEXP) != 0) {
validateRegexp(lexer);
return RubyParser.tREGEXP_END;
}

return RubyParser.tSTRING_END;
}

private void validateRegexp(RipperLexer lexer) throws IOException {
Ruby runtime = lexer.getRuntime();
RegexpOptions options = lexer.parseRegexpFlags();
for (ByteList fragment : regexpFragments) {
lexer.checkRegexpFragment(runtime, fragment, options);
}
if (!regexpDynamic && regexpFragments.size() == 1) {
lexer.checkRegexpSyntax(runtime, regexpFragments.get(0), options);
}
regexpFragments.clear();
regexpDynamic = false;
}

@Override
public int parseString(RipperLexer lexer, LexerSource src) throws IOException {
boolean spaceSeen = false;
@@ -108,6 +134,9 @@ public int parseString(RipperLexer lexer, LexerSource src) throws IOException {
int token = lexer.peekVariableName(RubyParser.tSTRING_DVAR, RubyParser.tSTRING_DBEG);

if (token != 0) {
if ((flags & STR_FUNC_REGEXP) != 0) {
regexpDynamic = true;
}
return token;
} else {
buffer.append(c);
@@ -129,33 +158,13 @@ public int parseString(RipperLexer lexer, LexerSource src) throws IOException {
}

lexer.setValue(lexer.createStr(buffer, flags));
if ((flags & STR_FUNC_REGEXP) != 0) {
regexpFragments.add(buffer);
}
lexer.flush_string_content(enc[0]);
return RubyParser.tSTRING_CONTENT;
}

private int parseRegexpFlags(RipperLexer lexer) throws IOException {
int c;
StringBuilder unknownFlags = new StringBuilder(10);

for (c = lexer.nextc(); c != EOF
&& Character.isLetter(c); c = lexer.nextc()) {
switch (c) {
case 'i': case 'x': case 'm': case 'o': case 'n':
case 'e': case 's': case 'u':
break;
default:
unknownFlags.append((char) c);
break;
}
}
lexer.pushback(c);
if (unknownFlags.length() != 0) {
lexer.compile_error("unknown regexp option" + (unknownFlags.length() > 1 ? "s" : "") + " - " + unknownFlags.toString());
}

return RubyParser.tREGEXP_END;
}

private void mixedEscape(RipperLexer lexer, Encoding foundEncoding, Encoding parserEncoding) {
lexer.compile_error(" mixed within " + parserEncoding);
}
121 changes: 121 additions & 0 deletions core/src/main/java/org/jruby/lexer/LexingCommon.java
Original file line number Diff line number Diff line change
@@ -13,11 +13,16 @@
import org.jruby.Ruby;
import org.jruby.RubyEncoding;
import org.jruby.RubyRegexp;
import org.jruby.exceptions.RaiseException;
import org.jruby.javasupport.ext.JavaLang;
import org.jruby.lexer.yacc.ISourcePosition;
import org.jruby.lexer.yacc.SimpleSourcePosition;
import org.jruby.lexer.yacc.StackState;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.jruby.util.KCode;
import org.jruby.util.RegexpOptions;
import org.jruby.util.StringSupport;
import org.jruby.util.io.EncodingUtils;

@@ -1087,4 +1092,120 @@ protected boolean onMagicComment(String name, ByteList value) {
}
return false;
}

protected abstract RegexpOptions parseRegexpFlags() throws IOException;

protected RegexpOptions parseRegexpFlags(StringBuilder unknownFlags) throws IOException {
RegexpOptions options = new RegexpOptions();
int c;

newtok(true);
for (c = nextc(); c != EOF && Character.isLetter(c); c = nextc()) {
switch (c) {
case 'i':
options.setIgnorecase(true);
break;
case 'x':
options.setExtended(true);
break;
case 'm':
options.setMultiline(true);
break;
case 'o':
options.setOnce(true);
break;
case 'n':
options.setExplicitKCode(KCode.NONE);
break;
case 'e':
options.setExplicitKCode(KCode.EUC);
break;
case 's':
options.setExplicitKCode(KCode.SJIS);
break;
case 'u':
options.setExplicitKCode(KCode.UTF8);
break;
case 'j':
options.setJava(true);
break;
default:
unknownFlags.append((char) c);
break;
}
}
pushback(c);

return options;
}

public void checkRegexpFragment(Ruby runtime, ByteList value, RegexpOptions options) {
setRegexpEncoding(runtime, value, options);
ThreadContext context = runtime.getCurrentContext();
IRubyObject $ex = context.getErrorInfo();
try {
RubyRegexp.preprocessCheck(runtime, value);
} catch (RaiseException re) {
context.setErrorInfo($ex);
compile_error(re.getMessage());
}
}

public void checkRegexpSyntax(Ruby runtime, ByteList value, RegexpOptions options) {
final String stringValue = value.toString();
// Joni doesn't support these modifiers - but we can fix up in some cases - let the error delay until we try that
if (stringValue.startsWith("(?u)") || stringValue.startsWith("(?a)") || stringValue.startsWith("(?d)"))
return;

ThreadContext context = runtime.getCurrentContext();
IRubyObject $ex = context.getErrorInfo();
try {
// This is only for syntax checking but this will as a side-effect create an entry in the regexp cache.
RubyRegexp.newRegexpParser(runtime, value, (RegexpOptions)options.clone());
} catch (RaiseException re) {
context.setErrorInfo($ex);
compile_error(re.getMessage());
}
}

protected abstract void mismatchedRegexpEncodingError(Encoding optionEncoding, Encoding encoding);

// MRI: reg_fragment_setenc_gen
public void setRegexpEncoding(Ruby runtime, ByteList value, RegexpOptions options) {
Encoding optionsEncoding = options.setup(runtime);

// Change encoding to one specified by regexp options as long as the string is compatible.
if (optionsEncoding != null) {
if (optionsEncoding != value.getEncoding() && !is7BitASCII(value)) {
mismatchedRegexpEncodingError(optionsEncoding, value.getEncoding());
}

value.setEncoding(optionsEncoding);
} else if (options.isEncodingNone()) {
if (value.getEncoding() != ASCII8BIT_ENCODING && !is7BitASCII(value)) {
mismatchedRegexpEncodingError(optionsEncoding, value.getEncoding());
}
value.setEncoding(ASCII8BIT_ENCODING);
} else if (getEncoding() == USASCII_ENCODING) {
if (!is7BitASCII(value)) {
value.setEncoding(USASCII_ENCODING); // This will raise later
} else {
value.setEncoding(ASCII8BIT_ENCODING);
}
}
}

private boolean is7BitASCII(ByteList value) {
return StringSupport.codeRangeScan(value.getEncoding(), value) == StringSupport.CR_7BIT;
}

// TODO: Put somewhere more consolidated (similiar
protected char optionsEncodingChar(Encoding optionEncoding) {
if (optionEncoding == USASCII_ENCODING) return 'n';
if (optionEncoding == org.jcodings.specific.EUCJPEncoding.INSTANCE) return 'e';
if (optionEncoding == org.jcodings.specific.SJISEncoding.INSTANCE) return 's';
if (optionEncoding == UTF8_ENCODING) return 'u';

return ' ';
}
}
18 changes: 18 additions & 0 deletions core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java
Original file line number Diff line number Diff line change
@@ -62,6 +62,7 @@
import org.jruby.parser.ParserSupport;
import org.jruby.parser.RubyParser;
import org.jruby.util.ByteList;
import org.jruby.util.RegexpOptions;
import org.jruby.util.SafeDoubleParser;
import org.jruby.util.StringSupport;
import org.jruby.util.cli.Options;
@@ -438,6 +439,23 @@ protected void setCompileOptionFlag(String name, ByteList value) {
parserSupport.getConfiguration().setFrozenStringLiteral(b == 1);
}

@Override
protected RegexpOptions parseRegexpFlags() throws IOException {
StringBuilder unknownFlags = new StringBuilder(10);
RegexpOptions options = parseRegexpFlags(unknownFlags);
if (unknownFlags.length() != 0) {
compile_error(PID.REGEXP_UNKNOWN_OPTION, "unknown regexp option" +
(unknownFlags.length() > 1 ? "s" : "") + " - " + unknownFlags);
}
return options;
}

@Override
protected void mismatchedRegexpEncodingError(Encoding optionEncoding, Encoding encoding) {
compile_error(PID.REGEXP_ENCODING_MISMATCH, "regexp encoding option '" + optionsEncodingChar(optionEncoding) +
"' differs from source encoding '" + encoding + "'");
}

private final ByteList TRUE = new ByteList(new byte[] {'t', 'r', 'u', 'e'});
private final ByteList FALSE = new ByteList(new byte[] {'f', 'a', 'l', 's', 'e'});
protected int asTruth(String name, ByteList value) {
52 changes: 1 addition & 51 deletions core/src/main/java/org/jruby/lexer/yacc/StringTerm.java
Original file line number Diff line number Diff line change
@@ -33,7 +33,6 @@
import org.jruby.lexer.yacc.SyntaxException.PID;
import org.jruby.parser.RubyParser;
import org.jruby.util.ByteList;
import org.jruby.util.KCode;
import org.jruby.util.RegexpOptions;

import static org.jruby.lexer.LexingCommon.*;
@@ -80,7 +79,7 @@ private int endFound(RubyLexer lexer) throws IOException {
}

if ((flags & STR_FUNC_REGEXP) != 0) {
RegexpOptions options = parseRegexpFlags(lexer);
RegexpOptions options = lexer.parseRegexpFlags();
ByteList regexpBytelist = ByteList.create("");

lexer.setValue(new RegexpNode(lexer.getPosition(), regexpBytelist, options));
@@ -140,55 +139,6 @@ public int parseString(RubyLexer lexer) throws IOException {
return RubyParser.tSTRING_CONTENT;
}

private RegexpOptions parseRegexpFlags(RubyLexer lexer) throws IOException {
RegexpOptions options = new RegexpOptions();
int c;
StringBuilder unknownFlags = new StringBuilder(10);

lexer.newtok(true);
for (c = lexer.nextc(); c != EOF
&& Character.isLetter(c); c = lexer.nextc()) {
switch (c) {
case 'i':
options.setIgnorecase(true);
break;
case 'x':
options.setExtended(true);
break;
case 'm':
options.setMultiline(true);
break;
case 'o':
options.setOnce(true);
break;
case 'n':
options.setExplicitKCode(KCode.NONE);
break;
case 'e':
options.setExplicitKCode(KCode.EUC);
break;
case 's':
options.setExplicitKCode(KCode.SJIS);
break;
case 'u':
options.setExplicitKCode(KCode.UTF8);
break;
case 'j':
options.setJava(true);
break;
default:
unknownFlags.append((char) c);
break;
}
}
lexer.pushback(c);
if (unknownFlags.length() != 0) {
lexer.compile_error(PID.REGEXP_UNKNOWN_OPTION, "unknown regexp option" +
(unknownFlags.length() > 1 ? "s" : "") + " - " + unknownFlags);
}
return options;
}

private void mixedEscape(RubyLexer lexer, Encoding foundEncoding, Encoding parserEncoding) {
lexer.compile_error(PID.MIXED_ENCODING, "" + foundEncoding + " mixed within " + parserEncoding);
}
Loading