jruby · Apr 5, 2015 · Apr 5, 2015 · Apr 5, 2015 · Apr 5, 2015 · Apr 5, 2015
diff --git a/core/src/main/java/org/jruby/ext/ripper/HeredocTerm.java b/core/src/main/java/org/jruby/ext/ripper/HeredocTerm.java
@@ -1,36 +1,39 @@
 /*
  ***** BEGIN LICENSE BLOCK *****
- * Version: CPL 1.0/GPL 2.0/LGPL 2.1
+ * Version: EPL 1.0/GPL 2.0/LGPL 2.1
  *
- * The contents of this file are subject to the Common Public
+ * The contents of this file are subject to the Eclipse Public
  * License Version 1.0 (the "License"); you may not use this file
  * except in compliance with the License. You may obtain a copy of
- * the License at http://www.eclipse.org/legal/cpl-v10.html
+ * the License at http://www.eclipse.org/legal/epl-v10.html
  *
  * Software distributed under the License is distributed on an "AS
  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  * implied. See the License for the specific language governing
  * rights and limitations under the License.
  *
- * Copyright (C) 2013 The JRuby Team (jruby@jruby.org)
- * 
+ * Copyright (C) 2004 Jan Arne Petersen <jpetersen@uni-bonn.de>
+ * Copyright (C) 2004-2007 Thomas E Enebo <enebo@acm.org>
+ *
  * Alternatively, the contents of this file may be used under the terms of
  * either of the GNU General Public License Version 2 or later (the "GPL"),
  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the CPL, indicate your
+ * use your version of this file under the terms of the EPL, indicate your
  * decision by deleting the provisions above and replace them with the notice
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the CPL, the GPL or the LGPL.
+ * the terms of any one of the EPL, the GPL or the LGPL.
  ***** END LICENSE BLOCK *****/
 package org.jruby.ext.ripper;
 
 import org.jcodings.Encoding;
+import org.jruby.lexer.LexerSource;
 import org.jruby.util.ByteList;
 
+import static org.jruby.lexer.LexingCommon.*;
 
 /**
  * A lexing unit for scanning a heredoc element.
@@ -93,18 +96,18 @@ protected int restore(RipperLexer lexer) {
         lexer.heredoc_restore(this);
         lexer.setStrTerm(null);
 
-        return RipperLexer.EOF;        
+        return EOF;
     }
 
     @Override
     public int parseString(RipperLexer lexer, LexerSource src) throws java.io.IOException {
         ByteList str = null;
         ByteList eos = nd_lit;
         int len = nd_lit.length() - 1;
-        boolean indent = (flags & RipperLexer.STR_FUNC_INDENT) != 0;
+        boolean indent = (flags & STR_FUNC_INDENT) != 0;
         int c = lexer.nextc();
 
-        if (c == RipperLexer.EOF) return error(lexer, len, str, eos);
+        if (c == EOF) return error(lexer, len, str, eos);
 
         // Found end marker for this heredoc
         if (lexer.was_bol() && lexer.whole_match_p(nd_lit, indent)) {
@@ -113,21 +116,23 @@ public int parseString(RipperLexer lexer, LexerSource src) throws java.io.IOExce
             return Tokens.tSTRING_END;
         }
 
-        if ((flags & RipperLexer.STR_FUNC_EXPAND) == 0) {
+        if ((flags & STR_FUNC_EXPAND) == 0) {
             do {
                 ByteList lbuf = lexer.lex_lastline;
                 int p = 0;
                 int pend = lexer.lex_pend;
                 if (pend > p) {
-                    switch(lexer.p(pend-1)) { // ENEBO: This seems wrong.
+                    switch(lexer.p(pend-1)) {
                         case '\n':
                             pend--;
                             if (pend == p || lexer.p(pend-1) == '\r') {
                                 pend++;
                                 break;
                             }
+                            break;
                         case '\r':
                             pend--;
+                            break;
                     }
                 }
                 if (str != null) {
@@ -138,12 +143,8 @@ public int parseString(RipperLexer lexer, LexerSource src) throws java.io.IOExce
 
                 if (pend < lexer.lex_pend) str.append('\n');
                 lexer.lex_goto_eol();
-                if (lexer.nextc() == -1) {
-                    if (str != null) {
-                        str = null;
-                        return error(lexer, len, str, eos);
-                    }
-                }
+                // MRI null checks str in this case but it is unconditionally non-null?
+                if (lexer.nextc() == -1) return error(lexer, len, null, eos);
             } while(!lexer.whole_match_p(eos, indent));
         } else {
             ByteList tok = new ByteList();
@@ -168,7 +169,7 @@ public int parseString(RipperLexer lexer, LexerSource src) throws java.io.IOExce
                 Encoding enc[] = new Encoding[1];
                 enc[0] = lexer.getEncoding();
 
-                if ((c = new StringTerm(flags, '\0', '\n').parseStringIntoBuffer(lexer, src, tok, enc)) == RipperLexer.EOF) {
+                if ((c = new StringTerm(flags, '\0', '\n').parseStringIntoBuffer(lexer, src, tok, enc)) == EOF) {
                     if (lexer.eofp) return error(lexer, len, str, eos);
                     return restore(lexer);
                 }
@@ -179,7 +180,7 @@ public int parseString(RipperLexer lexer, LexerSource src) throws java.io.IOExce
                 }
                 tok.append(lexer.nextc());
 
-                if ((c = lexer.nextc()) == RipperLexer.EOF) return error(lexer, len, str, eos);
+                if ((c = lexer.nextc()) == EOF) return error(lexer, len, str, eos);
             } while (!lexer.whole_match_p(eos, indent));
             str = tok;
         }

diff --git a/core/src/main/java/org/jruby/ext/ripper/RipperLexer.java b/core/src/main/java/org/jruby/ext/ripper/RipperLexer.java
@@ -32,42 +32,30 @@
 import java.math.BigDecimal;
 import java.util.HashMap;
 import org.jcodings.Encoding;
-import org.jcodings.specific.ASCIIEncoding;
-import org.jcodings.specific.USASCIIEncoding;
-import org.jcodings.specific.UTF8Encoding;
 import org.joni.Matcher;
 import org.joni.Option;
 import org.joni.Regex;
 import org.jruby.Ruby;
 import org.jruby.RubyRegexp;
+import org.jruby.lexer.LexerSource;
 import org.jruby.lexer.yacc.StackState;
 import org.jruby.runtime.builtin.IRubyObject;
 import org.jruby.util.ByteList;
 import org.jruby.util.SafeDoubleParser;
 import org.jruby.util.StringSupport;
 
+import static org.jruby.lexer.LexingCommon.*;
+import static org.jruby.lexer.LexingCommon.parseMagicComment;
+
 /**
  *
  * @author enebo
  */
 public class RipperLexer {
-    public static final Encoding UTF8_ENCODING = UTF8Encoding.INSTANCE;
-    public static final Encoding USASCII_ENCODING = USASCIIEncoding.INSTANCE;
-    public static final Encoding ASCII8BIT_ENCODING = ASCIIEncoding.INSTANCE;
-
-    private static ByteList END_MARKER = new ByteList(new byte[] {'_', '_', 'E', 'N', 'D', '_', '_'});
-    private static ByteList BEGIN_DOC_MARKER = new ByteList(new byte[] {'b', 'e', 'g', 'i', 'n'});
-    private static ByteList END_DOC_MARKER = new ByteList(new byte[] {'e', 'n', 'd'});
-    private static ByteList CODING = new ByteList(new byte[] {'c', 'o', 'd', 'i', 'n', 'g'});
     private static final HashMap<String, Keyword> map;
 
-    private static final int SUFFIX_R = 1<<0;
-    private static final int SUFFIX_I = 1<<1;
-    private static final int SUFFIX_ALL = 3;
-
     static {
-        map = new HashMap<String, Keyword>();
-
+        map = new HashMap<>();
         map.put("end", Keyword.END);
         map.put("else", Keyword.ELSE);
         map.put("case", Keyword.CASE);
@@ -222,12 +210,12 @@ public enum Keyword {
         WHILE ("while", Tokens.kWHILE, Tokens.kWHILE_MOD, LexState.EXPR_BEG),
         ALIAS ("alias", Tokens.kALIAS, Tokens.kALIAS, LexState.EXPR_FNAME),
         __ENCODING__("__ENCODING__", Tokens.k__ENCODING__, Tokens.k__ENCODING__, LexState.EXPR_END);
-        
+
         public final String name;
         public final int id0;
         public final int id1;
         public final LexState state;
-        
+
         Keyword(String name, int id0, int id1, LexState state) {
             this.name = name;
             this.id0 = id0;
@@ -277,25 +265,9 @@ public static Keyword getKeyword(String str) {
     private StrTerm lex_strterm;
     public boolean commandStart;
 
-    // Give a name to a value.  Enebo: This should be used more.
-    static final int EOF = -1; // 0 in MRI
-
-    // ruby constants for strings (should this be moved somewhere else?)
-    static final int STR_FUNC_ESCAPE=0x01;
-    static final int STR_FUNC_EXPAND=0x02;
-    static final int STR_FUNC_REGEXP=0x04;
-    static final int STR_FUNC_QWORDS=0x08;
-    static final int STR_FUNC_SYMBOL=0x10;
     // When the heredoc identifier specifies <<-EOF that indents before ident. are ok (the '-').
     static final int STR_FUNC_INDENT=0x20;
 
-    private static final int str_squote = 0;
-    private static final int str_dquote = STR_FUNC_EXPAND;
-    private static final int str_xquote = STR_FUNC_EXPAND;
-    private static final int str_regexp = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND;
-    private static final int str_ssym   = STR_FUNC_SYMBOL;
-    private static final int str_dsym   = STR_FUNC_SYMBOL | STR_FUNC_EXPAND;
-
     // Count of nested parentheses (1.9 only)
     private int parenNest = 0;
     // 1.9 only
@@ -441,7 +413,7 @@ public int column() {
     }
 
     public int lineno() {
-        return ruby_sourceline + src.getLineOffset();
+        return ruby_sourceline + src.getLineOffset() - 1;
     }
 
     public void dispatchHeredocEnd() {
@@ -905,6 +877,7 @@ private int hereDocumentIdentifier() throws IOException {
                 return 0;
             }
             markerValue = new ByteList();
+            markerValue.setEncoding(current_enc);
             term = '"';
             func |= str_dquote;
             do {
@@ -927,43 +900,6 @@ private void arg_ambiguous() {
         parser.dispatch("on_arg_ambiguous");
     }
 
-
-    /* MRI: magic_comment_marker */
-    /* This impl is a little sucky.  We basically double scan the same bytelist twice.  Once here
-     * and once in parseMagicComment.
-     */
-    private int magicCommentMarker(ByteList str, int begin) {
-        int i = begin;
-        int len = str.length();
-
-        while (i < len) {
-            switch (str.charAt(i)) {
-                case '-':
-                    if (i >= 2 && str.charAt(i - 1) == '*' && str.charAt(i - 2) == '-') return i + 1;
-                    i += 2;
-                    break;
-                case '*':
-                    if (i + 1 >= len) return -1;
-
-                    if (str.charAt(i + 1) != '-') {
-                        i += 4;
-                    } else if (str.charAt(i - 1) != '-') {
-                        i += 2;
-                    } else {
-                        return i + 2;
-                    }
-                    break;
-                default:
-                    i += 3;
-                    break;
-            }
-        }
-        return -1;
-    }
-
-    private static final String magicString = "([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*";
-    private static final Regex magicRegexp = new Regex(magicString.getBytes(), 0, magicString.length(), 0, Encoding.load("ASCII"));
-
     private boolean comment_at_top() {
         int p = lex_pbeg;
         int pend = lex_p - 1;
@@ -974,35 +910,6 @@ private boolean comment_at_top() {
         }
         return true;
     }
-
-    // MRI: parser_magic_comment
-    protected boolean parseMagicComment(ByteList magicLine) throws IOException {
-        int length = magicLine.length();
-        if (length <= 7) return false;
-        int beg = magicCommentMarker(magicLine, 0);
-        if (beg < 0) return false;
-        int end = magicCommentMarker(magicLine, beg);
-        if (end < 0) return false;
-
-        // We only use a regex if -*- ... -*- is found.  Not too hot a path?
-        int realSize = magicLine.getRealSize();
-        int begin = magicLine.getBegin();
-        Matcher matcher = magicRegexp.matcher(magicLine.getUnsafeBytes(), begin, begin + realSize);
-        int result = RubyRegexp.matcherSearch(getRuntime(), matcher, begin, begin + realSize, Option.NONE);
-        if (result < 0) return false;
-
-        // Regexp is guarateed to have three matches
-        int begs[] = matcher.getRegion().beg;
-        int ends[] = matcher.getRegion().end;
-        String name = magicLine.subSequence(begs[1], ends[1]).toString();
-        if (!name.equalsIgnoreCase("encoding")) return false;
-
-        ByteList val = new ByteList(magicLine.getUnsafeBytes(), begs[2], ends[2] - begs[2]);
-
-        parser.dispatch("on_magic_comment", parser.getRuntime().newString(name), createStr(val, 0));
-
-        return true;
-    }
 
     protected void set_file_encoding(int str, int send) {
         boolean sep = false;
@@ -1181,7 +1088,7 @@ private String printToken(int token) {
             case Tokens.tLABEL: return "tLABEL("+ ((Token) value()).getValue() +":),";
             case '\n': return "NL";
             case EOF: return "EOF";
-            default: return "'" + (char)token + " [" + (int) token + "',";
+            default: return "'" + (char)token + " [" + token + "',";
         }
     }
 
@@ -1472,17 +1379,20 @@ private int yylex() throws IOException {
                 dispatchScanEvent(Tokens.tSP);
                 continue;
             }
-            case '#':		/* it's a comment */
-                if (!parseMagicComment(lexb.makeShared(lex_p, lex_pend - lex_p))) {
-                    if (comment_at_top()) {
-                        set_file_encoding(lex_p, lex_pend);
-                    }
+            case '#': { /* it's a comment */
+                ByteList encodingName = parseMagicComment(getRuntime(), lexb.makeShared(lex_p, lex_pend - lex_p));
+                // FIXME: boolean to mark we already found a magic comment to stop searching.  When found or we went too far
+                if (encodingName != null) {
+                    setEncoding(encodingName);
+                } else if (comment_at_top()) {
+                    set_file_encoding(lex_p, lex_pend);
                 }
                 lex_p = lex_pend;
                 dispatchScanEvent(Tokens.tCOMMENT);
-                    
+
                 fallthru = true;
-                /* fall through */
+            }
+            /* fall through */
             case '\n':
                 switch (lex_state) {
                     case EXPR_BEG:
@@ -2839,8 +2749,6 @@ public void readUTFEscapeRegexpLiteral(ByteList buffer) throws IOException {
         buffer.setEncoding(UTF8_ENCODING);
     }
 
-    private byte[] mbcBuf = new byte[6];
-
     // mri: parser_tokadd_mbchar
     // This is different than MRI in that we return a boolean since we only care whether it was added
     // or not.  The MRI version returns the byte supplied which is never used as a value.

diff --git a/core/src/main/java/org/jruby/ext/ripper/RipperParser.java b/core/src/main/java/org/jruby/ext/ripper/RipperParser.java
@@ -32,6 +32,7 @@
 package org.jruby.ext.ripper;
 
 import org.jruby.RubyArray;
+import org.jruby.lexer.LexerSource;
 import org.jruby.runtime.ThreadContext;
 import org.jruby.runtime.builtin.IRubyObject;
 import org.jruby.ext.ripper.RipperLexer.LexState;
@@ -40,7 +41,7 @@ public class RipperParser extends RipperParserBase {
     public RipperParser(ThreadContext context, IRubyObject ripper, LexerSource source) {
         super(context, ripper, source);
     }
-					// line 44 "-"
+					// line 45 "-"
   // %token constants
   public static final int kCLASS = 257;
   public static final int kMODULE = 258;
@@ -4645,6 +4646,6 @@ public Object yyparse (RipperLexer yyLex) throws java.io.IOException {
   }
 };
 }
-					// line 2069 "RipperParser.y"
+					// line 2070 "RipperParser.y"
 }
-					// line 9136 "-"
+					// line 9137 "-"
diff --git a/core/src/main/java/org/jruby/ext/ripper/RipperParser.y b/core/src/main/java/org/jruby/ext/ripper/RipperParser.y
@@ -29,6 +29,7 @@
 package org.jruby.ext.ripper;
 
 import org.jruby.RubyArray;
+import org.jruby.lexer.LexerSource;
 import org.jruby.runtime.ThreadContext;
 import org.jruby.runtime.builtin.IRubyObject;
 import org.jruby.ext.ripper.RipperLexer.LexState;