Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: jruby/jruby
Failed to load repositories. Confirm that selected base ref is valid, then try again.
base: 745fcccc6841
Choose a base ref
head repository: jruby/jruby
Failed to load repositories. Confirm that selected head ref is valid, then try again.
compare: c6e43c076899
Choose a head ref
  • 2 commits
  • 3 files changed
  • 1 contributor

Commits on Mar 4, 2016

  1. Verified

    This commit was signed with the committer’s verified signature.
    makenowjust Hiroya Fujinami
    Copy the full SHA
    efae5ad View commit details
  2. Copy the full SHA
    c6e43c0 View commit details
Showing with 114 additions and 228 deletions.
  1. +114 −106 core/src/main/java/org/jruby/ext/nkf/
  2. +0 −118 test/jruby/test_nkf.rb
  3. +0 −4 test/mri/excludes/TestKconv.rb
220 changes: 114 additions & 106 deletions core/src/main/java/org/jruby/ext/nkf/
Original file line number Diff line number Diff line change
@@ -40,6 +40,9 @@

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.transcode.EConv;
import org.jcodings.transcode.EConvFlags;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyModule;
@@ -53,15 +56,16 @@
import org.jruby.util.ByteList;
import org.jruby.util.KCode;
import org.jruby.util.Pack;

public class RubyNKF {
public static enum NKFCharset {
AUTO(0, "x-JISAutoDetect"),
// no ISO-2022-JP in jcodings
JIS(1, "iso-2022-jp"),
JIS(1, "ISO-2022-JP"),
EUC(2, "EUC-JP"),
SJIS(3, "Windows-31J"),
SJIS(3, "Shift_JIS"),
BINARY(4, null),
NOCONV(4, null),
UNKNOWN(0, null),
@@ -119,6 +123,10 @@ public static void createNKF(Ruby runtime) {

@JRubyMethod(name = "guess", required = 1, module = true)
public static IRubyObject guess(ThreadContext context, IRubyObject recv, IRubyObject s) {
return charsetMappedValue(context.runtime, guess(context, s));

public static NKFCharset guess(ThreadContext context, IRubyObject s) {
// TODO: Fix charset usage for JRUBY-4553
Ruby runtime = context.runtime;
if (!s.respondsTo("to_str")) {
@@ -134,28 +142,39 @@ public static IRubyObject guess(ThreadContext context, IRubyObject recv, IRubyOb
try {

if ( ! decoder.isCharsetDetected() ) {
return NKFCharset.UNKNOWN;
Charset charset = decoder.detectedCharset();
String name =;
if ("Shift_JIS".equals(name)) {
return NKFCharset.SJIS;
if ("Windows-31j".equalsIgnoreCase(name)) {
return NKFCharset.JIS;
if ("EUC-JP".equals(name)) {
return NKFCharset.EUC;
if ("ISO-2022-JP".equals(name)) {
return NKFCharset.JIS;
catch (CharacterCodingException e) {
return charsetMappedValue(runtime, UNKNOWN);
if ( ! decoder.isCharsetDetected() ) {
return charsetMappedValue(runtime, UNKNOWN);
Charset charset = decoder.detectedCharset();
String name =;
if ("Shift_JIS".equals(name)) {
return charsetMappedValue(runtime, SJIS);
// fall through and try direct encoding
if ("windows-31j".equals(name)) {
return charsetMappedValue(runtime, SJIS);

if (bytes.getEncoding() == UTF8Encoding.INSTANCE) {
return NKFCharset.UTF8;
if ("EUC-JP".equals(name)) {
return charsetMappedValue(runtime, EUC);
if (bytes.getEncoding().toString().startsWith("UTF-16")) {
return NKFCharset.UTF16;
if ("ISO-2022-JP".equals(name)) {
return charsetMappedValue(runtime, JIS);
if (bytes.getEncoding().toString().startsWith("UTF-32")) {
return NKFCharset.UTF32;
return charsetMappedValue(runtime, UNKNOWN);
return NKFCharset.UNKNOWN;

private static IRubyObject charsetMappedValue(final Ruby runtime, final NKFCharset charset) {
@@ -195,15 +214,8 @@ public static IRubyObject nkf(ThreadContext context, IRubyObject recv, IRubyObje

Map<String, NKFCharset> options = parseOpt(opt.convertToString().toString());

if (options.get("input").getValue() == AUTO.getValue()) {
KCode kcode = runtime.getKCode();
if (kcode == KCode.SJIS) {
options.put("input", SJIS);
} else if (kcode == KCode.EUC) {
options.put("input", EUC);
} else if (kcode == KCode.UTF8) {
options.put("input", UTF8);
if (options.get("input").getValue() == NKFCharset.AUTO.getValue()) {
options.put("input", guess(context, str));

ByteList bstr = str.convertToString().getByteList();
@@ -216,9 +228,9 @@ public static IRubyObject nkf(ThreadContext context, IRubyObject recv, IRubyObje

RubyString result = converter.convert(bstr);

if (options.get("mime-encode") == BASE64) {
if (options.get("mime-encode") == NKFCharset.BASE64) {
result = Converter.encodeMimeString(runtime, result, PACK_BASE64);
} else if (options.get("mime-encode") == QENCODE) {
} else if (options.get("mime-encode") == NKFCharset.QENCODE) {
result = Converter.encodeMimeString(runtime, result, PACK_QENCODE);

@@ -290,123 +302,127 @@ private static Map<String, NKFCharset> parseOpt(String s) {
Map<String, NKFCharset> options = new HashMap<String, NKFCharset>();

// default options
options.put("input", AUTO);
options.put("output", JIS);
options.put("mime-decode", MIME_DETECT);
options.put("mime-encode", NOCONV);
options.put("input", NKFCharset.AUTO);
options.put("output", NKFCharset.JIS);
options.put("mime-decode", NKFCharset.MIME_DETECT);
options.put("mime-encode", NKFCharset.NOCONV);

Command cmd = parseOption(s);
if (cmd.hasOption("j")) {
options.put("output", JIS);
options.put("output", NKFCharset.JIS);
if (cmd.hasOption("s")) {
options.put("output", SJIS);
options.put("output", NKFCharset.SJIS);
if (cmd.hasOption("e")) {
options.put("output", EUC);
options.put("output", NKFCharset.EUC);
if (cmd.hasOption("w")) {
Option opt = cmd.getOption("w");
if ("32".equals(opt.getValue())) {
options.put("output", UTF32);
options.put("output", NKFCharset.UTF32);
} else if("16".equals(opt.getValue())) {
options.put("output", UTF16);
options.put("output", NKFCharset.UTF16);
} else {
options.put("output", UTF8);
options.put("output", NKFCharset.UTF8);
if (cmd.hasOption("J")) {
options.put("input", JIS);
options.put("input", NKFCharset.JIS);
if (cmd.hasOption("S")) {
options.put("input", SJIS);
options.put("input", NKFCharset.SJIS);
if (cmd.hasOption("E")) {
options.put("input", EUC);
options.put("input", NKFCharset.EUC);
if (cmd.hasOption("W")) {
Option opt = cmd.getOption("W");
if ("32".equals(opt.getValue())) {
options.put("input", UTF32);
options.put("input", NKFCharset.UTF32);
} else if("16".equals(opt.getValue())) {
options.put("input", UTF16);
options.put("input", NKFCharset.UTF16);
} else {
options.put("input", UTF8);
options.put("input", NKFCharset.UTF8);
if (cmd.hasOption("m")) {
Option opt = cmd.getOption("m");
if (opt.getValue() == null) {
options.put("mime-decode", MIME_DETECT);
options.put("mime-decode", NKFCharset.MIME_DETECT);
} else if ("B".equals(opt.getValue())) {
options.put("mime-decode", BASE64);
options.put("mime-decode", NKFCharset.BASE64);
} else if ("Q".equals(opt.getValue())) {
options.put("mime-decode", QENCODE);
options.put("mime-decode", NKFCharset.QENCODE);
} else if ("N".equals(opt.getValue())) {
// TODO: non-strict option
} else if ("0".equals(opt.getValue())) {
options.put("mime-decode", NOCONV);
options.put("mime-decode", NKFCharset.NOCONV);
if (cmd.hasOption("M")) {
Option opt = cmd.getOption("M");
if (opt.getValue() == null) {
options.put("mime-encode", NOCONV);
options.put("mime-encode", NKFCharset.NOCONV);
} else if ("B".equals(opt.getValue())) {
options.put("mime-encode", BASE64);
options.put("mime-encode", NKFCharset.BASE64);
} else if ("Q".equals(opt.getValue())) {
options.put("mime-encode", QENCODE);
options.put("mime-encode", NKFCharset.QENCODE);
if (cmd.hasOption("base64")) {
options.put("mime-encode", BASE64);
options.put("mime-encode", NKFCharset.BASE64);
if (cmd.hasOption("oc")) {
Option opt = cmd.getOption("oc");
if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", JIS);
options.put("output", NKFCharset.JIS);
} else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", EUC);
options.put("output", NKFCharset.EUC);
} else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", SJIS);
options.put("output", NKFCharset.SJIS);
} else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", SJIS);
options.put("output", NKFCharset.SJIS);
} else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", NKFCharset.JIS);
} else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF8);
options.put("output", NKFCharset.UTF8);
} else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF8);
options.put("output", NKFCharset.UTF8);
} else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF16);
options.put("output", NKFCharset.UTF16);
} else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF16);
options.put("output", NKFCharset.UTF16);
} else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF32);
options.put("output", NKFCharset.UTF32);
} else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("output", UTF32);
options.put("output", NKFCharset.UTF32);
if (cmd.hasOption("ic")) {
Option opt = cmd.getOption("ic");
if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", JIS);
options.put("input", NKFCharset.JIS);
} else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", EUC);
options.put("input", NKFCharset.EUC);
} else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", SJIS);
options.put("input", NKFCharset.SJIS);
} else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", SJIS);
options.put("input", NKFCharset.SJIS);
} else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", NKFCharset.SJIS);
} else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF8);
options.put("input", NKFCharset.UTF8);
} else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF8);
options.put("input", NKFCharset.UTF8);
} else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF16);
options.put("input", NKFCharset.UTF16);
} else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF16);
options.put("input", NKFCharset.UTF16);
} else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF32);
options.put("input", NKFCharset.UTF32);
} else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
options.put("input", UTF32);
options.put("input", NKFCharset.UTF32);

@@ -427,7 +443,7 @@ static boolean isMimeText(ByteList str, Map<String, NKFCharset> options) {
if (str.length() <= 6) {
return false;
if (options.get("mime-decode") == NOCONV) {
if (options.get("mime-decode") == NKFCharset.NOCONV) {
return false;
if (str.indexOf(BEGIN_MIME_STRING) < 0) {
@@ -448,36 +464,28 @@ private static RubyString encodeMimeString(Ruby runtime, RubyString str, ByteLis

ByteList convert_byte(ByteList str, String inputCharset, NKFCharset output) {
String outputCharset = output.getCharset();
CharsetDecoder decoder;
CharsetEncoder encoder;

try {
decoder = Charset.forName(inputCharset).newDecoder();
encoder = Charset.forName(outputCharset).newEncoder();
} catch (UnsupportedCharsetException e) {
throw context.runtime.newArgumentError("invalid charset");

ByteBuffer buf = ByteBuffer.wrap(str.getUnsafeBytes(), str.begin(), str.length());
if (inputCharset == null) {
inputCharset = str.getEncoding().toString();

try {
CharBuffer cbuf = decoder.decode(buf);
buf = encoder.encode(cbuf);
} catch (CharacterCodingException e) {
throw context.runtime.newArgumentError("invalid encoding");
if (outputCharset.equals(inputCharset)) {
return str.dup();
byte[] arr = buf.array();
ByteList r = new ByteList(arr, 0, buf.limit());
if (outputCharset.equalsIgnoreCase("Windows-31J")) outputCharset = "Shift_JIS";
if (outputCharset.equalsIgnoreCase("UTF-16")) outputCharset = "UTF-16BE";
Ruby ruby = context.runtime;
Encoding enc = ruby.getEncodingService().findEncoding(ruby.newString(outputCharset));
if (enc != null) {

byte[] outCharsetBytes = outputCharset.getBytes();

EConv ec = EncodingUtils.econvOpenOpts(context, inputCharset.getBytes(), outCharsetBytes, 0, context.nil);

if (ec == null) {
throw context.runtime.newArgumentError("invalid encoding pair: " + inputCharset + " to " + outputCharset);

return r;
ByteList converted = EncodingUtils.econvStrConvert(context, ec, str, EConvFlags.INVALID_REPLACE);


return converted;

@@ -504,14 +512,14 @@ public MimeConverter(ThreadContext ctx, Map<String, NKFCharset> opt) {

private String detectCharset(String charset) {
if (charset.compareToIgnoreCase(UTF8.getCharset()) == 0) {
return UTF8.getCharset();
} else if (charset.compareToIgnoreCase(JIS.getCharset()) == 0) {
return JIS.getCharset();
} else if (charset.compareToIgnoreCase(EUC.getCharset()) == 0) {
return EUC.getCharset();
if (charset.compareToIgnoreCase(NKFCharset.UTF8.getCharset()) == 0) {
return NKFCharset.UTF8.getCharset();
} else if (charset.compareToIgnoreCase(NKFCharset.JIS.getCharset()) == 0) {
return NKFCharset.JIS.getCharset();
} else if (charset.compareToIgnoreCase(NKFCharset.EUC.getCharset()) == 0) {
return NKFCharset.EUC.getCharset();
} else {
return ASCII.getCharset();
return NKFCharset.ASCII.getCharset();

118 changes: 0 additions & 118 deletions test/jruby/test_nkf.rb

This file was deleted.

4 changes: 0 additions & 4 deletions test/mri/excludes/TestKconv.rb

This file was deleted.