Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: crystal-lang/crystal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: de680fbc168f
Choose a base ref
...
head repository: crystal-lang/crystal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 1def7738aecd
Choose a head ref
  • 2 commits
  • 7 files changed
  • 1 contributor

Commits on Nov 22, 2016

  1. String#reverse: iterate grapheme clusters for correct output when com…

    …bining characters are used
    Ary Borenszweig committed Nov 22, 2016
    Copy the full SHA
    da21f0b View commit details
  2. Unicode: consider special case conversions like the uppercase of "ffl" …

    …and the downcase of "İ"
    Ary Borenszweig committed Nov 22, 2016
    Copy the full SHA
    1def773 View commit details
Showing with 349 additions and 16 deletions.
  1. +34 −0 scripts/generate_unicode_data.cr
  2. +33 −1 scripts/unicode_data.ecr
  3. +11 −0 spec/std/string_spec.cr
  4. +37 −0 src/char.cr
  5. +25 −14 src/string.cr
  6. +132 −1 src/unicode/data.cr
  7. +77 −0 src/unicode/unicode.cr
34 changes: 34 additions & 0 deletions scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
@@ -15,6 +15,10 @@ record Entry,
upcase : Int32?,
downcase : Int32?

record SpecialCase,
codepoint : Int32,
value : Array(Int32)

record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
@@ -123,6 +127,8 @@ def strides(entries, targets)
end

entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase

url = "http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt"
body = HTTP::Client.get(url).body
@@ -139,6 +145,34 @@ body.each_line do |line|
entries << Entry.new(codepoint, name, general_category, upcase, downcase)
end

url = "http://www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
break if line.starts_with?("# Conditional Mappings")
next if line.starts_with?('#')

pieces = line.split(';')
codepoint = pieces[0].to_i(16)
downcase = pieces[1].split.map(&.to_i(16))
upcase = pieces[3].split.map(&.to_i(16))
downcase = nil if downcase.size == 1
upcase = nil if upcase.size == 1
if downcase
while downcase.size < 3
downcase << 0
end
special_cases_downcase << SpecialCase.new(codepoint, downcase)
end
if upcase
while upcase.size < 3
upcase << 0
end
special_cases_upcase << SpecialCase.new(codepoint, upcase)
end
end

downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }

34 changes: 33 additions & 1 deletion scripts/unicode_data.ecr
Original file line number Diff line number Diff line change
@@ -68,10 +68,42 @@ module Unicode
end
<%- end %>

# Special downcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_downcase
@@special_cases_downcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_downcase.size %>)
<%- special_cases_downcase.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end %>
data
end
end

# Special upcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_upcase
@@special_cases_upcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_upcase.size %>)
<%- special_cases_upcase.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end %>
data
end
end

# TODO: this is needed to avoid generating lots of allocas
# in LLVM, which makes LLVM really slow. The compiler should
# try to avoid/reuse temporary allocas.
private def self.put(array, *values) : Nil
private def self.put(array : Array, *values) : Nil
array << values
end

private def self.put(hash : Hash, key, *values) : Nil
hash[key] = values
end
end
11 changes: 11 additions & 0 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
@@ -461,6 +461,7 @@ describe "String" do
assert { "ÁÉÍÓÚĀ".downcase.should eq("áéíóúā") }
assert { "AEIİOU".downcase(Unicode::CaseOptions::Turkic).should eq("aeıiou") }
assert { "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII).should eq("ÁeÍoÚ") }
assert { "İ".downcase.should eq("") }
end

describe "upcase" do
@@ -469,12 +470,15 @@ describe "String" do
assert { "áéíóúā".upcase.should eq("ÁÉÍÓÚĀ") }
assert { "aeıiou".upcase(Unicode::CaseOptions::Turkic).should eq("AEIİOU") }
assert { "áeíoú".upcase(Unicode::CaseOptions::ASCII).should eq("áEíOú") }
assert { "baffle".upcase.should eq("BAFFLE") }
assert { "".upcase.should eq("FF") }
end

describe "capitalize" do
assert { "HELLO!".capitalize.should eq("Hello!") }
assert { "HELLO MAN!".capitalize.should eq("Hello man!") }
assert { "".capitalize.should eq("") }
assert { "fflİ".capitalize.should eq("FFLi̇") }
end

describe "chomp" do
@@ -870,6 +874,13 @@ describe "String" do
reversed.should eq("はちいんこ")
end

it "reverses taking grapheme clusters into account" do
reversed = "noël".reverse
reversed.bytesize.should eq("noël".bytesize)
reversed.size.should eq("noël".size)
reversed.should eq("lëon")
end

describe "sub" do
it "subs char with char" do
replaced = "foobar".sub('o', 'e')
37 changes: 37 additions & 0 deletions src/char.cr
Original file line number Diff line number Diff line change
@@ -366,6 +366,13 @@ struct Char

# Returns the downcase equivalent of this char.
#
# Note that this only works for characters whose downcase
# equivalent yields a single codepoint. There are a few
# characters, like 'İ', than when downcased result in multiple
# characters (in this case: 'I' and the dot mark).
#
# For a more correct method see the method that receives a block.
#
# ```
# 'Z'.downcase # => 'z'
# 'x'.downcase # => 'x'
@@ -375,8 +382,24 @@ struct Char
Unicode.downcase(self, options)
end

# Yields each char for the downcase equivalent of this char.
#
# This method takes into account the possibility that an downcase
# version of a char might result in multiple chars, like for
# 'İ', which results in 'i' and a dot mark.
def downcase(options = Unicode::CaseOptions::None)
Unicode.downcase(self, options) { |char| yield char }
end

# Returns the upcase equivalent of this char.
#
# Note that this only works for characters whose upcase
# equivalent yields a single codepoint. There are a few
# characters, like 'ffl', than when upcased result in multiple
# characters (in this case: 'F', 'F', 'L').
#
# For a more correct method see the method that receives a block.
#
# ```
# 'z'.upcase # => 'Z'
# 'X'.upcase # => 'X'
@@ -386,6 +409,20 @@ struct Char
Unicode.upcase(self, options)
end

# Yields each char for the upcase equivalent of this char.
#
# This method takes into account the possibility that an upcase
# version of a char might result in multiple chars, like for
# 'ffl', which results in 'F', 'F' and 'L'.
#
# ```
# 'z'.upcase { |v| puts v } # prints 'Z'
# 'ffl'.upcase { |v| puts v } # prints 'F', 'F', 'F'
# ```
def upcase(options = Unicode::CaseOptions::None)
Unicode.upcase(self, options) { |char| yield char }
end

# Returns this char's codepoint.
def hash
ord
39 changes: 25 additions & 14 deletions src/string.cr
Original file line number Diff line number Diff line change
@@ -877,7 +877,9 @@ class String
def downcase(options = Unicode::CaseOptions::None)
String.build(bytesize) do |io|
each_char do |char|
io << char.downcase(options)
char.downcase(options) do |res|
io << res
end
end
end
end
@@ -891,7 +893,9 @@ class String
def upcase(options = Unicode::CaseOptions::None)
String.build(bytesize) do |io|
each_char do |char|
io << char.upcase(options)
char.upcase(options) do |res|
io << res
end
end
end
end
@@ -908,9 +912,9 @@ class String
String.build(bytesize) do |io|
each_char_with_index do |char, i|
if i == 0
io << char.upcase
char.upcase { |c| io << c }
else
io << char.downcase
char.downcase { |c| io << c }
end
end
end
@@ -2748,18 +2752,25 @@ class String
# "racecar".reverse # => "racecar"
# ```
def reverse
String.new(bytesize) do |buffer|
buffer += bytesize
reader = Char::Reader.new(self)
reader.each do |char|
buffer -= reader.current_char_width
i = 0
char.each_byte do |byte|
buffer[i] = byte
i += 1
if ascii_only?
String.new(bytesize) do |buffer|
bytesize.times do |i|
buffer[i] = self.to_unsafe[bytesize - i - 1]
end
{@bytesize, @length}
end
else
# Iterate grpahemes to reverse the string,
# so combining characters are placed correctly
String.new(bytesize) do |buffer|
buffer += bytesize
scan(/\X/) do |match|
grapheme = match[0]
buffer -= grapheme.bytesize
buffer.copy_from(grapheme.to_unsafe, grapheme.bytesize)
end
{@bytesize, @length}
end
{@bytesize, @length}
end
end

133 changes: 132 additions & 1 deletion src/unicode/data.cr
Original file line number Diff line number Diff line change
@@ -1309,10 +1309,141 @@ module Unicode
end
end

# Special downcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_downcase
@@special_cases_downcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 1)
put(data, 304, 105, 775, 0)

data
end
end

# Special upcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_upcase
@@special_cases_upcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 102)
put(data, 223, 83, 83, 0)
put(data, 64256, 70, 70, 0)
put(data, 64257, 70, 73, 0)
put(data, 64258, 70, 76, 0)
put(data, 64259, 70, 70, 73)
put(data, 64260, 70, 70, 76)
put(data, 64261, 83, 84, 0)
put(data, 64262, 83, 84, 0)
put(data, 1415, 1333, 1362, 0)
put(data, 64275, 1348, 1350, 0)
put(data, 64276, 1348, 1333, 0)
put(data, 64277, 1348, 1339, 0)
put(data, 64278, 1358, 1350, 0)
put(data, 64279, 1348, 1341, 0)
put(data, 329, 700, 78, 0)
put(data, 912, 921, 776, 769)
put(data, 944, 933, 776, 769)
put(data, 496, 74, 780, 0)
put(data, 7830, 72, 817, 0)
put(data, 7831, 84, 776, 0)
put(data, 7832, 87, 778, 0)
put(data, 7833, 89, 778, 0)
put(data, 7834, 65, 702, 0)
put(data, 8016, 933, 787, 0)
put(data, 8018, 933, 787, 768)
put(data, 8020, 933, 787, 769)
put(data, 8022, 933, 787, 834)
put(data, 8118, 913, 834, 0)
put(data, 8134, 919, 834, 0)
put(data, 8146, 921, 776, 768)
put(data, 8147, 921, 776, 769)
put(data, 8150, 921, 834, 0)
put(data, 8151, 921, 776, 834)
put(data, 8162, 933, 776, 768)
put(data, 8163, 933, 776, 769)
put(data, 8164, 929, 787, 0)
put(data, 8166, 933, 834, 0)
put(data, 8167, 933, 776, 834)
put(data, 8182, 937, 834, 0)
put(data, 8064, 7944, 921, 0)
put(data, 8065, 7945, 921, 0)
put(data, 8066, 7946, 921, 0)
put(data, 8067, 7947, 921, 0)
put(data, 8068, 7948, 921, 0)
put(data, 8069, 7949, 921, 0)
put(data, 8070, 7950, 921, 0)
put(data, 8071, 7951, 921, 0)
put(data, 8072, 7944, 921, 0)
put(data, 8073, 7945, 921, 0)
put(data, 8074, 7946, 921, 0)
put(data, 8075, 7947, 921, 0)
put(data, 8076, 7948, 921, 0)
put(data, 8077, 7949, 921, 0)
put(data, 8078, 7950, 921, 0)
put(data, 8079, 7951, 921, 0)
put(data, 8080, 7976, 921, 0)
put(data, 8081, 7977, 921, 0)
put(data, 8082, 7978, 921, 0)
put(data, 8083, 7979, 921, 0)
put(data, 8084, 7980, 921, 0)
put(data, 8085, 7981, 921, 0)
put(data, 8086, 7982, 921, 0)
put(data, 8087, 7983, 921, 0)
put(data, 8088, 7976, 921, 0)
put(data, 8089, 7977, 921, 0)
put(data, 8090, 7978, 921, 0)
put(data, 8091, 7979, 921, 0)
put(data, 8092, 7980, 921, 0)
put(data, 8093, 7981, 921, 0)
put(data, 8094, 7982, 921, 0)
put(data, 8095, 7983, 921, 0)
put(data, 8096, 8040, 921, 0)
put(data, 8097, 8041, 921, 0)
put(data, 8098, 8042, 921, 0)
put(data, 8099, 8043, 921, 0)
put(data, 8100, 8044, 921, 0)
put(data, 8101, 8045, 921, 0)
put(data, 8102, 8046, 921, 0)
put(data, 8103, 8047, 921, 0)
put(data, 8104, 8040, 921, 0)
put(data, 8105, 8041, 921, 0)
put(data, 8106, 8042, 921, 0)
put(data, 8107, 8043, 921, 0)
put(data, 8108, 8044, 921, 0)
put(data, 8109, 8045, 921, 0)
put(data, 8110, 8046, 921, 0)
put(data, 8111, 8047, 921, 0)
put(data, 8115, 913, 921, 0)
put(data, 8124, 913, 921, 0)
put(data, 8131, 919, 921, 0)
put(data, 8140, 919, 921, 0)
put(data, 8179, 937, 921, 0)
put(data, 8188, 937, 921, 0)
put(data, 8114, 8122, 921, 0)
put(data, 8116, 902, 921, 0)
put(data, 8130, 8138, 921, 0)
put(data, 8132, 905, 921, 0)
put(data, 8178, 8186, 921, 0)
put(data, 8180, 911, 921, 0)
put(data, 8119, 913, 834, 921)
put(data, 8135, 919, 834, 921)
put(data, 8183, 937, 834, 921)

data
end
end

# TODO: this is needed to avoid generating lots of allocas
# in LLVM, which makes LLVM really slow. The compiler should
# try to avoid/reuse temporary allocas.
private def self.put(array, *values) : Nil
private def self.put(array : Array, *values) : Nil
array << values
end

private def self.put(hash : Hash, key, *values) : Nil
hash[key] = values
end
end
77 changes: 77 additions & 0 deletions src/unicode/unicode.cr
Original file line number Diff line number Diff line change
@@ -25,21 +25,59 @@ module Unicode
end

def self.upcase(char : Char, options : CaseOptions)
result = check_upcase_ascii(char, options)
return result if result

result = check_upcase_turkic(char, options)
return result if result

check_upcase_ranges(char)
end

def self.upcase(char : Char, options : CaseOptions)
result = check_upcase_ascii(char, options)
if result
yield result
return
end

result = check_upcase_turkic(char, options)
if result
yield result
return
end

result = special_cases_upcase[char.ord]?
if result
result.each { |c| yield c.unsafe_chr if c != 0 }
return
end

yield check_upcase_ranges(char)
end

private def self.check_upcase_ascii(char, options)
if (char.ascii? && options == Unicode::CaseOptions::None) || options.ascii?
if char.ascii_lowercase?
return (char.ord - 32).unsafe_chr
else
return char
end
end
nil
end

private def self.check_upcase_turkic(char, options)
if options.turkic?
case char
when 'ı'; return 'I'
when 'i'; return 'İ'
end
end
nil
end

private def self.check_upcase_ranges(char)
result = search_ranges(upcase_ranges, char.ord)
return char + result if result

@@ -50,6 +88,38 @@ module Unicode
end

def self.downcase(char : Char, options : CaseOptions)
result = check_downcase_ascii(char, options)
return result if result

result = check_downcase_turkic(char, options)
return result if result

check_downcase_ranges(char)
end

def self.downcase(char : Char, options : CaseOptions)
result = check_downcase_ascii(char, options)
if result
yield result
return
end

result = check_downcase_turkic(char, options)
if result
yield result
return
end

result = special_cases_downcase[char.ord]?
if result
result.each { |c| yield c.unsafe_chr if c != 0 }
return
end

yield check_downcase_ranges(char)
end

private def self.check_downcase_ascii(char, options)
if (char.ascii? && options == Unicode::CaseOptions::None) || options.ascii?
if char.ascii_uppercase?
return (char.ord + 32).unsafe_chr
@@ -58,13 +128,20 @@ module Unicode
end
end

nil
end

private def self.check_downcase_turkic(char, options)
if options.turkic?
case char
when 'I'; return 'ı'
when 'İ'; return 'i'
end
end
nil
end

private def self.check_downcase_ranges(char)
result = search_ranges(downcase_ranges, char.ord)
return char + result if result