Skip to content

Commit 295ddc3

Browse files
straight-shootaasterite
authored andcommittedJan 16, 2018
Add overload to String.from_utf16 with pointer
1 parent 244da57 commit 295ddc3

File tree

2 files changed

+40
-1
lines changed

2 files changed

+40
-1
lines changed
 

Diff for: ‎spec/std/string/utf16_spec.cr

+6
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,11 @@ describe "String UTF16" do
4343
input = Slice[0xdc00_u16, 0xd800_u16]
4444
String.from_utf16(input).should eq("\u{fffd}\u{fffd}")
4545
end
46+
47+
it "handles null bytes" do
48+
slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16]
49+
String.from_utf16(slice).should eq("hi\0000𐂥")
50+
String.from_utf16(slice.to_unsafe).should eq("hi")
51+
end
4652
end
4753
end

Diff for: ‎src/string/utf16.cr

+34-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,15 @@ class String
5353
# slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
5454
# String.from_utf16(slice) # => "hi 𐂥"
5555
# ```
56-
def self.from_utf16(slice : Slice(UInt16)) : String
56+
#
57+
# If *slice* is a pointer, the string ends when a zero value is found.
58+
#
59+
# ```
60+
# slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16]
61+
# String.from_utf16(slice) # => "hi\0000𐂥"
62+
# String.from_utf16(slice.to_unsafe) # => "hi"
63+
# ```
64+
def self.from_utf16(slice : Slice(UInt16) | Pointer(UInt16)) : String
5765
bytesize = 0
5866
size = 0
5967

@@ -97,4 +105,29 @@ class String
97105
i += 1
98106
end
99107
end
108+
109+
# Yields each decoded char in the given pointer, stopping at the first null byte.
110+
private def self.each_utf16_char(pointer : Pointer(UInt16))
111+
loop do
112+
byte = pointer.value.to_i
113+
break if byte == 0
114+
115+
if byte < 0xd800 || byte >= 0xe000
116+
# One byte
117+
codepoint = byte
118+
elsif 0xd800 <= byte < 0xdc00 &&
119+
0xdc00 <= (pointer + 1).value <= 0xdfff
120+
# Surrougate pair
121+
pointer = pointer + 1
122+
codepoint = ((byte - 0xd800) << 10) + (pointer.value - 0xdc00) + 0x10000
123+
else
124+
# Invalid byte
125+
codepoint = 0xfffd
126+
end
127+
128+
yield codepoint.chr
129+
130+
pointer = pointer + 1
131+
end
132+
end
100133
end

0 commit comments

Comments
 (0)
Please sign in to comment.