Skip to content

Commit 843529d

Browse files
committedJul 18, 2018
implement proper utf16leToUtf8
·
0.15.10.3.0
1 parent cbfe9a4 commit 843529d

File tree

1 file changed

+83
-6
lines changed

1 file changed

+83
-6
lines changed
 

‎src-self-hosted/compilation.zig‎

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const io = std.io;
44
const mem = std.mem;
55
const Allocator = mem.Allocator;
66
const Buffer = std.Buffer;
7+
const unicode = std.unicode;
78
const llvm = @import("llvm.zig");
89
const c = @import("c.zig");
910
const builtin = @import("builtin");
@@ -998,14 +999,90 @@ test "getAppDataDir" {
998999
std.debug.warn("{}...", result);
9991000
}
10001001

1001-
// TODO full utf-16 LE support
1002+
// TODO: put general purpose stuff in std.unicode
10021003
fn utf16leToUtf8(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 {
1003-
const utf8_bytes = try allocator.alloc(u8, utf16le.len);
1004-
for (utf16le) |codepoint, i| {
1005-
assert(codepoint < 127); // TODO full utf-16 LE support
1006-
utf8_bytes[i] = @intCast(u8, codepoint);
1004+
var result = ArrayList(u8).init(allocator);
1005+
// optimistically guess that it will all be ascii.
1006+
try result.ensureCapacity(utf16le.len);
1007+
1008+
const utf16le_as_bytes = @sliceToBytes(utf16le);
1009+
var i: usize = 0;
1010+
var out_index: usize = 0;
1011+
while (i < utf16le_as_bytes.len) : (i += 2) {
1012+
// decode
1013+
const c0: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]);
1014+
var codepoint: u32 = undefined;
1015+
if (c0 & ~u32(0x03ff) == 0xd800) {
1016+
// surrogate pair
1017+
i += 2;
1018+
if (i >= utf16le_as_bytes.len) return error.DanglingSurrogateHalf;
1019+
const c1: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]);
1020+
if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
1021+
codepoint = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
1022+
} else if (c0 & ~u32(0x03ff) == 0xdc00) {
1023+
return error.UnexpectedSecondSurrogateHalf;
1024+
} else {
1025+
codepoint = c0;
1026+
}
1027+
1028+
// encode
1029+
const utf8_len = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
1030+
try result.resize(result.len + utf8_len);
1031+
_ = unicode.utf8Encode(codepoint, result.items[out_index..]) catch unreachable;
1032+
out_index += utf8_len;
1033+
}
1034+
1035+
return result.toOwnedSlice();
1036+
}
1037+
1038+
test "utf16leToUtf8" {
1039+
var utf16le: [2]u16 = undefined;
1040+
const utf16le_as_bytes = @sliceToBytes(utf16le[0..]);
1041+
1042+
{
1043+
mem.writeInt(utf16le_as_bytes[0..], u16('A'), builtin.Endian.Little);
1044+
mem.writeInt(utf16le_as_bytes[2..], u16('a'), builtin.Endian.Little);
1045+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1046+
assert(mem.eql(u8, utf8, "Aa"));
1047+
}
1048+
1049+
{
1050+
mem.writeInt(utf16le_as_bytes[0..], u16(0x80), builtin.Endian.Little);
1051+
mem.writeInt(utf16le_as_bytes[2..], u16(0xffff), builtin.Endian.Little);
1052+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1053+
assert(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
1054+
}
1055+
1056+
{
1057+
// the values just outside the surrogate half range
1058+
mem.writeInt(utf16le_as_bytes[0..], u16(0xd7ff), builtin.Endian.Little);
1059+
mem.writeInt(utf16le_as_bytes[2..], u16(0xe000), builtin.Endian.Little);
1060+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1061+
assert(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
1062+
}
1063+
1064+
{
1065+
// smallest surrogate pair
1066+
mem.writeInt(utf16le_as_bytes[0..], u16(0xd800), builtin.Endian.Little);
1067+
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
1068+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1069+
assert(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
1070+
}
1071+
1072+
{
1073+
// largest surrogate pair
1074+
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
1075+
mem.writeInt(utf16le_as_bytes[2..], u16(0xdfff), builtin.Endian.Little);
1076+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1077+
assert(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
1078+
}
1079+
1080+
{
1081+
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
1082+
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
1083+
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
1084+
assert(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
10071085
}
1008-
return utf8_bytes;
10091086
}
10101087

10111088
fn utf16lePtrSlice(ptr: [*]const u16) []const u16 {

0 commit comments

Comments
 (0)
Please sign in to comment.