@@ -4,6 +4,7 @@ const io = std.io;
4
4
const mem = std .mem ;
5
5
const Allocator = mem .Allocator ;
6
6
const Buffer = std .Buffer ;
7
+ const unicode = std .unicode ;
7
8
const llvm = @import ("llvm.zig" );
8
9
const c = @import ("c.zig" );
9
10
const builtin = @import ("builtin" );
@@ -998,14 +999,90 @@ test "getAppDataDir" {
998
999
std .debug .warn ("{}..." , result );
999
1000
}
1000
1001
1001
- // TODO full utf-16 LE support
1002
+ // TODO: put general purpose stuff in std.unicode
1002
1003
fn utf16leToUtf8 (allocator : * mem.Allocator , utf16le : []const u16 ) ! []u8 {
1003
- const utf8_bytes = try allocator .alloc (u8 , utf16le .len );
1004
- for (utf16le ) | codepoint , i | {
1005
- assert (codepoint < 127 ); // TODO full utf-16 LE support
1006
- utf8_bytes [i ] = @intCast (u8 , codepoint );
1004
+ var result = ArrayList (u8 ).init (allocator );
1005
+ // optimistically guess that it will all be ascii.
1006
+ try result .ensureCapacity (utf16le .len );
1007
+
1008
+ const utf16le_as_bytes = @sliceToBytes (utf16le );
1009
+ var i : usize = 0 ;
1010
+ var out_index : usize = 0 ;
1011
+ while (i < utf16le_as_bytes .len ) : (i += 2 ) {
1012
+ // decode
1013
+ const c0 : u32 = mem .readIntLE (u16 , utf16le_as_bytes [i .. i + 2 ]);
1014
+ var codepoint : u32 = undefined ;
1015
+ if (c0 & ~ u32 (0x03ff ) == 0xd800 ) {
1016
+ // surrogate pair
1017
+ i += 2 ;
1018
+ if (i >= utf16le_as_bytes .len ) return error .DanglingSurrogateHalf ;
1019
+ const c1 : u32 = mem .readIntLE (u16 , utf16le_as_bytes [i .. i + 2 ]);
1020
+ if (c1 & ~ u32 (0x03ff ) != 0xdc00 ) return error .ExpectedSecondSurrogateHalf ;
1021
+ codepoint = 0x10000 + (((c0 & 0x03ff ) << 10 ) | (c1 & 0x03ff ));
1022
+ } else if (c0 & ~ u32 (0x03ff ) == 0xdc00 ) {
1023
+ return error .UnexpectedSecondSurrogateHalf ;
1024
+ } else {
1025
+ codepoint = c0 ;
1026
+ }
1027
+
1028
+ // encode
1029
+ const utf8_len = unicode .utf8CodepointSequenceLength (codepoint ) catch unreachable ;
1030
+ try result .resize (result .len + utf8_len );
1031
+ _ = unicode .utf8Encode (codepoint , result .items [out_index .. ]) catch unreachable ;
1032
+ out_index += utf8_len ;
1033
+ }
1034
+
1035
+ return result .toOwnedSlice ();
1036
+ }
1037
+
1038
+ test "utf16leToUtf8" {
1039
+ var utf16le : [2 ]u16 = undefined ;
1040
+ const utf16le_as_bytes = @sliceToBytes (utf16le [0.. ]);
1041
+
1042
+ {
1043
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 ('A' ), builtin .Endian .Little );
1044
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 ('a' ), builtin .Endian .Little );
1045
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1046
+ assert (mem .eql (u8 , utf8 , "Aa" ));
1047
+ }
1048
+
1049
+ {
1050
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 (0x80 ), builtin .Endian .Little );
1051
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 (0xffff ), builtin .Endian .Little );
1052
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1053
+ assert (mem .eql (u8 , utf8 , "\xc2\x80 " ++ "\xef\xbf\xbf " ));
1054
+ }
1055
+
1056
+ {
1057
+ // the values just outside the surrogate half range
1058
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 (0xd7ff ), builtin .Endian .Little );
1059
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 (0xe000 ), builtin .Endian .Little );
1060
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1061
+ assert (mem .eql (u8 , utf8 , "\xed\x9f\xbf " ++ "\xee\x80\x80 " ));
1062
+ }
1063
+
1064
+ {
1065
+ // smallest surrogate pair
1066
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 (0xd800 ), builtin .Endian .Little );
1067
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 (0xdc00 ), builtin .Endian .Little );
1068
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1069
+ assert (mem .eql (u8 , utf8 , "\xf0\x90\x80\x80 " ));
1070
+ }
1071
+
1072
+ {
1073
+ // largest surrogate pair
1074
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 (0xdbff ), builtin .Endian .Little );
1075
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 (0xdfff ), builtin .Endian .Little );
1076
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1077
+ assert (mem .eql (u8 , utf8 , "\xf4\x8f\xbf\xbf " ));
1078
+ }
1079
+
1080
+ {
1081
+ mem .writeInt (utf16le_as_bytes [0.. ], u16 (0xdbff ), builtin .Endian .Little );
1082
+ mem .writeInt (utf16le_as_bytes [2.. ], u16 (0xdc00 ), builtin .Endian .Little );
1083
+ const utf8 = try utf16leToUtf8 (std .debug .global_allocator , utf16le );
1084
+ assert (mem .eql (u8 , utf8 , "\xf4\x8f\xb0\x80 " ));
1007
1085
}
1008
- return utf8_bytes ;
1009
1086
}
1010
1087
1011
1088
fn utf16lePtrSlice (ptr : [* ]const u16 ) []const u16 {
0 commit comments