Skip to content

Commit 5e392cf

Browse files
committedFeb 2, 2021
Refactor utf8_to_wide/wide_to_utf8 functions
1 parent 7ebd5da commit 5e392cf

File tree

3 files changed

+40
-38
lines changed

3 files changed

+40
-38
lines changed
 

‎src/unittest/test_utilities.cpp

+12-3
Original file line numberDiff line numberDiff line change
@@ -302,9 +302,18 @@ void TestUtilities::testAsciiPrintableHelper()
302302

303303
void TestUtilities::testUTF8()
304304
{
305-
UASSERT(wide_to_utf8(utf8_to_wide("")) == "");
306-
UASSERT(wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!"))
307-
== "the shovel dug a crumbly node!");
305+
UASSERT(utf8_to_wide("¤") == L"¤");
306+
307+
UASSERT(wide_to_utf8(L"¤") == "¤");
308+
309+
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), "");
310+
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")),
311+
"the shovel dug a crumbly node!");
312+
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-ä-")),
313+
"-ä-");
314+
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-\xF0\xA0\x80\x8B-")),
315+
"-\xF0\xA0\x80\x8B-");
316+
308317
}
309318

310319
void TestUtilities::testRemoveEscapes()

‎src/util/string.cpp

+24-33
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ static bool parseNamedColorString(const std::string &value, video::SColor &color
5050

5151
#ifndef _WIN32
5252

53-
bool convert(const char *to, const char *from, char *outbuf,
54-
size_t outbuf_size, char *inbuf, size_t inbuf_size)
53+
static bool convert(const char *to, const char *from, char *outbuf,
54+
size_t *outbuf_size, char *inbuf, size_t inbuf_size)
5555
{
5656
iconv_t cd = iconv_open(to, from);
5757

@@ -60,15 +60,14 @@ bool convert(const char *to, const char *from, char *outbuf,
6060
#else
6161
char *inbuf_ptr = inbuf;
6262
#endif
63-
6463
char *outbuf_ptr = outbuf;
6564

6665
size_t *inbuf_left_ptr = &inbuf_size;
67-
size_t *outbuf_left_ptr = &outbuf_size;
6866

67+
const size_t old_outbuf_size = *outbuf_size;
6968
size_t old_size = inbuf_size;
7069
while (inbuf_size > 0) {
71-
iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_left_ptr);
70+
iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size);
7271
if (inbuf_size == old_size) {
7372
iconv_close(cd);
7473
return false;
@@ -77,70 +76,65 @@ bool convert(const char *to, const char *from, char *outbuf,
7776
}
7877

7978
iconv_close(cd);
79+
*outbuf_size = old_outbuf_size - *outbuf_size;
8080
return true;
8181
}
8282

8383
#ifdef __ANDROID__
84-
// Android need manual caring to support the full character set possible with wchar_t
84+
// On Android iconv disagrees how big a wchar_t is for whatever reason
8585
const char *DEFAULT_ENCODING = "UTF-32LE";
8686
#else
8787
const char *DEFAULT_ENCODING = "WCHAR_T";
8888
#endif
8989

9090
std::wstring utf8_to_wide(const std::string &input)
9191
{
92-
size_t inbuf_size = input.length() + 1;
92+
const size_t inbuf_size = input.length();
9393
// maximum possible size, every character is sizeof(wchar_t) bytes
94-
size_t outbuf_size = (input.length() + 1) * sizeof(wchar_t);
94+
size_t outbuf_size = input.length() * sizeof(wchar_t);
9595

96-
char *inbuf = new char[inbuf_size];
96+
char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
9797
memcpy(inbuf, input.c_str(), inbuf_size);
98-
char *outbuf = new char[outbuf_size];
99-
memset(outbuf, 0, outbuf_size);
98+
std::wstring out;
99+
out.resize(outbuf_size / sizeof(wchar_t));
100100

101101
#ifdef __ANDROID__
102-
// Android need manual caring to support the full character set possible with wchar_t
103102
SANITY_CHECK(sizeof(wchar_t) == 4);
104103
#endif
105104

106-
if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, outbuf_size, inbuf, inbuf_size)) {
105+
char *outbuf = reinterpret_cast<char*>(&out[0]);
106+
if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) {
107107
infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input)
108108
<< " into wstring" << std::endl;
109109
delete[] inbuf;
110-
delete[] outbuf;
111110
return L"<invalid UTF-8 string>";
112111
}
113-
std::wstring out((wchar_t *)outbuf);
114-
115112
delete[] inbuf;
116-
delete[] outbuf;
117113

114+
out.resize(outbuf_size / sizeof(wchar_t));
118115
return out;
119116
}
120117

121118
std::string wide_to_utf8(const std::wstring &input)
122119
{
123-
size_t inbuf_size = (input.length() + 1) * sizeof(wchar_t);
124-
// maximum possible size: utf-8 encodes codepoints using 1 up to 6 bytes
125-
size_t outbuf_size = (input.length() + 1) * 6;
120+
const size_t inbuf_size = input.length() * sizeof(wchar_t);
121+
// maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes
122+
size_t outbuf_size = input.length() * 4;
126123

127-
char *inbuf = new char[inbuf_size];
124+
char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
128125
memcpy(inbuf, input.c_str(), inbuf_size);
129-
char *outbuf = new char[outbuf_size];
130-
memset(outbuf, 0, outbuf_size);
126+
std::string out;
127+
out.resize(outbuf_size);
131128

132-
if (!convert("UTF-8", DEFAULT_ENCODING, outbuf, outbuf_size, inbuf, inbuf_size)) {
129+
if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) {
133130
infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size)
134131
<< " into UTF-8 string" << std::endl;
135132
delete[] inbuf;
136-
delete[] outbuf;
137-
return "<invalid wstring>";
133+
return "<invalid wide string>";
138134
}
139-
std::string out(outbuf);
140-
141135
delete[] inbuf;
142-
delete[] outbuf;
143136

137+
out.resize(outbuf_size);
144138
return out;
145139
}
146140

@@ -172,15 +166,12 @@ std::string wide_to_utf8(const std::wstring &input)
172166

173167
#endif // _WIN32
174168

175-
// You must free the returned string!
176-
// The returned string is allocated using new
177169
wchar_t *utf8_to_wide_c(const char *str)
178170
{
179171
std::wstring ret = utf8_to_wide(std::string(str));
180172
size_t len = ret.length();
181173
wchar_t *ret_c = new wchar_t[len + 1];
182-
memset(ret_c, 0, (len + 1) * sizeof(wchar_t));
183-
memcpy(ret_c, ret.c_str(), len * sizeof(wchar_t));
174+
memcpy(ret_c, ret.c_str(), (len + 1) * sizeof(wchar_t));
184175
return ret_c;
185176
}
186177

‎src/util/string.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ struct FlagDesc {
6464
u32 flag;
6565
};
6666

67-
// try not to convert between wide/utf8 encodings; this can result in data loss
68-
// try to only convert between them when you need to input/output stuff via Irrlicht
67+
// Try to avoid converting between wide and UTF-8 unless you need to
68+
// input/output stuff via Irrlicht
6969
std::wstring utf8_to_wide(const std::string &input);
7070
std::string wide_to_utf8(const std::wstring &input);
7171

72+
// You must free the returned string!
73+
// The returned string is allocated using new[]
7274
wchar_t *utf8_to_wide_c(const char *str);
7375

7476
// NEVER use those two functions unless you have a VERY GOOD reason to

0 commit comments

Comments
 (0)
Please sign in to comment.