Sun, 26 Jan 2025 10:23:32 +0100
implement decoder for utf16 surrogate pairs in unescape_string
src/json.c | file | annotate | diff | comparison | revisions | |
tests/test_json.c | file | annotate | diff | comparison | revisions |
--- a/src/json.c Sat Jan 25 16:27:48 2025 +0100 +++ b/src/json.c Sun Jan 26 10:23:32 2025 +0100 @@ -353,6 +353,7 @@ return CX_JSON_INCOMPLETE_DATA; } +// converts a unicode (up to U+FFFF) codepoint to utf8 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { if (codepoint <= 0x7F) { *output_buf = (char)codepoint; @@ -366,11 +367,22 @@ output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); return 3; + } else if (codepoint <= 0x10FFFF) { + output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); + output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); + return 4; } return 0; } +// converts a utf16 surrogate pair to utf8 +static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { + return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { // note: this function expects that str contains the enclosing quotes! @@ -402,16 +414,36 @@ if (i+4 < str.length - 1) { cxstring codepoint_str = { str.ptr + i + 1, 4}; uint32_t codepoint; - if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { + if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { char utf8buf[4]; - int utf8len = codepoint_to_utf8(codepoint, utf8buf); + int utf8len = 0; + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + // character is encoded as a surrogate pair + // get next 6 bytes + if (i + 10 < str.length - 1) { + char *surrogate2 = str.ptr+i+5; + if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { + cxstring c2_str = { surrogate2 + 2, 4 }; + uint32_t c2; + if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { + codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); + utf8len = codepoint_to_utf8(codepoint, utf8buf); + i += 6; + } + } + } + } else { + // character is in the Basic Multilingual Plane + // and encoded as a single utf16 char + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } if(utf8len > 0) { // add all bytes from utf8buf expect the last char // to the result utf8len--; c = utf8buf[utf8len]; - for(int i=0;i<utf8len;i++) { - result.ptr[result.length++] = utf8buf[i]; + for(int x=0;x<utf8len;x++) { + result.ptr[result.length++] = utf8buf[x]; } } i += 4;
--- a/tests/test_json.c Sat Jan 25 16:27:48 2025 +0100 +++ b/tests/test_json.c Sun Jan 26 10:23:32 2025 +0100 @@ -149,7 +149,10 @@ "\"ascii\":\"\\u0041\\u0053\\u0043\\u0049\\u0049\",\n" "\"unicode\":\"\\u00df\\u00DF\",\n" "\"mixed\":\"mixed ä ö \\u00e4 \\u00f6\",\n" - "\"wide\":\"\\u03a3\\u29b0\"" + "\"wide\":\"\\u03a3\\u29b0\",\n" + "\"surrogatepair1\":\"\\ud83e\\udff5\",\n" + "\"surrogatepair2\":\"test\\ud83e\\udff1AA\"\n," + "\"mixed2\":\"123\\u03a3\\ud83e\\udfc5\\u00df\"" "}" ); @@ -190,6 +193,27 @@ CX_STR("\u03a3\u29b0")) ); + CxJsonValue *surrogatepair1 = cxJsonObjGet(obj, "surrogatepair1"); + CX_TEST_ASSERT(cxJsonIsString(surrogatepair1)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(surrogatepair1), + CX_STR("\xf0\x9f\xaf\xb5")) + ); + + CxJsonValue *surrogatepair2 = cxJsonObjGet(obj, "surrogatepair2"); + CX_TEST_ASSERT(cxJsonIsString(surrogatepair2)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(surrogatepair2), + CX_STR("test\xf0\x9f\xaf\xb1" "AA")) + ); + + CxJsonValue *mixed2 = cxJsonObjGet(obj, "mixed2"); + CX_TEST_ASSERT(cxJsonIsString(mixed2)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(mixed2), + CX_STR("123\u03a3\xf0\x9f\xaf\x85ß")) + ); + cxJsonValueFree(obj); } cxJsonDestroy(&json);