# HG changeset patch # User Olaf Wintermann # Date 1737883412 -3600 # Node ID e4af44b488bc91511a17b3a85cae44c4fb05985b # Parent 60113356a7de15391acc31d95b203eba01acca7f implement decoder for utf16 surrogate pairs in unescape_string diff -r 60113356a7de -r e4af44b488bc src/json.c --- a/src/json.c Sat Jan 25 16:27:48 2025 +0100 +++ b/src/json.c Sun Jan 26 10:23:32 2025 +0100 @@ -353,6 +353,7 @@ return CX_JSON_INCOMPLETE_DATA; } +// converts a unicode (up to U+FFFF) codepoint to utf8 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { if (codepoint <= 0x7F) { *output_buf = (char)codepoint; @@ -366,11 +367,22 @@ output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); return 3; + } else if (codepoint <= 0x10FFFF) { + output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); + output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); + return 4; } return 0; } +// converts a utf16 surrogate pair to utf8 +static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { + return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { // note: this function expects that str contains the enclosing quotes! @@ -402,16 +414,36 @@ if (i+4 < str.length - 1) { cxstring codepoint_str = { str.ptr + i + 1, 4}; uint32_t codepoint; - if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { + if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { char utf8buf[4]; - int utf8len = codepoint_to_utf8(codepoint, utf8buf); + int utf8len = 0; + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + // character is encoded as a surrogate pair + // get next 6 bytes + if (i + 10 < str.length - 1) { + char *surrogate2 = str.ptr+i+5; + if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { + cxstring c2_str = { surrogate2 + 2, 4 }; + uint32_t c2; + if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { + codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); + utf8len = codepoint_to_utf8(codepoint, utf8buf); + i += 6; + } + } + } + } else { + // character is in the Basic Multilingual Plane + // and encoded as a single utf16 char + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } if(utf8len > 0) { // add all bytes from utf8buf expect the last char // to the result utf8len--; c = utf8buf[utf8len]; - for(int i=0;i