Sat, 25 Jan 2025 16:13:28 +0100
implement unicode escape sequences in json unescape_string function
src/json.c | file | annotate | diff | comparison | revisions | |
tests/test_json.c | file | annotate | diff | comparison | revisions |
--- a/src/json.c Wed Jan 22 21:02:46 2025 +0100 +++ b/src/json.c Sat Jan 25 16:13:28 2025 +0100 @@ -353,6 +353,24 @@ return CX_JSON_INCOMPLETE_DATA; } +static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { + if (codepoint <= 0x7F) { + *output_buf = (char)codepoint; + return 1; + } else if (codepoint <= 0x7FF) { + output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); + output_buf[1] = (char)(0x80 | (codepoint & 0x3F)); + return 2; + } else if (codepoint <= 0xFFFF) { + output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); + output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); + return 3; + } + + return 0; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { // note: this function expects that str contains the enclosing quotes! @@ -380,8 +398,27 @@ c = '\f'; } else if (c == 'b') { c = '\b'; + } else if (c == 'u') { + if (i+4 < str.length) { + cxstring codepoint_str = { str.ptr + i + 1, 4}; + uint32_t codepoint; + if(!cx_strtou32_lc_(codepoint_str, &codepoint, 16, "")) { + char utf8buf[4]; + int utf8len = codepoint_to_utf8(codepoint, utf8buf); + if(utf8len > 0) { + // add all bytes from utf8buf expect the last char + // to the result + utf8len--; + c = utf8buf[utf8len]; + for(int i=0;i<utf8len;i++) { + result.ptr[result.length++] = utf8buf[i]; + } + } + i += 4; + } + } } - // TODO: support \uXXXX escape sequences + // TODO: discuss the behavior for unrecognized escape sequences // most parsers throw an error here result.ptr[result.length++] = c;
--- a/tests/test_json.c Wed Jan 22 21:02:46 2025 +0100 +++ b/tests/test_json.c Sat Jan 25 16:13:28 2025 +0100 @@ -143,6 +143,58 @@ cxJsonDestroy(&json); } +CX_TEST(test_json_escaped_unicode_strings) { + cxstring text = cx_str( + "{\n" + "\"ascii\":\"\\u0041\\u0053\\u0043\\u0049\\u0049\",\n" + "\"unicode\":\"\\u00df\\u00DF\",\n" + "\"mixed\":\"mixed ä ö \\u00e4 \\u00f6\",\n" + "\"wide\":\"\\u03a3\\u29b0\"" + "}" + ); + + CxJson json; + cxJsonInit(&json, NULL); + CX_TEST_DO { + cxJsonFill(&json, text); + CxJsonValue *obj; + CxJsonStatus result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsObject(obj)); + + CxJsonValue *ascii = cxJsonObjGet(obj, "ascii"); + CX_TEST_ASSERT(cxJsonIsString(ascii)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(ascii), + CX_STR("ASCII")) + ); + + CxJsonValue *unicode = cxJsonObjGet(obj, "unicode"); + CX_TEST_ASSERT(cxJsonIsString(unicode)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(unicode), + CX_STR("ßß")) + ); + + CxJsonValue *mixed = cxJsonObjGet(obj, "mixed"); + CX_TEST_ASSERT(cxJsonIsString(mixed)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(mixed), + CX_STR("mixed ä ö ä ö")) + ); + + CxJsonValue *wide = cxJsonObjGet(obj, "wide"); + CX_TEST_ASSERT(cxJsonIsString(wide)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(wide), + CX_STR("\u03a3\u29b0")) + ); + + cxJsonValueFree(obj); + } + cxJsonDestroy(&json); +} + CX_TEST(test_json_escaped_end_of_string) { CxJson json; cxJsonInit(&json, NULL); @@ -1042,6 +1094,7 @@ cx_test_register(suite, test_json_init_default); cx_test_register(suite, test_json_simple_object); cx_test_register(suite, test_json_escaped_strings); + cx_test_register(suite, test_json_escaped_unicode_strings); cx_test_register(suite, test_json_escaped_end_of_string); cx_test_register(suite, test_json_object_incomplete_token); cx_test_register(suite, test_json_token_wrongly_completed);