Sun, 26 Jan 2025 13:20:05 +0100
add test and implementation for malformed escape sequences
src/json.c | file | annotate | diff | comparison | revisions | |
tests/test_json.c | file | annotate | diff | comparison | revisions |
--- a/src/json.c Sun Jan 26 12:24:49 2025 +0100 +++ b/src/json.c Sun Jan 26 13:20:05 2025 +0100 @@ -353,8 +353,8 @@ return CX_JSON_INCOMPLETE_DATA; } -// converts a unicode (up to U+FFFF) codepoint to utf8 -static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { +// converts a Unicode codepoint to utf8 +static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { if (codepoint <= 0x7F) { *output_buf = (char)codepoint; return 1; @@ -375,7 +375,7 @@ return 4; } - return 0; + return 0; // LCOV_EXCL_LINE } // converts a utf16 surrogate pair to utf8 @@ -398,6 +398,8 @@ u = false; if (c == 'n') { c = '\n'; + } else if (c == '"') { + c = '"'; } else if (c == 't') { c = '\t'; } else if (c == 'r') { @@ -411,48 +413,54 @@ } else if (c == 'b') { c = '\b'; } else if (c == 'u') { - if (i+4 < str.length - 1) { - cxstring codepoint_str = { str.ptr + i + 1, 4}; - uint32_t codepoint; - if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { - char utf8buf[4]; - int utf8len = 0; - if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + if (i + 4 < str.length - 1) { + cxstring ustr1 = { str.ptr + i + 1, 4}; + uint16_t utf16a, utf16b; + char utf8buf[4]; + unsigned utf8len = 0; + if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { + uint32_t codepoint; + if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { // character is encoded as a surrogate pair // get next 6 bytes if (i + 10 < str.length - 1) { - char *surrogate2 = str.ptr+i+5; - if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { - cxstring c2_str = { surrogate2 + 2, 4 }; - uint32_t c2; - if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { - codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); + if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { + cxstring ustr2 = { str.ptr+i+7, 4 }; + if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") + && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { + codepoint = utf16pair_to_codepoint(utf16a, utf16b); utf8len = codepoint_to_utf8(codepoint, utf8buf); - i += 6; + i += 10; } } } } else { // character is in the Basic Multilingual Plane // and encoded as a single utf16 char + codepoint = utf16a; utf8len = codepoint_to_utf8(codepoint, utf8buf); + i += 4; } - if(utf8len > 0) { - // add all bytes from utf8buf expect the last char - // to the result - utf8len--; - c = utf8buf[utf8len]; - for(int x=0;x<utf8len;x++) { - result.ptr[result.length++] = utf8buf[x]; - } + } + if(utf8len > 0) { + // add all bytes from utf8buf except the last char + // to the result (last char will be added below) + utf8len--; + c = utf8buf[utf8len]; + for (unsigned x = 0; x < utf8len; x++) { + result.ptr[result.length++] = utf8buf[x]; } - i += 4; + } else { + // decoding failed, ignore the entire sequence + result.ptr[result.length++] = '\\'; } } + } else { + // TODO: discuss the behavior for unrecognized escape sequences + // most parsers throw an error here - we just ignore it + result.ptr[result.length++] = '\\'; } - - // TODO: discuss the behavior for unrecognized escape sequences - // most parsers throw an error here + result.ptr[result.length++] = c; } else { if (c == '\\') {
--- a/tests/test_json.c Sun Jan 26 12:24:49 2025 +0100 +++ b/tests/test_json.c Sun Jan 26 13:20:05 2025 +0100 @@ -226,6 +226,64 @@ cxJsonDestroy(&json); } +CX_TEST(test_json_escaped_unicode_malformed) { + CxJson json; + cxJsonInit(&json, NULL); + CxJsonValue *obj; + CxJsonStatus result; + CX_TEST_DO { + cxJsonFill(&json, "\"too few \\u123 digits\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("too few \\u123 digits") + )); + cxJsonFill(&json, "\"too many \\u00E456 digits\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("too many รค56 digits") + )); + cxJsonFill(&json, "\"only high \\uD800 surrogate\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("only high \\uD800 surrogate") + )); + cxJsonFill(&json, "\"only low \\uDC00 surrogate\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("only low \\uDC00 surrogate") + )); + cxJsonFill(&json, "\"two high \\uD800\\uD800 surrogates\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("two high \\uD800\\uD800 surrogates") + )); + cxJsonFill(&json, "\"high plus bullshit \\uD800\\u567 foo\""); + result = cxJsonNext(&json, &obj); + CX_TEST_ASSERT(result == CX_JSON_NO_ERROR); + CX_TEST_ASSERT(cxJsonIsString(obj)); + CX_TEST_ASSERT(0 == cx_strcmp( + cxJsonAsCxString(obj), + CX_STR("high plus bullshit \\uD800\\u567 foo") + )); + } + cxJsonDestroy(&json); +} + CX_TEST(test_json_escaped_end_of_string) { CxJson json; cxJsonInit(&json, NULL); @@ -1126,6 +1184,7 @@ cx_test_register(suite, test_json_simple_object); cx_test_register(suite, test_json_escaped_strings); cx_test_register(suite, test_json_escaped_unicode_strings); + cx_test_register(suite, test_json_escaped_unicode_malformed); cx_test_register(suite, test_json_escaped_end_of_string); cx_test_register(suite, test_json_object_incomplete_token); cx_test_register(suite, test_json_token_wrongly_completed);