Mercurial > hg > ucx / changeset

--- a/src/json.c	Sun Jan 26 12:24:49 2025 +0100
+++ b/src/json.c	Sun Jan 26 13:20:05 2025 +0100
@@ -353,8 +353,8 @@
     return CX_JSON_INCOMPLETE_DATA;
 }

-// converts a unicode (up to U+FFFF) codepoint to utf8
-static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+// converts a Unicode codepoint to utf8
+static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
     if (codepoint <= 0x7F) {
         *output_buf = (char)codepoint;
         return 1;
@@ -375,7 +375,7 @@
         return 4;
     }

-    return 0;
+    return 0; // LCOV_EXCL_LINE
 }

 // converts a utf16 surrogate pair to utf8
@@ -398,6 +398,8 @@
             u = false;
             if (c == 'n') {
                 c = '\n';
+            } else if (c == '"') {
+                c = '"';
             } else if (c == 't') {
                 c = '\t';
             } else if (c == 'r') {
@@ -411,48 +413,54 @@
             } else if (c == 'b') {
                 c = '\b';
             } else if (c == 'u') {
-                if (i+4 < str.length - 1) {
-                    cxstring codepoint_str = { str.ptr + i + 1, 4};
-                    uint32_t codepoint;
-                    if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
-                        char utf8buf[4];
-                        int utf8len = 0;
-                        if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+                if (i + 4 < str.length - 1) {
+                    cxstring ustr1 = { str.ptr + i + 1, 4};
+                    uint16_t utf16a, utf16b;
+                    char utf8buf[4];
+                    unsigned utf8len = 0;
+                    if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+                        uint32_t codepoint;
+                        if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
                             // character is encoded as a surrogate pair
                             // get next 6 bytes
                             if (i + 10 < str.length - 1) {
-                                char *surrogate2 = str.ptr+i+5;
-                                if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
-                                    cxstring c2_str = { surrogate2 + 2, 4 };
-                                    uint32_t c2;
-                                    if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
-                                        codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
+                                if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
+                                    cxstring ustr2 = { str.ptr+i+7, 4 };
+                                    if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+                                            && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+                                        codepoint = utf16pair_to_codepoint(utf16a, utf16b);
                                         utf8len = codepoint_to_utf8(codepoint, utf8buf);
-                                        i += 6;
+                                        i += 10;
                                     }
                                 }
                             }
                         } else {
                             // character is in the Basic Multilingual Plane
                             // and encoded as a single utf16 char
+                            codepoint = utf16a;
                             utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                            i += 4;
                         }
-                        if(utf8len > 0) {
-                            // add all bytes from utf8buf expect the last char
-                            // to the result
-                            utf8len--;
-                            c = utf8buf[utf8len];
-                            for(int x=0;x<utf8len;x++) {
-                                result.ptr[result.length++] = utf8buf[x];
-                            }
+                    }
+                    if(utf8len > 0) {
+                        // add all bytes from utf8buf except the last char
+                        // to the result (last char will be added below)
+                        utf8len--;
+                        c = utf8buf[utf8len];
+                        for (unsigned x = 0; x < utf8len; x++) {
+                            result.ptr[result.length++] = utf8buf[x];
                         }
-                        i += 4;
+                    } else {
+                        // decoding failed, ignore the entire sequence
+                        result.ptr[result.length++] = '\\';
                     }
                 }
+            } else {
+                // TODO: discuss the behavior for unrecognized escape sequences
+                //       most parsers throw an error here - we just ignore it
+                result.ptr[result.length++] = '\\';
             }
-
-            // TODO: discuss the behavior for unrecognized escape sequences
-            //       most parsers throw an error here
+
             result.ptr[result.length++] = c;
         } else {
             if (c == '\\') {
--- a/tests/test_json.c	Sun Jan 26 12:24:49 2025 +0100
+++ b/tests/test_json.c	Sun Jan 26 13:20:05 2025 +0100
@@ -226,6 +226,64 @@
     cxJsonDestroy(&json);
 }

+CX_TEST(test_json_escaped_unicode_malformed) {
+    CxJson json;
+    cxJsonInit(&json, NULL);
+    CxJsonValue *obj;
+    CxJsonStatus result;
+    CX_TEST_DO {
+        cxJsonFill(&json, "\"too few \\u123 digits\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("too few \\u123 digits")
+        ));
+        cxJsonFill(&json, "\"too many \\u00E456 digits\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("too many ä56 digits")
+        ));
+        cxJsonFill(&json, "\"only high \\uD800 surrogate\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("only high \\uD800 surrogate")
+        ));
+        cxJsonFill(&json, "\"only low \\uDC00 surrogate\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("only low \\uDC00 surrogate")
+        ));
+        cxJsonFill(&json, "\"two high \\uD800\\uD800 surrogates\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("two high \\uD800\\uD800 surrogates")
+        ));
+        cxJsonFill(&json, "\"high plus bullshit \\uD800\\u567 foo\"");
+        result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsString(obj));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(obj),
+            CX_STR("high plus bullshit \\uD800\\u567 foo")
+        ));
+    }
+    cxJsonDestroy(&json);
+}
+
 CX_TEST(test_json_escaped_end_of_string) {
     CxJson json;
     cxJsonInit(&json, NULL);
@@ -1126,6 +1184,7 @@
     cx_test_register(suite, test_json_simple_object);
     cx_test_register(suite, test_json_escaped_strings);
     cx_test_register(suite, test_json_escaped_unicode_strings);
+    cx_test_register(suite, test_json_escaped_unicode_malformed);
     cx_test_register(suite, test_json_escaped_end_of_string);
     cx_test_register(suite, test_json_object_incomplete_token);
     cx_test_register(suite, test_json_token_wrongly_completed);
src/json.c		file \| annotate \| diff \| comparison \| revisions
tests/test_json.c		file \| annotate \| diff \| comparison \| revisions