src/json.c

changeset 1156
96f16b5a0029
parent 1152
e4af44b488bc
child 1158
fa2811e9ab19
--- a/src/json.c	Sun Jan 26 12:24:49 2025 +0100
+++ b/src/json.c	Sun Jan 26 13:20:05 2025 +0100
@@ -353,8 +353,8 @@
     return CX_JSON_INCOMPLETE_DATA;
 }
 
-// converts a unicode (up to U+FFFF) codepoint to utf8
-static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+// converts a Unicode codepoint to utf8
+static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
     if (codepoint <= 0x7F) {
         *output_buf = (char)codepoint;
         return 1;
@@ -375,7 +375,7 @@
         return 4;
     }
     
-    return 0;
+    return 0; // LCOV_EXCL_LINE
 }
 
 // converts a utf16 surrogate pair to utf8
@@ -398,6 +398,8 @@
             u = false;
             if (c == 'n') {
                 c = '\n';
+            } else if (c == '"') {
+                c = '"';
             } else if (c == 't') {
                 c = '\t';
             } else if (c == 'r') {
@@ -411,48 +413,54 @@
             } else if (c == 'b') {
                 c = '\b';
             } else if (c == 'u') {
-                if (i+4 < str.length - 1) {
-                    cxstring codepoint_str = { str.ptr + i + 1, 4};
-                    uint32_t codepoint;
-                    if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
-                        char utf8buf[4];
-                        int utf8len = 0;
-                        if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+                if (i + 4 < str.length - 1) {
+                    cxstring ustr1 = { str.ptr + i + 1, 4};
+                    uint16_t utf16a, utf16b;
+                    char utf8buf[4];
+                    unsigned utf8len = 0;
+                    if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+                        uint32_t codepoint;
+                        if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
                             // character is encoded as a surrogate pair
                             // get next 6 bytes
                             if (i + 10 < str.length - 1) {
-                                char *surrogate2 = str.ptr+i+5;
-                                if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
-                                    cxstring c2_str = { surrogate2 + 2, 4 };
-                                    uint32_t c2;
-                                    if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
-                                        codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
+                                if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
+                                    cxstring ustr2 = { str.ptr+i+7, 4 };
+                                    if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+                                            && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+                                        codepoint = utf16pair_to_codepoint(utf16a, utf16b);
                                         utf8len = codepoint_to_utf8(codepoint, utf8buf);
-                                        i += 6;
+                                        i += 10;
                                     }
                                 }
                             }
                         } else {
                             // character is in the Basic Multilingual Plane
                             // and encoded as a single utf16 char
+                            codepoint = utf16a;
                             utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                            i += 4;
                         }
-                        if(utf8len > 0) {
-                            // add all bytes from utf8buf expect the last char
-                            // to the result
-                            utf8len--;
-                            c = utf8buf[utf8len];
-                            for(int x=0;x<utf8len;x++) {
-                                result.ptr[result.length++] = utf8buf[x];
-                            }
+                    }
+                    if(utf8len > 0) {
+                        // add all bytes from utf8buf except the last char
+                        // to the result (last char will be added below)
+                        utf8len--;
+                        c = utf8buf[utf8len];
+                        for (unsigned x = 0; x < utf8len; x++) {
+                            result.ptr[result.length++] = utf8buf[x];
                         }
-                        i += 4;
+                    } else {
+                        // decoding failed, ignore the entire sequence
+                        result.ptr[result.length++] = '\\';
                     }
                 }
+            } else {
+                // TODO: discuss the behavior for unrecognized escape sequences
+                //       most parsers throw an error here - we just ignore it
+                result.ptr[result.length++] = '\\';
             }
-            
-            // TODO: discuss the behavior for unrecognized escape sequences
-            //       most parsers throw an error here
+
             result.ptr[result.length++] = c;
         } else {
             if (c == '\\') {

mercurial