Mercurial > hg > ucx / changeset

--- a/src/json.c	Sun Jan 26 13:22:58 2025 +0100
+++ b/src/json.c	Sun Jan 26 14:13:48 2025 +0100
@@ -383,6 +383,42 @@
     return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
 }

+static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
+    // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
+    // remaining bytes in the string are ignored (str may be larger!)
+
+    if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
+        return 0;
+    }
+
+    unsigned utf8len = 0;
+    cxstring ustr1 = { str.ptr + 2, 4};
+    uint16_t utf16a, utf16b;
+    if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+        uint32_t codepoint;
+        if (utf16a < 0xD800 || utf16a > 0xE000) {
+            // character is in the Basic Multilingual Plane
+            // and encoded as a single utf16 char
+            codepoint = utf16a;
+            utf8len = codepoint_to_utf8(codepoint, utf8buf);
+        } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
+            // character is encoded as a surrogate pair
+            // get next 6 bytes
+            if (str.length > 12) {
+                if (*(str.ptr+6) == '\\' && *(str.ptr+7) == 'u') {
+                    cxstring ustr2 = { str.ptr+8, 4 };
+                    if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+                            && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+                        codepoint = utf16pair_to_codepoint(utf16a, utf16b);
+                        utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                    }
+                }
+            }
+        }
+    }
+    return utf8len;
+}
+
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
     // note: this function expects that str contains the enclosing quotes!

@@ -413,47 +449,23 @@
             } else if (c == 'b') {
                 c = '\b';
             } else if (c == 'u') {
-                if (i + 4 < str.length - 1) {
-                    cxstring ustr1 = { str.ptr + i + 1, 4};
-                    uint16_t utf16a, utf16b;
-                    char utf8buf[4];
-                    unsigned utf8len = 0;
-                    if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
-                        uint32_t codepoint;
-                        if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
-                            // character is encoded as a surrogate pair
-                            // get next 6 bytes
-                            if (i + 10 < str.length - 1) {
-                                if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
-                                    cxstring ustr2 = { str.ptr+i+7, 4 };
-                                    if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
-                                            && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
-                                        codepoint = utf16pair_to_codepoint(utf16a, utf16b);
-                                        utf8len = codepoint_to_utf8(codepoint, utf8buf);
-                                        i += 10;
-                                    }
-                                }
-                            }
-                        } else {
-                            // character is in the Basic Multilingual Plane
-                            // and encoded as a single utf16 char
-                            codepoint = utf16a;
-                            utf8len = codepoint_to_utf8(codepoint, utf8buf);
-                            i += 4;
-                        }
+                char utf8buf[4];
+                unsigned utf8len = unescape_unicode_string(
+                    cx_strn(str.ptr + i - 1, str.length + 1 - i),
+                    utf8buf
+                );
+                if(utf8len > 0) {
+                    i += utf8len < 4 ? 4 : 10;
+                    // add all bytes from utf8buf except the last char
+                    // to the result (last char will be added below)
+                    utf8len--;
+                    c = utf8buf[utf8len];
+                    for (unsigned x = 0; x < utf8len; x++) {
+                        result.ptr[result.length++] = utf8buf[x];
                     }
-                    if(utf8len > 0) {
-                        // add all bytes from utf8buf except the last char
-                        // to the result (last char will be added below)
-                        utf8len--;
-                        c = utf8buf[utf8len];
-                        for (unsigned x = 0; x < utf8len; x++) {
-                            result.ptr[result.length++] = utf8buf[x];
-                        }
-                    } else {
-                        // decoding failed, ignore the entire sequence
-                        result.ptr[result.length++] = '\\';
-                    }
+                } else {
+                    // decoding failed, ignore the entire sequence
+                    result.ptr[result.length++] = '\\';
                 }
             } else {
                 // TODO: discuss the behavior for unrecognized escape sequences