ucx: comparison src/json.c

-:60113356a7de
+:e4af44b488bc
 }
 return CX_JSON_INCOMPLETE_DATA;
 }
+// converts a unicode (up to U+FFFF) codepoint to utf8
 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
 if (codepoint <= 0x7F) {
 *output_buf = (char)codepoint;
 return 1;
 } else if (codepoint <= 0x7FF) {
 } else if (codepoint <= 0xFFFF) {
 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
 output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
 return 3;
+} else if (codepoint <= 0x10FFFF) {
+output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
+output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
+return 4;
 }
 return 0;
+}
+// converts a utf16 surrogate pair to utf8
+static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
+return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
 }
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
 // note: this function expects that str contains the enclosing quotes!
 c = '\b';
 } else if (c == 'u') {
 if (i+4 < str.length - 1) {
 cxstring codepoint_str = { str.ptr + i + 1, 4};
 uint32_t codepoint;
-if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
+if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
 char utf8buf[4];
-int utf8len = codepoint_to_utf8(codepoint, utf8buf);
+int utf8len = 0;
+if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+// character is encoded as a surrogate pair
+// get next 6 bytes
+if (i + 10 < str.length - 1) {
+char *surrogate2 = str.ptr+i+5;
+if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
+cxstring c2_str = { surrogate2 + 2, 4 };
+uint32_t c2;
+if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
+codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+i += 6;
+}
+}
+}
+} else {
+// character is in the Basic Multilingual Plane
+// and encoded as a single utf16 char
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+}
 if(utf8len > 0) {
 // add all bytes from utf8buf expect the last char
 // to the result
 utf8len--;
 c = utf8buf[utf8len];
-for(int i=0;i<utf8len;i++) {
+for(int x=0;x<utf8len;x++) {
-result.ptr[result.length++] = utf8buf[i];
+result.ptr[result.length++] = utf8buf[x];
 }
 }
 i += 4;
 }
 }

Mercurial > hg > ucx / file comparison

comparison: src/json.c

src/json.c