| 351 } |
351 } |
| 352 |
352 |
| 353 return CX_JSON_INCOMPLETE_DATA; |
353 return CX_JSON_INCOMPLETE_DATA; |
| 354 } |
354 } |
| 355 |
355 |
| |
356 // converts a unicode (up to U+FFFF) codepoint to utf8 |
| 356 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
357 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
| 357 if (codepoint <= 0x7F) { |
358 if (codepoint <= 0x7F) { |
| 358 *output_buf = (char)codepoint; |
359 *output_buf = (char)codepoint; |
| 359 return 1; |
360 return 1; |
| 360 } else if (codepoint <= 0x7FF) { |
361 } else if (codepoint <= 0x7FF) { |
| 364 } else if (codepoint <= 0xFFFF) { |
365 } else if (codepoint <= 0xFFFF) { |
| 365 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); |
366 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); |
| 366 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
367 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
| 367 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); |
368 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); |
| 368 return 3; |
369 return 3; |
| |
370 } else if (codepoint <= 0x10FFFF) { |
| |
371 output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); |
| |
372 output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); |
| |
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
| |
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); |
| |
375 return 4; |
| 369 } |
376 } |
| 370 |
377 |
| 371 return 0; |
378 return 0; |
| |
379 } |
| |
380 |
| |
381 // converts a utf16 surrogate pair to utf8 |
| |
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { |
| |
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; |
| 372 } |
384 } |
| 373 |
385 |
| 374 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
386 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
| 375 // note: this function expects that str contains the enclosing quotes! |
387 // note: this function expects that str contains the enclosing quotes! |
| 376 |
388 |
| 400 c = '\b'; |
412 c = '\b'; |
| 401 } else if (c == 'u') { |
413 } else if (c == 'u') { |
| 402 if (i+4 < str.length - 1) { |
414 if (i+4 < str.length - 1) { |
| 403 cxstring codepoint_str = { str.ptr + i + 1, 4}; |
415 cxstring codepoint_str = { str.ptr + i + 1, 4}; |
| 404 uint32_t codepoint; |
416 uint32_t codepoint; |
| 405 if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { |
417 if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { |
| 406 char utf8buf[4]; |
418 char utf8buf[4]; |
| 407 int utf8len = codepoint_to_utf8(codepoint, utf8buf); |
419 int utf8len = 0; |
| |
420 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { |
| |
421 // character is encoded as a surrogate pair |
| |
422 // get next 6 bytes |
| |
423 if (i + 10 < str.length - 1) { |
| |
424 char *surrogate2 = str.ptr+i+5; |
| |
425 if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { |
| |
426 cxstring c2_str = { surrogate2 + 2, 4 }; |
| |
427 uint32_t c2; |
| |
428 if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { |
| |
429 codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); |
| |
430 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
| |
431 i += 6; |
| |
432 } |
| |
433 } |
| |
434 } |
| |
435 } else { |
| |
436 // character is in the Basic Multilingual Plane |
| |
437 // and encoded as a single utf16 char |
| |
438 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
| |
439 } |
| 408 if(utf8len > 0) { |
440 if(utf8len > 0) { |
| 409 // add all bytes from utf8buf expect the last char |
441 // add all bytes from utf8buf expect the last char |
| 410 // to the result |
442 // to the result |
| 411 utf8len--; |
443 utf8len--; |
| 412 c = utf8buf[utf8len]; |
444 c = utf8buf[utf8len]; |
| 413 for(int i=0;i<utf8len;i++) { |
445 for(int x=0;x<utf8len;x++) { |
| 414 result.ptr[result.length++] = utf8buf[i]; |
446 result.ptr[result.length++] = utf8buf[x]; |
| 415 } |
447 } |
| 416 } |
448 } |
| 417 i += 4; |
449 i += 4; |
| 418 } |
450 } |
| 419 } |
451 } |