351 } |
351 } |
352 |
352 |
353 return CX_JSON_INCOMPLETE_DATA; |
353 return CX_JSON_INCOMPLETE_DATA; |
354 } |
354 } |
355 |
355 |
|
356 // converts a unicode (up to U+FFFF) codepoint to utf8 |
356 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
357 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
357 if (codepoint <= 0x7F) { |
358 if (codepoint <= 0x7F) { |
358 *output_buf = (char)codepoint; |
359 *output_buf = (char)codepoint; |
359 return 1; |
360 return 1; |
360 } else if (codepoint <= 0x7FF) { |
361 } else if (codepoint <= 0x7FF) { |
364 } else if (codepoint <= 0xFFFF) { |
365 } else if (codepoint <= 0xFFFF) { |
365 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); |
366 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); |
366 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
367 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
367 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); |
368 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); |
368 return 3; |
369 return 3; |
|
370 } else if (codepoint <= 0x10FFFF) { |
|
371 output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); |
|
372 output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); |
|
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
|
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); |
|
375 return 4; |
369 } |
376 } |
370 |
377 |
371 return 0; |
378 return 0; |
|
379 } |
|
380 |
|
381 // converts a utf16 surrogate pair to utf8 |
|
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { |
|
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; |
372 } |
384 } |
373 |
385 |
374 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
386 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
375 // note: this function expects that str contains the enclosing quotes! |
387 // note: this function expects that str contains the enclosing quotes! |
376 |
388 |
400 c = '\b'; |
412 c = '\b'; |
401 } else if (c == 'u') { |
413 } else if (c == 'u') { |
402 if (i+4 < str.length - 1) { |
414 if (i+4 < str.length - 1) { |
403 cxstring codepoint_str = { str.ptr + i + 1, 4}; |
415 cxstring codepoint_str = { str.ptr + i + 1, 4}; |
404 uint32_t codepoint; |
416 uint32_t codepoint; |
405 if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { |
417 if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { |
406 char utf8buf[4]; |
418 char utf8buf[4]; |
407 int utf8len = codepoint_to_utf8(codepoint, utf8buf); |
419 int utf8len = 0; |
|
420 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { |
|
421 // character is encoded as a surrogate pair |
|
422 // get next 6 bytes |
|
423 if (i + 10 < str.length - 1) { |
|
424 char *surrogate2 = str.ptr+i+5; |
|
425 if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { |
|
426 cxstring c2_str = { surrogate2 + 2, 4 }; |
|
427 uint32_t c2; |
|
428 if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { |
|
429 codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); |
|
430 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
431 i += 6; |
|
432 } |
|
433 } |
|
434 } |
|
435 } else { |
|
436 // character is in the Basic Multilingual Plane |
|
437 // and encoded as a single utf16 char |
|
438 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
439 } |
408 if(utf8len > 0) { |
440 if(utf8len > 0) { |
409 // add all bytes from utf8buf expect the last char |
441 // add all bytes from utf8buf expect the last char |
410 // to the result |
442 // to the result |
411 utf8len--; |
443 utf8len--; |
412 c = utf8buf[utf8len]; |
444 c = utf8buf[utf8len]; |
413 for(int i=0;i<utf8len;i++) { |
445 for(int x=0;x<utf8len;x++) { |
414 result.ptr[result.length++] = utf8buf[i]; |
446 result.ptr[result.length++] = utf8buf[x]; |
415 } |
447 } |
416 } |
448 } |
417 i += 4; |
449 i += 4; |
418 } |
450 } |
419 } |
451 } |