src/json.c

changeset 1152
e4af44b488bc
parent 1151
60113356a7de
child 1156
96f16b5a0029
equal deleted inserted replaced
1151:60113356a7de 1152:e4af44b488bc
351 } 351 }
352 352
353 return CX_JSON_INCOMPLETE_DATA; 353 return CX_JSON_INCOMPLETE_DATA;
354 } 354 }
355 355
356 // converts a unicode (up to U+FFFF) codepoint to utf8
356 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { 357 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
357 if (codepoint <= 0x7F) { 358 if (codepoint <= 0x7F) {
358 *output_buf = (char)codepoint; 359 *output_buf = (char)codepoint;
359 return 1; 360 return 1;
360 } else if (codepoint <= 0x7FF) { 361 } else if (codepoint <= 0x7FF) {
364 } else if (codepoint <= 0xFFFF) { 365 } else if (codepoint <= 0xFFFF) {
365 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); 366 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
366 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); 367 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
367 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); 368 output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
368 return 3; 369 return 3;
370 } else if (codepoint <= 0x10FFFF) {
371 output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
372 output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
375 return 4;
369 } 376 }
370 377
371 return 0; 378 return 0;
379 }
380
381 // converts a utf16 surrogate pair to utf8
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
372 } 384 }
373 385
374 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { 386 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
375 // note: this function expects that str contains the enclosing quotes! 387 // note: this function expects that str contains the enclosing quotes!
376 388
400 c = '\b'; 412 c = '\b';
401 } else if (c == 'u') { 413 } else if (c == 'u') {
402 if (i+4 < str.length - 1) { 414 if (i+4 < str.length - 1) {
403 cxstring codepoint_str = { str.ptr + i + 1, 4}; 415 cxstring codepoint_str = { str.ptr + i + 1, 4};
404 uint32_t codepoint; 416 uint32_t codepoint;
405 if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { 417 if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
406 char utf8buf[4]; 418 char utf8buf[4];
407 int utf8len = codepoint_to_utf8(codepoint, utf8buf); 419 int utf8len = 0;
420 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
421 // character is encoded as a surrogate pair
422 // get next 6 bytes
423 if (i + 10 < str.length - 1) {
424 char *surrogate2 = str.ptr+i+5;
425 if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
426 cxstring c2_str = { surrogate2 + 2, 4 };
427 uint32_t c2;
428 if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
429 codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
430 utf8len = codepoint_to_utf8(codepoint, utf8buf);
431 i += 6;
432 }
433 }
434 }
435 } else {
436 // character is in the Basic Multilingual Plane
437 // and encoded as a single utf16 char
438 utf8len = codepoint_to_utf8(codepoint, utf8buf);
439 }
408 if(utf8len > 0) { 440 if(utf8len > 0) {
409 // add all bytes from utf8buf expect the last char 441 // add all bytes from utf8buf expect the last char
410 // to the result 442 // to the result
411 utf8len--; 443 utf8len--;
412 c = utf8buf[utf8len]; 444 c = utf8buf[utf8len];
413 for(int i=0;i<utf8len;i++) { 445 for(int x=0;x<utf8len;x++) {
414 result.ptr[result.length++] = utf8buf[i]; 446 result.ptr[result.length++] = utf8buf[x];
415 } 447 }
416 } 448 }
417 i += 4; 449 i += 4;
418 } 450 }
419 } 451 }

mercurial