add more escape sequences to unescape function

Sat, 11 Jan 2025 12:56:54 +0100

author
Mike Becker <universe@uap-core.de>
date
Sat, 11 Jan 2025 12:56:54 +0100
changeset 1122
49ab92de9a13
parent 1121
7fd2672199d7
child 1123
2b83302d595a

add more escape sequences to unescape function

and change the name of token_start to more clearly express what it actually is
(the start of the currently parsed PART of the token)

src/json.c file | annotate | diff | comparison | revisions
--- a/src/json.c	Sat Jan 11 12:33:10 2025 +0100
+++ b/src/json.c	Sat Jan 11 12:56:54 2025 +0100
@@ -252,7 +252,7 @@
 
     // current token type and start index
     CxJsonTokenType ttype = json->uncompleted.tokentype;
-    size_t token_start = json->buffer.pos;
+    size_t token_part_start = json->buffer.pos;
 
     for (size_t i = json->buffer.pos; i < json->buffer.size; i++) {
         char c = json->buffer.space[i];
@@ -266,7 +266,7 @@
                 } else if (ctype == CX_JSON_TOKEN_STRING) {
                     // begin string
                     ttype = CX_JSON_TOKEN_STRING;
-                    token_start = i;
+                    token_part_start = i;
                 } else if (ctype != CX_JSON_NO_TOKEN) {
                     // single-char token
                     json->buffer.pos = i + 1;
@@ -274,12 +274,12 @@
                     return CX_JSON_NO_ERROR;
                 } else {
                     ttype = CX_JSON_TOKEN_LITERAL; // number or literal
-                    token_start = i;
+                    token_part_start = i;
                 }
             } else {
                 // finish token
                 if (ctype != CX_JSON_NO_TOKEN) {
-                    *result = token_create(json, false, token_start, i);
+                    *result = token_create(json, false, token_part_start, i);
                     if (result->tokentype == CX_JSON_NO_TOKEN) {
                         return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE
                     }
@@ -296,7 +296,7 @@
                 json->tokenizer_escape = false;
             } else {
                 if (c == '"') {
-                    *result = token_create(json, true, token_start, i + 1);
+                    *result = token_create(json, true, token_part_start, i + 1);
                     if (result->tokentype == CX_JSON_NO_TOKEN) {
                         return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE
                     }
@@ -311,13 +311,13 @@
 
     if (ttype != CX_JSON_NO_TOKEN) {
         // uncompleted token
-        size_t uncompleted_len = json->buffer.size - token_start;
+        size_t uncompleted_len = json->buffer.size - token_part_start;
         if (json->uncompleted.tokentype == CX_JSON_NO_TOKEN) {
             // current token is uncompleted
             // save current token content
             CxJsonToken uncompleted = {
                 ttype, true,
-                cx_strdup(cx_strn(json->buffer.space + token_start, uncompleted_len))
+                cx_strdup(cx_strn(json->buffer.space + token_part_start, uncompleted_len))
             };
             if (uncompleted.content.ptr == NULL) {
                 return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE
@@ -328,7 +328,7 @@
             // combine the uncompleted token with the current token
             assert(json->uncompleted.allocated);
             cxmutstr str = cx_strcat_m(json->uncompleted.content, 1,
-                cx_strn(json->buffer.space + token_start, uncompleted_len));
+                cx_strn(json->buffer.space + token_part_start, uncompleted_len));
             if (str.ptr == NULL) {
                 return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE
             }
@@ -342,8 +342,8 @@
 }
 
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
-    // TODO: support more escape sequences
-    // TODO: to be consistent with escape_string() we might want to expect that the enclosing quotes were already removed
+    // note: this function expects that str contains the enclosing quotes!
+
     cxmutstr result;
     result.length = 0;
     result.ptr = cxMalloc(a, str.length - 1);
@@ -358,7 +358,20 @@
                 c = '\n';
             } else if (c == 't') {
                 c = '\t';
+            } else if (c == 'r') {
+                c = '\r';
+            } else if (c == '\\') {
+                c = '\\';
+            } else if (c == '/') {
+                c = '/'; // always unescape, we don't need settings here
+            } else if (c == 'f') {
+                c = '\f';
+            } else if (c == 'b') {
+                c = '\b';
             }
+            // TODO: support \uXXXX escape sequences
+            // TODO: discuss the behavior for unrecognized escape sequences
+            //       most parsers throw an error here
             result.ptr[result.length++] = c;
         } else {
             if (c == '\\') {
@@ -374,6 +387,8 @@
 }
 
 static cxmutstr escape_string(cxmutstr str) {
+    // note: this function produces the string without enclosing quotes
+    // the reason is that we don't want to allocate memory just for that
     CxBuffer buf = {0};
 
     bool all_printable = true;

mercurial