Mercurial > hg > ucx / file revision

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2024 Mike Becker, Olaf Wintermann All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <string.h>
#include <ctype.h>

#include "cx/json.h"

/*
 * RFC 8259
 * https://tools.ietf.org/html/rfc8259
 */

#define PARSER_READVALUE_ALLOC 32

static CxJsonValue cx_json_value_nothing = {.type = CX_JSON_NOTHING};


static int token_append(CxJsonToken *token, const char *buf, size_t len) {
    if (len == 0) {
        return 0;
    }

    size_t newlen = token->length + len;
    if (token->alloc < newlen) {
        char *newbuf = realloc(
                token->alloc == 0 ? NULL : (char *) token->content,
                newlen);
        if (!newbuf) {
            return 1;
        }
        token->content = newbuf;
        token->alloc = newlen;
    }

    memcpy((char *) token->content + token->length, buf, len);
    token->length = newlen;
    return 0;
}

static CxJsonToken get_content(CxJson *p, size_t start, size_t end) {
    CxJsonToken token = {0};
    size_t part2 = end - start;
    if (p->uncompleted.tokentype == CX_JSON_NO_TOKEN) {
        token.content = p->buffer + start;
        token.length = part2;
    } else if (part2 == 0) {
        token = p->uncompleted;
    } else {
        if (token_append(&p->uncompleted, p->buffer + start, end - start)) {
            // TODO: this does certainly not lead to correct error handling
            return (CxJsonToken){0};
        }
        token = p->uncompleted;
    }
    p->uncompleted = (CxJsonToken){0};
    return token;
}

static int token_isliteral(const char *content, size_t length) {
    if (length == 4) {
        if (!memcmp(content, "true", 4)) {
            return 1;
        } else if (!memcmp(content, "null", 4)) {
            return 1;
        }
    } else if (length == 5 && !memcmp(content, "false", 5)) {
        return 1;
    }
    return 0;
}

static int num_isexp(const char *content, size_t length, size_t pos) {
    if (pos >= length) {
        return 0;
    }

    int ok = 0;
    for (size_t i = pos; i < length; i++) {
        char c = content[i];
        if (isdigit(c)) {
            ok = 1;
        } else if (i == pos) {
            if (!(c == '+' || c == '-')) {
                return 0;
            }
        } else {
            return 0;
        }
    }

    return ok;
}

static CxJsonTokenType token_numbertype(const char *content, size_t length) {
    if (length == 0) return CX_JSON_TOKEN_ERROR;

    if (content[0] != '-' && !isdigit(content[0])) {
        return CX_JSON_TOKEN_ERROR;
    }

    CxJsonTokenType type = CX_JSON_TOKEN_INTEGER;
    for (size_t i = 1; i < length; i++) {
        if (content[i] == '.') {
            if (type == CX_JSON_TOKEN_NUMBER) {
                return CX_JSON_TOKEN_ERROR; // more than one decimal separator
            }
            type = CX_JSON_TOKEN_NUMBER;
        } else if (content[i] == 'e' || content[i] == 'E') {
            return num_isexp(content, length, i + 1) ? CX_JSON_TOKEN_NUMBER : CX_JSON_TOKEN_ERROR;
        } else if (!isdigit(content[i])) {
            return CX_JSON_TOKEN_ERROR; // char is not a digit, decimal separator or exponent sep
        }
    }

    return type;
}

static CxJsonToken get_token(CxJson *p, size_t start, size_t end) {
    CxJsonToken token = get_content(p, start, end);
    if (token_isliteral(token.content, token.length)) {
        token.tokentype = CX_JSON_TOKEN_LITERAL;
    } else {
        token.tokentype = token_numbertype(token.content, token.length);
    }
    p->pos = end;
    return token;
}

static CxJsonTokenType char2ttype(char c) {
    switch (c) {
        case '[': {
            return CX_JSON_TOKEN_BEGIN_ARRAY;
        }
        case '{': {
            return CX_JSON_TOKEN_BEGIN_OBJECT;
        }
        case ']': {
            return CX_JSON_TOKEN_END_ARRAY;
        }
        case '}': {
            return CX_JSON_TOKEN_END_OBJECT;
        }
        case ':': {
            return CX_JSON_TOKEN_NAME_SEPARATOR;
        }
        case ',': {
            return CX_JSON_TOKEN_VALUE_SEPARATOR;
        }
        case '"': {
            return CX_JSON_TOKEN_STRING;
        }
        default: {
            if (isspace(c)) {
                return CX_JSON_TOKEN_SPACE;
            }
        }
    }
    return CX_JSON_NO_TOKEN;
}

static CxJsonToken json_parser_next_token(CxJson *p) {
    // current token type and start index
    CxJsonTokenType ttype = p->uncompleted.tokentype;
    size_t token_start = p->pos;

    for (size_t i = p->pos; i < p->size; i++) {
        char c = p->buffer[i];
        if (ttype != CX_JSON_TOKEN_STRING) {
            // currently non-string token

            CxJsonTokenType ctype = char2ttype(c); // start of new token?

            if (ttype == CX_JSON_NO_TOKEN) {
                if (ctype == CX_JSON_TOKEN_SPACE) {
                    continue;
                } else if (ctype == CX_JSON_TOKEN_STRING) {
                    // begin string
                    ttype = CX_JSON_TOKEN_STRING;
                    token_start = i;
                } else if (ctype != CX_JSON_NO_TOKEN) {
                    // single-char token
                    p->pos = i + 1;
                    CxJsonToken token = {ctype, NULL, 0, 0};
                    return token;
                } else {
                    ttype = CX_JSON_TOKEN_LITERAL; // number or literal
                    token_start = i;
                }
            } else {
                // finish token
                if (ctype != CX_JSON_NO_TOKEN) {
                    return get_token(p, token_start, i);
                }
            }
        } else {
            // currently inside a string
            if (!p->tokenizer_escape) {
                if (c == '"') {
                    CxJsonToken ret = get_content(p, token_start, i + 1);
                    ret.tokentype = CX_JSON_TOKEN_STRING;
                    p->pos = i + 1;
                    return ret;
                } else if (c == '\\') {
                    p->tokenizer_escape = 1;
                }
            } else {
                p->tokenizer_escape = 0;
            }
        }
    }

    if (ttype != CX_JSON_NO_TOKEN) {
        // uncompleted token
        size_t uncompeted_len = p->size - token_start;
        if (p->uncompleted.tokentype == CX_JSON_NO_TOKEN) {
            // current token is uncompleted
            // save current token content in p->uncompleted
            CxJsonToken uncompleted;
            uncompleted.tokentype = ttype;
            uncompleted.length = uncompeted_len;
            uncompleted.alloc = uncompeted_len + 16;
            char *tmp = malloc(uncompleted.alloc);
            if (tmp) {
                memcpy(tmp, p->buffer + token_start, uncompeted_len);
                uncompleted.content = tmp;
                p->uncompleted = uncompleted;
            } else {
                p->error = 1;
            }
        } else {
            // previously we also had an uncompleted token
            // combine the uncompleted token with the current token
            if (token_append(&p->uncompleted, p->buffer + token_start, uncompeted_len)) {
                p->error = 1;
            }
        }
    }

    CxJsonToken ret = {CX_JSON_NO_TOKEN, NULL, 0, 0};
    return ret;
}

static cxmutstr unescape_string(const char *str, size_t len) {
    // TODO: support more escape sequences
    // we know that the unescaped string will be shorter by at least 2 chars
    cxmutstr result;
    result.length = 0;
    result.ptr = malloc(len - 1);
    if (result.ptr == NULL) {
        // TODO: check if this actually leads to correct error handling
        return result;
    }

    bool u = false;
    for (size_t i = 1; i < len - 1; i++) {
        char c = str[i];
        if (u) {
            u = false;
            if (c == 'n') {
                c = '\n';
            } else if (c == 't') {
                c = '\t';
            }
            result.ptr[result.length++] = c;
        } else {
            if (c == '\\') {
                u = true;
            } else {
                result.ptr[result.length++] = c;
            }
        }
    }
    result.ptr[result.length] = 0;

    return result;
}

static int parse_number(const char *str, size_t len, void *value, bool asint) {
    char *endptr = NULL;
    char buf[32];
    if (len > 30) {
        return 1;
    }
    // TODO: if we can guarantee that we are working on a copied string already, we can avoid this memcpy
    memcpy(buf, str, len);
    buf[len] = 0;

    if (asint) {
        long long v = strtoll(buf, &endptr, 10);
        *((int64_t*)value) = (int64_t) v;
    } else {
        // TODO: proper JSON spec number parser
        double v = strtod(buf, &endptr);
        *((double*)value) = v;
    }

    return (endptr != &buf[len]);
}

static int add_state(CxJson *p, int state) {
    CxArrayReallocator alloc = cx_array_reallocator(NULL, p->states_internal);
    size_t size = p->nstates + 1;
    size_t capacity = p->states_alloc;
    // TODO: fix that nstates does not denote the size of the array
    // TODO: replace with a 16 bit (or maybe even 8 bit) version of cx_array_add()
    int result = cx_array_add(
            &p->states,
            &size,
            &capacity,
            sizeof(int),
            &state,
            &alloc
    );
    if (result == 0) {
        p->nstates = size - 1;
        p->states_alloc = capacity;
    }
    return result;
}

static void end_elm(CxJson *p, CxJsonReaderType type) {
    p->reader_type = type;
    p->nstates--;
}

#define JP_STATE_VALUE_BEGIN        0
#define JP_STATE_VALUE_BEGIN_OBJ    1
#define JP_STATE_VALUE_BEGIN_AR     2
#define JP_STATE_ARRAY_SEP_OR_CLOSE 3
#define JP_STATE_OBJ_NAME_OR_CLOSE  4
#define JP_STATE_OBJ_NAME           5
#define JP_STATE_OBJ_COLON          6
#define JP_STATE_OBJ_SEP_OR_CLOSE   7

static int next_state_after_value(int current) {
    switch (current) {
        default:
            return -1;
            // after value JSON complete, expect nothing
        case JP_STATE_VALUE_BEGIN:
            return -1;
            // after obj value, expect ',' or '}'
        case JP_STATE_VALUE_BEGIN_OBJ:
            return JP_STATE_OBJ_SEP_OR_CLOSE;
            // after array value, expect ',' or ']'
        case JP_STATE_VALUE_BEGIN_AR:
            return JP_STATE_ARRAY_SEP_OR_CLOSE;
    }
}

static void clear_valuename(CxJson *p) {
    free(p->value_name);
    p->value_name = NULL;
    p->value_name_len = 0;
}

static void clear_values(CxJson *p) {
    free(p->value_str);
    p->value_str = NULL;
    p->value_str_len = 0;
    p->value_int = 0;
    p->value_double = 0;
}

static int json_read(CxJson *p) {
    int state = p->states[p->nstates];
    clear_values(p);
    CxJsonToken token = json_parser_next_token(p);
    p->reader_token = token;

    p->value_ready = 0;

    if (token.tokentype == CX_JSON_NO_TOKEN) {
        return 0;
    }

    int ret = 1;

    // 0 JP_STATE_VALUE_BEGIN          value begin
    // 1 JP_STATE_VALUE_BEGIN_OBJ      value begin (inside object)
    // 2 JP_STATE_VALUE_BEGIN_AR       value begin (inside array)
    // 3 JP_STATE_ARRAY_SEP_OR_CLOSE   array, expect separator or arrayclose
    // 4 JP_STATE_OBJ_NAME_OR_CLOSE    object, expect name or objclose
    // 5 JP_STATE_OBJ_NAME             object, expect name
    // 6 JP_STATE_OBJ_COLON            object, expect ':'
    // 7 JP_STATE_OBJ_SEP_OR_CLOSE     object, expect separator, objclose

    if (state == JP_STATE_VALUE_BEGIN_AR || state == JP_STATE_OBJ_SEP_OR_CLOSE) {
        clear_valuename(p);
    }

    if (state < 3) {
        // expect value
        p->states[p->nstates] = next_state_after_value(state);
        p->value_ready = 1;
        switch (token.tokentype) {
            case CX_JSON_TOKEN_BEGIN_ARRAY: {
                p->reader_type = CX_JSON_READER_ARRAY_BEGIN;
                ret = add_state(p, JP_STATE_VALUE_BEGIN_AR) ? -1 : 1;
                break;
            }
            case CX_JSON_TOKEN_BEGIN_OBJECT: {
                p->reader_type = CX_JSON_READER_OBJECT_BEGIN;
                ret = add_state(p, JP_STATE_OBJ_NAME_OR_CLOSE) ? -1 : 1;
                break;
            }
            case CX_JSON_TOKEN_END_ARRAY: {
                p->value_ready = 0;
                end_elm(p, CX_JSON_READER_ARRAY_END);
                break;
            }
            case CX_JSON_TOKEN_STRING: {
                p->reader_type = CX_JSON_READER_STRING;
                cxmutstr str = unescape_string(token.content, token.length);
                if (str.ptr) {
                    p->value_str = str.ptr;
                    p->value_str_len = str.length;
                } else {
                    ret = -1;
                }
                break;
            }
            case CX_JSON_TOKEN_INTEGER: {
                p->reader_type = CX_JSON_READER_INTEGER;
                if (parse_number(token.content, token.length,
                                 &p->value_int, true)) {
                    ret = -1;
                }
                break;
            }
            case CX_JSON_TOKEN_NUMBER: {
                p->reader_type = CX_JSON_READER_NUMBER;
                if (parse_number(token.content, token.length,
                                 &p->value_double, false)) {
                    ret = -1;
                }
                break;
            }
            case CX_JSON_TOKEN_LITERAL: {
                p->reader_type = CX_JSON_READER_LITERAL;
                break;
            }
            default: ret = -1;
        }
    } else if (state == JP_STATE_ARRAY_SEP_OR_CLOSE) {
        // expect ',' or ']'
        if (token.tokentype == CX_JSON_TOKEN_VALUE_SEPARATOR) {
            p->states[p->nstates] = JP_STATE_VALUE_BEGIN_AR;
            ret = json_read(p);
        } else if (token.tokentype == CX_JSON_TOKEN_END_ARRAY) {
            end_elm(p, CX_JSON_READER_ARRAY_END);
        } else {
            ret = -1;
        }
    } else if (state == JP_STATE_OBJ_NAME_OR_CLOSE || state == JP_STATE_OBJ_NAME) {
        if (state == JP_STATE_OBJ_NAME_OR_CLOSE && token.tokentype == CX_JSON_TOKEN_END_OBJECT) {
            clear_valuename(p);
            end_elm(p, CX_JSON_READER_OBJECT_END);
        } else {
            // expect string
            if (token.tokentype != CX_JSON_TOKEN_STRING) return -1;

            if (p->value_name) free(p->value_name);
            cxmutstr valname = unescape_string(token.content, token.length);
            p->value_name = valname.ptr;
            p->value_name_len = valname.length;

            // next state
            p->states[p->nstates] = JP_STATE_OBJ_COLON;
            ret = json_read(p);
        }
    } else if (state == JP_STATE_OBJ_COLON) {
        // expect ':'
        if (token.tokentype != CX_JSON_TOKEN_NAME_SEPARATOR) return -1;
        // next state
        p->states[p->nstates] = JP_STATE_VALUE_BEGIN_OBJ;
        ret = json_read(p);
    } else if (state == JP_STATE_OBJ_SEP_OR_CLOSE) {
        // expect ',' or '}'
        if (token.tokentype == CX_JSON_TOKEN_VALUE_SEPARATOR) {
            p->states[p->nstates] = JP_STATE_OBJ_NAME;
            ret = json_read(p);
        } else if (token.tokentype == CX_JSON_TOKEN_END_OBJECT) {
            end_elm(p, CX_JSON_READER_OBJECT_END);
        } else {
            ret = -1;
        }
    }

    if (token.alloc > 0) {
        free((char*)token.content);
    }

    return ret;
}

static CxJsonLiteral json_reader_literal(CxJson *p) {
    const char *l = p->reader_token.content;
    size_t token_len = p->reader_token.length;
    if (token_len == 4 && !memcmp(l, "true", 4)) {
        return CX_JSON_TRUE;
    } else if (token_len == 5 && !memcmp(l, "false", 5)) {
        return CX_JSON_FALSE;
    }
    return CX_JSON_NULL;
}

/* -------------------- read value functions -------------------- */

static int setup_read_value(CxJson *p) {
    p->readvalue_alloc = PARSER_READVALUE_ALLOC;
    p->readvalue_nelm = 0;
    p->readvalue_stack = calloc(p->readvalue_alloc, sizeof(CxJsonValue *));
    if (!p->readvalue_stack) return -1;

    p->read_value = NULL;
    p->readvalue_stack[0] = NULL;

    return 0;
}

static int add_to_parent(CxJson *p, CxJsonValue *parent, CxJsonValue *v) {
    if (!parent) {
        return -1; // shouldn't happen but who knows
    }

    if (parent->type == CX_JSON_OBJECT) {
        if (!p->value_name || p->value_name_len == 0) {
            return -1;
        }
        char *valuename = p->value_name;
        p->value_name = NULL;

        CxJsonObjValue newvalue;
        newvalue.name = valuename;
        newvalue.value = v;

        return cx_array_simple_add(parent->value.object.values, newvalue);
    } else if (parent->type == CX_JSON_ARRAY) {
        return cx_array_simple_add(parent->value.array.array, v);
    } else {
        return -1; // should also never happen
    }
}


static int readvaluestack_add(CxJson *p, CxJsonValue *v) {
    if (p->readvalue_nelm == p->readvalue_alloc) {
        p->readvalue_alloc *= 2;
        if (cx_reallocate(&p->readvalue_stack, sizeof(CxJsonValue *) * p->readvalue_alloc)) {
            return -1;
        }
    }
    p->readvalue_stack[p->readvalue_nelm++] = v;
    return 0;
}

void cxJsonInit(CxJson *json) {
    memset(json, 0, sizeof(CxJson));
    json->states = json->states_internal;
    json->states_alloc = cx_nmemb(json->states_internal);
    // TODO: find better way to configure the initial allocation size for arrays and objects
    json->reader_array_alloc = 8;
}

void cxJsonDestroy(CxJson *p) {
    if (p->states != p->states_internal) {
        free(p->states);
    }
    free(p->readvalue_stack);
    cxJsonValueFree(p->read_value);
    free(p->value_name);
    free(p->value_str);
}

int cxJsonFilln(CxJson *p, const char *buf, size_t size) {
    // TODO: implement rescue buffer like in CxProperties to allow subsequent fills
    p->buffer = buf;
    p->size = size;
    p->pos = 0;
    return 0;
}

int cxJsonNext(CxJson *p, CxJsonValue **value) {
    // TODO: replace int with a status enum like in CxProperties

    *value = NULL; // TODO: maybe better initialize with NOTHING?
    if (!p->readvalue_stack) {
        if (setup_read_value(p)) return -1;
    }

    while (p->readvalue_nelm > 0 || !p->read_value) {
        if (p->value_ready) {
            // value available without another read
            CxJsonValue *v = calloc(1, sizeof(CxJsonValue));
            if (!v) return -1;

            if (p->readvalue_nelm > 0) {
                if (add_to_parent(p, p->readvalue_stack[p->readvalue_nelm - 1], v)) {
                    free(v);
                    return -1;
                }
            } else {
                // set this value as root
                p->read_value = v;
            }

            switch (p->reader_type) {
                case CX_JSON_READER_OBJECT_BEGIN: {
                    v->type = CX_JSON_OBJECT;
                    if (readvaluestack_add(p, v)) {
                        return -1;
                    }
                    break;
                }
                case CX_JSON_READER_OBJECT_END:
                    return -1; // should not happen
                case CX_JSON_READER_ARRAY_BEGIN: {
                    v->type = CX_JSON_ARRAY;
                    if (readvaluestack_add(p, v)) {
                        return -1;
                    }
                    break;
                }
                case CX_JSON_READER_ARRAY_END:
                    return -1; // should not happen
                case CX_JSON_READER_STRING: {
                    v->type = CX_JSON_STRING;
                    if (p->value_str) {
                        v->value.string.ptr = p->value_str;
                        v->value.string.length = p->value_str_len;
                        p->value_str = NULL;
                    }
                    break;
                }
                case CX_JSON_READER_INTEGER: {
                    v->type = CX_JSON_INTEGER;
                    v->value.integer = p->value_int;
                    break;
                }
                case CX_JSON_READER_NUMBER: {
                    v->type = CX_JSON_NUMBER;
                    v->value.number = p->value_double;
                    break;
                }
                case CX_JSON_READER_LITERAL: {
                    v->type = CX_JSON_LITERAL;
                    v->value.literal = json_reader_literal(p);
                    break;
                }
            }
        } else if (p->readvalue_initialized) {
            CxJsonReaderType rt = p->reader_type;
            if (rt == CX_JSON_READER_OBJECT_END || rt == CX_JSON_READER_ARRAY_END) {
                p->readvalue_nelm--;
            }
            // else: p->value_ready is 1, this will be handled in the next run
        }

        if (p->readvalue_nelm > 0 || !p->read_value) {
            int r = json_read(p);
            if (r != 1) {
                p->readvalue_initialized = 0;
                return r;
            }
            p->readvalue_initialized = 1;
        }
    }

    *value = p->read_value;
    p->readvalue_initialized = 0;
    p->read_value = NULL;

    return 1;
}

void cxJsonValueFree(CxJsonValue *value) {
    if (value == NULL || value == &cx_json_value_nothing) return;

    // TODO: discuss if we should keep freeing the stuff recursively
    switch (value->type) {
        case CX_JSON_OBJECT: {
            CxJsonObject obj = value->value.object;
            for (size_t i = 0; i < obj.values_size; i++) {
                cxJsonValueFree(obj.values[i].value);
                free(obj.values[i].name);
            }
            free(obj.values);
            break;
        }
        case CX_JSON_ARRAY: {
            CxJsonArray array = value->value.array;
            for (size_t i = 0; i < array.array_size; i++) {
                cxJsonValueFree(array.array[i]);
            }
            free(array.array);
            break;
        }
        case CX_JSON_STRING: {
            free(value->value.string.ptr);
            break;
        }
        default: {
            break;
        }
    }
    free(value);
}

CxJsonValue *cxJsonArrGet(CxJsonValue *value, size_t index) {
    if (index >= value->value.array.array_size) {
        return &cx_json_value_nothing;
    }
    return value->value.array.array[index];
}

CxJsonValue *cxJsonObjGet(CxJsonValue *value, const char *name) {
    CxJsonObject *obj = &(value->value.object);
    // TODO: think about sorting the object so that we can use binary search here
    for (size_t i = 0; i < obj->values_size; i++) {
        // TODO: we might want to store names as cxmutstr
        if (0 == strcmp(name, obj->values[i].name)) {
            return obj->values[i].value;
        }
    }
    return &cx_json_value_nothing;
}