Sat, 28 Dec 2024 15:06:15 +0100
implement string to integer conversions
relates to #532
CHANGELOG | file | annotate | diff | comparison | revisions | |
src/string.c | file | annotate | diff | comparison | revisions | |
tests/test_string.c | file | annotate | diff | comparison | revisions |
--- a/CHANGELOG Fri Dec 27 13:01:31 2024 +0100 +++ b/CHANGELOG Sat Dec 28 15:06:15 2024 +0100 @@ -4,6 +4,7 @@ * adds properties.h * adds tree.h * adds json.h + * adds locale-independent string to number conversion functions * adds reallocarray() like functions to allocator.h * adds cxIterator() to create iterators over raw C arrays * adds cx_array_reallocator() and cx_array_default_reallocator
--- a/src/string.c Fri Dec 27 13:01:31 2024 +0100 +++ b/src/string.c Sat Dec 28 15:06:15 2024 +0100 @@ -839,17 +839,49 @@ } int cx_strtoll_lc(cxstring str, long long *output, int base, const char *groupsep) { - // TODO: replace temporary implementation - (void) groupsep; // unused in temp impl - char *s = malloc(str.length + 1); - memcpy(s, str.ptr, str.length); - s[str.length] = '\0'; - char *e; - errno = 0; - *output = strtoll(s, &e, base); - int r = errno || !(e && *e == '\0'); - free(s); - return r; + // strategy: parse as unsigned, check range, negate if required + bool neg = false; + size_t start_unsigned = 0; + + // trim already, to search for a sign character + str = cx_strtrim(str); + if (str.length == 0) { + errno = EINVAL; + return -1; + } + + // test if we have a negative sign character + if (str.ptr[start_unsigned] == '-') { + neg = true; + start_unsigned++; + // must not be followed by positive sign character + if (str.length == 1 || str.ptr[start_unsigned] == '+') { + errno = EINVAL; + return -1; + } + } + + // now parse the number with strtoull + unsigned long long v; + cxstring ustr = start_unsigned == 0 ? str + : cx_strn(str.ptr + start_unsigned, str.length - start_unsigned); + int ret = cx_strtoull_lc(ustr, &v, base, groupsep); + if (ret != 0) return ret; + if (neg) { + if (v - 1 > LLONG_MAX) { + errno = ERANGE; + return -1; + } + *output = -(long long) v; + return 0; + } else { + if (v > LLONG_MAX) { + errno = ERANGE; + return -1; + } + *output = (long long) v; + return 0; + } } int cx_strtoi8_lc(cxstring str, int8_t *output, int base, const char *groupsep) { @@ -904,16 +936,83 @@ } int cx_strtoull_lc(cxstring str, unsigned long long *output, int base, const char *groupsep) { - // TODO: replace temporary implementation - (void) groupsep; // unused in temp impl - char *s = malloc(str.length + 1); - memcpy(s, str.ptr, str.length); - s[str.length] = '\0'; - char *e; - *output = strtoull(s, &e, base); - int r = !(e && *e == '\0'); - free(s); - return r; + // some sanity checks + str = cx_strtrim(str); + if (str.length == 0) { + errno = EINVAL; + return -1; + } + if (!(base == 2 || base == 8 || base == 10 || base == 16)) { + errno = EINVAL; + return -1; + } + if (groupsep == NULL) groupsep = ""; + + // find the actual start of the number + if (str.ptr[0] == '+') { + str.ptr++; + str.length--; + if (str.length == 0) { + errno = EINVAL; + return -1; + } + } + size_t start = 0; + + // if base is 2 or 16, some leading stuff may appear + if (base == 2) { + if (str.ptr[0] == 'b' || str.ptr[0] == 'B') { + start = 1; + } else if (str.ptr[0] == '0' && str.length > 1) { + if (str.ptr[1] == 'b' || str.ptr[1] == 'B') { + start = 2; + } + } + } else if (base == 16) { + if (str.ptr[0] == 'x' || str.ptr[0] == 'X' || str.ptr[0] == '#') { + start = 1; + } else if (str.ptr[0] == '0' && str.length > 1) { + if (str.ptr[1] == 'x' || str.ptr[1] == 'X') { + start = 2; + } + } + } + + // check if there are digits left + if (start >= str.length) { + errno = EINVAL; + return -1; + } + + // now parse the number + unsigned long long result = 0; + for (size_t i = start; i < str.length; i++) { + // ignore group separators + if (strchr(groupsep, str.ptr[i])) continue; + + // determine the digit value of the character + unsigned char c = str.ptr[i]; + if (c >= 'a') c = 10 + (c - 'a'); + else if (c >= 'A') c = 10 + (c - 'A'); + else if (c >= '0') c = c - '0'; + else c = 255; + if (c >= base) { + errno = EINVAL; + return -1; + } + + // now combine the digit with what we already have + unsigned long right = (result & 0xff) * base + c; + unsigned long long left = (result >> 8) * base + (right >> 8); + if (left > (ULLONG_MAX >> 8)) { + errno = ERANGE; + return -1; + } + result = (left << 8) + (right & 0xff); + } + + *output = result; + return 0; } int cx_strtou8_lc(cxstring str, uint8_t *output, int base, const char *groupsep) {
--- a/tests/test_string.c Fri Dec 27 13:01:31 2024 +0100 +++ b/tests/test_string.c Sat Dec 28 15:06:15 2024 +0100 @@ -1050,13 +1050,54 @@ // TODO: roll out base 2 tests, but that needs C23 // do some special case tests + // -------------------------- // can fit only in unsigned long long errno = 0; CX_TEST_ASSERT(0 != cx_strtoll(cx_str("0x8df9CE03AbC90815"), &ll, 16)); CX_TEST_ASSERT(errno == ERANGE); - // TODO: implement more special cases + // edge case: only the sign bit is set + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi16(cx_str("0x8000"), &i16, 16)); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtoi16(cx_str("-0x8000"), &i16, 16)); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(i16 == INT16_MIN); + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi64(cx_str("X8000000000000000"), &i64, 16)); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtoi64(cx_str("-X8000000000000000"), &i64, 16)); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(i64 == INT64_MIN); + + // group separators + CX_TEST_ASSERT(0 == cx_strtoi32(cx_str(" -123,456"), &i32, 10)); + CX_TEST_ASSERT(i32 == -123456); + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi16_lc(cx_str(" -Xab,cd"), &i16, 16, "'")); + CX_TEST_ASSERT(errno == EINVAL); + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi16_lc(cx_str(" -X'ab'cd"), &i16, 16, "'")); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtoi16_lc(cx_str(" -X'67'89"), &i16, 16, "'")); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(i16 == -0x6789); + + // binary and (unusual notation of) signed binary + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi8_lc(cx_str(" -1010 1011"), &i8, 2, " ")); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 != cx_strtoi8_lc(cx_str(" 1010 1011"), &i8, 2, " ")); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtoi8_lc(cx_str(" -0101 0101"), &i8, 2, " ")); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(i8 == -0x55); } } @@ -1098,8 +1139,27 @@ // TODO: roll out base 2 tests, but that needs C23 // do some special case tests + // -------------------------- - // TODO: implement tests + // group separators + CX_TEST_ASSERT(0 == cx_strtou32(cx_str(" 123,456"), &u32, 10)); + CX_TEST_ASSERT(u32 == 123456); + errno = 0; + CX_TEST_ASSERT(0 != cx_strtou16_lc(cx_str(" ab,cd"), &u16, 16, "'")); + CX_TEST_ASSERT(errno == EINVAL); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtou16_lc(cx_str(" ab'cd"), &u16, 16, "'")); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(u16 == 0xabcd); + + // binary + errno = 0; + CX_TEST_ASSERT(0 != cx_strtou8_lc(cx_str("1 1010 1011"), &u8, 2, " ")); + CX_TEST_ASSERT(errno == ERANGE); + errno = 0; + CX_TEST_ASSERT(0 == cx_strtou8_lc(cx_str(" 1010 1011"), &u8, 2, " ")); + CX_TEST_ASSERT(errno == 0); + CX_TEST_ASSERT(u8 == 0xAB); } } @@ -1108,6 +1168,15 @@ CX_TEST_DO { CX_TEST_ASSERT(0 == cx_strtof(cx_str("11.3"), &f)); CX_TEST_ASSERT(11.3f == f); + + CX_TEST_ASSERT(0 == cx_strtof(cx_str("1.67262192595e-27"), &f)); + CX_TEST_ASSERT(1.67262192595e-27f == f); + + CX_TEST_ASSERT(0 == cx_strtof_lc(cx_str("138,339.4"), &f, '.', ",")); + CX_TEST_ASSERT(138339.4f == f); + + CX_TEST_ASSERT(0 == cx_strtof_lc(cx_str("138,339.4"), &f, ',', ".")); + CX_TEST_ASSERT(138.3394f == f); } } @@ -1116,6 +1185,15 @@ CX_TEST_DO { CX_TEST_ASSERT(0 == cx_strtod(cx_str("11.3"), &d)); CX_TEST_ASSERT(11.3 == d); + + CX_TEST_ASSERT(0 == cx_strtod(cx_str("1.67262192595e-27"), &d)); + CX_TEST_ASSERT(1.67262192595e-27 == d); + + CX_TEST_ASSERT(0 == cx_strtod_lc(cx_str("138,339.4"), &d, '.', ",")); + CX_TEST_ASSERT(138339.4 == d); + + CX_TEST_ASSERT(0 == cx_strtod_lc(cx_str("138,339.4"), &d, ',', ".")); + CX_TEST_ASSERT(138.3394 == d); } }