implement string to integer conversions

Sat, 28 Dec 2024 15:06:15 +0100

author
Mike Becker <universe@uap-core.de>
date
Sat, 28 Dec 2024 15:06:15 +0100
changeset 1061
c7d23892eab5
parent 1060
0a7c1bb2372d
child 1062
8baed9b38bc6

implement string to integer conversions

relates to #532

CHANGELOG file | annotate | diff | comparison | revisions
src/string.c file | annotate | diff | comparison | revisions
tests/test_string.c file | annotate | diff | comparison | revisions
--- a/CHANGELOG	Fri Dec 27 13:01:31 2024 +0100
+++ b/CHANGELOG	Sat Dec 28 15:06:15 2024 +0100
@@ -4,6 +4,7 @@
  * adds properties.h
  * adds tree.h
  * adds json.h
+ * adds locale-independent string to number conversion functions
  * adds reallocarray() like functions to allocator.h
  * adds cxIterator() to create iterators over raw C arrays
  * adds cx_array_reallocator() and cx_array_default_reallocator
--- a/src/string.c	Fri Dec 27 13:01:31 2024 +0100
+++ b/src/string.c	Sat Dec 28 15:06:15 2024 +0100
@@ -839,17 +839,49 @@
 }
 
 int cx_strtoll_lc(cxstring str, long long *output, int base, const char *groupsep) {
-    // TODO: replace temporary implementation
-    (void) groupsep; // unused in temp impl
-    char *s = malloc(str.length + 1);
-    memcpy(s, str.ptr, str.length);
-    s[str.length] = '\0';
-    char *e;
-    errno = 0;
-    *output = strtoll(s, &e, base);
-    int r = errno || !(e && *e == '\0');
-    free(s);
-    return r;
+    // strategy: parse as unsigned, check range, negate if required
+    bool neg = false;
+    size_t start_unsigned = 0;
+
+    // trim already, to search for a sign character
+    str = cx_strtrim(str);
+    if (str.length == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    // test if we have a negative sign character
+    if (str.ptr[start_unsigned] == '-') {
+        neg = true;
+        start_unsigned++;
+        // must not be followed by positive sign character
+        if (str.length == 1 || str.ptr[start_unsigned] == '+') {
+            errno = EINVAL;
+            return -1;
+        }
+    }
+
+    // now parse the number with strtoull
+    unsigned long long v;
+    cxstring ustr = start_unsigned == 0 ? str
+        : cx_strn(str.ptr + start_unsigned, str.length - start_unsigned);
+    int ret = cx_strtoull_lc(ustr, &v, base, groupsep);
+    if (ret != 0) return ret;
+    if (neg) {
+        if (v - 1 > LLONG_MAX) {
+            errno = ERANGE;
+            return -1;
+        }
+        *output = -(long long) v;
+        return 0;
+    } else {
+        if (v > LLONG_MAX) {
+            errno = ERANGE;
+            return -1;
+        }
+        *output = (long long) v;
+        return 0;
+    }
 }
 
 int cx_strtoi8_lc(cxstring str, int8_t *output, int base, const char *groupsep) {
@@ -904,16 +936,83 @@
 }
 
 int cx_strtoull_lc(cxstring str, unsigned long long *output, int base, const char *groupsep) {
-    // TODO: replace temporary implementation
-    (void) groupsep; // unused in temp impl
-    char *s = malloc(str.length + 1);
-    memcpy(s, str.ptr, str.length);
-    s[str.length] = '\0';
-    char *e;
-    *output = strtoull(s, &e, base);
-    int r = !(e && *e == '\0');
-    free(s);
-    return r;
+    // some sanity checks
+    str = cx_strtrim(str);
+    if (str.length == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+    if (!(base == 2 || base == 8 || base == 10 || base == 16)) {
+        errno = EINVAL;
+        return -1;
+    }
+    if (groupsep == NULL) groupsep = "";
+
+    // find the actual start of the number
+    if (str.ptr[0] == '+') {
+        str.ptr++;
+        str.length--;
+        if (str.length == 0) {
+            errno = EINVAL;
+            return -1;
+        }
+    }
+    size_t start = 0;
+
+    // if base is 2 or 16, some leading stuff may appear
+    if (base == 2) {
+        if (str.ptr[0] == 'b' || str.ptr[0] == 'B') {
+            start = 1;
+        } else if (str.ptr[0] == '0' && str.length > 1) {
+            if (str.ptr[1] == 'b' || str.ptr[1] == 'B') {
+                start = 2;
+            }
+        }
+    } else if (base == 16) {
+        if (str.ptr[0] == 'x' || str.ptr[0] == 'X' || str.ptr[0] == '#') {
+            start = 1;
+        } else if (str.ptr[0] == '0' && str.length > 1) {
+            if (str.ptr[1] == 'x' || str.ptr[1] == 'X') {
+                start = 2;
+            }
+        }
+    }
+
+    // check if there are digits left
+    if (start >= str.length) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    // now parse the number
+    unsigned long long result = 0;
+    for (size_t i = start; i < str.length; i++) {
+        // ignore group separators
+        if (strchr(groupsep, str.ptr[i])) continue;
+
+        // determine the digit value of the character
+        unsigned char c = str.ptr[i];
+        if (c >= 'a') c = 10 + (c - 'a');
+        else if (c >= 'A') c = 10 + (c - 'A');
+        else if (c >= '0') c = c - '0';
+        else c = 255;
+        if (c >= base) {
+            errno = EINVAL;
+            return -1;
+        }
+
+        // now combine the digit with what we already have
+        unsigned long right = (result & 0xff) * base + c;
+        unsigned long long left = (result >> 8) * base + (right >> 8);
+        if (left > (ULLONG_MAX >> 8)) {
+            errno = ERANGE;
+            return -1;
+        }
+        result = (left << 8) + (right & 0xff);
+    }
+
+    *output = result;
+    return 0;
 }
 
 int cx_strtou8_lc(cxstring str, uint8_t *output, int base, const char *groupsep) {
--- a/tests/test_string.c	Fri Dec 27 13:01:31 2024 +0100
+++ b/tests/test_string.c	Sat Dec 28 15:06:15 2024 +0100
@@ -1050,13 +1050,54 @@
         // TODO: roll out base 2 tests, but that needs C23
 
         // do some special case tests
+        // --------------------------
 
         // can fit only in unsigned long long
         errno = 0;
         CX_TEST_ASSERT(0 != cx_strtoll(cx_str("0x8df9CE03AbC90815"), &ll, 16));
         CX_TEST_ASSERT(errno == ERANGE);
 
-        // TODO: implement more special cases
+        // edge case: only the sign bit is set
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi16(cx_str("0x8000"), &i16, 16));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtoi16(cx_str("-0x8000"), &i16, 16));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(i16 == INT16_MIN);
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi64(cx_str("X8000000000000000"), &i64, 16));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtoi64(cx_str("-X8000000000000000"), &i64, 16));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(i64 == INT64_MIN);
+
+        // group separators
+        CX_TEST_ASSERT(0 == cx_strtoi32(cx_str("  -123,456"), &i32, 10));
+        CX_TEST_ASSERT(i32 == -123456);
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi16_lc(cx_str("  -Xab,cd"), &i16, 16, "'"));
+        CX_TEST_ASSERT(errno == EINVAL);
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi16_lc(cx_str("  -X'ab'cd"), &i16, 16, "'"));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtoi16_lc(cx_str("  -X'67'89"), &i16, 16, "'"));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(i16 == -0x6789);
+
+        // binary and (unusual notation of) signed binary
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi8_lc(cx_str(" -1010 1011"), &i8, 2, " "));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtoi8_lc(cx_str(" 1010 1011"), &i8, 2, " "));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtoi8_lc(cx_str(" -0101 0101"), &i8, 2, " "));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(i8 == -0x55);
     }
 }
 
@@ -1098,8 +1139,27 @@
         // TODO: roll out base 2 tests, but that needs C23
 
         // do some special case tests
+        // --------------------------
 
-        // TODO: implement tests
+        // group separators
+        CX_TEST_ASSERT(0 == cx_strtou32(cx_str("  123,456"), &u32, 10));
+        CX_TEST_ASSERT(u32 == 123456);
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtou16_lc(cx_str("  ab,cd"), &u16, 16, "'"));
+        CX_TEST_ASSERT(errno == EINVAL);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtou16_lc(cx_str("  ab'cd"), &u16, 16, "'"));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(u16 == 0xabcd);
+
+        // binary
+        errno = 0;
+        CX_TEST_ASSERT(0 != cx_strtou8_lc(cx_str("1 1010 1011"), &u8, 2, " "));
+        CX_TEST_ASSERT(errno == ERANGE);
+        errno = 0;
+        CX_TEST_ASSERT(0 == cx_strtou8_lc(cx_str(" 1010 1011"), &u8, 2, " "));
+        CX_TEST_ASSERT(errno == 0);
+        CX_TEST_ASSERT(u8 == 0xAB);
     }
 }
 
@@ -1108,6 +1168,15 @@
     CX_TEST_DO {
         CX_TEST_ASSERT(0 == cx_strtof(cx_str("11.3"), &f));
         CX_TEST_ASSERT(11.3f == f);
+
+        CX_TEST_ASSERT(0 == cx_strtof(cx_str("1.67262192595e-27"), &f));
+        CX_TEST_ASSERT(1.67262192595e-27f == f);
+
+        CX_TEST_ASSERT(0 == cx_strtof_lc(cx_str("138,339.4"), &f, '.', ","));
+        CX_TEST_ASSERT(138339.4f == f);
+
+        CX_TEST_ASSERT(0 == cx_strtof_lc(cx_str("138,339.4"), &f, ',', "."));
+        CX_TEST_ASSERT(138.3394f == f);
     }
 }
 
@@ -1116,6 +1185,15 @@
     CX_TEST_DO {
         CX_TEST_ASSERT(0 == cx_strtod(cx_str("11.3"), &d));
         CX_TEST_ASSERT(11.3 == d);
+
+        CX_TEST_ASSERT(0 == cx_strtod(cx_str("1.67262192595e-27"), &d));
+        CX_TEST_ASSERT(1.67262192595e-27 == d);
+
+        CX_TEST_ASSERT(0 == cx_strtod_lc(cx_str("138,339.4"), &d, '.', ","));
+        CX_TEST_ASSERT(138339.4 == d);
+
+        CX_TEST_ASSERT(0 == cx_strtod_lc(cx_str("138,339.4"), &d, ',', "."));
+        CX_TEST_ASSERT(138.3394 == d);
     }
 }
 

mercurial