Sun, 15 Dec 2024 11:11:16 +0100
and changelog entry for issue #523
/* * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. * * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "cx/string.h" #include <string.h> #include <stdarg.h> #include <ctype.h> #ifndef _WIN32 #include <strings.h> // for strncasecmp() #endif // _WIN32 cxmutstr cx_mutstr(char *cstring) { return (cxmutstr) {cstring, strlen(cstring)}; } cxmutstr cx_mutstrn( char *cstring, size_t length ) { return (cxmutstr) {cstring, length}; } cxstring cx_str(const char *cstring) { return (cxstring) {cstring, strlen(cstring)}; } cxstring cx_strn( const char *cstring, size_t length ) { return (cxstring) {cstring, length}; } cxstring cx_strcast(cxmutstr str) { return (cxstring) {str.ptr, str.length}; } void cx_strfree(cxmutstr *str) { if (str == NULL) return; free(str->ptr); str->ptr = NULL; str->length = 0; } void cx_strfree_a( const CxAllocator *alloc, cxmutstr *str ) { if (str == NULL) return; cxFree(alloc, str->ptr); str->ptr = NULL; str->length = 0; } size_t cx_strlen( size_t count, ... ) { if (count == 0) return 0; va_list ap; va_start(ap, count); size_t size = 0; for (size_t i = 0; i < count; i++) { cxstring str = va_arg(ap, cxstring); size += str.length; } va_end(ap); return size; } cxmutstr cx_strcat_ma( const CxAllocator *alloc, cxmutstr str, size_t count, ... ) { if (count == 0) return str; cxstring strings_stack[8]; cxstring *strings; if (count > 8) { strings = calloc(count, sizeof(cxstring)); if (strings == NULL) { return (cxmutstr) {NULL, 0}; } } else { strings = strings_stack; } va_list ap; va_start(ap, count); // get all args and overall length size_t slen = str.length; for (size_t i = 0; i < count; i++) { cxstring s = va_arg (ap, cxstring); strings[i] = s; slen += s.length; } va_end(ap); // reallocate or create new string char *newstr; if (str.ptr == NULL) { newstr = cxMalloc(alloc, slen + 1); } else { newstr = cxRealloc(alloc, str.ptr, slen + 1); } if (newstr == NULL) { free(strings); return (cxmutstr) {NULL, 0}; } str.ptr = newstr; // concatenate strings size_t pos = str.length; str.length = slen; for (size_t i = 0; i < count; i++) { cxstring s = strings[i]; memcpy(str.ptr + pos, s.ptr, s.length); pos += s.length; } // terminate string str.ptr[str.length] = '\0'; // free temporary array if (strings != strings_stack) { free(strings); } return str; } cxstring cx_strsubs( cxstring string, size_t start ) { return cx_strsubsl(string, start, string.length - start); } cxmutstr cx_strsubs_m( cxmutstr string, size_t start ) { return cx_strsubsl_m(string, start, string.length - start); } cxstring cx_strsubsl( cxstring string, size_t start, size_t length ) { if (start > string.length) { return (cxstring) {NULL, 0}; } size_t rem_len = string.length - start; if (length > rem_len) { length = rem_len; } return (cxstring) {string.ptr + start, length}; } cxmutstr cx_strsubsl_m( cxmutstr string, size_t start, size_t length ) { cxstring result = cx_strsubsl(cx_strcast(string), start, length); return (cxmutstr) {(char *) result.ptr, result.length}; } cxstring cx_strchr( cxstring string, int chr ) { chr = 0xFF & chr; // TODO: improve by comparing multiple bytes at once for (size_t i = 0; i < string.length; i++) { if (string.ptr[i] == chr) { return cx_strsubs(string, i); } } return (cxstring) {NULL, 0}; } cxmutstr cx_strchr_m( cxmutstr string, int chr ) { cxstring result = cx_strchr(cx_strcast(string), chr); return (cxmutstr) {(char *) result.ptr, result.length}; } cxstring cx_strrchr( cxstring string, int chr ) { chr = 0xFF & chr; size_t i = string.length; while (i > 0) { i--; // TODO: improve by comparing multiple bytes at once if (string.ptr[i] == chr) { return cx_strsubs(string, i); } } return (cxstring) {NULL, 0}; } cxmutstr cx_strrchr_m( cxmutstr string, int chr ) { cxstring result = cx_strrchr(cx_strcast(string), chr); return (cxmutstr) {(char *) result.ptr, result.length}; } #ifndef CX_STRSTR_SBO_SIZE #define CX_STRSTR_SBO_SIZE 512 #endif const unsigned cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE; cxstring cx_strstr( cxstring haystack, cxstring needle ) { if (needle.length == 0) { return haystack; } // optimize for single-char needles if (needle.length == 1) { return cx_strchr(haystack, *needle.ptr); } /* * IMPORTANT: * Our prefix table contains the prefix length PLUS ONE * this is our decision, because we want to use the full range of size_t. * The original algorithm needs a (-1) at one single place, * and we want to avoid that. */ // local prefix table size_t s_prefix_table[CX_STRSTR_SBO_SIZE]; // check needle length and use appropriate prefix table // if the pattern exceeds static prefix table, allocate on the heap bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; register size_t *ptable = useheap ? calloc(needle.length + 1, sizeof(size_t)) : s_prefix_table; // keep counter in registers register size_t i, j; // fill prefix table i = 0; j = 0; ptable[i] = j; while (i < needle.length) { while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) { j = ptable[j - 1]; } i++; j++; ptable[i] = j; } // search cxstring result = {NULL, 0}; i = 0; j = 1; while (i < haystack.length) { while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) { j = ptable[j - 1]; } i++; j++; if (j - 1 == needle.length) { size_t start = i - needle.length; result.ptr = haystack.ptr + start; result.length = haystack.length - start; break; } } // if prefix table was allocated on the heap, free it if (ptable != s_prefix_table) { free(ptable); } return result; } cxmutstr cx_strstr_m( cxmutstr haystack, cxstring needle ) { cxstring result = cx_strstr(cx_strcast(haystack), needle); return (cxmutstr) {(char *) result.ptr, result.length}; } size_t cx_strsplit( cxstring string, cxstring delim, size_t limit, cxstring *output ) { // special case: output limit is zero if (limit == 0) return 0; // special case: delimiter is empty if (delim.length == 0) { output[0] = string; return 1; } // special cases: delimiter is at least as large as the string if (delim.length >= string.length) { // exact match if (cx_strcmp(string, delim) == 0) { output[0] = cx_strn(string.ptr, 0); output[1] = cx_strn(string.ptr + string.length, 0); return 2; } else { // no match possible output[0] = string; return 1; } } size_t n = 0; cxstring curpos = string; while (1) { ++n; cxstring match = cx_strstr(curpos, delim); if (match.length > 0) { // is the limit reached? if (n < limit) { // copy the current string to the array cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr); output[n - 1] = item; size_t processed = item.length + delim.length; curpos.ptr += processed; curpos.length -= processed; } else { // limit reached, copy the _full_ remaining string output[n - 1] = curpos; break; } } else { // no more matches, copy last string output[n - 1] = curpos; break; } } return n; } size_t cx_strsplit_a( const CxAllocator *allocator, cxstring string, cxstring delim, size_t limit, cxstring **output ) { // find out how many splits we're going to make and allocate memory size_t n = 0; cxstring curpos = string; while (1) { ++n; cxstring match = cx_strstr(curpos, delim); if (match.length > 0) { // is the limit reached? if (n < limit) { size_t processed = match.ptr - curpos.ptr + delim.length; curpos.ptr += processed; curpos.length -= processed; } else { // limit reached break; } } else { // no more matches break; } } *output = cxCalloc(allocator, n, sizeof(cxstring)); return cx_strsplit(string, delim, n, *output); } size_t cx_strsplit_m( cxmutstr string, cxstring delim, size_t limit, cxmutstr *output ) { return cx_strsplit(cx_strcast(string), delim, limit, (cxstring *) output); } size_t cx_strsplit_ma( const CxAllocator *allocator, cxmutstr string, cxstring delim, size_t limit, cxmutstr **output ) { return cx_strsplit_a(allocator, cx_strcast(string), delim, limit, (cxstring **) output); } int cx_strcmp( cxstring s1, cxstring s2 ) { if (s1.length == s2.length) { return memcmp(s1.ptr, s2.ptr, s1.length); } else if (s1.length > s2.length) { return 1; } else { return -1; } } int cx_strcasecmp( cxstring s1, cxstring s2 ) { if (s1.length == s2.length) { #ifdef _WIN32 return _strnicmp(s1.ptr, s2.ptr, s1.length); #else return strncasecmp(s1.ptr, s2.ptr, s1.length); #endif } else if (s1.length > s2.length) { return 1; } else { return -1; } } int cx_strcmp_p( const void *s1, const void *s2 ) { const cxstring *left = s1; const cxstring *right = s2; return cx_strcmp(*left, *right); } int cx_strcasecmp_p( const void *s1, const void *s2 ) { const cxstring *left = s1; const cxstring *right = s2; return cx_strcasecmp(*left, *right); } cxmutstr cx_strdup_a( const CxAllocator *allocator, cxstring string ) { cxmutstr result = { cxMalloc(allocator, string.length + 1), string.length }; if (result.ptr == NULL) { result.length = 0; return result; } memcpy(result.ptr, string.ptr, string.length); result.ptr[string.length] = '\0'; return result; } cxstring cx_strtrim(cxstring string) { cxstring result = string; // TODO: optimize by comparing multiple bytes at once while (result.length > 0 && isspace(*result.ptr)) { result.ptr++; result.length--; } while (result.length > 0 && isspace(result.ptr[result.length - 1])) { result.length--; } return result; } cxmutstr cx_strtrim_m(cxmutstr string) { cxstring result = cx_strtrim(cx_strcast(string)); return (cxmutstr) {(char *) result.ptr, result.length}; } bool cx_strprefix( cxstring string, cxstring prefix ) { if (string.length < prefix.length) return false; return memcmp(string.ptr, prefix.ptr, prefix.length) == 0; } bool cx_strsuffix( cxstring string, cxstring suffix ) { if (string.length < suffix.length) return false; return memcmp(string.ptr + string.length - suffix.length, suffix.ptr, suffix.length) == 0; } bool cx_strcaseprefix( cxstring string, cxstring prefix ) { if (string.length < prefix.length) return false; #ifdef _WIN32 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0; #else return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0; #endif } bool cx_strcasesuffix( cxstring string, cxstring suffix ) { if (string.length < suffix.length) return false; #ifdef _WIN32 return _strnicmp(string.ptr+string.length-suffix.length, suffix.ptr, suffix.length) == 0; #else return strncasecmp(string.ptr + string.length - suffix.length, suffix.ptr, suffix.length) == 0; #endif } void cx_strlower(cxmutstr string) { for (size_t i = 0; i < string.length; i++) { string.ptr[i] = (char) tolower(string.ptr[i]); } } void cx_strupper(cxmutstr string) { for (size_t i = 0; i < string.length; i++) { string.ptr[i] = (char) toupper(string.ptr[i]); } } #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64 #endif struct cx_strreplace_ibuf { size_t *buf; struct cx_strreplace_ibuf *next; unsigned int len; }; static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) { while (buf) { struct cx_strreplace_ibuf *next = buf->next; free(buf->buf); free(buf); buf = next; } } cxmutstr cx_strreplacen_a( const CxAllocator *allocator, cxstring str, cxstring pattern, cxstring replacement, size_t replmax ) { if (pattern.length == 0 || pattern.length > str.length || replmax == 0) return cx_strdup_a(allocator, str); // Compute expected buffer length size_t ibufmax = str.length / pattern.length; size_t ibuflen = replmax < ibufmax ? replmax : ibufmax; if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) { ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE; } // Allocate first index buffer struct cx_strreplace_ibuf *firstbuf, *curbuf; firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); if (!firstbuf) return cx_mutstrn(NULL, 0); firstbuf->buf = calloc(ibuflen, sizeof(size_t)); if (!firstbuf->buf) { free(firstbuf); return cx_mutstrn(NULL, 0); } // Search occurrences cxstring searchstr = str; size_t found = 0; do { cxstring match = cx_strstr(searchstr, pattern); if (match.length > 0) { // Allocate next buffer in chain, if required if (curbuf->len == ibuflen) { struct cx_strreplace_ibuf *nextbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); if (!nextbuf) { cx_strrepl_free_ibuf(firstbuf); return cx_mutstrn(NULL, 0); } nextbuf->buf = calloc(ibuflen, sizeof(size_t)); if (!nextbuf->buf) { free(nextbuf); cx_strrepl_free_ibuf(firstbuf); return cx_mutstrn(NULL, 0); } curbuf->next = nextbuf; curbuf = nextbuf; } // Record match index found++; size_t idx = match.ptr - str.ptr; curbuf->buf[curbuf->len++] = idx; searchstr.ptr = match.ptr + pattern.length; searchstr.length = str.length - idx - pattern.length; } else { break; } } while (searchstr.length > 0 && found < replmax); // Allocate result string cxmutstr result; { ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length; size_t rcount = 0; curbuf = firstbuf; do { rcount += curbuf->len; curbuf = curbuf->next; } while (curbuf); result.length = str.length + rcount * adjlen; result.ptr = cxMalloc(allocator, result.length + 1); if (!result.ptr) { cx_strrepl_free_ibuf(firstbuf); return cx_mutstrn(NULL, 0); } } // Build result string curbuf = firstbuf; size_t srcidx = 0; char *destptr = result.ptr; do { for (size_t i = 0; i < curbuf->len; i++) { // Copy source part up to next match size_t idx = curbuf->buf[i]; size_t srclen = idx - srcidx; if (srclen > 0) { memcpy(destptr, str.ptr + srcidx, srclen); destptr += srclen; srcidx += srclen; } // Copy the replacement and skip the source pattern srcidx += pattern.length; memcpy(destptr, replacement.ptr, replacement.length); destptr += replacement.length; } curbuf = curbuf->next; } while (curbuf); memcpy(destptr, str.ptr + srcidx, str.length - srcidx); // Result is guaranteed to be zero-terminated result.ptr[result.length] = '\0'; // Free index buffer cx_strrepl_free_ibuf(firstbuf); return result; } CxStrtokCtx cx_strtok( cxstring str, cxstring delim, size_t limit ) { CxStrtokCtx ctx; ctx.str = str; ctx.delim = delim; ctx.limit = limit; ctx.pos = 0; ctx.next_pos = 0; ctx.delim_pos = 0; ctx.found = 0; ctx.delim_more = NULL; ctx.delim_more_count = 0; return ctx; } CxStrtokCtx cx_strtok_m( cxmutstr str, cxstring delim, size_t limit ) { return cx_strtok(cx_strcast(str), delim, limit); } bool cx_strtok_next( CxStrtokCtx *ctx, cxstring *token ) { // abortion criteria if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { return false; } // determine the search start cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); // search the next delimiter cxstring delim = cx_strstr(haystack, ctx->delim); // if found, make delim capture exactly the delimiter if (delim.length > 0) { delim.length = ctx->delim.length; } // if more delimiters are specified, check them now if (ctx->delim_more_count > 0) { for (size_t i = 0; i < ctx->delim_more_count; i++) { cxstring d = cx_strstr(haystack, ctx->delim_more[i]); if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { delim.ptr = d.ptr; delim.length = ctx->delim_more[i].length; } } } // store the token information and adjust the context ctx->found++; ctx->pos = ctx->next_pos; token->ptr = &ctx->str.ptr[ctx->pos]; ctx->delim_pos = delim.length == 0 ? ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); token->length = ctx->delim_pos - ctx->pos; ctx->next_pos = ctx->delim_pos + delim.length; return true; } bool cx_strtok_next_m( CxStrtokCtx *ctx, cxmutstr *token ) { return cx_strtok_next(ctx, (cxstring *) token); } void cx_strtok_delim( CxStrtokCtx *ctx, const cxstring *delim, size_t count ) { ctx->delim_more = delim; ctx->delim_more_count = count; }