Thu, 23 Feb 2017 14:30:12 +0100
improves sstrstr function by using KMP string search algorithm
test/string_tests.c | file | annotate | diff | comparison | revisions | |
ucx/string.c | file | annotate | diff | comparison | revisions |
--- a/test/string_tests.c Mon Feb 20 17:28:58 2017 +0100 +++ b/test/string_tests.c Thu Feb 23 14:30:12 2017 +0100 @@ -81,6 +81,34 @@ UCX_TEST(test_sstrstr) { sstr_t str = ST("find the match in this string"); + sstr_t longstr = ST( + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl" + "mnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwx" + "yzabcdeababababnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij" + "klmnopqrstuvwxyzaababababababababrstuvwxyzabcdefghijklmnopqrstuv" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "wxyz1234567890"); + sstr_t longstrpattern = ST( + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + ); + sstr_t longstrresult = ST( + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "abababababababababababababababababababababababababababababababab" + "wxyz1234567890" + ); UCX_TEST_BEGIN sstr_t notfound = sstrstr(str, S("no match")); @@ -97,6 +125,12 @@ UCX_TEST_ASSERT(!strcmp(str.ptr, result.ptr), "sstrstr with empty match string did not return the original string"); + result = sstrstr(longstr, longstrpattern); + UCX_TEST_ASSERT(result.length == longstrresult.length, + "long string result length incorrect"); + UCX_TEST_ASSERT(!strcmp(result.ptr, longstrresult.ptr), + "long string result content incorrect"); + UCX_TEST_END }
--- a/ucx/string.c Mon Feb 20 17:28:58 2017 +0100 +++ b/ucx/string.c Thu Feb 23 14:30:12 2017 +0100 @@ -29,6 +29,7 @@ #include <stdlib.h> #include <string.h> #include <stdarg.h> +#include <stdint.h> #include <ctype.h> #include "string.h" @@ -175,22 +176,91 @@ return n; } +typedef size_t(*prefix_table_rfun)(void*,size_t); +typedef void(*prefix_table_wfun)(void*,size_t,size_t); + +static size_t s_prefix_table_r(void* table, size_t index) { + return (size_t) ((uint8_t*)table)[index]; +} +static void s_prefix_table_w(void* table, size_t index, size_t value) { + ((uint8_t*)table)[index] = (uint8_t) value; +} +static size_t h_prefix_table_r(void* table, size_t index) { + return ((size_t*)table)[index]; +} +static void h_prefix_table_w(void* table, size_t index, size_t value) { + ((size_t*)table)[index] = value; +} + sstr_t sstrstr(sstr_t string, sstr_t match) { if (match.length == 0) { return string; } - for (size_t i = 0 ; i < string.length ; i++) { - sstr_t substr = sstrsubs(string, i); - if (sstrprefix(substr, match)) { - return substr; + /* prepare default return value in case of no match */ + sstr_t result = sstrn(NULL, 0); + + /* + * IMPORTANT: + * our prefix table contains the prefix length PLUS ONE + * this is our decision, because we want to use the full range of size_t + * the original algorithm needs a (-1) at one single place + * and we want to avoid that + */ + + /* static prefix table */ + static uint8_t s_prefix_table[256]; + + /* check pattern length and use appropriate prefix table */ + register void* ptable; + prefix_table_rfun ptable_r; + prefix_table_wfun ptable_w; + if (match.length > 255) { + /* pattern exceeds static prefix table, allocate on the heap */ + ptable = calloc(match.length+1, sizeof(size_t)); + ptable_r = h_prefix_table_r; + ptable_w = h_prefix_table_w; + } else { + ptable = s_prefix_table; + ptable_r = s_prefix_table_r; + ptable_w = s_prefix_table_w; + } + + /* keep counter in registers */ + register size_t i, j; + + /* fill prefix table */ + i = 0; j = 0; + ptable_w(ptable, i, j); + while (i < match.length) { + while (j >= 1 && match.ptr[j-1] != match.ptr[i]) { + j = ptable_r(ptable, j-i); + } + i++; j++; + ptable_w(ptable, i, j); + } + + /* search */ + i = 0; j = 1; + while (i < string.length) { + while (j >= 1 && string.ptr[i] != match.ptr[j-1]) { + j = ptable_r(ptable, j-1); + } + i++; j++; + if (j-1 == match.length) { + size_t start = i - match.length; + result.ptr = string.ptr + start; + result.length = string.length - start; + break; } } + + /* if prefix table was allocated on the heap, free it */ + if (ptable != s_prefix_table) { + free(ptable); + } - sstr_t emptystr; - emptystr.length = 0; - emptystr.ptr = NULL; - return emptystr; + return result; } sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {