improves sstrstr function by using KMP string search algorithm

2017-02-23

author
Mike Becker <universe@uap-core.de>
date
Thu, 23 Feb 2017 14:30:12 +0100 (2017-02-23)
changeset 236
ffc6d0910342
parent 235
7cf1e41833a2
child 237
5ba9de6361ff

improves sstrstr function by using KMP string search algorithm

test/string_tests.c file | annotate | diff | comparison | revisions
ucx/string.c file | annotate | diff | comparison | revisions
--- a/test/string_tests.c	Mon Feb 20 17:28:58 2017 +0100
+++ b/test/string_tests.c	Thu Feb 23 14:30:12 2017 +0100
@@ -81,6 +81,34 @@
 
 UCX_TEST(test_sstrstr) {
     sstr_t str = ST("find the match in this string");
+    sstr_t longstr = ST(
+            "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl"
+            "mnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwx"
+            "yzabcdeababababnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij"
+            "klmnopqrstuvwxyzaababababababababrstuvwxyzabcdefghijklmnopqrstuv"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "wxyz1234567890");
+    sstr_t longstrpattern = ST(
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            );
+    sstr_t longstrresult = ST(
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "abababababababababababababababababababababababababababababababab"
+            "wxyz1234567890"
+            );
     UCX_TEST_BEGIN
     
     sstr_t notfound = sstrstr(str, S("no match"));
@@ -97,6 +125,12 @@
     UCX_TEST_ASSERT(!strcmp(str.ptr, result.ptr),
         "sstrstr with empty match string did not return the original string");
     
+    result = sstrstr(longstr, longstrpattern);
+    UCX_TEST_ASSERT(result.length == longstrresult.length,
+            "long string result length incorrect");
+    UCX_TEST_ASSERT(!strcmp(result.ptr, longstrresult.ptr),
+            "long string result content incorrect");
+    
     UCX_TEST_END
 }
 
--- a/ucx/string.c	Mon Feb 20 17:28:58 2017 +0100
+++ b/ucx/string.c	Thu Feb 23 14:30:12 2017 +0100
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <ctype.h>
 
 #include "string.h"
@@ -175,22 +176,91 @@
     return n;
 }
 
+typedef size_t(*prefix_table_rfun)(void*,size_t);
+typedef void(*prefix_table_wfun)(void*,size_t,size_t);
+
+static size_t s_prefix_table_r(void* table, size_t index) {
+    return (size_t) ((uint8_t*)table)[index];
+}
+static void s_prefix_table_w(void* table, size_t index, size_t value) {
+    ((uint8_t*)table)[index] = (uint8_t) value;
+}
+static size_t h_prefix_table_r(void* table, size_t index) {
+    return ((size_t*)table)[index];
+}
+static void h_prefix_table_w(void* table, size_t index, size_t value) {
+    ((size_t*)table)[index] = value;
+}
+
 sstr_t sstrstr(sstr_t string, sstr_t match) {
     if (match.length == 0) {
         return string;
     }
     
-    for (size_t i = 0 ; i < string.length ; i++) {
-        sstr_t substr = sstrsubs(string, i);
-        if (sstrprefix(substr, match)) {
-            return substr;
+    /* prepare default return value in case of no match */
+    sstr_t result = sstrn(NULL, 0);
+    
+    /*
+     * IMPORTANT:
+     * our prefix table contains the prefix length PLUS ONE
+     * this is our decision, because we want to use the full range of size_t
+     * the original algorithm needs a (-1) at one single place
+     * and we want to avoid that
+     */
+    
+    /* static prefix table */
+    static uint8_t s_prefix_table[256];
+    
+    /* check pattern length and use appropriate prefix table */
+    register void* ptable;
+    prefix_table_rfun ptable_r;
+    prefix_table_wfun ptable_w;
+    if (match.length > 255) {
+        /* pattern exceeds static prefix table, allocate on the heap */
+        ptable = calloc(match.length+1, sizeof(size_t));
+        ptable_r = h_prefix_table_r;
+        ptable_w = h_prefix_table_w;
+    } else {
+        ptable = s_prefix_table;
+        ptable_r = s_prefix_table_r;
+        ptable_w = s_prefix_table_w;
+    }
+    
+    /* keep counter in registers */
+    register size_t i, j;
+    
+    /* fill prefix table */
+    i = 0; j = 0;
+    ptable_w(ptable, i, j);
+    while (i < match.length) {
+        while (j >= 1 && match.ptr[j-1] != match.ptr[i]) {
+            j = ptable_r(ptable, j-i);
+        }
+        i++; j++;
+        ptable_w(ptable, i, j);
+    }
+
+    /* search */
+    i = 0; j = 1;
+    while (i < string.length) {
+        while (j >= 1 && string.ptr[i] != match.ptr[j-1]) {
+            j = ptable_r(ptable, j-1);
+        }
+        i++; j++;
+        if (j-1 == match.length) {
+            size_t start = i - match.length;
+            result.ptr = string.ptr + start;
+            result.length = string.length - start;
+            break;
         }
     }
+
+    /* if prefix table was allocated on the heap, free it */
+    if (ptable != s_prefix_table) {
+        free(ptable);
+    }
     
-    sstr_t emptystr;
-    emptystr.length = 0;
-    emptystr.ptr = NULL;
-    return emptystr;
+    return result;
 }
 
 sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {

mercurial