# HG changeset patch # User Mike Becker # Date 1760190168 -7200 # Node ID 9a72258446cdc0f62709706287d9e7da2071cbd1 # Parent 8bfccb342895bd8b6c302912ee0d82c314dbdbe8 fixes various bugs related to skipping duplicates in insert_unique - relates to #557 diff -r 8bfccb342895 -r 9a72258446cd src/array_list.c --- a/src/array_list.c Sat Oct 11 11:55:46 2025 +0200 +++ b/src/array_list.c Sat Oct 11 15:42:48 2025 +0200 @@ -291,7 +291,7 @@ *target, oldcap, newcap, elem_size, reallocator ); if (newmem == NULL) { - return 1; + return 1; // LCOV_EXCL_LINE } // repair src pointer, if necessary @@ -420,29 +420,60 @@ bptr, cmp_func ); - // handle the identical elements - size_t skip_len = 0; - while (copy_len < elem_count - si + // binary search gives us the smallest index; + // we also want to include equal elements here + while (si + copy_len < elem_count && cmp_func(bptr, src+copy_len*elem_size) == 0) { copy_len++; - if (!allow_duplicates) { - skip_len++; - } } - copy_len -= skip_len; // copy the source elements - bytes_copied = copy_len * elem_size; - memcpy(dest, src, bytes_copied); - dest += bytes_copied; - src += bytes_copied; - si += copy_len; - di += copy_len; - - // skip duplicates when they are not allowed - src += skip_len * elem_size; - si += skip_len; - *size -= skip_len; + if (copy_len > 0) { + if (allow_duplicates) { + // we can copy the entire chunk + bytes_copied = copy_len * elem_size; + memcpy(dest, src, bytes_copied); + dest += bytes_copied; + src += bytes_copied; + si += copy_len; + di += copy_len; + } else { + // first, check the end of the source chunk + // for being a duplicate of the bptr + const char *end_of_src = src + (copy_len - 1) * elem_size; + size_t skip_len = 0; + while (copy_len > 0 && cmp_func(bptr, end_of_src) == 0) { + end_of_src -= elem_size; + skip_len++; + copy_len--; + } + char *last = dest == *target ? NULL : dest - elem_size; + // then iterate through the source chunk + // and skip all duplicates with the last element in the array + size_t more_skipped = 0; + for (unsigned j = 0; j < copy_len; j++) { + if (last != NULL && cmp_func(last, src) == 0) { + // duplicate - skip + src += elem_size; + si++; + more_skipped++; + } else { + memcpy(dest, src, elem_size); + src += elem_size; + last = dest; + dest += elem_size; + si++; + di++; + } + } + // skip the previously identified elements as well + src += skip_len * elem_size; + si += skip_len; + skip_len += more_skipped; + // reduce the actual size by the number of skipped elements + *size -= skip_len; + } + } // when all source elements are in place, we are done if (si >= elem_count) break; @@ -463,23 +494,63 @@ bptr += bytes_copied; di += copy_len; bi += copy_len; + } - // check if the last restored element is a duplicate of the next source - if (!allow_duplicates && copy_len > 0) { - char *last = dest - elem_size; - while (si < elem_count && cmp_func(last, src) == 0) { - src += elem_size; - si++; - (*size)--; + // still source elements left? + if (si < elem_count) { + if (allow_duplicates) { + // duplicates allowed or nothing inserted yet: simply copy everything + memcpy(dest, src, elem_size * (elem_count - si)); + } else { + if (dest != *target) { + // skip all source elements that equal the last element + char *last = dest - elem_size; + while (si < elem_count) { + if (last != NULL && cmp_func(last, src) == 0) { + src += elem_size; + si++; + (*size)--; + } else { + break; + } + } + } + // we must check the elements in the chunk one by one + while (si < elem_count) { + // find a chain of elements that can be copied + size_t copy_len = 1, skip_len = 0; + { + const char *left_src = src; + while (si + copy_len < elem_count) { + const char *right_src = left_src + elem_size; + int d = cmp_func(left_src, right_src); + if (d < 0) { + if (skip_len > 0) { + // new larger element found; + // handle it in the next cycle + break; + } + left_src += elem_size; + copy_len++; + } else if (d == 0) { + left_src += elem_size; + skip_len++; + } else { + break; + } + } + } + size_t bytes_copied = copy_len * elem_size; + memcpy(dest, src, bytes_copied); + dest += bytes_copied; + src += bytes_copied + skip_len * elem_size; + si += copy_len + skip_len; + di += copy_len; + *size -= skip_len; } } } - // still source elements left? simply append them - if (si < elem_count) { - memcpy(dest, src, elem_size * (elem_count - si)); - } - // buffered elements need to be moved when we skipped duplicates size_t total_skipped = new_size - *size; if (bi < new_size && total_skipped > 0) { diff -r 8bfccb342895 -r 9a72258446cd src/cx/array_list.h --- a/src/cx/array_list.h Sat Oct 11 11:55:46 2025 +0200 +++ b/src/cx/array_list.h Sat Oct 11 15:42:48 2025 +0200 @@ -592,7 +592,6 @@ * * If either the target or the source array is not already sorted with respect * to the specified @p cmp_func, the behavior is undefined. - * Also, the @p src array must not contain duplicates within itself. * * If the capacity is insufficient to hold the new data, a reallocation * attempt is made. diff -r 8bfccb342895 -r 9a72258446cd src/cx/linked_list.h --- a/src/cx/linked_list.h Sat Oct 11 11:55:46 2025 +0200 +++ b/src/cx/linked_list.h Sat Oct 11 15:42:48 2025 +0200 @@ -448,7 +448,6 @@ * * If either the list starting with the node pointed to by @p begin or the list * starting with @p insert_begin is not sorted, the behavior is undefined. - * Also, the chain to be inserted must not contain duplicates within itself. * * @attention In contrast to cx_linked_list_insert_sorted(), not all nodes of the * chain might be added. This function returns a new chain consisting of all the duplicates. @@ -459,7 +458,7 @@ * @param loc_next the location of a @c next pointer within your node struct (required) * @param insert_begin a pointer to the first node of the chain that shall be inserted * @param cmp_func a compare function that will receive the node pointers - * @return a pointer to a new chain with all duplicates which were not inserted (or @c NULL when there were no duplicates) + * @return a pointer to a new chain with all duplicates that were not inserted (or @c NULL when there were no duplicates) */ cx_attr_nonnull_arg(1, 5, 6) cx_attr_nodiscard diff -r 8bfccb342895 -r 9a72258446cd src/cx/list.h --- a/src/cx/list.h Sat Oct 11 11:55:46 2025 +0200 +++ b/src/cx/list.h Sat Oct 11 15:42:48 2025 +0200 @@ -283,7 +283,6 @@ * Default implementation of an array insert where only elements are inserted when they don't exist in the list. * * This function is similar to cx_list_default_insert_sorted(), except it skips elements that are already in the list. - * The @p sorted_data itself must not contain duplicates. * * @note The return value of this function denotes the number of elements from the @p sorted_data that are definitely * contained in the list after completing the call. It is @em not the number of elements that were newly inserted. @@ -625,7 +624,7 @@ * be an array of pointers. * * If the list is not sorted already, the behavior is undefined. - * This is also the case when the @p array is not sorted or already contains duplicates. + * This is also the case when the @p array is not sorted. * * @param list the list * @param array a pointer to the elements to add diff -r 8bfccb342895 -r 9a72258446cd src/linked_list.c --- a/src/linked_list.c Sat Oct 11 11:55:46 2025 +0200 +++ b/src/linked_list.c Sat Oct 11 15:42:48 2025 +0200 @@ -258,117 +258,132 @@ assert(loc_next >= 0); assert(insert_begin != NULL); - // track currently observed nodes - void *dup_start = NULL; - void *dup_last = NULL; - void *dest_prev = NULL; - void *dest = *begin; - void *src = insert_begin; + // strategy: build completely new chains from scratch + void *source_original = *begin; + void *source_argument = insert_begin; + void *new_begin = NULL; + void *new_end = NULL; + void *dup_begin = NULL; + void *dup_end = NULL; - // special case: list is empty - if (dest == NULL) { - *begin = src; - if (end != NULL) { - *end = cx_linked_list_last(src, loc_next); + // determine the new start + { + int d = source_original == NULL ? 1 : cmp_func(source_original, source_argument); + if (d <= 0) { + // the new chain starts with the original chain + new_begin = new_end = source_original; + source_original = ll_next(source_original); + if (d == 0) { + if (allow_duplicates) { + // duplicate allowed, append it to the chain + cx_linked_list_link(new_end, source_argument, loc_prev, loc_next); + new_end = source_argument; + } else { + // duplicate is not allowed, start a duplicate chain with the argument + dup_begin = dup_end = source_argument; + } + source_argument = ll_next(source_argument); + } + } else { + // input is smaller, or there is no original chain; + // start the new chain with the source argument + new_begin = new_end = source_argument; + source_argument = ll_next(source_argument); } - return NULL; } - // search the list for insertion points - while (dest != NULL && src != NULL) { - // compare current list node with source node - // if less or equal, skip - { - int d = cmp_func(dest, src); - if (d <= 0) { - if (!allow_duplicates && d == 0) { - if (dup_last == NULL) { - dup_start = src; - dup_last = src; - if (loc_prev >= 0) { - ll_prev(dup_start) = NULL; - } + // now successively compare the elements and add them to the correct chains + while (source_original != NULL && source_argument != NULL) { + int d = cmp_func(source_original, source_argument); + if (d <= 0) { + // the original is not larger, add it to the chain + cx_linked_list_link(new_end, source_original, loc_prev, loc_next); + new_end = source_original; + source_original = ll_next(source_original); + if (d == 0) { + if (allow_duplicates) { + // duplicate allowed, append it to the chain + cx_linked_list_link(new_end, source_argument, loc_prev, loc_next); + new_end = source_argument; + } else { + // duplicate is not allowed, append it to the duplicate chain + if (dup_end == NULL) { + dup_begin = dup_end = source_argument; } else { - cx_linked_list_link(dup_last, src, loc_prev, loc_next); - dup_last = src; - } - src = ll_next(src); - while (src != NULL && cmp_func(dest, src) == 0) { - dup_last = src; - src = ll_next(src); + cx_linked_list_link(dup_end, source_argument, loc_prev, loc_next); + dup_end = source_argument; } } - - dest_prev = dest; - dest = ll_next(dest); - continue; - } - } - - // determine chain of elements that can be inserted - void *end_of_chain = src; - void *next_in_chain = ll_next(src); - while (next_in_chain != NULL) { - // once we become larger than the list elem, break - if (cmp_func(dest, next_in_chain) <= 0) { - break; + source_argument = ll_next(source_argument); } - // otherwise, we can insert one more - end_of_chain = next_in_chain; - next_in_chain = ll_next(next_in_chain); + } else { + // the original is larger, append the source argument to the chain + // check if we must discard the source argument as duplicate + if (!allow_duplicates && cmp_func(new_end, source_argument) == 0) { + if (dup_end == NULL) { + dup_begin = dup_end = source_argument; + } else { + cx_linked_list_link(dup_end, source_argument, loc_prev, loc_next); + dup_end = source_argument; + } + } else { + // no duplicate or duplicates allowed + cx_linked_list_link(new_end, source_argument, loc_prev, loc_next); + new_end = source_argument; + } + source_argument = ll_next(source_argument); } - - // insert the elements - if (dest_prev == NULL) { - // new begin - *begin = src; - } else { - cx_linked_list_link(dest_prev, src, loc_prev, loc_next); - } - cx_linked_list_link(end_of_chain, dest, loc_prev, loc_next); - - // continue with next - src = next_in_chain; - dest_prev = dest; - dest = ll_next(dest); } - // skip duplicates in the remaining items - if (!allow_duplicates && src != NULL) { - if (cmp_func(dest_prev, src) == 0) { - if (dup_last == NULL) { - dup_start = src; - dup_last = src; - if (loc_prev >= 0) { - ll_prev(dup_start) = NULL; + if (source_original != NULL) { + // something is left from the original chain, append it + cx_linked_list_link(new_end, source_original, loc_prev, loc_next); + new_end = cx_linked_list_last(source_original, loc_next); + } else if (source_argument != NULL) { + // something is left from the input chain; + // when we allow duplicates, append it + if (allow_duplicates) { + cx_linked_list_link(new_end, source_argument, loc_prev, loc_next); + new_end = cx_linked_list_last(source_argument, loc_next); + } else { + // otherwise we must check one-by-one + while (source_argument != NULL) { + if (cmp_func(new_end, source_argument) == 0) { + if (dup_end == NULL) { + dup_begin = dup_end = source_argument; + } else { + cx_linked_list_link(dup_end, source_argument, loc_prev, loc_next); + dup_end = source_argument; + } + } else { + cx_linked_list_link(new_end, source_argument, loc_prev, loc_next); + new_end = source_argument; } - } else { - cx_linked_list_link(dup_last, src, loc_prev, loc_next); - dup_last = src; - } - src = ll_next(src); - while (src != NULL && cmp_func(dest_prev, src) == 0) { - dup_last = src; - src = ll_next(src); + source_argument = ll_next(source_argument); } } } - // insert remaining items - if (src != NULL) { - cx_linked_list_link(dest_prev, src, loc_prev, loc_next); + // null the next pointers at the end of the chain + ll_next(new_end) = NULL; + if (dup_end != NULL) { + ll_next(dup_end) = NULL; } - // determine new end of list, if requested - if (end != NULL) { - *end = cx_linked_list_last( - dest != NULL ? dest : dest_prev, loc_next); + // null the optional prev pointers + if (loc_prev >= 0) { + ll_prev(new_begin) = NULL; + if (dup_begin != NULL) { + ll_prev(dup_begin) = NULL; + } } - if (dup_last != NULL) { - ll_next(dup_last) = NULL; + // output + *begin = new_begin; + if (end != NULL) { + *end = new_end; } - return dup_start; + return dup_begin; } void cx_linked_list_insert_sorted_chain( @@ -379,10 +394,9 @@ void *insert_begin, cx_compare_func cmp_func ) { - void *ret = cx_linked_list_insert_sorted_chain_impl( + cx_linked_list_insert_sorted_chain_impl( begin, end, loc_prev, loc_next, insert_begin, cmp_func, true); - assert(ret == NULL); } int cx_linked_list_insert_unique( diff -r 8bfccb342895 -r 9a72258446cd src/list.c --- a/src/list.c Sat Oct 11 11:55:46 2025 +0200 +++ b/src/list.c Sat Oct 11 15:42:48 2025 +0200 @@ -327,38 +327,54 @@ const char *src = sorted_data; // track indices and number of inserted items - size_t di = 0, si = 0, inserted = 0; + size_t di = 0, si = 0, processed = 0; // search the list for insertion points - for (; di < list->collection.size; di++) { + while (di < list->collection.size) { const void *list_elm = invoke_list_func(at, list, di); - // compare current list element with first source element - // if less or equal, skip + // compare the current list element with the first source element + // if less, skip the list elements + // if equal, skip the list elements and optionally the source elements { int d = cmp(list_elm, src); if (d <= 0) { if (!allow_duplicates && d == 0) { src += elem_size; si++; - inserted++; // we also count duplicates for the return value + processed++; // we also count duplicates for the return value while (si < n && cmp(list_elm, src) == 0) { src += elem_size; si++; - inserted++; + processed++; } - if (inserted == n) { - return inserted; + if (processed == n) { + return processed; } } + di++; continue; } } - // determine number of consecutive elements that can be inserted - size_t ins = 1; + // determine the number of consecutive elements that can be inserted + size_t ins = 1, skip = 0; const char *next = src; while (++si < n) { + if (!allow_duplicates) { + // skip duplicates within the source + if (cmp(next, next + elem_size) == 0) { + next += elem_size; + skip++; + continue; + } else { + if (skip > 0) { + // if we had to skip something, we must wait for the next run + next += elem_size; + break; + } + } + } next += elem_size; // once we become larger than the list elem, break if (cmp(list_elm, next) <= 0) { @@ -371,30 +387,46 @@ // insert the elements at location si if (ins == 1) { if (NULL == invoke_list_func(insert_element, list, di, src)) { - return inserted; // LCOV_EXCL_LINE + return processed; // LCOV_EXCL_LINE } } else { size_t r = invoke_list_func(insert_array, list, di, src, ins); if (r < ins) { - return inserted + r; // LCOV_EXCL_LINE + return processed + r; // LCOV_EXCL_LINE } } - inserted += ins; + processed += ins + skip; di += ins; // everything inserted? - if (inserted == n) { - return inserted; + if (processed == n) { + return processed; } src = next; } // insert remaining items if (si < n) { - inserted += invoke_list_func(insert_array, list, di, src, n - si); + if (allow_duplicates) { + processed += invoke_list_func(insert_array, list, di, src, n - si); + } else { + const void *last = di == 0 ? NULL : invoke_list_func(at, list, di - 1); + for (; si < n; si++) { + // skip duplicates within the source + if (last == NULL || cmp(last, src) != 0) { + if (NULL == invoke_list_func(insert_element, list, di, src)) { + return processed; // LCOV_EXCL_LINE + } + last = src; + di++; + } + processed++; + src += elem_size; + } + } } - return inserted; + return processed; } size_t cx_list_default_insert_sorted( diff -r 8bfccb342895 -r 9a72258446cd tests/test_list.c --- a/tests/test_list.c Sat Oct 11 11:55:46 2025 +0200 +++ b/tests/test_list.c Sat Oct 11 15:42:48 2025 +0200 @@ -36,6 +36,7 @@ #include #include +#include CX_TEST(test_array_add) { CX_ARRAY_DECLARE(int, arr); @@ -2456,6 +2457,73 @@ cxListFree(list); } +CX_TEST(test_list_use_insert_unique_to_remove_duplicates) { + CxList *linked_list = cxLinkedListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int)); + CxList *array_list = cxArrayListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int), 8); + CxList *defaulted_list = cxArrayListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int), 8); + do_set_default_class_funcs(defaulted_list); + int test_array[23] = { + 120, -13, 100, -90, 13, -56, 74, 20, 28, 80, 18, -56, 130, 12, 15, 0, 39, 100, 0, 29, 28, 85, 20 + }; + int test_array_unique[18] = { + -90, -56, -13, 0, 12, 13, 15, 18, 20, 28, 29, 39, 74, 80, 85, 100, 120, 130 + }; + CX_TEST_DO { + qsort(test_array, 23, sizeof(int), cx_cmp_int); + cxListInsertUniqueArray(linked_list, test_array, 23); + cxListInsertUniqueArray(array_list, test_array, 23); + cxListInsertUniqueArray(defaulted_list, test_array, 23); + CX_TEST_ASSERT(cxListSize(linked_list) == 18); + CX_TEST_ASSERT(cxListSize(array_list) == 18); + CX_TEST_ASSERT(cxListSize(defaulted_list) == 18); + for (unsigned i = 0; i < 18; i++) { + CX_TEST_ASSERT(*(int *) cxListAt(linked_list, i) == test_array_unique[i]); + CX_TEST_ASSERT(*(int *) cxListAt(array_list, i) == test_array_unique[i]); + CX_TEST_ASSERT(*(int *) cxListAt(defaulted_list, i) == test_array_unique[i]); + } + } + cxListFree(defaulted_list); + cxListFree(linked_list); + cxListFree(array_list); +} + +CX_TEST(test_list_use_insert_unique_with_duplicates_in_source) { + CxList *linked_list = cxLinkedListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int)); + CxList *array_list = cxArrayListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int), 8); + CxList *defaulted_list = cxArrayListCreate(cxDefaultAllocator, cx_cmp_int, sizeof(int), 8); + do_set_default_class_funcs(defaulted_list); + int pre_filled[10] = {-13, 5, 18, 30, 40, 45, 50, 80, 85, 110}; + cxListInsertSortedArray(linked_list, pre_filled, 10); + cxListInsertSortedArray(array_list, pre_filled, 10); + cxListInsertSortedArray(defaulted_list, pre_filled, 10); + int test_array[23] = { + 120, -13, 100, -90, 13, -56, 74, 20, 28, 80, 18, -56, 130, 12, 15, 0, 39, 100, 0, 29, 28, 85, 20 + }; + int test_array_unique[24] = { + -90, -56, -13, 0, 5, 12, 13, 15, 18, 20, 28, 29, 30, 39, 40, 45, 50, 74, 80, 85, 100, 110, 120, 130 + }; + CX_TEST_DO { + qsort(test_array, 23, sizeof(int), cx_cmp_int); + cxListInsertUniqueArray(linked_list, test_array, 23); + cxListInsertUniqueArray(array_list, test_array, 23); + cxListInsertUniqueArray(defaulted_list, test_array, 23); + CX_TEST_ASSERT(cxListSize(linked_list) == 24); + CX_TEST_ASSERT(cxListSize(array_list) == 24); + CX_TEST_ASSERT(cxListSize(defaulted_list) == 24); + for (unsigned i = 0; i < 24; i++) { + CX_TEST_ASSERT(*(int *) cxListAt(linked_list, i) == test_array_unique[i]); + CX_TEST_ASSERT(*(int *) cxListAt(array_list, i) == test_array_unique[i]); + CX_TEST_ASSERT(*(int *) cxListAt(defaulted_list, i) == test_array_unique[i]); + } + CX_TEST_ASSERT(*(int*)cxListLast(linked_list) == 130); + CX_TEST_ASSERT(*(int*)cxListLast(array_list) == 130); + CX_TEST_ASSERT(*(int*)cxListLast(defaulted_list) == 130); + } + cxListFree(defaulted_list); + cxListFree(linked_list); + cxListFree(array_list); +} + CxTestSuite *cx_test_suite_array_list(void) { CxTestSuite *suite = cx_test_suite_new("array_list"); @@ -2747,6 +2815,8 @@ CxTestSuite *suite = cx_test_suite_new("list corner cases"); cx_test_register(suite, test_list_pointer_list_supports_null); + cx_test_register(suite, test_list_use_insert_unique_with_duplicates_in_source); + cx_test_register(suite, test_list_use_insert_unique_to_remove_duplicates); return suite; }