From 1c222f77cecc100719bbc87c7a2ecd13402fe6db Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 31 Oct 2024 11:37:53 -0700
Subject: [PATCH] Improve performance of sdssplitargs (#1230)

The current implementation of `sdssplitargs` does repeated `sdscatlen`
to build the parsed arguments, which isn't very efficient because it
does a lot of extra reallocations and moves through the sds code a lot.
It also typically results in memory overhead, because `sdscatlen`
over-allocates, which is usually not needed since args are usually not
modified after being created.

The new implementation of sdssplitargs does two passes, the first to
parse the argument to figure out the final length and the second to
actually copy the string. It's generally about 2x faster for larger
strings (~100 bytes), and about 20% faster for small strings (~10
bytes). This is generally faster since as long as everything is in the
CPU cache, it's going to be fast.

There are a couple of sanity tests, none existed before, as well as some
fuzzying which was used to find some bugs and also to do the
benchmarking. The original benchmarking code can be seen
https://github.com/valkey-io/valkey/pull/1230/commits/6576aeb86adfb5afa74aefb5bc2d2becde95ce4c.

```
test_sdssplitargs_benchmark - unit/test_sds.c:530] Using random seed: 1729883235
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.44%, new:13039us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.58%, new:12057us, old:27771us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 59.18%, new:9048us, old:22165us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 54.61%, new:12381us, old:27278us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 51.17%, new:16012us, old:32793us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 49.18%, new:16041us, old:31563us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.40%, new:12450us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.49%, new:13066us, old:30031us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.75%, new:12744us, old:30894us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 52.44%, new:16885us, old:35504us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.57%, new:8107us, old:21659us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.12%, new:8320us, old:21966us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 45.23%, new:13960us, old:25487us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 57.95%, new:9188us, old:21849us
```

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/sds.c             | 181 +++++++++++++++++++++++-------------------
 src/unit/test_files.h |   3 +-
 src/unit/test_sds.c   |  41 ++++++++++
 3 files changed, 143 insertions(+), 82 deletions(-)

diff --git a/src/sds.c b/src/sds.c
index e14f4bd0bd..4dd7d709aa 100644
--- a/src/sds.c
+++ b/src/sds.c
@@ -1032,6 +1032,86 @@ int hex_digit_to_int(char c) {
     }
 }
 
+/* Helper function for sdssplitargs that parses a single argument. It
+ * populates the number characters needed to store the parsed argument
+ * in len, if provided, or will copy the parsed string into dst, if provided.
+ * If the string is able to be parsed, this function returns the number of
+ * characters that were parsed. If the argument can't be parsed, it
+ * returns 0. */
+static int sdsparsearg(const char *arg, unsigned int *len, char *dst) {
+    const char *p = arg;
+    int inq = 0;  /* set to 1 if we are in "quotes" */
+    int insq = 0; /* set to 1 if we are in 'single quotes' */
+    int done = 0;
+
+    while (!done) {
+        int new_char = -1;
+        if (inq) {
+            if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
+                new_char = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
+                p += 3;
+            } else if (*p == '\\' && *(p + 1)) {
+                p++;
+                switch (*p) {
+                case 'n': new_char = '\n'; break;
+                case 'r': new_char = '\r'; break;
+                case 't': new_char = '\t'; break;
+                case 'b': new_char = '\b'; break;
+                case 'a': new_char = '\a'; break;
+                default: new_char = *p; break;
+                }
+            } else if (*p == '"') {
+                /* closing quote must be followed by a space or
+                 * nothing at all. */
+                if (*(p + 1) && !isspace(*(p + 1))) return 0;
+                done = 1;
+            } else if (!*p) {
+                /* unterminated quotes */
+                return 0;
+            } else {
+                new_char = *p;
+            }
+        } else if (insq) {
+            if (*p == '\\' && *(p + 1) == '\'') {
+                p++;
+                new_char = *p;
+            } else if (*p == '\'') {
+                /* closing quote must be followed by a space or
+                 * nothing at all. */
+                if (*(p + 1) && !isspace(*(p + 1))) return 0;
+                done = 1;
+            } else if (!*p) {
+                /* unterminated quotes */
+                return 0;
+            } else {
+                new_char = *p;
+            }
+        } else {
+            switch (*p) {
+            case ' ':
+            case '\n':
+            case '\r':
+            case '\t':
+            case '\0': done = 1; break;
+            case '"': inq = 1; break;
+            case '\'': insq = 1; break;
+            default: new_char = *p; break;
+            }
+        }
+        if (new_char != -1) {
+            if (len) (*len)++;
+            if (dst) {
+                *dst = (char)new_char;
+                dst++;
+            }
+        }
+        if (*p) {
+            p++;
+        }
+    }
+    return p - arg;
+}
+
 /* Split a line into arguments, where every argument can be in the
  * following programming-language REPL-alike form:
  *
@@ -1049,103 +1129,42 @@ int hex_digit_to_int(char c) {
  * The function returns the allocated tokens on success, even when the
  * input string is empty, or NULL if the input contains unbalanced
  * quotes or closed quotes followed by non space characters
- * as in: "foo"bar or "foo'
+ * as in: "foo"bar or "foo'.
+ *
+ * The sds strings returned by this function are not initialized with
+ * extra space.
  */
 sds *sdssplitargs(const char *line, int *argc) {
     const char *p = line;
-    char *current = NULL;
     char **vector = NULL;
 
     *argc = 0;
-    while (1) {
+    while (*p) {
         /* skip blanks */
         while (*p && isspace(*p)) p++;
-        if (*p) {
-            /* get a token */
-            int inq = 0;  /* set to 1 if we are in "quotes" */
-            int insq = 0; /* set to 1 if we are in 'single quotes' */
-            int done = 0;
-
-            if (current == NULL) current = sdsempty();
-            while (!done) {
-                if (inq) {
-                    if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
-                        unsigned char byte;
-
-                        byte = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
-                        current = sdscatlen(current, (char *)&byte, 1);
-                        p += 3;
-                    } else if (*p == '\\' && *(p + 1)) {
-                        char c;
-
-                        p++;
-                        switch (*p) {
-                        case 'n': c = '\n'; break;
-                        case 'r': c = '\r'; break;
-                        case 't': c = '\t'; break;
-                        case 'b': c = '\b'; break;
-                        case 'a': c = '\a'; break;
-                        default: c = *p; break;
-                        }
-                        current = sdscatlen(current, &c, 1);
-                    } else if (*p == '"') {
-                        /* closing quote must be followed by a space or
-                         * nothing at all. */
-                        if (*(p + 1) && !isspace(*(p + 1))) goto err;
-                        done = 1;
-                    } else if (!*p) {
-                        /* unterminated quotes */
-                        goto err;
-                    } else {
-                        current = sdscatlen(current, p, 1);
-                    }
-                } else if (insq) {
-                    if (*p == '\\' && *(p + 1) == '\'') {
-                        p++;
-                        current = sdscatlen(current, "'", 1);
-                    } else if (*p == '\'') {
-                        /* closing quote must be followed by a space or
-                         * nothing at all. */
-                        if (*(p + 1) && !isspace(*(p + 1))) goto err;
-                        done = 1;
-                    } else if (!*p) {
-                        /* unterminated quotes */
-                        goto err;
-                    } else {
-                        current = sdscatlen(current, p, 1);
-                    }
-                } else {
-                    switch (*p) {
-                    case ' ':
-                    case '\n':
-                    case '\r':
-                    case '\t':
-                    case '\0': done = 1; break;
-                    case '"': inq = 1; break;
-                    case '\'': insq = 1; break;
-                    default: current = sdscatlen(current, p, 1); break;
-                    }
-                }
-                if (*p) p++;
-            }
+        if (!(*p)) break;
+        unsigned int len = 0;
+        if (sdsparsearg(p, &len, NULL)) {
+            sds current = sdsnewlen(SDS_NOINIT, len);
+            int parsedlen = sdsparsearg(p, NULL, current);
+            assert(parsedlen > 0);
+            p += parsedlen;
+
             /* add the token to the vector */
             vector = s_realloc(vector, ((*argc) + 1) * sizeof(char *));
             vector[*argc] = current;
             (*argc)++;
             current = NULL;
         } else {
-            /* Even on empty input string return something not NULL. */
-            if (vector == NULL) vector = s_malloc(sizeof(void *));
-            return vector;
+            while ((*argc)--) sdsfree(vector[*argc]);
+            s_free(vector);
+            *argc = 0;
+            return NULL;
         }
     }
-
-err:
-    while ((*argc)--) sdsfree(vector[*argc]);
-    s_free(vector);
-    if (current) sdsfree(current);
-    *argc = 0;
-    return NULL;
+    /* Even on empty input string return something not NULL. */
+    if (vector == NULL) vector = s_malloc(sizeof(void *));
+    return vector;
 }
 
 /* Modify the string substituting all the occurrences of the set of
diff --git a/src/unit/test_files.h b/src/unit/test_files.h
index cd2e0c5b92..c2b062039a 100644
--- a/src/unit/test_files.h
+++ b/src/unit/test_files.h
@@ -99,6 +99,7 @@ int test_raxFuzz(int argc, char **argv, int flags);
 int test_sds(int argc, char **argv, int flags);
 int test_typesAndAllocSize(int argc, char **argv, int flags);
 int test_sdsHeaderSizes(int argc, char **argv, int flags);
+int test_sdssplitargs(int argc, char **argv, int flags);
 int test_sha1(int argc, char **argv, int flags);
 int test_string2ll(int argc, char **argv, int flags);
 int test_string2l(int argc, char **argv, int flags);
@@ -157,7 +158,7 @@ unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEnco
 unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
 unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
 unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
-unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}};
+unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}};
 unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}};
 unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}};
 unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}};
diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c
index 19b5c7d73f..b97d0d9d32 100644
--- a/src/unit/test_sds.c
+++ b/src/unit/test_sds.c
@@ -328,3 +328,44 @@ int test_sdsHeaderSizes(int argc, char **argv, int flags) {
 
     return 0;
 }
+
+int test_sdssplitargs(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    int len;
+    sds *sargv;
+
+    sargv = sdssplitargs("Testing one two three", &len);
+    TEST_ASSERT(4 == len);
+    TEST_ASSERT(!strcmp("Testing", sargv[0]));
+    TEST_ASSERT(!strcmp("one", sargv[1]));
+    TEST_ASSERT(!strcmp("two", sargv[2]));
+    TEST_ASSERT(!strcmp("three", sargv[3]));
+    sdsfreesplitres(sargv, len);
+
+    sargv = sdssplitargs("", &len);
+    TEST_ASSERT(0 == len);
+    TEST_ASSERT(sargv != NULL);
+    sdsfreesplitres(sargv, len);
+
+    sargv = sdssplitargs("\"Testing split strings\" \'Another split string\'", &len);
+    TEST_ASSERT(2 == len);
+    TEST_ASSERT(!strcmp("Testing split strings", sargv[0]));
+    TEST_ASSERT(!strcmp("Another split string", sargv[1]));
+    sdsfreesplitres(sargv, len);
+
+    sargv = sdssplitargs("\"Hello\" ", &len);
+    TEST_ASSERT(1 == len);
+    TEST_ASSERT(!strcmp("Hello", sargv[0]));
+    sdsfreesplitres(sargv, len);
+
+    char *binary_string = "\"\\x73\\x75\\x70\\x65\\x72\\x20\\x00\\x73\\x65\\x63\\x72\\x65\\x74\\x20\\x70\\x61\\x73\\x73\\x77\\x6f\\x72\\x64\"";
+    sargv = sdssplitargs(binary_string, &len);
+    TEST_ASSERT(1 == len);
+    TEST_ASSERT(22 == sdslen(sargv[0]));
+    sdsfreesplitres(sargv, len);
+
+    return 0;
+}