Skip to content

Commit

Permalink
Improve performance of sdssplitargs (valkey-io#1230)
Browse files Browse the repository at this point in the history
The current implementation of `sdssplitargs` does repeated `sdscatlen`
to build the parsed arguments, which isn't very efficient because it
does a lot of extra reallocations and moves through the sds code a lot.
It also typically results in memory overhead, because `sdscatlen`
over-allocates, which is usually not needed since args are usually not
modified after being created.

The new implementation of sdssplitargs does two passes, the first to
parse the argument to figure out the final length and the second to
actually copy the string. It's generally about 2x faster for larger
strings (~100 bytes), and about 20% faster for small strings (~10
bytes). This is generally faster since as long as everything is in the
CPU cache, it's going to be fast.

There are a couple of sanity tests, none existed before, as well as some
fuzzying which was used to find some bugs and also to do the
benchmarking. The original benchmarking code can be seen
valkey-io@6576aeb.

```
test_sdssplitargs_benchmark - unit/test_sds.c:530] Using random seed: 1729883235
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.44%, new:13039us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.58%, new:12057us, old:27771us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 59.18%, new:9048us, old:22165us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 54.61%, new:12381us, old:27278us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 51.17%, new:16012us, old:32793us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 49.18%, new:16041us, old:31563us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.40%, new:12450us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.49%, new:13066us, old:30031us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.75%, new:12744us, old:30894us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 52.44%, new:16885us, old:35504us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.57%, new:8107us, old:21659us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.12%, new:8320us, old:21966us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 45.23%, new:13960us, old:25487us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 57.95%, new:9188us, old:21849us
```

---------

Signed-off-by: Madelyn Olson <[email protected]>
  • Loading branch information
madolson authored Oct 31, 2024
1 parent 91cbf77 commit 1c222f7
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 82 deletions.
181 changes: 100 additions & 81 deletions src/sds.c
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,86 @@ int hex_digit_to_int(char c) {
}
}

/* Helper function for sdssplitargs that parses a single argument. It
* populates the number characters needed to store the parsed argument
* in len, if provided, or will copy the parsed string into dst, if provided.
* If the string is able to be parsed, this function returns the number of
* characters that were parsed. If the argument can't be parsed, it
* returns 0. */
static int sdsparsearg(const char *arg, unsigned int *len, char *dst) {
const char *p = arg;
int inq = 0; /* set to 1 if we are in "quotes" */
int insq = 0; /* set to 1 if we are in 'single quotes' */
int done = 0;

while (!done) {
int new_char = -1;
if (inq) {
if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
new_char = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
p += 3;
} else if (*p == '\\' && *(p + 1)) {
p++;
switch (*p) {
case 'n': new_char = '\n'; break;
case 'r': new_char = '\r'; break;
case 't': new_char = '\t'; break;
case 'b': new_char = '\b'; break;
case 'a': new_char = '\a'; break;
default: new_char = *p; break;
}
} else if (*p == '"') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) return 0;
done = 1;
} else if (!*p) {
/* unterminated quotes */
return 0;
} else {
new_char = *p;
}
} else if (insq) {
if (*p == '\\' && *(p + 1) == '\'') {
p++;
new_char = *p;
} else if (*p == '\'') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) return 0;
done = 1;
} else if (!*p) {
/* unterminated quotes */
return 0;
} else {
new_char = *p;
}
} else {
switch (*p) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\0': done = 1; break;
case '"': inq = 1; break;
case '\'': insq = 1; break;
default: new_char = *p; break;
}
}
if (new_char != -1) {
if (len) (*len)++;
if (dst) {
*dst = (char)new_char;
dst++;
}
}
if (*p) {
p++;
}
}
return p - arg;
}

/* Split a line into arguments, where every argument can be in the
* following programming-language REPL-alike form:
*
Expand All @@ -1049,103 +1129,42 @@ int hex_digit_to_int(char c) {
* The function returns the allocated tokens on success, even when the
* input string is empty, or NULL if the input contains unbalanced
* quotes or closed quotes followed by non space characters
* as in: "foo"bar or "foo'
* as in: "foo"bar or "foo'.
*
* The sds strings returned by this function are not initialized with
* extra space.
*/
sds *sdssplitargs(const char *line, int *argc) {
const char *p = line;
char *current = NULL;
char **vector = NULL;

*argc = 0;
while (1) {
while (*p) {
/* skip blanks */
while (*p && isspace(*p)) p++;
if (*p) {
/* get a token */
int inq = 0; /* set to 1 if we are in "quotes" */
int insq = 0; /* set to 1 if we are in 'single quotes' */
int done = 0;

if (current == NULL) current = sdsempty();
while (!done) {
if (inq) {
if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
unsigned char byte;

byte = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
current = sdscatlen(current, (char *)&byte, 1);
p += 3;
} else if (*p == '\\' && *(p + 1)) {
char c;

p++;
switch (*p) {
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'b': c = '\b'; break;
case 'a': c = '\a'; break;
default: c = *p; break;
}
current = sdscatlen(current, &c, 1);
} else if (*p == '"') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) goto err;
done = 1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current, p, 1);
}
} else if (insq) {
if (*p == '\\' && *(p + 1) == '\'') {
p++;
current = sdscatlen(current, "'", 1);
} else if (*p == '\'') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) goto err;
done = 1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current, p, 1);
}
} else {
switch (*p) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\0': done = 1; break;
case '"': inq = 1; break;
case '\'': insq = 1; break;
default: current = sdscatlen(current, p, 1); break;
}
}
if (*p) p++;
}
if (!(*p)) break;
unsigned int len = 0;
if (sdsparsearg(p, &len, NULL)) {
sds current = sdsnewlen(SDS_NOINIT, len);
int parsedlen = sdsparsearg(p, NULL, current);
assert(parsedlen > 0);
p += parsedlen;

/* add the token to the vector */
vector = s_realloc(vector, ((*argc) + 1) * sizeof(char *));
vector[*argc] = current;
(*argc)++;
current = NULL;
} else {
/* Even on empty input string return something not NULL. */
if (vector == NULL) vector = s_malloc(sizeof(void *));
return vector;
while ((*argc)--) sdsfree(vector[*argc]);
s_free(vector);
*argc = 0;
return NULL;
}
}

err:
while ((*argc)--) sdsfree(vector[*argc]);
s_free(vector);
if (current) sdsfree(current);
*argc = 0;
return NULL;
/* Even on empty input string return something not NULL. */
if (vector == NULL) vector = s_malloc(sizeof(void *));
return vector;
}

/* Modify the string substituting all the occurrences of the set of
Expand Down
3 changes: 2 additions & 1 deletion src/unit/test_files.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ int test_raxFuzz(int argc, char **argv, int flags);
int test_sds(int argc, char **argv, int flags);
int test_typesAndAllocSize(int argc, char **argv, int flags);
int test_sdsHeaderSizes(int argc, char **argv, int flags);
int test_sdssplitargs(int argc, char **argv, int flags);
int test_sha1(int argc, char **argv, int flags);
int test_string2ll(int argc, char **argv, int flags);
int test_string2l(int argc, char **argv, int flags);
Expand Down Expand Up @@ -157,7 +158,7 @@ unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEnco
unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}};
unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}};
unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}};
unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}};
unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}};
Expand Down
41 changes: 41 additions & 0 deletions src/unit/test_sds.c
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,44 @@ int test_sdsHeaderSizes(int argc, char **argv, int flags) {

return 0;
}

int test_sdssplitargs(int argc, char **argv, int flags) {
UNUSED(argc);
UNUSED(argv);
UNUSED(flags);

int len;
sds *sargv;

sargv = sdssplitargs("Testing one two three", &len);
TEST_ASSERT(4 == len);
TEST_ASSERT(!strcmp("Testing", sargv[0]));
TEST_ASSERT(!strcmp("one", sargv[1]));
TEST_ASSERT(!strcmp("two", sargv[2]));
TEST_ASSERT(!strcmp("three", sargv[3]));
sdsfreesplitres(sargv, len);

sargv = sdssplitargs("", &len);
TEST_ASSERT(0 == len);
TEST_ASSERT(sargv != NULL);
sdsfreesplitres(sargv, len);

sargv = sdssplitargs("\"Testing split strings\" \'Another split string\'", &len);
TEST_ASSERT(2 == len);
TEST_ASSERT(!strcmp("Testing split strings", sargv[0]));
TEST_ASSERT(!strcmp("Another split string", sargv[1]));
sdsfreesplitres(sargv, len);

sargv = sdssplitargs("\"Hello\" ", &len);
TEST_ASSERT(1 == len);
TEST_ASSERT(!strcmp("Hello", sargv[0]));
sdsfreesplitres(sargv, len);

char *binary_string = "\"\\x73\\x75\\x70\\x65\\x72\\x20\\x00\\x73\\x65\\x63\\x72\\x65\\x74\\x20\\x70\\x61\\x73\\x73\\x77\\x6f\\x72\\x64\"";
sargv = sdssplitargs(binary_string, &len);
TEST_ASSERT(1 == len);
TEST_ASSERT(22 == sdslen(sargv[0]));
sdsfreesplitres(sargv, len);

return 0;
}

0 comments on commit 1c222f7

Please sign in to comment.