From 5cda17f446664ef09908c0d3b25f5c9b8ae6a150 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Wed, 29 Apr 2026 15:11:05 -0400 Subject: [PATCH 1/2] Remove unrolling of last iteration of loop for x86 decode It is not clear why this was originally done. There is no comment in the source code, but it does not seem to improve performance. --- src/streamvbyte_x64_decode.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/src/streamvbyte_x64_decode.c b/src/streamvbyte_x64_decode.c index d079285..b2a24f3 100644 --- a/src/streamvbyte_x64_decode.c +++ b/src/streamvbyte_x64_decode.c @@ -50,7 +50,7 @@ static inline const uint8_t *svb_decode_sse41_simple(uint32_t *out, const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; uint64_t nextkeys; memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); - for (; Offset != 0; ++Offset) { + for (; Offset != 1; ++Offset) { uint64_t keys = nextkeys; memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); @@ -77,34 +77,6 @@ static inline const uint8_t *svb_decode_sse41_simple(uint32_t *out, Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); svb_write_sse41(out + 28, Data); - out += 32; - } - { - uint64_t keys = nextkeys; - - Data = svb_decode_sse41((keys & 0xFF), &dataPtr); - svb_write_sse41(out, Data); - Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); - svb_write_sse41(out + 4, Data); - - keys >>= 16; - Data = svb_decode_sse41((keys & 0xFF), &dataPtr); - svb_write_sse41(out + 8, Data); - Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); - svb_write_sse41(out + 12, Data); - - keys >>= 16; - Data = svb_decode_sse41((keys & 0xFF), &dataPtr); - svb_write_sse41(out + 16, Data); - Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); - svb_write_sse41(out + 20, Data); - - keys >>= 16; - Data = svb_decode_sse41((keys & 0xFF), &dataPtr); - svb_write_sse41(out + 24, Data); - Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); - svb_write_sse41(out + 28, Data); - out += 32; } } From 8e79192d06610e6229b4ca59ef16ad6c7e0cd714 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Thu, 30 Apr 2026 07:48:06 -0400 Subject: [PATCH 2/2] Load keys for current loop iteration instead of next iteration --- src/streamvbyte_x64_decode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/streamvbyte_x64_decode.c b/src/streamvbyte_x64_decode.c index b2a24f3..b2e954c 100644 --- a/src/streamvbyte_x64_decode.c +++ b/src/streamvbyte_x64_decode.c @@ -48,11 +48,9 @@ static inline const uint8_t *svb_decode_sse41_simple(uint32_t *out, int64_t Offset = -(int64_t)keybytes / 8 + 1; const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; - uint64_t nextkeys; - memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); + uint64_t keys; for (; Offset != 1; ++Offset) { - uint64_t keys = nextkeys; - memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); + memcpy(&keys, keyPtr64 + Offset, sizeof(keys)); Data = svb_decode_sse41((keys & 0xFF), &dataPtr); svb_write_sse41(out, Data);