From 4629ca4be0c59a271f5cb74d6e93d9efbc91f831 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Fri, 19 May 2023 17:22:02 -0700 Subject: [PATCH] Optimize ZSTD_decodeSequence when ofBits==0 This patch adds a branch to a previously branchless code in decompress hot loop handling the case where `ofBits == 0`. Even though a branch is added, the branch saves on instructions that introduce memory dependency an unneeded memory operations when the condition isn't met. Testing on intel Skylake shows positive decompression speed improvements across different corpora and compilers, with speed improvements of 1% to 7%. On M1 Macbook Pro performance is mostly neutral with a possible very small regression. --- lib/decompress/zstd_decompress_block.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 5028a52f103..e4c1e369645 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1287,8 +1287,10 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) U32 const ll0 = (llDInfo->baseValue == 0); if (LIKELY((ofBits == 0))) { offset = seqState->prevOffset[ll0]; - seqState->prevOffset[1] = seqState->prevOffset[!ll0]; - seqState->prevOffset[0] = offset; + if(ll0) { + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } } else { offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];