Skip to content

Commit f406b48

Browse files
authored
Split decoding loop (#34)
1 parent 797304d commit f406b48

File tree

1 file changed

+213
-69
lines changed

1 file changed

+213
-69
lines changed

src/decompress.rs

Lines changed: 213 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,219 @@ impl Decompressor {
415415
output: &mut [u8],
416416
mut output_index: usize,
417417
) -> Result<usize, DecompressionError> {
418+
// Fast decoding loop.
419+
//
420+
// This loop is optimized for speed and is the main decoding loop for the decompressor,
421+
// which is used when there are at least 8 bytes of input and output data available. It
422+
// assumes that the bitbuffer is full (nbits >= 56) and that litlen_entry has been loaded.
423+
//
424+
// These assumptions enable a few optimizations:
425+
// - Nearly all checks for nbits are avoided.
426+
// - Checking the input size is optimized out in the refill function call.
427+
// - The litlen_entry for the next loop iteration can be loaded in parallel with refilling
428+
// the bit buffer. This is because when the input is non-empty, the bit buffer actually
429+
// has 64-bits of valid data (even though nbits will be in 56..=63).
430+
self.fill_buffer(remaining_input);
431+
let mut litlen_entry = self.compression.litlen_table[(self.buffer & 0xfff) as usize];
432+
while self.state == State::CompressedData
433+
&& output_index + 8 <= output.len()
434+
&& remaining_input.len() >= 8
435+
{
436+
// First check whether the next symbol is a literal. This code does up to 2 additional
437+
// table lookups to decode more literals.
438+
let mut bits;
439+
let mut litlen_code_bits = litlen_entry as u8;
440+
if litlen_entry & LITERAL_ENTRY != 0 {
441+
let litlen_entry2 = self.compression.litlen_table
442+
[(self.buffer >> litlen_code_bits & 0xfff) as usize];
443+
let litlen_code_bits2 = litlen_entry2 as u8;
444+
let litlen_entry3 = self.compression.litlen_table
445+
[(self.buffer >> (litlen_code_bits + litlen_code_bits2) & 0xfff) as usize];
446+
let litlen_code_bits3 = litlen_entry3 as u8;
447+
let litlen_entry4 = self.compression.litlen_table[(self.buffer
448+
>> (litlen_code_bits + litlen_code_bits2 + litlen_code_bits3)
449+
& 0xfff)
450+
as usize];
451+
452+
let advance_output_bytes = ((litlen_entry & 0xf00) >> 8) as usize;
453+
output[output_index] = (litlen_entry >> 16) as u8;
454+
output[output_index + 1] = (litlen_entry >> 24) as u8;
455+
output_index += advance_output_bytes;
456+
457+
if litlen_entry2 & LITERAL_ENTRY != 0 {
458+
let advance_output_bytes2 = ((litlen_entry2 & 0xf00) >> 8) as usize;
459+
output[output_index] = (litlen_entry2 >> 16) as u8;
460+
output[output_index + 1] = (litlen_entry2 >> 24) as u8;
461+
output_index += advance_output_bytes2;
462+
463+
if litlen_entry3 & LITERAL_ENTRY != 0 {
464+
let advance_output_bytes3 = ((litlen_entry3 & 0xf00) >> 8) as usize;
465+
output[output_index] = (litlen_entry3 >> 16) as u8;
466+
output[output_index + 1] = (litlen_entry3 >> 24) as u8;
467+
output_index += advance_output_bytes3;
468+
469+
litlen_entry = litlen_entry4;
470+
self.consume_bits(litlen_code_bits + litlen_code_bits2 + litlen_code_bits3);
471+
self.fill_buffer(remaining_input);
472+
continue;
473+
} else {
474+
self.consume_bits(litlen_code_bits + litlen_code_bits2);
475+
litlen_entry = litlen_entry3;
476+
litlen_code_bits = litlen_code_bits3;
477+
self.fill_buffer(remaining_input);
478+
bits = self.buffer;
479+
}
480+
} else {
481+
self.consume_bits(litlen_code_bits);
482+
bits = self.buffer;
483+
litlen_entry = litlen_entry2;
484+
litlen_code_bits = litlen_code_bits2;
485+
if self.nbits < 48 {
486+
self.fill_buffer(remaining_input);
487+
}
488+
}
489+
} else {
490+
bits = self.buffer;
491+
}
492+
493+
// The next symbol is either a 13+ bit literal, back-reference, or an EOF symbol.
494+
let (length_base, length_extra_bits, litlen_code_bits) =
495+
if litlen_entry & EXCEPTIONAL_ENTRY == 0 {
496+
(
497+
litlen_entry >> 16,
498+
(litlen_entry >> 8) as u8,
499+
litlen_code_bits,
500+
)
501+
} else if litlen_entry & SECONDARY_TABLE_ENTRY != 0 {
502+
let secondary_table_index =
503+
(litlen_entry >> 16) + ((bits >> 12) as u32 & (litlen_entry & 0xff));
504+
let secondary_entry =
505+
self.compression.secondary_table[secondary_table_index as usize];
506+
let litlen_symbol = secondary_entry >> 4;
507+
let litlen_code_bits = (secondary_entry & 0xf) as u8;
508+
509+
match litlen_symbol {
510+
0..=255 => {
511+
self.consume_bits(litlen_code_bits);
512+
litlen_entry =
513+
self.compression.litlen_table[(self.buffer & 0xfff) as usize];
514+
self.fill_buffer(remaining_input);
515+
output[output_index] = litlen_symbol as u8;
516+
output_index += 1;
517+
continue;
518+
}
519+
256 => {
520+
self.consume_bits(litlen_code_bits);
521+
self.state = match self.last_block {
522+
true => State::Checksum,
523+
false => State::BlockHeader,
524+
};
525+
break;
526+
}
527+
_ => (
528+
LEN_SYM_TO_LEN_BASE[litlen_symbol as usize - 257] as u32,
529+
LEN_SYM_TO_LEN_EXTRA[litlen_symbol as usize - 257],
530+
litlen_code_bits,
531+
),
532+
}
533+
} else if litlen_code_bits == 0 {
534+
return Err(DecompressionError::InvalidLiteralLengthCode);
535+
} else {
536+
self.consume_bits(litlen_code_bits);
537+
self.state = match self.last_block {
538+
true => State::Checksum,
539+
false => State::BlockHeader,
540+
};
541+
break;
542+
};
543+
bits >>= litlen_code_bits;
544+
545+
let length_extra_mask = (1 << length_extra_bits) - 1;
546+
let length = length_base as usize + (bits & length_extra_mask) as usize;
547+
bits >>= length_extra_bits;
548+
549+
let dist_entry = self.compression.dist_table[(bits & 0x1ff) as usize];
550+
let (dist_base, dist_extra_bits, dist_code_bits) = if dist_entry & LITERAL_ENTRY != 0 {
551+
(
552+
(dist_entry >> 16) as u16,
553+
(dist_entry >> 8) as u8 & 0xf,
554+
dist_entry as u8,
555+
)
556+
} else if dist_entry >> 8 == 0 {
557+
return Err(DecompressionError::InvalidDistanceCode);
558+
} else {
559+
let secondary_table_index =
560+
(dist_entry >> 16) + ((bits >> 9) as u32 & (dist_entry & 0xff));
561+
let secondary_entry =
562+
self.compression.dist_secondary_table[secondary_table_index as usize];
563+
let dist_symbol = (secondary_entry >> 4) as usize;
564+
if dist_symbol >= 30 {
565+
return Err(DecompressionError::InvalidDistanceCode);
566+
}
567+
568+
(
569+
DIST_SYM_TO_DIST_BASE[dist_symbol],
570+
DIST_SYM_TO_DIST_EXTRA[dist_symbol],
571+
(secondary_entry & 0xf) as u8,
572+
)
573+
};
574+
bits >>= dist_code_bits;
575+
576+
let dist = dist_base as usize + (bits & ((1 << dist_extra_bits) - 1)) as usize;
577+
if dist > output_index {
578+
return Err(DecompressionError::DistanceTooFarBack);
579+
}
580+
581+
self.consume_bits(
582+
litlen_code_bits + length_extra_bits + dist_code_bits + dist_extra_bits,
583+
);
584+
self.fill_buffer(remaining_input);
585+
litlen_entry = self.compression.litlen_table[(self.buffer & 0xfff) as usize];
586+
587+
let copy_length = length.min(output.len() - output_index);
588+
if dist == 1 {
589+
let last = output[output_index - 1];
590+
output[output_index..][..copy_length].fill(last);
591+
592+
if copy_length < length {
593+
self.queued_rle = Some((last, length - copy_length));
594+
output_index = output.len();
595+
break;
596+
}
597+
} else if output_index + length + 15 <= output.len() {
598+
let start = output_index - dist;
599+
output.copy_within(start..start + 16, output_index);
600+
601+
if length > 16 || dist < 16 {
602+
for i in (0..length).step_by(dist.min(16)).skip(1) {
603+
output.copy_within(start + i..start + i + 16, output_index + i);
604+
}
605+
}
606+
} else {
607+
if dist < copy_length {
608+
for i in 0..copy_length {
609+
output[output_index + i] = output[output_index + i - dist];
610+
}
611+
} else {
612+
output.copy_within(
613+
output_index - dist..output_index + copy_length - dist,
614+
output_index,
615+
)
616+
}
617+
618+
if copy_length < length {
619+
self.queued_backref = Some((dist, length - copy_length));
620+
output_index = output.len();
621+
break;
622+
}
623+
}
624+
output_index += copy_length;
625+
}
626+
627+
// Careful decoding loop.
628+
//
629+
// This loop processes the remaining input when we're too close to the end of the input or
630+
// output to use the fast loop.
418631
while let State::CompressedData = self.state {
419632
self.fill_buffer(remaining_input);
420633
if output_index == output.len() {
@@ -426,74 +639,10 @@ impl Decompressor {
426639
let litlen_code_bits = litlen_entry as u8;
427640

428641
if litlen_entry & LITERAL_ENTRY != 0 {
429-
// Ultra-fast path: do 3 more consecutive table lookups and bail if any of them need the slow path.
430-
if self.nbits >= 48 {
431-
let litlen_entry2 =
432-
self.compression.litlen_table[(bits >> litlen_code_bits & 0xfff) as usize];
433-
let litlen_code_bits2 = litlen_entry2 as u8;
434-
let litlen_entry3 = self.compression.litlen_table
435-
[(bits >> (litlen_code_bits + litlen_code_bits2) & 0xfff) as usize];
436-
let litlen_code_bits3 = litlen_entry3 as u8;
437-
let litlen_entry4 = self.compression.litlen_table[(bits
438-
>> (litlen_code_bits + litlen_code_bits2 + litlen_code_bits3)
439-
& 0xfff)
440-
as usize];
441-
let litlen_code_bits4 = litlen_entry4 as u8;
442-
if litlen_entry2 & litlen_entry3 & litlen_entry4 & LITERAL_ENTRY != 0 {
443-
let advance_output_bytes = ((litlen_entry & 0xf00) >> 8) as usize;
444-
let advance_output_bytes2 = ((litlen_entry2 & 0xf00) >> 8) as usize;
445-
let advance_output_bytes3 = ((litlen_entry3 & 0xf00) >> 8) as usize;
446-
let advance_output_bytes4 = ((litlen_entry4 & 0xf00) >> 8) as usize;
447-
if output_index
448-
+ advance_output_bytes
449-
+ advance_output_bytes2
450-
+ advance_output_bytes3
451-
+ advance_output_bytes4
452-
< output.len()
453-
{
454-
self.consume_bits(
455-
litlen_code_bits
456-
+ litlen_code_bits2
457-
+ litlen_code_bits3
458-
+ litlen_code_bits4,
459-
);
460-
461-
output[output_index] = (litlen_entry >> 16) as u8;
462-
output[output_index + 1] = (litlen_entry >> 24) as u8;
463-
output_index += advance_output_bytes;
464-
output[output_index] = (litlen_entry2 >> 16) as u8;
465-
output[output_index + 1] = (litlen_entry2 >> 24) as u8;
466-
output_index += advance_output_bytes2;
467-
output[output_index] = (litlen_entry3 >> 16) as u8;
468-
output[output_index + 1] = (litlen_entry3 >> 24) as u8;
469-
output_index += advance_output_bytes3;
470-
output[output_index] = (litlen_entry4 >> 16) as u8;
471-
output[output_index + 1] = (litlen_entry4 >> 24) as u8;
472-
output_index += advance_output_bytes4;
473-
continue;
474-
}
475-
}
476-
}
477-
478642
// Fast path: the next symbol is <= 12 bits and a literal, the table specifies the
479643
// output bytes and we can directly write them to the output buffer.
480644
let advance_output_bytes = ((litlen_entry & 0xf00) >> 8) as usize;
481645

482-
// match advance_output_bytes {
483-
// 1 => println!("[{output_index}] LIT1 {}", litlen_entry >> 16),
484-
// 2 => println!(
485-
// "[{output_index}] LIT2 {} {} {}",
486-
// (litlen_entry >> 16) as u8,
487-
// litlen_entry >> 24,
488-
// bits & 0xfff
489-
// ),
490-
// n => println!(
491-
// "[{output_index}] LIT{n} {} {}",
492-
// (litlen_entry >> 16) as u8,
493-
// litlen_entry >> 24,
494-
// ),
495-
// }
496-
497646
if self.nbits < litlen_code_bits {
498647
break;
499648
} else if output_index + 1 < output.len() {
@@ -536,14 +685,11 @@ impl Decompressor {
536685
if self.nbits < litlen_code_bits {
537686
break;
538687
} else if litlen_symbol < 256 {
539-
// println!("[{output_index}] LIT1b {} (val={:04x})", litlen_symbol, self.peak_bits(15));
540-
541688
self.consume_bits(litlen_code_bits);
542689
output[output_index] = litlen_symbol as u8;
543690
output_index += 1;
544691
continue;
545692
} else if litlen_symbol == 256 {
546-
// println!("[{output_index}] EOF");
547693
self.consume_bits(litlen_code_bits);
548694
self.state = match self.last_block {
549695
true => State::Checksum,
@@ -563,7 +709,6 @@ impl Decompressor {
563709
if self.nbits < litlen_code_bits {
564710
break;
565711
}
566-
// println!("[{output_index}] EOF");
567712
self.consume_bits(litlen_code_bits);
568713
self.state = match self.last_block {
569714
true => State::Checksum,
@@ -618,7 +763,6 @@ impl Decompressor {
618763
return Err(DecompressionError::DistanceTooFarBack);
619764
}
620765

621-
// println!("[{output_index}] BACKREF len={} dist={} {:x}", length, dist, dist_entry);
622766
self.consume_bits(total_bits);
623767

624768
let copy_length = length.min(output.len() - output_index);

0 commit comments

Comments
 (0)