diff --git a/benches/bench.rs b/benches/bench.rs index 25d6bc3..4f7e639 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -5,14 +5,17 @@ pub const BLUETOOTH: Crc = Crc::::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_SLICE16: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_BYTEWISE: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_NOLOOKUP: Crc = Crc::::new(&CRC_8_BLUETOOTH); +pub const BLUETOOTH_SIMD: Crc = Crc::::new(&CRC_8_BLUETOOTH); pub const X25: Crc = Crc::::new(&CRC_16_IBM_SDLC); pub const X25_SLICE16: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); pub const X25_BYTEWISE: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); pub const X25_NOLOOKUP: Crc = Crc::::new(&CRC_16_IBM_SDLC); +pub const X25_SIMD: Crc = Crc::::new(&CRC_16_IBM_SDLC); pub const ISCSI: Crc = Crc::::new(&CRC_32_ISCSI); pub const ISCSI_SLICE16: Crc> = Crc::>::new(&CRC_32_ISCSI); pub const ISCSI_BYTEWISE: Crc> = Crc::>::new(&CRC_32_ISCSI); pub const ISCSI_NOLOOKUP: Crc = Crc::::new(&CRC_32_ISCSI); +pub const ISCSI_SIMD: Crc = Crc::::new(&CRC_32_ISCSI); pub const GSM_40: Crc = Crc::::new(&CRC_40_GSM); pub const ECMA: Crc = Crc::::new(&CRC_64_ECMA_182); pub const ECMA_SLICE16: Crc> = Crc::>::new(&CRC_64_ECMA_182); @@ -51,6 +54,9 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| BLUETOOTH_SLICE16.checksum(black_box(&bytes))) + }) + .bench_function("simd", |b| { + b.iter(|| BLUETOOTH_SIMD.checksum(black_box(&bytes))) }); c.benchmark_group("crc16") @@ -64,7 +70,8 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| X25_SLICE16.checksum(black_box(&bytes))) - }); + }) + .bench_function("simd", |b| b.iter(|| X25_SIMD.checksum(black_box(&bytes)))); c.benchmark_group("crc32") .throughput(Throughput::Bytes(size as u64)) @@ -77,6 +84,9 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| ISCSI_SLICE16.checksum(black_box(&bytes))) + }) + .bench_function("simd", |b| { + b.iter(|| ISCSI_SIMD.checksum(black_box(&bytes))) }); c.benchmark_group("crc64") diff --git a/src/crc128.rs b/src/crc128.rs index 480061f..9bde2f5 100644 --- a/src/crc128.rs +++ b/src/crc128.rs @@ -169,20 +169,19 @@ const fn update_slice16( #[cfg(test)] mod test { use crate::*; - use crc_catalog::{Algorithm, CRC_82_DARC}; /// Test this optimized version against the well known implementation to ensure correctness #[test] fn correctness() { let data: &[&str] = &[ - "", - "1", - "1234", - "123456789", - "0123456789ABCDE", - "01234567890ABCDEFGHIJK", - "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", - ]; + "", + "1", + "1234", + "123456789", + "0123456789ABCDE", + "01234567890ABCDEFGHIJK", + "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", + ]; pub const CRC_82_DARC_NONREFLEX: Algorithm = Algorithm { width: 82, @@ -191,12 +190,20 @@ mod test { refin: false, refout: true, xorout: 0x000000000000000000000, - check: 0x09ea83f625023801fd612, + check: 0x12e0b19fa447c0bf627ac, residue: 0x000000000000000000000, }; let algs_to_test = [&CRC_82_DARC, &CRC_82_DARC_NONREFLEX]; + // Check if the baseline is as expected. + for alg in algs_to_test { + assert_eq!( + Crc::>::new(alg).checksum("123456789".as_bytes()), + alg.check + ); + } + for alg in algs_to_test { for data in data { let crc_slice16 = Crc::>::new(alg); diff --git a/src/crc16.rs b/src/crc16.rs index 392cd81..734dc6d 100644 --- a/src/crc16.rs +++ b/src/crc16.rs @@ -5,6 +5,13 @@ mod bytewise; mod nolookup; mod slice16; +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq", +))] +mod simd; + const fn init(algorithm: &Algorithm, initial: u16) -> u16 { if algorithm.refin { initial.reverse_bits() >> (16u8 - algorithm.width) @@ -141,7 +148,6 @@ const fn update_slice16( #[cfg(test)] mod test { use crate::*; - use crc_catalog::{Algorithm, CRC_16_IBM_SDLC}; /// Test this optimized version against the well known implementation to ensure correctness #[test] @@ -156,28 +162,72 @@ mod test { "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", ]; - pub const CRC_16_IBM_SDLC_NONREFLEX: Algorithm = Algorithm { - width: 16, - poly: 0x1021, - init: 0xffff, - refin: false, - refout: true, - xorout: 0xffff, - check: 0x906e, - residue: 0xf0b8, - }; + let algs_to_test = &[ + CRC_10_ATM, + CRC_10_CDMA2000, + CRC_10_GSM, + CRC_11_FLEXRAY, + CRC_11_UMTS, + CRC_12_CDMA2000, + CRC_12_DECT, + CRC_12_GSM, + CRC_12_UMTS, + CRC_13_BBC, + CRC_14_DARC, + CRC_14_GSM, + CRC_15_CAN, + CRC_15_MPT1327, + CRC_16_ARC, + CRC_16_CDMA2000, + CRC_16_CMS, + CRC_16_DDS_110, + CRC_16_DECT_R, + CRC_16_DECT_X, + CRC_16_DNP, + CRC_16_EN_13757, + CRC_16_GENIBUS, + CRC_16_GSM, + CRC_16_IBM_3740, + CRC_16_IBM_SDLC, + CRC_16_ISO_IEC_14443_3_A, + CRC_16_KERMIT, + CRC_16_LJ1200, + CRC_16_MAXIM_DOW, + CRC_16_MCRF4XX, + CRC_16_MODBUS, + CRC_16_NRSC_5, + CRC_16_OPENSAFETY_A, + CRC_16_OPENSAFETY_B, + CRC_16_PROFIBUS, + CRC_16_RIELLO, + CRC_16_SPI_FUJITSU, + CRC_16_T10_DIF, + CRC_16_TELEDISK, + CRC_16_TMS37157, + CRC_16_UMTS, + CRC_16_USB, + CRC_16_XMODEM, + ]; - let algs_to_test = [&CRC_16_IBM_SDLC, &CRC_16_IBM_SDLC_NONREFLEX]; + // Check if the baseline is as expected. + for alg in algs_to_test { + assert_eq!( + Crc::>::new(alg).checksum("123456789".as_bytes()), + alg.check + ); + } for alg in algs_to_test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::::new(alg); + let crc_clmul = Crc::::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_clmul.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -187,6 +237,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_clmul.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); diff --git a/src/crc16/simd.rs b/src/crc16/simd.rs new file mode 100644 index 0000000..4ecb26f --- /dev/null +++ b/src/crc16/simd.rs @@ -0,0 +1,66 @@ +use crate::crc16::{finalize, init, update_bytewise}; +use crate::*; +use crate::{simd::crc32_coeff, table::crc16_table}; + +use self::simd::{crc32_update_refin, Value}; + +impl Crc { + pub const fn new(algorithm: &'static Algorithm) -> Self { + let table = crc16_table(algorithm.width, algorithm.poly, algorithm.refin); + let coeff = crc32_coeff(algorithm.width, algorithm.poly as u32); + Self { + algorithm, + data: (table, coeff), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u16 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u16, bytes: &[u8]) -> u16 { + if !self.algorithm.refin { + return update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes); + } + + // SAFETY: The returned value for chunks will always be aligned, + // considering the platform requirement and 64*8-bit chunks are transmuted + // to 4*128-bit chunks and the lifetime and mutability does not change. + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[Value; 4]>() }; + crc = update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + crc = crc32_update_refin(crc as u32, &self.data.1, first_chunk, &chunks[1..]) as u16; + } + update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes_after) + } + + pub const fn digest(&self) -> Digest { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub const fn digest_with_initial(&self, initial: u16) -> Digest { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, u16, Simd> { + const fn new(crc: &'a Crc, value: u16) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u16 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/crc32.rs b/src/crc32.rs index 4385685..a5c6418 100644 --- a/src/crc32.rs +++ b/src/crc32.rs @@ -5,6 +5,13 @@ mod bytewise; mod nolookup; mod slice16; +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq", +))] +mod simd; + // init is shared between all impls const fn init(algorithm: &Algorithm, initial: u32) -> u32 { if algorithm.refin { @@ -152,44 +159,65 @@ const fn update_slice16( #[cfg(test)] mod test { use crate::*; - use crc_catalog::{Algorithm, CRC_32_ISCSI}; /// Test this optimized version against the well known implementation to ensure correctness #[test] fn correctness() { let data: &[&str] = &[ - "", - "1", - "1234", - "123456789", - "0123456789ABCDE", - "01234567890ABCDEFGHIJK", - "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", - ]; - - pub const CRC_32_ISCSI_NONREFLEX: Algorithm = Algorithm { - width: 32, - poly: 0x1edc6f41, - init: 0xffffffff, - // This is the only flag that affects the optimized code path - refin: false, - refout: true, - xorout: 0xffffffff, - check: 0xe3069283, - residue: 0xb798b438, - }; - - let algs_to_test = [&CRC_32_ISCSI, &CRC_32_ISCSI_NONREFLEX]; + "", + "1", + "1234", + "123456789", + "0123456789ABCDE", + "01234567890ABCDEFGHIJK", + "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", + ]; + + let algs_to_test = &[ + CRC_17_CAN_FD, + CRC_21_CAN_FD, + CRC_24_BLE, + CRC_24_FLEXRAY_A, + CRC_24_FLEXRAY_B, + CRC_24_INTERLAKEN, + CRC_24_LTE_A, + CRC_24_LTE_B, + CRC_24_OPENPGP, + CRC_24_OS_9, + CRC_30_CDMA, + CRC_31_PHILIPS, + CRC_32_AIXM, + CRC_32_AUTOSAR, + CRC_32_BASE91_D, + CRC_32_BZIP2, + CRC_32_CD_ROM_EDC, + CRC_32_CKSUM, + CRC_32_ISCSI, + CRC_32_ISO_HDLC, + CRC_32_JAMCRC, + CRC_32_MPEG_2, + CRC_32_XFER, + ]; + + // Check if the baseline is as expected. + for alg in algs_to_test { + assert_eq!( + Crc::>::new(alg).checksum("123456789".as_bytes()), + alg.check + ); + } for alg in algs_to_test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::::new(alg); + let crc_clmul = Crc::::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_clmul.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -199,6 +227,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_clmul.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); @@ -212,6 +244,10 @@ mod test { digest.update(data1); digest.update(data2); assert_eq!(digest.finalize(), expected); + let mut digest = crc_clmul.digest(); + digest.update(data1); + digest.update(data2); + assert_eq!(digest.finalize(), expected); } } } diff --git a/src/crc32/simd.rs b/src/crc32/simd.rs new file mode 100644 index 0000000..3f841da --- /dev/null +++ b/src/crc32/simd.rs @@ -0,0 +1,67 @@ +use crate::*; +use crate::{simd::crc32_coeff, table::crc32_table}; + +use crate::crc32::{finalize, init, update_bytewise}; + +use self::simd::{crc32_update_refin, Value}; + +impl Crc { + pub const fn new(algorithm: &'static Algorithm) -> Self { + let table = crc32_table(algorithm.width, algorithm.poly, algorithm.refin); + let coeff = crc32_coeff(algorithm.width, algorithm.poly); + Self { + algorithm, + data: (table, coeff), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u32 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u32, bytes: &[u8]) -> u32 { + if !self.algorithm.refin { + return update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes); + } + + // SAFETY: The returned value for chunks will always be aligned, + // considering the platform requirement and 64*8-bit chunks are transmuted + // to 4*128-bit chunks and the lifetime and mutability does not change. + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[Value; 4]>() }; + crc = update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + crc = crc32_update_refin(crc, &self.data.1, first_chunk, &chunks[1..]); + } + update_bytewise(crc, self.algorithm.refin, &self.data.0, bytes_after) + } + + pub const fn digest(&self) -> Digest { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub const fn digest_with_initial(&self, initial: u32) -> Digest { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, u32, Simd> { + const fn new(crc: &'a Crc, value: u32) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u32 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/crc64.rs b/src/crc64.rs index 6793453..4fd4bd0 100644 --- a/src/crc64.rs +++ b/src/crc64.rs @@ -154,33 +154,35 @@ const fn update_slice16( #[cfg(test)] mod test { use crate::*; - use crc_catalog::{Algorithm, CRC_64_ECMA_182}; /// Test this optimized version against the well known implementation to ensure correctness #[test] fn correctness() { let data: &[&str] = &[ - "", - "1", - "1234", - "123456789", - "0123456789ABCDE", - "01234567890ABCDEFGHIJK", - "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", - ]; - - pub const CRC_64_ECMA_182_REFLEX: Algorithm = Algorithm { - width: 64, - poly: 0x42f0e1eba9ea3693, - init: 0x0000000000000000, - refin: true, - refout: false, - xorout: 0x0000000000000000, - check: 0x6c40df5f0b497347, - residue: 0x0000000000000000, - }; - - let algs_to_test = [&CRC_64_ECMA_182, &CRC_64_ECMA_182_REFLEX]; + "", + "1", + "1234", + "123456789", + "0123456789ABCDE", + "01234567890ABCDEFGHIJK", + "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", + ]; + + let algs_to_test = &[ + CRC_40_GSM, + CRC_64_ECMA_182, + CRC_64_GO_ISO, + CRC_64_WE, + CRC_64_XZ, + ]; + + // Check if the baseline is as expected. + for alg in algs_to_test { + assert_eq!( + Crc::>::new(alg).checksum("123456789".as_bytes()), + alg.check + ); + } for alg in algs_to_test { for data in data { diff --git a/src/crc8.rs b/src/crc8.rs index 9e38289..b2b4e93 100644 --- a/src/crc8.rs +++ b/src/crc8.rs @@ -5,6 +5,13 @@ mod bytewise; mod nolookup; mod slice16; +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq", +))] +mod simd; + const fn init(algorithm: &Algorithm, initial: u8) -> u8 { if algorithm.refin { initial.reverse_bits() >> (8u8 - algorithm.width) @@ -88,7 +95,6 @@ const fn update_slice16(mut crc: u8, table: &[[u8; 256]; 16], bytes: &[u8]) -> u #[cfg(test)] mod test { use crate::*; - use crc_catalog::{Algorithm, CRC_8_BLUETOOTH}; /// Test this optimized version against the well known implementation to ensure correctness #[test] @@ -103,28 +109,62 @@ mod test { "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", ]; - pub const CRC_8_BLUETOOTH_NONREFLEX: Algorithm = Algorithm { - width: 8, - poly: 0xa7, - init: 0x00, - refin: false, - refout: true, - xorout: 0x00, - check: 0x26, - residue: 0x00, - }; + let algs_to_test = &[ + CRC_3_GSM, + CRC_3_ROHC, + CRC_4_G_704, + CRC_4_INTERLAKEN, + CRC_5_EPC_C1G2, + CRC_5_G_704, + CRC_5_USB, + CRC_6_CDMA2000_A, + CRC_6_CDMA2000_B, + CRC_6_DARC, + CRC_6_G_704, + CRC_6_GSM, + CRC_7_MMC, + CRC_7_ROHC, + CRC_7_UMTS, + CRC_8_AUTOSAR, + CRC_8_BLUETOOTH, + CRC_8_CDMA2000, + CRC_8_DARC, + CRC_8_DVB_S2, + CRC_8_GSM_A, + CRC_8_GSM_B, + CRC_8_I_432_1, + CRC_8_I_CODE, + CRC_8_LTE, + CRC_8_MAXIM_DOW, + CRC_8_MIFARE_MAD, + CRC_8_NRSC_5, + CRC_8_OPENSAFETY, + CRC_8_ROHC, + CRC_8_SAE_J1850, + CRC_8_SMBUS, + CRC_8_TECH_3250, + CRC_8_WCDMA, + ]; - let algs_to_test = [&CRC_8_BLUETOOTH, &CRC_8_BLUETOOTH_NONREFLEX]; + // Check if the baseline is as expected. + for alg in algs_to_test { + assert_eq!( + Crc::>::new(alg).checksum("123456789".as_bytes()), + alg.check + ); + } for alg in algs_to_test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::::new(alg); + let crc_clmul = Crc::::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_clmul.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -134,6 +174,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_clmul.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); @@ -147,6 +191,10 @@ mod test { digest.update(data1); digest.update(data2); assert_eq!(digest.finalize(), expected); + let mut digest = crc_clmul.digest(); + digest.update(data1); + digest.update(data2); + assert_eq!(digest.finalize(), expected); } } } diff --git a/src/crc8/simd.rs b/src/crc8/simd.rs new file mode 100644 index 0000000..21d629d --- /dev/null +++ b/src/crc8/simd.rs @@ -0,0 +1,66 @@ +use crate::crc8::{finalize, init, update_bytewise}; +use crate::*; +use crate::{simd::crc32_coeff, table::crc8_table}; + +use self::simd::{crc32_update_refin, Value}; + +impl Crc { + pub const fn new(algorithm: &'static Algorithm) -> Self { + let table = crc8_table(algorithm.width, algorithm.poly, algorithm.refin); + let coeff = crc32_coeff(algorithm.width, algorithm.poly as u32); + Self { + algorithm, + data: (table, coeff), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u8 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u8, bytes: &[u8]) -> u8 { + if !self.algorithm.refin { + return update_bytewise(crc, &self.data.0, bytes); + } + + // SAFETY: The returned value for chunks will always be aligned, + // considering the platform requirement and 64*8-bit chunks are transmuted + // to 4*128-bit chunks and the lifetime and mutability does not change. + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[Value; 4]>() }; + crc = update_bytewise(crc, &self.data.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + crc = crc32_update_refin(crc as u32, &self.data.1, first_chunk, &chunks[1..]) as u8; + } + update_bytewise(crc, &self.data.0, bytes_after) + } + + pub const fn digest(&self) -> Digest { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub const fn digest_with_initial(&self, initial: u8) -> Digest { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, u8, Simd> { + const fn new(crc: &'a Crc, value: u8) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u8 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/lib.rs b/src/lib.rs index 20549e7..83460d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ //! assert_eq!(digest.finalize(), 0xaee7); //! ``` #![no_std] -#![forbid(unsafe_code)] +//#![forbid(unsafe_code)] pub use crc_catalog::algorithm::*; pub use crc_catalog::{Algorithm, Width}; @@ -39,6 +39,13 @@ mod crc8; mod table; mod util; +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq" +))] +mod simd; + /// A trait for CRC implementations. pub trait Implementation: private::Sealed { /// Associated data necessary for the implementation (e.g. lookup tables). @@ -49,6 +56,29 @@ pub trait Implementation: private::Sealed { /// The number of entries in the lookup table is `L * 256`. pub struct Table {} +/// A carry-less multiplication based implementation of the CRC algorithm, +/// which can run in lanes, and only requires 8 coefficients +/// and for fallback reasons a 256-entry lookup table. +#[cfg(any( + doc, + all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq" + ) +))] +pub struct Simd {} + +#[cfg(not(any( + doc, + all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq" + ) +)))] +pub type Simd = DefaultImpl; + /// An implementation of the CRC algorithm with no lookup table. pub type NoTable = Table<0>; @@ -58,9 +88,25 @@ impl Implementation for Table { type Data = [[W; 256]; L]; } +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq" +))] +impl Implementation for Simd { + type Data = ([W; 256], [simd::Value; 4]); +} + mod private { pub trait Sealed {} impl Sealed for super::Table {} + + #[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq" + ))] + impl Sealed for super::Simd {} } /// Crc instance with a specific width, algorithm, and implementation. diff --git a/src/simd.rs b/src/simd.rs new file mode 100644 index 0000000..4d0d083 --- /dev/null +++ b/src/simd.rs @@ -0,0 +1,112 @@ +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq", +))] +mod x86; + +#[cfg(all( + target_feature = "sse2", + target_feature = "sse4.1", + target_feature = "pclmulqdq", +))] +pub(crate) use x86::Value; + +trait ValueOps { + fn xor(self, value: u64) -> Self; + + fn fold_16(self, x_mod_p: Self, value: Self) -> Self; + + fn fold_8(self, x_mod_p: Self) -> Self; + + fn fold_4(self, x_mod_p: Self) -> Self; + + fn barret_reduction_32(self, px_u: Self) -> u32; +} + +pub(crate) const fn crc32_coeff(width: u8, poly: u32) -> [Value; 4] { + const fn xt_mod_px(mut t: u32, px: u64) -> u64 { + if t < 32 { + return 0; + } + t -= 31; + + let mut n = 0x80000000; + let mut i = 0; + while i < t { + n <<= 1; + if n & 0x100000000 != 0 { + n ^= px; + } + i += 1; + } + n << 32 + } + + const fn u(px: u64) -> u64 { + let mut q = 0; + let mut n = 0x100000000; + let mut i = 0; + while i < 33 { + q <<= 1; + if n & 0x100000000 != 0 { + q |= 1; + n ^= px; + } + n <<= 1; + i += 1; + } + q + } + + let px = (poly as u64) << (u32::BITS as u8 - width); + unsafe { + // SAFETY: This will be evaluated during compile-time and therefore the alignment + // doesn't matter, the type is transmuted from 2*u64 to u64x2 simd type. + core::mem::transmute([ + xt_mod_px(4 * 128 + 32, px).reverse_bits() << 1, + xt_mod_px(4 * 128 - 32, px).reverse_bits() << 1, + xt_mod_px(128 + 32, px).reverse_bits() << 1, + xt_mod_px(128 - 32, px).reverse_bits() << 1, + xt_mod_px(64, px).reverse_bits() << 1, + xt_mod_px(32, px).reverse_bits() << 1, + px.reverse_bits() >> 31, + u(px).reverse_bits() >> 31, + ]) + } +} + +pub(crate) fn crc32_update_refin( + crc: u32, + coeff: &[Value; 4], + first_chunk: &[Value; 4], + chunks: &[[Value; 4]], +) -> u32 { + let mut x4 = *first_chunk; + + // Apply initial crc value + x4[0] = x4[0].xor(crc as u64); + + // Iteratively Fold by 4: + let k1_k2 = coeff[0]; + for chunk in chunks { + for (x, value) in x4.iter_mut().zip(chunk.iter()) { + *x = x.fold_16(k1_k2, *value) + } + } + + // Iteratively Fold by 1: + let k3_k4 = coeff[1]; + let mut x = x4[0].fold_16(k3_k4, x4[1]); + x = x.fold_16(k3_k4, x4[2]); + x = x.fold_16(k3_k4, x4[3]); + + // Final Reduction of 128-bits + let k5_k6 = coeff[2]; + x = x.fold_8(k3_k4); + x = x.fold_4(k5_k6); + + // Barret Reduction + let px_u = coeff[3]; + x.barret_reduction_32(px_u) +} diff --git a/src/simd/x86.rs b/src/simd/x86.rs new file mode 100644 index 0000000..2ec815e --- /dev/null +++ b/src/simd/x86.rs @@ -0,0 +1,68 @@ +use crate::simd::ValueOps; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; +use core::mem; + +#[derive(Copy, Clone)] +pub struct Value(arch::__m128i); + +impl ValueOps for Value { + #[inline] + fn xor(self, value: u64) -> Self { + // SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq + unsafe { + Self(arch::_mm_xor_si128( + self.0, + arch::_mm_set_epi64x(0, value as i64), + )) + } + } + + #[inline] + fn fold_16(self, x_mod_p: Self, value: Self) -> Self { + // SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq + unsafe { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x00), + arch::_mm_xor_si128(arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x11), value.0), + )) + } + } + + #[inline] + fn fold_8(self, x_mod_p: Self) -> Self { + // SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq + unsafe { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x10), + arch::_mm_srli_si128(self.0, 8), + )) + } + } + + #[inline] + fn fold_4(self, x_mod_p: Self) -> Self { + // SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq + unsafe { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128(arch::_mm_and_si128(self.0, MASK), x_mod_p.0, 0x00), + arch::_mm_srli_si128(self.0, 4), + )) + } + } + + #[inline] + fn barret_reduction_32(self, px_u: Self) -> u32 { + // SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq + unsafe { + let t1 = arch::_mm_clmulepi64_si128(arch::_mm_and_si128(self.0, MASK), px_u.0, 0x10); + let t2 = arch::_mm_clmulepi64_si128(arch::_mm_and_si128(t1, MASK), px_u.0, 0x00); + arch::_mm_extract_epi32(arch::_mm_xor_si128(self.0, t2), 1) as u32 + } + } +} + +const MASK: arch::__m128i = unsafe { mem::transmute((1u128 << 32) - 1) };