Skip to content

Commit

Permalink
Use xor_into3 in CBC
Browse files Browse the repository at this point in the history
  • Loading branch information
reynir committed Feb 27, 2024
1 parent 3ebc0e3 commit c03de64
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 5 deletions.
10 changes: 5 additions & 5 deletions src/cipher_block.ml
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,15 @@ module Modes = struct

let encrypt ~key:(key, _) ~iv src =
bounds_check ~iv src ;
let msg = Cs.clone src in
let msg = Cstruct.create_unsafe (Cstruct.length src) in
let dst = msg.buffer in
let rec loop iv iv_i dst_i = function
let rec loop iv iv_i src_i dst_i = function
0 -> ()
| b -> Native.xor_into iv iv_i dst dst_i block ;
| b -> Native.xor_into3 iv iv_i src.buffer src_i dst dst_i block ;
Core.encrypt ~key ~blocks:1 dst dst_i dst dst_i ;
loop dst dst_i (dst_i + block) (b - 1)
loop src.buffer src_i (src_i + block) (dst_i + block) (b - 1)
in
loop iv.buffer iv.off msg.off (msg.len / block) ; msg
loop iv.buffer iv.off src.off msg.off (msg.len / block) ; msg

let decrypt ~key:(_, key) ~iv src =
bounds_check ~iv src ;
Expand Down
1 change: 1 addition & 0 deletions src/native.ml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ end
(* XXX TODO
* Unsolved: bounds-checked XORs are slowing things down considerably... *)
external xor_into : buffer -> off -> buffer -> off -> size -> unit = "mc_xor_into" [@@noalloc]
external xor_into3 : buffer -> off -> buffer -> off -> buffer -> off -> size -> unit = "mc_xor_into3_bytecode" "mc_xor_into3" [@@noalloc]

external count8be : bytes -> buffer -> off -> blocks:size -> unit = "mc_count_8_be" [@@noalloc]
external count16be : bytes -> buffer -> off -> blocks:size -> unit = "mc_count_16_be" [@@noalloc]
Expand Down
3 changes: 3 additions & 0 deletions src/native/mirage_crypto.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ mc_ghash_generic (value m, value hash, value src, value off, value len);
CAMLprim value
mc_xor_into_generic (value b1, value off1, value b2, value off2, value n);

CAMLprim value
mc_xor_into3_generic (value b1, value off1, value b2, value off2, value b3, value off3, value n);

CAMLprim value
mc_count_16_be_4_generic (value ctr, value dst, value off, value blocks);

Expand Down
27 changes: 27 additions & 0 deletions src/native/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ static inline void xor_into (uint8_t *src, uint8_t *dst, size_t n) {
for (; n --; ++ src, ++ dst) *dst = *src ^ *dst;
}

static inline void xor_into3 (uint8_t *src1, uint8_t *src2, uint8_t *dst, size_t n) {
/* see issue #70 #81 for alignment considerations (memcpy used below) */
#ifdef ARCH_64BIT
uint64_t s1, s2;
for (; n >= 8; n -= 8, src1 += 8, src2 += 8, dst += 8)
*(uint64_t*) dst = *(uint64_t*)memcpy(&s1, src1, 8) ^ *(uint64_t*)memcpy(&s2, src2, 8);
#endif

uint32_t t1, t2;
for (; n >= 4; n -= 4, src1 += 4, dst += 4)
*(uint32_t*) dst = *(uint32_t*)memcpy(&t1, src1, 4) ^ *(uint32_t*)memcpy(&t2, src2, 4);;

for (; n --; ++ src1, ++ src2, ++ dst) *dst = *src1 ^ *src2;
}

static inline void _mc_count_8_be (uint64_t *init, uint64_t *dst, size_t blocks) {
uint64_t qw = be64_to_cpu (*init);
while (blocks --) *(dst ++) = cpu_to_be64 (qw ++);
Expand Down Expand Up @@ -59,6 +74,18 @@ mc_xor_into_generic (value b1, value off1, value b2, value off2, value n) {
return Val_unit;
}

CAMLprim value
mc_xor_into3_generic (value b1, value off1, value b2, value off2, value b3, value off3, value n) {
xor_into3 (_ba_uint8_off (b1, off1), _ba_uint8_off (b2, off2), _ba_uint8_off (b3, off3), Int_val (n));
return Val_unit;
}

CAMLprim value
mc_xor_into3_generic_bytecode (value *argv, int argn) {
(void) argn;
return mc_xor_into3_generic (argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
}

#define __export_counter(name, f) \
CAMLprim value name (value ctr, value dst, value off, value blocks) { \
f ( (uint64_t*) Bp_val (ctr), \
Expand Down
37 changes: 37 additions & 0 deletions src/native/misc_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,29 @@ static inline void xor_into (uint8_t *src, uint8_t *dst, size_t n) {
for (; n --; ++ src, ++ dst) *dst = *src ^ *dst;
}

static inline void xor_into3 (uint8_t *src1, uint8_t *src2, uint8_t *dst, size_t n) {
/* see issue #70 #81 for alignment considerations (memcpy used below) */
#ifdef ARCH_64BIT
__m128i r1, r2;
for (; n >= 16; n -= 16, src1 += 16, src2 += 16, dst += 16)
_mm_storeu_si128 (
(__m128i*) dst,
_mm_xor_si128 (
_mm_loadu_si128 ((__m128i*) memcpy(&r1, src1, 16)),
_mm_loadu_si128 ((__m128i*) memcpy(&r2, src2, 16))));

uint64_t s1, s2;
for (; n >= 8; n -= 8, src1 += 8, src2 += 8, dst += 8)
*(uint64_t*) dst = *(uint64_t*) memcpy(&s1, src1, 8) ^ *(uint64_t*) memcpy(&s2, src2, 8);
#endif

uint32_t t1, t2;
for (; n >= 4; n -= 4, src1 += 4, src2 += 4, dst += 4)
*(uint32_t*) dst = *(uint32_t*)memcpy(&t1, src1, 4) ^ *(uint32_t*)memcpy(&t2, src2, 4);

for (; n --; ++ src1, ++ src2, ++ dst) *dst = *src1 ^ *src2;
}

/* The GCM counter. Counts on the last 32 bits, ignoring carry. */
static inline void _mc_count_16_be_4 (uint64_t *init, uint64_t *dst, size_t blocks) {

Expand All @@ -47,6 +70,20 @@ mc_xor_into (value b1, value off1, value b2, value off2, value n) {
return Val_unit;
}

CAMLprim value
mc_xor_into3 (value b1, value off1, value b2, value off2, value b3, value off3, value n) {
_mc_switch_accel(ssse3,
mc_xor_into3_generic(b1, off1, b2, off2, b3, off3, n),
xor_into3 (_ba_uint8_off (b1, off1), _ba_uint8_off (b2, off2), _ba_uint8_off (b3, off3), Int_val (n)))
return Val_unit;
}

CAMLprim value
mc_xor_into3_bytecode (value *argv, int argn) {
(void) argn;
return mc_xor_into3 (argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
}

#define __export_counter(name, f) \
CAMLprim value name (value ctr, value dst, value off, value blocks) { \
_mc_switch_accel(ssse3, \
Expand Down

0 comments on commit c03de64

Please sign in to comment.