From 1c317e2877ce2b07011bb90fc8b52c20ffa50702 Mon Sep 17 00:00:00 2001
From: Nirjhar Roy <nirjhar.roy@fortanix.com>
Date: Fri, 15 Mar 2024 14:33:49 +0000
Subject: [PATCH] Added a stress test for memory allocation (snmalloc)

---
 .github/workflows/build.yml         |   3 +
 examples/mem-alloc-test/.gitignore  |   1 +
 examples/mem-alloc-test/Cargo.toml  |  16 ++
 examples/mem-alloc-test/src/main.rs | 293 ++++++++++++++++++++++++++++
 4 files changed, 313 insertions(+)
 create mode 100644 examples/mem-alloc-test/.gitignore
 create mode 100644 examples/mem-alloc-test/Cargo.toml
 create mode 100644 examples/mem-alloc-test/src/main.rs
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3f031bc4..2af1c531 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -99,3 +99,6 @@ jobs:
     - name: Generate API docs
       run: ./doc/generate-api-docs.sh
 
+    - name: Run memory allocator stress test
+      run: cd ./examples/mem-alloc-test && cargo run
+
diff --git a/examples/mem-alloc-test/.gitignore b/examples/mem-alloc-test/.gitignore
new file mode 100644
index 00000000..ea8c4bf7
--- /dev/null
+++ b/examples/mem-alloc-test/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/examples/mem-alloc-test/Cargo.toml b/examples/mem-alloc-test/Cargo.toml
new file mode 100644
index 00000000..e6a2aa04
--- /dev/null
+++ b/examples/mem-alloc-test/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "mem-alloc-test"
+version = "1.0.0"
+edition = "2021"
+authors = ["Fortanix, Inc."]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+rand = "0.8.4"
+num_cpus = "1.14.0"
+
+[package.metadata.fortanix-sgx]
+# heap size (in bytes), the default heap size is 0x2000000.
+heap-size=0x4000000
+debug=false
\ No newline at end of file
diff --git a/examples/mem-alloc-test/src/main.rs b/examples/mem-alloc-test/src/main.rs
new file mode 100644
index 00000000..478ed63d
--- /dev/null
+++ b/examples/mem-alloc-test/src/main.rs
@@ -0,0 +1,293 @@
+use rand::Rng;
+use num_cpus;
+use std::sync::{Arc, Condvar, Mutex};
+use std::thread;
+use std::time::{Instant};
+
+#[derive(Debug, PartialEq)]
+enum MemSize {
+    Large,
+    Medium,
+    Small,
+}
+/* These 3 variables will store the average latency of allocation+access+deallocation
+ * per thread type i.e, if there are 2 small threads, 3 medium threads and
+ * 4 large threads each of which runs 10, 20 and 30 times, then
+ * avg_duration_small_thread will store average latency of 2*10=20 iterations,
+ * avg_duration_medium_thread will store average latency of 3*20=60 iterations,
+ * and avg_duration_large_thread will store average latency of 4*30=120 iterations
+ */
+struct Counters {
+    avg_duration_small_thread: f64,
+    avg_duration_medium_thread: f64,
+    avg_duration_large_thread: f64,
+    global_average: f64,
+}
+
+const PAGE_SIZE: usize = 4096;
+const TO_KB: usize = 1024;
+const TO_MB: usize = TO_KB * 1024;
+const TO_GB: usize = TO_MB * 1024;
+
+
+/* Set of configurable parameters. These will adjusted as necessary while
+ * recording the performance numbers
+ */
+const NUM_CPUS: usize = 2;
+
+const LIMIT_SMALL_THREAD: i32 = 2;
+const LIMIT_MEDIUM_THREAD: i32 = 2;
+const LIMIT_LARGE_THREAD: i32 = 2;
+
+const SCAN_INTERVAL_SMALL_THREAD: usize = 1 * TO_KB;
+const SCAN_INTERVAL_MEDIUM_THREAD: usize = 1 * TO_MB;
+const SCAN_INTERVAL_LARGE_THREAD: usize = 1 * TO_MB;
+
+const SMALL_THREAD_MEM_START: usize = 1;
+const SMALL_THREAD_MEM_END: usize = 512;
+const MEDIUM_THREAD_MEM_START: usize = 1;
+const MEDIUM_THREAD_MEM_END: usize = 2;
+const LARGE_THREAD_MEM_START: usize = 1;
+const LARGE_THREAD_MEM_END: usize = 2;
+
+const MAX_INDEX_CHECKS_PER_BUFFER: usize = 32;
+
+fn calculate_and_print_stat(
+    shared_mutex_clone: Arc<Mutex<Counters>>,
+    tid: i32,
+    memsize: &MemSize,
+    avg_thread_latency: f64,
+) {
+    /* TODO: Record some other statistical parameters like more detailed statistics
+     * than just average, such as standard deviation, minimum and maximum time,
+     * and p95/p99/p99.9 latency
+     */
+    //println!("thread {} took {}\n", tid, avg_thread_latency);
+    let mut data = shared_mutex_clone.lock().unwrap();
+    /* Please note this is an intermediate value. Once we get the sum of individual
+     * averages of all the threads, then we will divide it by the frequency of
+     * the corresponding thread memsize type.
+     */
+    match memsize {
+        MemSize::Large => {
+            data.avg_duration_large_thread += avg_thread_latency;
+        }
+        MemSize::Medium => {
+            data.avg_duration_medium_thread += avg_thread_latency;
+        }
+        MemSize::Small => {
+            data.avg_duration_small_thread += avg_thread_latency;
+        }
+    };
+}
+
+fn get_random_num(start: usize, end: usize) -> usize {
+    let mut rng = rand::thread_rng();
+    rng.gen_range(start..=end)
+}
+
+fn wait_per_thread(pair_clone: Arc<(Mutex<bool>, Condvar)>) {
+    let (lock, cvar) = &*pair_clone;
+    let mut started = lock.lock().unwrap();
+    while !*started {
+        started = cvar.wait(started).unwrap();
+    }
+    drop(started);
+}
+
+fn wakeup_all_child_threads(pair_clone: Arc<(Mutex<bool>, Condvar)>) {
+    let (lock, cvar) = &*pair_clone;
+    let mut started = lock.lock().unwrap();
+    *started = true;
+    cvar.notify_all();
+    drop(started);
+}
+
+fn traverse_buffer(buf: &mut Vec<i32>, scan_interval: usize) {
+    let mut ptr = 0;
+    let num_indices_checks = get_random_num(1, MAX_INDEX_CHECKS_PER_BUFFER);
+    for i in 1..=num_indices_checks {
+        /* Check for random indices and number of such indices is  num_indices_checks
+         * Please note that depending on the number random number generator, we
+         * can check for the same index multiple times. We could have checked
+         * for all the indices but that would be too time consuming
+         */
+        let index = get_random_num(0, buf.len() - 1);
+        buf[index] = buf[index] * 2;
+    }
+}
+
+fn worker_thread(
+    tid: i32,
+    shared_mutex_clone: Arc<Mutex<Counters>>,
+    memsize: MemSize,
+    pair_clone: Arc<(Mutex<bool>, Condvar)>,
+) {
+    /* Wait for all the threads to be created and then start together */
+    wait_per_thread(pair_clone);
+
+    let mut count = 0;
+    let mut tot_time_ns = 0;
+
+    /* Once the thread's allocation and deallocation operations begin, we
+     * shouldn't take any lock as the allocator that we trying to test is a
+     * multithreaded allocator and we should allow as many threads as possible
+     * to get the lock.
+     */
+    loop {
+        /* Create a random size depending on the memory type */
+        let (scan_interval, size, limit) = match memsize {
+            MemSize::Large => {
+                /* buffer size will be from 1GB to 4GB */
+                (
+                    SCAN_INTERVAL_LARGE_THREAD,
+                    TO_GB * get_random_num(LARGE_THREAD_MEM_START, LARGE_THREAD_MEM_END),
+                    LIMIT_LARGE_THREAD,
+                )
+            }
+            MemSize::Medium => {
+                /* buffer size will be from 8MB to 128 */
+                (
+                    SCAN_INTERVAL_MEDIUM_THREAD,
+                    TO_MB * get_random_num(MEDIUM_THREAD_MEM_START, MEDIUM_THREAD_MEM_END),
+                    LIMIT_MEDIUM_THREAD,
+                )
+            }
+            MemSize::Small => {
+                /* buffer size will be from 1KB to 512KB */
+                (
+                    SCAN_INTERVAL_SMALL_THREAD,
+                    TO_KB * get_random_num(SMALL_THREAD_MEM_START, SMALL_THREAD_MEM_END),
+                    LIMIT_SMALL_THREAD,
+                )
+            }
+        };
+
+        let start_time = Instant::now();
+
+        /* Create an array of x GB where x is a random number between 1 to 4 */
+        let mut large_vector = Vec::with_capacity(size);
+        large_vector.resize(size, 0);
+
+        /* Traverse and access the entire buffer so that pages are allocated */
+        traverse_buffer(&mut large_vector, scan_interval);
+
+        /* deallocate */
+        drop(large_vector);
+
+        /* calculate the metrics */
+        let end_time = Instant::now();
+        let duration = end_time.duration_since(start_time);
+        tot_time_ns += duration.as_nanos();
+
+        count = count + 1;
+        if (count >= limit) {
+            break;
+        }
+    }
+
+    /* At this point the thread's allocation and deallocation operations are
+     * completed and hence it is okay to take a lock.
+     */
+
+    let avg_thread_latency = tot_time_ns as f64 / count as f64;
+
+    let shared_mutex_clone_2 = Arc::clone(&shared_mutex_clone);
+    calculate_and_print_stat(shared_mutex_clone_2, tid, &memsize, avg_thread_latency);
+}
+
+fn spawn_threads(thread_count: i32) {
+    let mut handles = vec![];
+    let shared_variable = Arc::new(Mutex::new(Counters {
+        avg_duration_large_thread: 0.0,
+        avg_duration_medium_thread: 0.0,
+        avg_duration_small_thread: 0.0,
+        global_average: 0.0,
+    }));
+
+    let pair = Arc::new((Mutex::new(false), Condvar::new()));
+    let (mut num_small_threads, mut num_medium_threads, mut num_large_threads) = (0, 0, 0);
+    for i in 0..thread_count {
+        let shared_mutex_clone = Arc::clone(&shared_variable);
+        // Spawn a thread that waits until the condition is met
+        let pair_clone = Arc::clone(&pair);
+        let mut memtype;
+
+        match i % 2 {
+            0 => {
+                memtype = MemSize::Small;
+                num_small_threads += 1;
+
+            }
+            1 => {
+                memtype = MemSize::Medium;
+                num_medium_threads += 1;
+            }
+            2 => {
+                memtype = MemSize::Large;
+                num_large_threads += 1;
+            }
+            _ => return,
+        };
+
+        let handle = thread::spawn(move || {
+            worker_thread(i, shared_mutex_clone, memtype, pair_clone);
+        });
+        handles.push(handle);
+    }
+
+    /* Start all the threads */
+    wakeup_all_child_threads(pair);
+
+    /* Wait for all threads to finish */
+    for handle in handles {
+        handle.join().unwrap();
+    }
+
+    /* Calculate final means */
+    let mut data = shared_variable.lock().unwrap();
+    if (num_large_threads != 0) {
+        data.avg_duration_large_thread = data.avg_duration_large_thread / num_large_threads as f64;
+    }
+    data.avg_duration_medium_thread = data.avg_duration_medium_thread / num_medium_threads as f64;
+    data.avg_duration_small_thread = data.avg_duration_small_thread / num_small_threads as f64;
+
+    data.global_average = (data.avg_duration_small_thread * num_small_threads as f64 * LIMIT_SMALL_THREAD as f64
+        + data.avg_duration_medium_thread * num_medium_threads as f64 * LIMIT_MEDIUM_THREAD as f64
+        + data.avg_duration_large_thread * num_large_threads as f64 * LIMIT_LARGE_THREAD as f64)
+        / (num_large_threads * LIMIT_LARGE_THREAD +
+            num_medium_threads * LIMIT_MEDIUM_THREAD + num_small_threads * LIMIT_SMALL_THREAD) as f64;
+    println!(
+        "{},{},{},{},{}",
+        thread_count,
+        data.avg_duration_small_thread,
+        data.avg_duration_medium_thread,
+        data.avg_duration_large_thread,
+        data.global_average
+    );
+}
+
+fn get_num_processors() -> usize {
+    //num_cpus::get()
+    /* ToDo: Currently it tests with a hardcoded value. We need to add a
+     * special service to make it work properly.
+     */
+    NUM_CPUS
+}
+
+/* If there are n processors available, we will record the numbers with,2n threads,
+ * then n threads, then n/2 and so on.
+ */
+fn start_tests() {
+    println!("NUM_THREADS,LATENCY_SMALL_THREADS,LATENCY_MEDIUM_THREADS,LATENCY_LARGE_THREADS,GLOBAL_AVERAGE");
+    let mut num_processors = get_num_processors();
+    let mut num_threads = num_processors * 2;
+    while (num_threads >= 3) {
+        spawn_threads(num_threads as i32);
+        num_threads = num_threads >> 1;
+    }
+}
+
+fn main() {
+    start_tests();
+}