diff --git a/CMakeLists.txt b/CMakeLists.txt
index c260e98..6b7dd8f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,79 @@
-cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+ # Modifications Copyright (C) 2023 Intel Corporation
+ #
+ # This Program is subject to the terms of The Unlicense.​
+ # If a copy of the license was not distributed with this file, ​
+ # you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ #​
+ #
+ # SPDX-License-Identifier: Unlicense
+ #
 
-project(SimpleConcurrentGPUHashTable LANGUAGES CXX CUDA)
+cmake_minimum_required(VERSION 3.10)
 
-# put predefined cmake projects in their own solution folder
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+project(hashtable_sycl LANGUAGES CXX)
 
-string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=sm_35")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
-set(SOURCE_FILES src/main.cpp src/test.cpp src/linearprobing.h src/linearprobing.cu)
+option(GPU_AOT                 "Build AOT for Intel GPU"      OFF)
+option(USE_NVIDIA_BACKEND      "Build for NVIDIA backend"     OFF)
+option(USE_AMDHIP_BACKEND      "Build for AMD HIP backend"    OFF)
+option(USE_SM                  "Build for specific SM"        OFF)
+
+set(INTEL_GPU_CXX_FLAGS  " -O2 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+set(AMD_GPU_CXX_FLAGS    " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+
+set(USE_DEFAULT_FLAGS ON)
+if("${CMAKE_CXX_FLAGS}" STREQUAL "")
+    message(STATUS "Using DEFAULT compilation flags")
+else()
+    message(STATUS "Overriding DEFAULT compilation flags")
+    set(USE_DEFAULT_FLAGS OFF)
+endif()
+
+# AOT compilation
+if(GPU_AOT)
+    message(STATUS "Enabling INTEL backend")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for Intel backend
+    endif()
+    if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") )
+        message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 0x2f\" ")
+    else()
+        message(STATUS "Using custom AOT compilation flag ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " ${GPU_AOT} ")
+    endif()
+elseif(USE_NVIDIA_BACKEND)
+    message(STATUS "Enabling NVIDIA backend")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend
+    endif()
+    if(USE_SM)
+        message("-- Building for SM_${USE_SM} compatibility")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_${USE_SM} ")
+    else()
+        message("-- Building for SM_80 compatibility (DEFAULT)")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 ")
+    endif()
+elseif(USE_AMDHIP_BACKEND)
+    message(STATUS "Enabling AMD HIP backend for ${USE_AMDHIP_BACKEND} AMD architecture")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}") # Default flags for AMD backend (gfx90a for MI250)
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND} ")
+endif()
+
+set(SOURCES
+    ${CMAKE_SOURCE_DIR}/src/main.cpp
+    ${CMAKE_SOURCE_DIR}/src/test.cpp
+    ${CMAKE_SOURCE_DIR}/src/linearprobing.cpp
+)
 
 include_directories(${CMAKE_SOURCE_DIR}/src)
 
-add_executable(test ${SOURCE_FILES})
+add_executable(${PROJECT_NAME} ${SOURCES})
 
-# visual studio project should mimic directory structure
-# this isn't working for me; I think because
-# https://developercommunity.visualstudio.com/content/problem/777578/source-grouptree-no-longer-works.html
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES})
+target_link_libraries(${PROJECT_NAME} sycl OpenCL stdc++fs)
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index fdddb29..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,24 +0,0 @@
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <https://unlicense.org>
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..2977253
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,8 @@
+Modifications Copyright (C) 2023 Intel Corporation
+
+This Program is subject to the terms of The Unlicense.​
+If a copy of the license was not distributed with this file, ​
+you can obtain one at https://spdx.org/licenses/Unlicense.html​
+
+
+SPDX-License-Identifier: Unlicense
diff --git a/README.md b/README.md
index ce2a8c1..46028ed 100644
--- a/README.md
+++ b/README.md
@@ -1,89 +1,64 @@
-![](screenshot.png)
+# hashtable
 
-# About
+hashtable implements a simple hash table in GPU (original CUDA source code is from [here](https://github.com/nosferalatu/SimpleGPUHashTable)).
 
-This project shows how to implement a simple GPU hash table. Thanks to the high bandwidth and massive parallelism of
-GPU's, the result is a high performance hash table capable of hundreds of millions of operations per second.
 
-The code achieves an average insertion rate of 326 million key/second on my development laptop with an NVIDIA GTX 1060,
-measured by inserting 64 million elements.
+## SYCL version
 
-[Read my blog post about the code here](http://nosferalatu.com/SimpleGPUHashTable.html) for more information about the
-implementation.
+- The CUDA code was migrated using Intel DPCT, and then the resulting code was modified to remove the DPCT headers.
+- Timing code was later added for performance measurement purpose.
+- The same SYCL code runs on Intel GPUs & CPUs as well as NVIDIA (tested on A100 and H100) and AMD (tested on MI100 and MI250) GPUs.
 
-The code implements a lock free hash table using linear probing. Concurrent inserts, deletes, and lookups are supported by
-this hash table. The hash table works on 32 bit keys and 32 bit values (although 0xffffffff is reserved for both keys
-and values). The load factor of the table is set to 50% in the code, and the table size must be a power of two.
+# Build Instructions
+- icpx compiler mentioned below is included in oneAPI Base Toolkit available [here](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html).
+- clang++ compiler mentioned below is available [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md).
+## To build for SYCL
 
-Atomic operations are used to insert key/value pairs into the hash table on multiple GPU threads. It uses CUDA for ease
-of development, but could easily be ported to HLSL or GLSL. 64 bit keys and/or values could be supported using 64 bit
-atomics.
-
-Resizing the hash table is not implemented (it's a *simple* hash table!) although this can be achieved by inserting the
-contents of a table into another, larger table.
-
-The code was kept simple for readability. There are many optimizations that can be done, but they muddy the waters. I
-wanted to illustrate the basic design of the lock free hash table and how it can be implemented on a GPU.
-
-# How To Use
-
-If you build and run the executable, it enters an infinite loop of inserting and deleting random numbers into the
-GPU hash table and verifying that the results are correct. The seed used to generate random numbers changes every time
-you run the executable, but you can set the seed to a specific value in code if you'd like to reproduce results across
-runs.
-
-This is how you insert a vector of `KeyValue` pairs into the hash table and then retrieve all the `KeyValue` pairs back:
-
-```cpp
-    std::vector<KeyValue> things_to_insert = { {0,1}, {1,2}, {2,3}, {3,4} };
-
-    KeyValue* pHashTable = create_hashtable();
-    insert_hashtable(pHashTable, things_to_insert.data(), (uint32_t)things_to_insert.size());
-    std::vector<KeyValue> result = iterate_hashtable(pHashTable);
-    destroy_hashtable(pHashTable);
-```
-
-After that runs, the vectors `things_to_insert` and `result` should be the same, but possibly in a different order.
-
-# Prerequisites
-
-* CMake
-* CUDA
-
-This has been tested on Windows with Visual Studio Community 2019 on a machine with an NVIDIA GTX 1060.
-An easy way to get CMake is to open a Visual Studio command prompt (in Windows, run "x64 Native Tools Command Prompt for
-VS 2019"; that will put CMake in your path).
-
-This should work on other CUDA-supported platforms, but I have not tested this.
-
-# Cloning
+For Intel GPU -  
+First, source icpx compiler. Then,
 
 ```
-git clone https://github.com/nosferalatu/SimpleConcurrentGPUHashTable.git SimpleConcurrentGPUHashTable
+mkdir build
+cd build
+CXX=icpx cmake -DGPU_AOT=pvc ..
+make -sj
 ```
+Note:
+- To enable AOT compilation, please use the flag `-DGPU_AOT=pvc` for PVC.
 
-# Generating Build Files
-
-Run the following commands to generate .sln and .vcxproj's that can be opened in Visual Studio:
-
+For AMD GPU -  
+First source clang++ compiler. Then,
 ```
-cd ConcurrentHashTables
-md build
+mkdir build
 cd build
-cmake ..
+CXX=clang++ cmake -DUSE_AMDHIP_BACKEND=gfx90a ..
+make -sj
 ```
+Note:
+- We use the flag `-DUSE_AMDHIP_BACKEND=gfx90a` for MI250. Use the correct value for your GPU.
 
-You can now open `SimpleConcurrentGPUHashTable.sln` in Visual Studio.
-
-If CMake fails to find CUDA above, then run a CMake generator for 64 bit builds:
+For NVIDIA GPU -  
+First source clang++ compiler. Then,
 ```
-cmake -G "Visual Studio 16 2019 Win64" ..
+mkdir build
+cd build
+CXX=clang++ cmake -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=80 ..
+make -sj
 ```
+Note:
+- We use the flag `-DUSE_SM=80` for A100 or `-DUSE_SM=90` for H100.
 
-# Building
+# Run instructions
 
-You can build within Visual Studio, or from the command line with:
+After building, to run the workload, cd into the build folder. Then
 
 ```
-cmake --build . --config Release
+./hashtable_sycl
+```
+By default a verification is done and that takes some time. To skip verification:
 ```
+./hashtable_sycl --no-verify
+```
+# Output
+
+Output gives number of keys per second.
diff --git a/screenshot.png b/screenshot.png
deleted file mode 100644
index b8c09fd..0000000
Binary files a/screenshot.png and /dev/null differ
diff --git a/src/acas.h b/src/acas.h
new file mode 100644
index 0000000..1a755bd
--- /dev/null
+++ b/src/acas.h
@@ -0,0 +1,66 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.​
+ * If a copy of the license was not distributed with this file, ​
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ *​
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Following code is copied from atomic.hpp of dpct
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+namespace acas {
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in, out] addr Multi_ptr.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
+T atomic_compare_exchange_strong(
+    sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
+    T expected,
+    T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail    = sycl::memory_order::relaxed
+) {
+    // sycl::atomic_ref<T, addressSpace> obj(addr);
+    sycl::atomic_ref<T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> obj(addr[0]);
+    obj.compare_exchange_strong(expected, desired, success, fail);
+    return expected;
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in] addr The pointer to the data.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
+T atomic_compare_exchange_strong(
+    T* addr,
+    T expected,
+    T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail    = sycl::memory_order::relaxed
+) {
+    return atomic_compare_exchange_strong(sycl::multi_ptr<T, addressSpace>(addr), expected, desired, success, fail);
+}
+
+} // namespace acas
diff --git a/src/linearprobing.cpp b/src/linearprobing.cpp
new file mode 100644
index 0000000..0642443
--- /dev/null
+++ b/src/linearprobing.cpp
@@ -0,0 +1,313 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.​
+ * If a copy of the license was not distributed with this file, ​
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ *​
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
+#include "stdio.h"
+#include "stdint.h"
+#include "vector"
+
+#define CPP_MODULE "KERNEL"
+#include "linearprobing.h"
+
+#include <sycl/sycl.hpp>
+#include <chrono>
+#include "acas.h"
+
+// 32 bit Murmur3 hash
+uint32_t hash(uint32_t k)
+{
+    k ^= k >> 16;
+    k *= 0x85ebca6b;
+    k ^= k >> 13;
+    k *= 0xc2b2ae35;
+    k ^= k >> 16;
+    return k & (kHashTableCapacity - 1);
+}
+
+// Create a hash table. For linear probing, this is just an array of KeyValues
+KeyValue* create_hashtable(sycl::queue& qht)
+{
+    KeyValue* hashtable;
+
+    try {
+        // Allocate memory
+        hashtable = sycl::malloc_device<KeyValue>(kHashTableCapacity, qht);
+
+        // Initialize hash table to empty
+        static_assert(kEmpty == 0xFFFFFFFF, "memset expected kEmpty=0xFFFFFFFF");
+        qht.memset(hashtable, 0xFF, sizeof(KeyValue) * kHashTableCapacity);
+        qht.wait();
+    } catch (std::exception const& e) {
+        LOG_ERROR("Exception caught, \'" << e.what() << "\'");
+    } catch (...) {
+        LOG_ERROR("Unknown exception caught, bailing...");
+    }
+
+    return hashtable;
+}
+
+// Insert the key/values in kvs into the hashtable
+void gpu_hashtable_insert(
+    KeyValue* hashtable,
+    const KeyValue* kvs,
+    unsigned int numkvs,
+    sycl::nd_item<1> item)
+{
+    unsigned int tid = item.get_global_id(0);
+    if (tid < numkvs) {
+        uint32_t key   = kvs[tid].key;
+        uint32_t value = kvs[tid].value;
+        uint32_t slot  = hash(key);
+
+        while (true) {
+            uint32_t prev = acas::atomic_compare_exchange_strong(&hashtable[slot].key, kEmpty, key);
+            if (prev == kEmpty || prev == key) {
+                hashtable[slot].value = value;
+                return;
+            }
+
+            slot = (slot + 1) & (kHashTableCapacity - 1);
+        }
+    }
+}
+
+void insert_hashtable(
+    KeyValue* pHashTable, // hashtable
+    const KeyValue* kvs,  // starting position for this batch of key-value pairs
+    uint32_t num_kvs,     // number of key-value pairs in this batch
+    sycl::queue& qht)
+{
+    try {
+        // Copy this batch of key-value pairs to the device
+        KeyValue* device_kvs;
+        device_kvs = sycl::malloc_device<KeyValue>(num_kvs, qht);
+        auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs);
+
+        int threadblocksize = 256; // perf does not seem to vary w/ thread block size (for all kernels in hashtable)
+
+        // Create events for GPU timing
+        qht.parallel_for(
+            sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1),
+            [=](sycl::nd_item<1> item) {
+
+                gpu_hashtable_insert(
+                    pHashTable,
+                    device_kvs,
+                    (uint32_t)num_kvs,
+                    item);
+            }
+        );
+        qht.wait();
+
+        sycl::free(device_kvs, qht);
+    } catch (std::exception const& e) {
+        LOG_ERROR("Exception caught, \'" << e.what() << "\'");
+    } catch (...) {
+        LOG_ERROR("Unknown exception caught, bailing...");
+    }
+}
+
+// Lookup keys in the hashtable, and return the values
+void gpu_hashtable_lookup(
+    KeyValue* hashtable,
+    KeyValue* kvs,
+    unsigned int numkvs,
+    sycl::nd_item<1> item)
+{
+    unsigned int tid = item.get_global_id(0);
+    if (tid < numkvs) {
+        uint32_t key  = kvs[tid].key;
+        uint32_t slot = hash(key);
+
+        while (true) {
+            if (hashtable[slot].key == key) {
+                kvs[tid].value = hashtable[slot].value;
+                return;
+            }
+            if (hashtable[slot].key == kEmpty) {
+                kvs[tid].value = kEmpty;
+                return;
+            }
+            slot = (slot + 1) & (kHashTableCapacity - 1);
+        }
+    }
+}
+
+void lookup_hashtable(
+    KeyValue* pHashTable,
+    KeyValue* kvs,
+    uint32_t num_kvs,
+    sycl::queue& qht)
+{
+    try {
+        // Copy this batch of key-value pairs to the device
+        KeyValue* device_kvs;
+        device_kvs = sycl::malloc_device<KeyValue>(num_kvs, qht);
+        auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs);
+
+        int threadblocksize = 256;
+
+        qht.parallel_for(
+            sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1),
+            [=](sycl::nd_item<1> item) {
+
+                gpu_hashtable_lookup(
+                    pHashTable,
+                    device_kvs,
+                    (uint32_t)num_kvs,
+                    item);
+            }
+        );
+        qht.wait();
+
+        sycl::free(device_kvs, qht);
+    } catch (std::exception const& e) {
+        LOG_ERROR("Exception caught, \'" << e.what() << "\'");
+    } catch (...) {
+        LOG_ERROR("Unknown exception caught, bailing...");
+    }
+}
+
+// Delete each key in kvs from the hash table, if the key exists
+// A deleted key is left in the hash table, but its value is set to kEmpty
+// Deleted keys are not reused; once a key is assigned a slot, it never moves
+void gpu_hashtable_delete(
+    KeyValue* hashtable,
+    const KeyValue* kvs,
+    unsigned int numkvs,
+    sycl::nd_item<1> item)
+{
+    unsigned int tid = item.get_global_id(0);
+    if (tid < numkvs) {
+        uint32_t key  = kvs[tid].key;
+        uint32_t slot = hash(key);
+
+        while (true) {
+            if (hashtable[slot].key == key) {
+                hashtable[slot].value = kEmpty;
+                return;
+            }
+            if (hashtable[slot].key == kEmpty) {
+                return;
+            }
+            slot = (slot + 1) & (kHashTableCapacity - 1);
+        }
+    }
+}
+
+void delete_hashtable(
+    KeyValue* pHashTable,
+    const KeyValue* kvs,
+    uint32_t num_kvs,
+    sycl::queue& qht)
+{
+    try {
+        // Copy the keyvalues to the GPU
+        KeyValue* device_kvs;
+        device_kvs = sycl::malloc_device<KeyValue>(num_kvs, qht);
+        auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs);
+
+        int threadblocksize = 256;
+
+        qht.parallel_for(
+            sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1),
+            [=](sycl::nd_item<1> item) {
+
+                gpu_hashtable_delete(
+                    pHashTable,
+                    device_kvs,
+                    (uint32_t)num_kvs,
+                    item) ;
+            }
+        );
+        qht.wait();
+
+        sycl::free(device_kvs, qht);
+    } catch (std::exception const& e) {
+        LOG_ERROR("Exception caught, \'" << e.what() << "\'");
+    } catch (...) {
+        LOG_ERROR("Unknown exception caught, bailing...");
+    }
+}
+
+// Iterate over every item in the hashtable; return non-empty key/values
+void gpu_iterate_hashtable(
+    KeyValue* pHashTable,
+    KeyValue* kvs,
+    uint32_t* kvs_size,
+    sycl::nd_item<1> item)
+{
+    unsigned int tid = item.get_global_id(0);
+    if (tid < kHashTableCapacity) {
+        if (pHashTable[tid].key != kEmpty) {
+            uint32_t value = pHashTable[tid].value;
+            if (value != kEmpty) {
+                // uint32_t size = sycl::atomic<uint32_t>(sycl::global_ptr<uint32_t>(kvs_size)).fetch_add(1);
+                uint32_t size = sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>(kvs_size[0]).fetch_add(1);
+                kvs[size] = pHashTable[tid];
+            }
+        }
+    }
+}
+
+std::vector<KeyValue> iterate_hashtable(
+    KeyValue* pHashTable,
+    sycl::queue& qht)
+{
+    std::vector<KeyValue> kvs;
+
+    try {
+        uint32_t* device_num_kvs;
+        KeyValue* device_kvs;
+        device_num_kvs = sycl::malloc_device<uint32_t>(1, qht);
+        device_kvs = sycl::malloc_device<KeyValue>(kNumKeyValues, qht);
+
+        auto e1 = qht.memset(device_num_kvs, 0, sizeof(uint32_t));
+
+        int threadblocksize = 256;
+
+        auto e2 = qht.parallel_for(
+            sycl::nd_range<1>(kHashTableCapacity, threadblocksize), std::move(e1),
+            [=](sycl::nd_item<1> item) {
+
+                gpu_iterate_hashtable(
+                    pHashTable,
+                    device_kvs,
+                    device_num_kvs,
+                    item);
+            }
+        );
+
+        uint32_t num_kvs;
+        qht.memcpy(&num_kvs, device_num_kvs, sizeof(uint32_t), std::move(e2));
+        qht.wait();
+
+        kvs.resize(num_kvs);
+
+        qht.memcpy(kvs.data(), device_kvs, sizeof(KeyValue) * num_kvs);
+        qht.wait();
+
+        sycl::free(device_kvs, qht);
+        sycl::free(device_num_kvs, qht);
+    } catch (std::exception const& e) {
+        LOG_ERROR("Exception caught, \'" << e.what() << "\'");
+    } catch (...) {
+        LOG_ERROR("Unknown exception caught, bailing...");
+    }
+
+    return kvs;
+}
+
+// Free the memory of the hashtable
+void destroy_hashtable(
+    KeyValue* pHashTable,
+    sycl::queue& qht)
+{
+    sycl::free(pHashTable, qht);
+}
diff --git a/src/linearprobing.cu b/src/linearprobing.cu
deleted file mode 100644
index ac75d9d..0000000
--- a/src/linearprobing.cu
+++ /dev/null
@@ -1,268 +0,0 @@
-#include "stdio.h"
-#include "stdint.h"
-#include "vector"
-#include "linearprobing.h"
-
-// 32 bit Murmur3 hash
-__device__ uint32_t hash(uint32_t k)
-{
-    k ^= k >> 16;
-    k *= 0x85ebca6b;
-    k ^= k >> 13;
-    k *= 0xc2b2ae35;
-    k ^= k >> 16;
-    return k & (kHashTableCapacity-1);
-}
-
-// Create a hash table. For linear probing, this is just an array of KeyValues
-KeyValue* create_hashtable() 
-{
-    // Allocate memory
-    KeyValue* hashtable;
-    cudaMalloc(&hashtable, sizeof(KeyValue) * kHashTableCapacity);
-
-    // Initialize hash table to empty
-    static_assert(kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff");
-    cudaMemset(hashtable, 0xff, sizeof(KeyValue) * kHashTableCapacity);
-
-    return hashtable;
-}
-
-// Insert the key/values in kvs into the hashtable
-__global__ void gpu_hashtable_insert(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs)
-{
-    unsigned int threadid = blockIdx.x*blockDim.x + threadIdx.x;
-    if (threadid < numkvs)
-    {
-        uint32_t key = kvs[threadid].key;
-        uint32_t value = kvs[threadid].value;
-        uint32_t slot = hash(key);
-
-        while (true)
-        {
-            uint32_t prev = atomicCAS(&hashtable[slot].key, kEmpty, key);
-            if (prev == kEmpty || prev == key)
-            {
-                hashtable[slot].value = value;
-                return;
-            }
-
-            slot = (slot + 1) & (kHashTableCapacity-1);
-        }
-    }
-}
- 
-void insert_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs)
-{
-    // Copy the keyvalues to the GPU
-    KeyValue* device_kvs;
-    cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);
-    cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);
-
-    // Have CUDA calculate the thread block size
-    int mingridsize;
-    int threadblocksize;
-    cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);
-
-    // Create events for GPU timing
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start);
-
-    // Insert all the keys into the hash table
-    int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;
-    gpu_hashtable_insert<<<gridsize, threadblocksize>>>(pHashTable, device_kvs, (uint32_t)num_kvs);
-
-    cudaEventRecord(stop);
-
-    cudaEventSynchronize(stop);
-
-    float milliseconds = 0;
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    float seconds = milliseconds / 1000.0f;
-    printf("    GPU inserted %d items in %f ms (%f million keys/second)\n", 
-        num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);
-
-    cudaFree(device_kvs);
-}
-
-// Lookup keys in the hashtable, and return the values
-__global__ void gpu_hashtable_lookup(KeyValue* hashtable, KeyValue* kvs, unsigned int numkvs)
-{
-    unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (threadid < numkvs)
-    {
-        uint32_t key = kvs[threadid].key;
-        uint32_t slot = hash(key);
-
-        while (true)
-        {
-            if (hashtable[slot].key == key)
-            {
-                kvs[threadid].value = hashtable[slot].value;
-                return;
-            }
-            if (hashtable[slot].key == kEmpty)
-            {
-                kvs[threadid].value = kEmpty;
-                return;
-            }
-            slot = (slot + 1) & (kHashTableCapacity - 1);
-        }
-    }
-}
-
-void lookup_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t num_kvs)
-{
-    // Copy the keyvalues to the GPU
-    KeyValue* device_kvs;
-    cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);
-    cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);
-
-    // Have CUDA calculate the thread block size
-    int mingridsize;
-    int threadblocksize;
-    cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);
-
-    // Create events for GPU timing
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start);
-
-    // Insert all the keys into the hash table
-    int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;
-    gpu_hashtable_lookup << <gridsize, threadblocksize >> > (pHashTable, device_kvs, (uint32_t)num_kvs);
-
-    cudaEventRecord(stop);
-
-    cudaEventSynchronize(stop);
-
-    float milliseconds = 0;
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    float seconds = milliseconds / 1000.0f;
-    printf("    GPU lookup %d items in %f ms (%f million keys/second)\n",
-        num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);
-
-    cudaFree(device_kvs);
-}
-
-// Delete each key in kvs from the hash table, if the key exists
-// A deleted key is left in the hash table, but its value is set to kEmpty
-// Deleted keys are not reused; once a key is assigned a slot, it never moves
-__global__ void gpu_hashtable_delete(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs)
-{
-    unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (threadid < numkvs)
-    {
-        uint32_t key = kvs[threadid].key;
-        uint32_t slot = hash(key);
-
-        while (true)
-        {
-            if (hashtable[slot].key == key)
-            {
-                hashtable[slot].value = kEmpty;
-                return;
-            }
-            if (hashtable[slot].key == kEmpty)
-            {
-                return;
-            }
-            slot = (slot + 1) & (kHashTableCapacity - 1);
-        }
-    }
-}
-
-void delete_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs)
-{
-    // Copy the keyvalues to the GPU
-    KeyValue* device_kvs;
-    cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);
-    cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);
-
-    // Have CUDA calculate the thread block size
-    int mingridsize;
-    int threadblocksize;
-    cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);
-
-    // Create events for GPU timing
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start);
-
-    // Insert all the keys into the hash table
-    int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;
-    gpu_hashtable_delete<< <gridsize, threadblocksize >> > (pHashTable, device_kvs, (uint32_t)num_kvs);
-
-    cudaEventRecord(stop);
-
-    cudaEventSynchronize(stop);
-
-    float milliseconds = 0;
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    float seconds = milliseconds / 1000.0f;
-    printf("    GPU delete %d items in %f ms (%f million keys/second)\n",
-        num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);
-
-    cudaFree(device_kvs);
-}
-
-// Iterate over every item in the hashtable; return non-empty key/values
-__global__ void gpu_iterate_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t* kvs_size)
-{
-    unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (threadid < kHashTableCapacity) 
-    {
-        if (pHashTable[threadid].key != kEmpty) 
-        {
-            uint32_t value = pHashTable[threadid].value;
-            if (value != kEmpty)
-            {
-                uint32_t size = atomicAdd(kvs_size, 1);
-                kvs[size] = pHashTable[threadid];
-            }
-        }
-    }
-}
-
-std::vector<KeyValue> iterate_hashtable(KeyValue* pHashTable)
-{
-    uint32_t* device_num_kvs;
-    cudaMalloc(&device_num_kvs, sizeof(uint32_t));
-    cudaMemset(device_num_kvs, 0, sizeof(uint32_t));
-
-    KeyValue* device_kvs;
-    cudaMalloc(&device_kvs, sizeof(KeyValue) * kNumKeyValues);
-
-    int mingridsize;
-    int threadblocksize;
-    cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_iterate_hashtable, 0, 0);
-
-    int gridsize = (kHashTableCapacity + threadblocksize - 1) / threadblocksize;
-    gpu_iterate_hashtable<<<gridsize, threadblocksize>>>(pHashTable, device_kvs, device_num_kvs);
-
-    uint32_t num_kvs;
-    cudaMemcpy(&num_kvs, device_num_kvs, sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-    std::vector<KeyValue> kvs;
-    kvs.resize(num_kvs);
-
-    cudaMemcpy(kvs.data(), device_kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyDeviceToHost);
-
-    cudaFree(device_kvs);
-    cudaFree(device_num_kvs);
-
-    return kvs;
-}
-
-// Free the memory of the hashtable
-void destroy_hashtable(KeyValue* pHashTable)
-{
-    cudaFree(pHashTable);
-}
diff --git a/src/linearprobing.h b/src/linearprobing.h
index 6487e8b..5cadba3 100644
--- a/src/linearprobing.h
+++ b/src/linearprobing.h
@@ -1,25 +1,63 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.​
+ * If a copy of the license was not distributed with this file, ​
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ *​
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
 #pragma once
 
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <sstream>
+
+#ifndef CPP_MODULE
+#define CPP_MODULE "UNKN"
+#endif
+
 struct KeyValue
 {
     uint32_t key;
     uint32_t value;
 };
 
-const uint32_t kHashTableCapacity = 128 * 1024 * 1024;
+const uint32_t kHashTableCapacity = 256 * 1024 * 1024;
 
 const uint32_t kNumKeyValues = kHashTableCapacity / 2;
 
-const uint32_t kEmpty = 0xffffffff;
+const uint32_t kEmpty = 0xFFFFFFFF;
+
+const uint32_t NUM_LOOPS = 1;
 
-KeyValue* create_hashtable();
+KeyValue* create_hashtable(sycl::queue& qht);
 
-void insert_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs);
+void insert_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht);
+void lookup_hashtable(KeyValue* hashtable,       KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht);
+void delete_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht);
 
-void lookup_hashtable(KeyValue* hashtable, KeyValue* kvs, uint32_t num_kvs);
+std::vector<KeyValue> iterate_hashtable(KeyValue* hashtable, sycl::queue& qht);
 
-void delete_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs);
+void destroy_hashtable(KeyValue* hashtable, sycl::queue& qht);
 
-std::vector<KeyValue> iterate_hashtable(KeyValue* hashtable);
+#define checkCUDA(expression)                                   \
+{                                                               \
+    cudaError_t const status(expression);                       \
+    if (status != cudaSuccess) {                                \
+        std::stringstream sErrorMessage;                        \
+        sErrorMessage << "Error on line " << __LINE__ << ": "   \
+                      << cudaGetErrorString(status) << "\n";    \
+        throw std::runtime_error(sErrorMessage.str());          \
+        std::exit(EXIT_FAILURE);                                \
+    }                                                           \
+}
 
-void destroy_hashtable(KeyValue* hashtable);
+#define LOG_ERROR(msg)                                                              \
+{                                                                                   \
+    std::stringstream sErrorMessage;                                                \
+    sErrorMessage << CPP_MODULE << " ERROR(" << __LINE__<< "): " <<  msg << "\n";   \
+    std::cerr << sErrorMessage.str();                                               \
+    throw std::runtime_error(sErrorMessage.str());                                  \
+}
diff --git a/src/main.cpp b/src/main.cpp
index 9b15dc2..c7f8e32 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,3 +1,13 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.​
+ * If a copy of the license was not distributed with this file, ​
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ *​
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
 #include "algorithm"
 #include "random"
 #include "stdint.h"
@@ -6,11 +16,48 @@
 #include "unordered_set"
 #include "vector"
 #include "chrono"
+#include <cstring>
+
+// #define DEBUG_TIME
+#define CPP_MODULE "MAIN"
 #include "linearprobing.h"
 
+#define TIMER_START() time_start = std::chrono::steady_clock::now();
+#define TIMER_END()                                                                         \
+    time_end = std::chrono::steady_clock::now();                                            \
+    time_total  = std::chrono::duration<double, std::milli>(time_end - time_start).count();
+#define TIMER_PRINT(name) std::cout << name <<": " << time_total / 1e3 << " s\n";
+
+#ifdef DEBUG_TIME
+#define START_TIMER() start_time = std::chrono::steady_clock::now();
+#define STOP_TIMER()                                                                        \
+    stop_time = std::chrono::steady_clock::now();                                           \
+    duration  = std::chrono::duration<double, std::milli>(stop_time - start_time).count();  \
+    tot_time += duration;
+#define PRINT_TIMER(name) std::cout <<name <<"      : " << duration << " ms\n";
+#endif
+
+using Time = std::chrono::time_point<std::chrono::steady_clock>;
+
+Time start_timer() 
+{
+    return std::chrono::steady_clock::now();
+}
+
+double get_elapsed_time(Time start) 
+{
+    Time end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> d = end - start;
+    std::chrono::microseconds us = std::chrono::duration_cast<std::chrono::microseconds>(d);
+    return us.count() / 1000.0f;
+}
+
 // Create random keys/values in the range [0, kEmpty)
 // kEmpty is used to indicate an empty slot
-std::vector<KeyValue> generate_random_keyvalues(std::mt19937& rnd, uint32_t numkvs)
+std::vector<KeyValue> generate_random_keyvalues(
+    std::mt19937& rnd,
+    uint32_t numkvs)
 {
     std::uniform_int_distribution<uint32_t> dis(0, kEmpty - 1);
 
@@ -28,7 +75,10 @@ std::vector<KeyValue> generate_random_keyvalues(std::mt19937& rnd, uint32_t numk
 }
 
 // return numshuffledkvs random items from kvs
-std::vector<KeyValue> shuffle_keyvalues(std::mt19937& rnd, std::vector<KeyValue> kvs, uint32_t numshuffledkvs)
+std::vector<KeyValue> shuffle_keyvalues(
+    std::mt19937& rnd,
+    std::vector<KeyValue> kvs,
+    uint32_t numshuffledkvs)
 {
     std::shuffle(kvs.begin(), kvs.end(), rnd);
 
@@ -40,23 +90,9 @@ std::vector<KeyValue> shuffle_keyvalues(std::mt19937& rnd, std::vector<KeyValue>
     return shuffled_kvs;
 }
 
-using Time = std::chrono::time_point<std::chrono::high_resolution_clock>;
-
-Time start_timer() 
-{
-    return std::chrono::high_resolution_clock::now();
-}
-
-double get_elapsed_time(Time start) 
-{
-    Time end = std::chrono::high_resolution_clock::now();
-
-    std::chrono::duration<double> d = end - start;
-    std::chrono::microseconds us = std::chrono::duration_cast<std::chrono::microseconds>(d);
-    return us.count() / 1000.0f;
-}
-
-void test_unordered_map(std::vector<KeyValue> insert_kvs, std::vector<KeyValue> delete_kvs) 
+void test_unordered_map(
+    std::vector<KeyValue> insert_kvs,
+    std::vector<KeyValue> delete_kvs) 
 {
     Time timer = start_timer();
 
@@ -78,69 +114,161 @@ void test_unordered_map(std::vector<KeyValue> insert_kvs, std::vector<KeyValue>
 
     double milliseconds = get_elapsed_time(timer);
     double seconds = milliseconds / 1000.0f;
-    printf("Total time for std::unordered_map: %f ms (%f million keys/second)\n", 
+    printf("Total time for std::unordered_map: %f ms (%f Mkeys/second)\n",
         milliseconds, kNumKeyValues / seconds / 1000000.0f);
 }
 
-void test_correctness(std::vector<KeyValue>, std::vector<KeyValue>, std::vector<KeyValue>);
+void test_correctness(
+    std::vector<KeyValue>,
+    std::vector<KeyValue>,
+    std::vector<KeyValue>);
 
-int main() 
+int main(int argc, char* argv[])
 {
+    std::chrono::steady_clock::time_point time_start;
+    std::chrono::steady_clock::time_point time_end;
+    double time_total = 0.0;
+
+    try {
+
     // To recreate the same random numbers across runs of the program, set seed to a specific
     // number instead of a number from random_device
     std::random_device rd;
     uint32_t seed = rd();
     std::mt19937 rnd(seed);  // mersenne_twister_engine
 
-    printf("Random number generator seed = %u\n", seed);
+    // printf("Random number generator seed = %u\n", seed);
 
-    while (true)
-    {
-        printf("Initializing keyvalue pairs with random numbers...\n");
+    // double seconds;
+    // for (uint32_t n = 0; n < NUM_LOOPS; ++n) {
+        // printf("Initializing keyvalue pairs with random numbers...\n");
+
+#ifdef DEBUG_TIME
+        std::chrono::steady_clock::time_point start_time;
+        std::chrono::steady_clock::time_point stop_time;
+        double duration = 0.0;
+        double tot_time = 0.0;
 
+        START_TIMER();
+#endif
         std::vector<KeyValue> insert_kvs = generate_random_keyvalues(rnd, kNumKeyValues);
+        std::vector<KeyValue> lookup_kvs = shuffle_keyvalues(rnd, insert_kvs, kNumKeyValues / 2);
         std::vector<KeyValue> delete_kvs = shuffle_keyvalues(rnd, insert_kvs, kNumKeyValues / 2);
 
-        // Begin test
-        printf("Testing insertion/deletion of %d/%d elements into GPU hash table...\n",
-            (uint32_t)insert_kvs.size(), (uint32_t)delete_kvs.size());
-
-        Time timer = start_timer();
-
-        KeyValue* pHashTable = create_hashtable();
-
-        // Insert items into the hash table
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("generate_hashtable ");
+#endif
+
+        TIMER_START()
+
+#ifdef DEBUG_TIME
+START_TIMER();
+#endif
+        sycl::queue qht;
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("init               ");
+
+START_TIMER();
+#endif
+        // Allocates device memory for the hashtable and
+        // fills every byte with 0xFF (so each key (and value) is set to 0xFFFFFFFF)
+        KeyValue* pHashTable = create_hashtable(qht);
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("create_hashtable   ");
+
+START_TIMER();
+#endif
+        // Insert items into the hash table in batches of num_inserts_per_batch
         const uint32_t num_insert_batches = 16;
         uint32_t num_inserts_per_batch = (uint32_t)insert_kvs.size() / num_insert_batches;
-        for (uint32_t i = 0; i < num_insert_batches; i++)
-        {
-            insert_hashtable(pHashTable, insert_kvs.data() + i * num_inserts_per_batch, num_inserts_per_batch);
+        for (uint32_t i = 0; i < num_insert_batches; i++) {
+
+            insert_hashtable(
+                pHashTable,
+                insert_kvs.data() + i * num_inserts_per_batch,
+                num_inserts_per_batch,
+                qht);
+        }
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("insert_hashtable   ");
+
+START_TIMER();
+#endif
+        // Look up items from the hash table in batches of num_lookups_per_batch
+        const uint32_t num_lookup_batches = 8;
+        uint32_t num_lookups_per_batch = (uint32_t)lookup_kvs.size() / num_lookup_batches;
+        for (uint32_t i = 0; i < num_lookup_batches; i++) {
+
+            lookup_hashtable(
+                pHashTable,
+                lookup_kvs.data() + i * num_lookups_per_batch,
+                num_lookups_per_batch,
+                qht);
         }
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("lookup_hashtable   ");
 
-        // Delete items from the hash table
+START_TIMER();
+#endif
+        // Delete items from the hash table in batches of num_deletes_per_batch
         const uint32_t num_delete_batches = 8;
         uint32_t num_deletes_per_batch = (uint32_t)delete_kvs.size() / num_delete_batches;
-        for (uint32_t i = 0; i < num_delete_batches; i++)
-        {
-            delete_hashtable(pHashTable, delete_kvs.data() + i * num_deletes_per_batch, num_deletes_per_batch);
+        for (uint32_t i = 0; i < num_delete_batches; i++) {
+
+            delete_hashtable(
+                pHashTable,
+                delete_kvs.data() + i * num_deletes_per_batch,
+                num_deletes_per_batch,
+                qht);
         }
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("delete_hashtable   ");
 
+START_TIMER();
+#endif
         // Get all the key-values from the hash table
-        std::vector<KeyValue> kvs = iterate_hashtable(pHashTable);
-
-        destroy_hashtable(pHashTable);
+        std::vector<KeyValue> kvs = iterate_hashtable(pHashTable, qht);
+#ifdef DEBUG_TIME
+STOP_TIMER();
+PRINT_TIMER("iterate_hashtable  ");
+#endif
+// std::cout << "tot_time: " << tot_time << " ms" << std::endl;
 
         // Summarize results
-        double milliseconds = get_elapsed_time(timer);
-        double seconds = milliseconds / 1000.0f;
-        printf("Total time (including memory copies, readback, etc): %f ms (%f million keys/second)\n", milliseconds,
-            kNumKeyValues / seconds / 1000000.0f);
+        // double milliseconds = get_elapsed_time(timer);
+        // seconds = milliseconds / 1000.0f;
 
-        test_unordered_map(insert_kvs, delete_kvs);
+        destroy_hashtable(pHashTable, qht);
 
-        test_correctness(insert_kvs, delete_kvs, kvs);
+    TIMER_END()
+    TIMER_PRINT("hashtable - total time for whole calculation")
+    printf("%f million keys/second\n", kNumKeyValues / (time_total / 1000.0f) / 1000000.0f);
 
-        printf("Success\n");
+        bool verify = true;
+        if (argc > 1 && strcmp(argv[1], "--no-verify") == 0) {
+            verify = false;
+        }
+        if (verify) {
+            test_unordered_map(insert_kvs, delete_kvs);
+            test_correctness(insert_kvs, delete_kvs, std::move(kvs));
+            printf("Success\n");
+        }
+    // }
+    // printf("Total time: %f s\n", seconds);
+    // printf("%f million keys/second\n", kNumKeyValues / seconds / 1000000.0f);
+    
+    } catch (std::exception const& e) {
+        std::cout << "Exception caught, \'" << e.what() << "\'";
+        return 1;
+    } catch (...) {
+        std::cout << "Unknown exception caught, bailing...";
+        return 2;
     }
 
     return 0;
diff --git a/src/test.cpp b/src/test.cpp
index ad454a4..4b8aef4 100644
--- a/src/test.cpp
+++ b/src/test.cpp
@@ -1,3 +1,13 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.​
+ * If a copy of the license was not distributed with this file, ​
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html​
+ *​
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
 #include "stdio.h"
 #include "stdint.h"
 #include "unordered_set"
@@ -7,7 +17,10 @@
 #include "random"
 #include "linearprobing.h"
 
-void test_correctness(std::vector<KeyValue> insert_kvs, std::vector<KeyValue> delete_kvs, std::vector<KeyValue> kvs)
+void test_correctness(
+    std::vector<KeyValue> insert_kvs,
+    std::vector<KeyValue> delete_kvs,
+    std::vector<KeyValue> kvs)
 {
     printf("Testing that there are no duplicate keys...\n");
     std::unordered_set<uint32_t> unique_keys;
@@ -27,7 +40,7 @@ void test_correctness(std::vector<KeyValue> insert_kvs, std::vector<KeyValue> de
 
     printf("Building unordered_map from original list...\n");
     std::unordered_map<uint32_t, std::vector<uint32_t>> all_kvs_map;
-    for (int i = 0; i < insert_kvs.size(); i++)
+    for (uint32_t i = 0; i < insert_kvs.size(); i++)
     {
         if (i % 10000000 == 0)
             printf("    Inserting %d/%d\n", i, (uint32_t)insert_kvs.size());
@@ -43,7 +56,7 @@ void test_correctness(std::vector<KeyValue> insert_kvs, std::vector<KeyValue> de
         }
     }
 
-    for (int i = 0; i < delete_kvs.size(); i++)
+    for (uint32_t i = 0; i < delete_kvs.size(); i++)
     {
         if (i % 10000000 == 0)
             printf("    Deleting %d/%d\n", i, (uint32_t)delete_kvs.size());