diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 36145a85..df7786fd 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -20,7 +20,8 @@ jobs:
 
     - name: Checkout source code
       uses: actions/checkout@v4
-
+    - name: Install MPI
+      run: sudo apt install -y -q build-essential mpich libmpich-dev python-is-python3 python3
     - name: Cache conda
       uses: actions/cache@v4
       env:
@@ -34,7 +35,7 @@ jobs:
     - name: Cache depends/
       uses: actions/cache@v4
       with:
-        key: ${{ runner.os }}-build-${{ hashFiles('depends/Makefile', 'depends/install_pfft.sh') }}
+        key: ${{ runner.os }}-build-${{ hashFiles('depends/Makefile') }}
         path: |
           ~/depends/install
           ~/depends/include
diff --git a/LICENSE b/LICENSE
index 446508f6..46041b7c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -14,6 +14,4 @@ Redistribution and use in source and binary forms, with or without modification,
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-MP-Gadget is also available under the terms of the GNU Public License v2 or later at the option of the user. MP-Gadget is a derived code of Gadget-2 and has been re-licensed under the above license with the permission of all original copyright holders, including Volker Springel, the author of Gadget-2. 
-
-Note that as MP-Gadget depends on PFFT and the GNU scientific library, a compiled version is implicitly distributed under the terms of the GNU General Public License version under which the version of the library llinked against is available.
+Shenqi is a derived code of Gadget-2 and has been re-licensed under the above license with the permission of all original copyright holders, including Volker Springel, the author of Gadget-2. 
diff --git a/Makefile b/Makefile
index 322699bb..1c5a2eed 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,6 @@ all: $(CONFIG)
 	cd libgenic; $(MAKE)
 	cd gadget; $(MAKE)
 	cd genic; $(MAKE)
-
 clean :
 	cd libgadget; $(MAKE) clean
 	cd libgenic; $(MAKE) clean
@@ -59,9 +58,7 @@ tag:
 sdist:
 	(git rev-parse --abbrev-ref HEAD | grep master )|| (echo "Must be on master" && exit 1);
 	git checkout -B "rc-$(VERSION)";
-	git add -f depends/pfft-1.0.8-alpha2-fftw3.tar.gz
 	git commit -m "rc-$(VERSION) packaging"
-	git rm --cached depends/pfft-1.0.8-alpha2-fftw3.tar.gz
 	git commit -m "rc-$(VERSION) cleanup"
 	bash maintainer/git-archive-all.sh --prefix MPGadget-$(VERSION)/ -- - | gzip -c > MPGadget-$(VERSION).tar.gz
 	git checkout master
diff --git a/Makefile.rules b/Makefile.rules
index 76afb24c..5a9a59d8 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -1,35 +1,57 @@
 # vim: set ft=make:
 #
 AR ?= ar
-MPICC ?= mpicc
+MPICC ?= mpic++
+NVOPTIMIZE ?=  
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
-GSL_INCL ?= $(shell pkg-config --cflags gsl)
-GSL_LIBS ?= $(shell pkg-config --libs gsl)
+
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
     FITSIO_INCL ?= $(shell pkg-config --cflags cfitsio)
     FITSIO_LIBS ?= $(shell pkg-config --libs cfitsio)
 endif
 
+ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
+    CUDA_INCL ?= 
+    CUDA_LIBS ?= -lcudart
+    CUFFTMP_INCL ?= 
+    CUFFTMP_LIBS ?= -lcufftMp
+    NVSHMEM_INCL ?= 
+    NVSHMEM_LIBS ?= -lnvshmem_host
+    NVCC ?= nvcc
+    NVOPTIMIZE ?= -O3
+    MPI_INCL ?=
+endif
+
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(FITSIO_INCL)
+CFLAGS = $(OPTIONS) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 #For tests
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
-BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm $(GSL_LIBS) $(FITSIO_LIBS)
+CUDAFLAGS = $(BOOST_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL) $(MPI_INCL)
+
+BUNDLEDLIBS = -lbigfile-mpi -lbigfile 
+LIBS  = -lm $(BOOST_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
-.objs/%.o: %.c Makefile $(CONFIG)
-	@cmd="$(MPICC) -MMD -c -o $@ $(CFLAGS) $<"; \
+.objs/%.o: %.c $(INCL) Makefile $(CONFIG)
+	@cmd="$(MPICC) -c -o $@ $(CFLAGS) $<"; \
 	if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
 	mkdir -p `dirname $@`; \
 	echo Compiling $<; $$cmd
+
+
+.objs/%.o: %.cu Makefile $(CONFIG)
+	@cmd="$(NVCC) -MMD -c -o $@ $(CUDAFLAGS) $<"; \
+	if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
+	echo Compiling $<; $$cmd
+	
+
diff --git a/Options.mk.example b/Options.mk.example
index 3a91d5e2..65eea878 100644
--- a/Options.mk.example
+++ b/Options.mk.example
@@ -1,7 +1,10 @@
 #These variables are set to useful defaults, but may be overriden if needed
-#MPICC=mpicc
-#GSL_LIBS=
-#GSL_INCL=
+#MPICC=mpic++
+#MPICCDEP=mpicc
+
+#NVCC=nvcc
+#NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
+
 #This is a good optimized build default for gcc
 OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
 #This is a good non-optimized default for debugging
@@ -12,7 +15,7 @@ OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
 #OPT += -DDEBUG      # print a lot of debugging messages
 #Disable openmp locking. This means no threading.
 #OPT += -DNO_OPENMP_SPINLOCK
-
+#OPT += -DUSE_CUDA  #Enable GPU-specific CUDA code
 #-----------
 #OPT += -DEXCUR_REION  # reionization with excursion set
 
diff --git a/README.rst b/README.rst
index 17c4af28..5aaf5d37 100644
--- a/README.rst
+++ b/README.rst
@@ -12,7 +12,7 @@ Description
 
 This version of Gadget is derived from main P-Gadget / Gadget-2, with the gravity solver algorithm from Gadget-4.
 It is the source code used to run the BlueTides and ASTRID simulations (http://bluetides-project.org).
-MP-Gadget requires GSL and a C compiler with OpenMP 4.5 support.
+MP-Gadget requires a C++ compiler with OpenMP 4.5 support.
 
 The infrastructure is heavily reworked. As a summary:
 
@@ -50,17 +50,6 @@ First time users:
     make -j
 
 The Makefile will automatically copy Options.mk.example to Options.mk. The default compile flags are appropriate for a linux using gcc, but may not be optimal.
-We will need gsl. On HPC systems with the modules command,
-usually it can be loaded with
-
-.. code:: bash
-
-    module load gsl
-
-    env | grep GSL  # check if GSL path is reasonable
-
-On a common PC/Linux system, refer to your package vendor how to
-install gsl and gsl-devel.
 
 If you wish to perform compile-time customisation (to, eg, change optimizations or use different compilers), you need an Options.mk file. The initial defaults are stored in Options.mk.example.
 
@@ -81,8 +70,6 @@ Compile-time options may be set in Options.mk. The remaining compile time option
 - EXCUR_REION enables the excursion set reionization model.
 - USE_CFITSIO enables the output of lenstools compatible potential planes using cfitsio,
 
-If compilation fails with errors related to the GSL, you may also need to set the GSL_INC or GSL_LIB variables in Options.mk to the filesystem path containing the GSL headers and libraries.
-
 To run a N-Body sim, use IC files with no gas particles.
 
 Now we are ready to build
@@ -140,15 +127,6 @@ Refer to https://github.com/rainwoodman/bigfile for usage.
 Otherwise directly open the blocks with Fortran or C, noting the data-type
 information and attributes in header and attrs files (in plain text)
 
-GLIBC 2.22
-----------
-
-Cray updated their GLIBC to 2.22+ recently.
-A good move but it happens to be a buggy version of GLIBC:
-https://sourceware.org/bugzilla/show_bug.cgi?id=19590
-causing non-existing symbols like `_ZGVcN4v___log_finite`.
-Adding `-lmvec -lmvec_nonshared` to GSL_LIBS works around the issue.
-
 Bigfile
 -------
 
@@ -192,7 +170,7 @@ For usage of the code, here is a DOI for this repository that you can cite
 Licence
 -------
 
-MP-Gadget is distributed under the terms of a 3-clause BSD license or the GNU General Public License v2 or later, at the option of the user. The use of PFFT and GSL libraries usually forces distribution under the terms of the GNU General Public License v3.
+MP-Gadget is distributed under the terms of a 3-clause BSD license or the GNU General Public License v2 or later, at the option of the user.
 
 Status
 ------
diff --git a/depends/Makefile b/depends/Makefile
index d18b2f16..85b6d56d 100644
--- a/depends/Makefile
+++ b/depends/Makefile
@@ -2,34 +2,22 @@ CONFIG = ../Options.mk
 include $(CONFIG)
 
 .PHONY: depends
-.INTERMEDIATE: pfft
-MPICC ?= mpicc
+# MPICC ?= mpicc
+MPICCDEP ?= mpicc
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
 LIBRARIES=lib/libbigfile-mpi.a
-FFTLIBRARIES=lib/libpfft_omp.a lib/libfftw3_mpi.a lib/libfftw3_omp.a
-depends: $(LIBRARIES) $(FFTLIBRARIES)
-$(FFTLIBRARIES): pfft
+depends: $(LIBRARIES)
 
 lib/libbigfile-mpi.a: bigfile/src/bigfile-mpi.c
 	mkdir -p lib; \
 	mkdir -p include; \
 	cd bigfile/src; \
-	make install PREFIX=$(PWD) CC="$(MPICC)" MPICC="$(MPICC)" CFLAGS="$(OPTIMIZE)" AR="$(AR)"
+	make install PREFIX=$(PWD) CC="$(MPICCDEP)" MPICC="$(MPICCDEP)" CFLAGS="$(OPTIMIZE)" AR="$(AR)"
 
-pfft: install_pfft.sh
-	mkdir -p lib; \
-	mkdir -p include; \
-	#Using -ipo causes icc to crash.
-	MPICC="$(MPICC)" CC="$(MPICC)" CFLAGS="$(filter-out -ipo,$(OPTIMIZE)) -I $(PWD)/include -L$(PWD)/lib" AR="$(AR)" RANLIB=$(RANLIB) \
-        sh $(PWD)/install_pfft.sh $(PWD)/
 
-clean: clean-fast clean-fft
+clean: clean-fast
 
 clean-fast:
 	rm -rf $(LIBRARIES)
 	cd bigfile/src; make clean
 
-clean-fft:
-	rm -rf $(FFTLIBRARIES)
-	rm -rf tmp-pfft-*/double
-	rm -rf tmp-pfft-*/single
diff --git a/depends/bigfile/.github/workflows/main.yaml b/depends/bigfile/.github/workflows/main.yaml
index b0c7c170..366351ab 100644
--- a/depends/bigfile/.github/workflows/main.yaml
+++ b/depends/bigfile/.github/workflows/main.yaml
@@ -65,7 +65,7 @@ jobs:
                numpy=${{ matrix.numpy-version }} \
                nose cython mpi4py \
                compilers
-        conda install -q -y cmake gsl
+        conda install -q -y cmake
         conda install -q -y runtests
 
     - name: Build C
diff --git a/depends/bigfile/CMakeLists.txt b/depends/bigfile/CMakeLists.txt
index 48527264..843bc005 100644
--- a/depends/bigfile/CMakeLists.txt
+++ b/depends/bigfile/CMakeLists.txt
@@ -4,7 +4,6 @@ project(bigfile)
 
 # Finding optional dependencies
 find_package(MPI)
-find_package(GSL)
 
 # Add library subdirectoy
 add_subdirectory(src)
diff --git a/depends/bigfile/utils/CMakeLists.txt b/depends/bigfile/utils/CMakeLists.txt
index ce157662..540787e6 100644
--- a/depends/bigfile/utils/CMakeLists.txt
+++ b/depends/bigfile/utils/CMakeLists.txt
@@ -55,17 +55,6 @@ if(${MPI_C_FOUND})
     
     install(TARGETS bigfile-copy-mpi bigfile-iosim
             RUNTIME DESTINATION bin)
-    
-    if(${GSL_FOUND})
-        include_directories(${GSL_INCLUDE_DIRS})
-
-        # bigfile-sample-mpi
-        add_executable(bigfile-sample-mpi bigfile-sample-mpi.c)
-        target_link_libraries(bigfile-sample-mpi bigfile-mpi bigfile ${GSL_LIBRARIES} ${MPI_C_LIBRARIES})
-        
-        install(TARGETS bigfile-sample-mpi
-                RUNTIME DESTINATION bin)
-    endif()
 endif()
 
 # Install bash scripts
diff --git a/depends/bigfile/utils/Makefile b/depends/bigfile/utils/Makefile
index 4a5771a4..6b09206d 100644
--- a/depends/bigfile/utils/Makefile
+++ b/depends/bigfile/utils/Makefile
@@ -4,7 +4,6 @@ all: \
 	bigfile-set-attr \
 	bigfile-copy \
 	bigfile-copy-mpi \
-	bigfile-sample-mpi \
 	bigfile-cat \
 	bigfile-create \
 	bigfile-ls \
@@ -19,8 +18,6 @@ bigfile-copy: bigfile-copy.c ../src/libbigfile.a
 	$(CC) -o $@ $< ../src/libbigfile.a -I../src
 bigfile-copy-mpi: bigfile-copy-mpi.c ../src/libbigfile.a ../src/libbigfile-mpi.a
 	$(CC) -o $@ $< ../src/libbigfile-mpi.a ../src/libbigfile.a -I../src
-bigfile-sample-mpi: bigfile-sample-mpi.c ../src/libbigfile.a ../src/libbigfile-mpi.a
-	$(CC) -o $@ $< ../src/libbigfile-mpi.a ../src/libbigfile.a -I../src -lgsl -lgslcblas -lm
 bigfile-cat: bigfile-cat.c ../src/libbigfile.a
 	$(CC) -o $@ $< ../src/libbigfile.a -I../src
 bigfile-create: bigfile-create.c ../src/libbigfile.a
diff --git a/depends/bigfile/utils/bigfile-sample-mpi.c b/depends/bigfile/utils/bigfile-sample-mpi.c
deleted file mode 100644
index 3dcf31b6..00000000
--- a/depends/bigfile/utils/bigfile-sample-mpi.c
+++ /dev/null
@@ -1,280 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <unistd.h>
-#include <math.h>
-#include "bigfile-mpi.h"
-#include <gsl/gsl_rng.h>
-#include <gsl/gsl_randist.h>
-
-void usage() {
-    fprintf(stderr, "usage: bigfile-sample-mpi [-r ratio] [-N Nfile] [-f newfilepath] filepath block newblock\n");
-    exit(1);
-
-}
-#define DONE_TAG 1293
-#define ERROR_TAG 1295
-#define DIE_TAG 1290
-#define WORK_TAG 1291
-
-MPI_Datatype MPI_TYPE_WORK;
-BigFile bf = {0};
-BigFile bfnew = {0};
-BigBlock bb = {0};
-BigBlock bbnew = {0};
-int verbose = 0;
-int Nfile = -1;
-size_t CHUNKSIZE = 1 * 1024 * 1024;
-int ThisTask, NTask;
-char * newfilepath = NULL;
-void slave(void);
-void server(void);
-
-double ratio = 1.0;
-struct work {
-    int64_t offset;
-    int64_t seed;
-    int64_t chunksize;
-    int64_t offsetnew;
-    int64_t nsel;
-};
-
-static size_t filesize();
-
-int main(int argc, char * argv[]) {
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    MPI_Comm_size(MPI_COMM_WORLD, &NTask);
-
-    MPI_Type_contiguous(sizeof(struct work), MPI_BYTE, &MPI_TYPE_WORK);
-    MPI_Type_commit(&MPI_TYPE_WORK);
-
-    int ch;
-    while(-1 != (ch = getopt(argc, argv, "n:N:vf:r:"))) {
-        switch(ch) {
-            case 'r':
-                ratio = atof(optarg);
-                break;
-            case 'N':
-            case 'n':
-                Nfile = atoi(optarg);
-                break;
-            case 'f':
-                newfilepath = optarg;
-                break;
-            case 'v':
-                verbose = 1;
-                break;
-            default:
-                usage();
-        }
-    }
-    if(argc - optind + 1 != 4) {
-        usage();
-    }
-    argv += optind - 1;
-    if(0 != big_file_mpi_open(&bf, argv[1], MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    if(0 != big_file_mpi_open_block(&bf, &bb, argv[2], MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    if(Nfile == -1 || bb.Nfile == 0) {
-        Nfile = bb.Nfile;
-    }
-    if(newfilepath == NULL) {
-        newfilepath = argv[1];
-    }
-    if(0 != big_file_mpi_create(&bfnew, newfilepath, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    size_t newsize = filesize();
-    if(0 != big_file_mpi_create_block(&bfnew, &bbnew, argv[3], bb.dtype, bb.nmemb, Nfile, newsize, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to create temp: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-
-    /* copy attrs */
-    size_t nattr;
-    BigAttr * attrs = big_block_list_attrs(&bb, &nattr);
-    int i;
-    for(i = 0; i < nattr; i ++) {
-        BigAttr * attr = &attrs[i];
-        big_block_set_attr(&bbnew, attr->name, attr->data, attr->dtype, attr->nmemb);
-    }
-
-    if(bb.nmemb > 0 && bb.size > 0) {
-    /* copy data */
-        if(ThisTask == 0) {
-            server();
-        } else {
-            slave();
-        }
-    }
-    if(0 != big_block_mpi_close(&bbnew, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to close new: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    big_block_mpi_close(&bb, MPI_COMM_WORLD);
-    big_file_mpi_close(&bf, MPI_COMM_WORLD);
-    big_file_mpi_close(&bfnew, MPI_COMM_WORLD);
-    return 0;
-}
-static size_t filesize() {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(rng, 1984);
-    int64_t offset = 0;
-    int64_t offsetnew = 0;
-    struct work work;
-    for(offset = 0; offset < bb.size; ) {
-        int64_t chunksize = CHUNKSIZE;
-
-        /* never read beyond my end (read_simple caps at EOF) */
-        if(offset + chunksize >= bb.size) {
-            /* this is the last chunk */
-            chunksize = bb.size - offset;
-        }
-        work.offset = offset;
-        work.chunksize = chunksize;
-        work.seed = gsl_rng_get(rng);
-        work.offsetnew = offsetnew;
-        if(ratio == 1.0) {
-            work.nsel = chunksize;
-        } else {
-            work.nsel = gsl_ran_poisson(rng, chunksize * ratio);
-        }
-
-        offset += chunksize;
-        offsetnew += work.nsel;
-    }
-    return offsetnew;
-}
-void server() {
-    int64_t offset = 0;
-    int64_t offsetnew = 0;
-    struct work work;
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(rng, 1984);
-    for(offset = 0; offset < bb.size; ) {
-        int64_t chunksize = CHUNKSIZE;
-        MPI_Status status;
-        int result = 0;
-        MPI_Recv(&result, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
-                &status);
-        if(status.MPI_TAG == ERROR_TAG) {
-            break;
-        }
-
-        /* never read beyond my end (read_simple caps at EOF) */
-        if(offset + chunksize >= bb.size) {
-            /* this is the last chunk */
-            chunksize = bb.size - offset;
-        }
-        work.offset = offset;
-        work.chunksize = chunksize;
-        work.seed = gsl_rng_get(rng);
-        work.offsetnew = offsetnew;
-        if(ratio == 1.0) {
-            work.nsel = chunksize;
-        } else {
-            work.nsel = gsl_ran_poisson(rng, chunksize * ratio);
-        }
-        MPI_Send(&work, 1, MPI_TYPE_WORK, status.MPI_SOURCE, WORK_TAG, MPI_COMM_WORLD);
-
-        offset += chunksize;
-        offsetnew += work.nsel;
-        if(verbose) {
-            fprintf(stderr, "%td / %td done (%0.4g%%)\r", offset, bb.size, (100. / bb.size) * offset);
-        }
-    }
-    int i;
-    for(i = 1; i < NTask; i ++) {
-        struct work work;
-        MPI_Send(&work, 1, MPI_TYPE_WORK, i, DIE_TAG, MPI_COMM_WORLD);
-    }
-
-}
-void slave() {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    int result = 0;
-    MPI_Send(&result, 1, MPI_INT, 0, DONE_TAG, MPI_COMM_WORLD);
-    while(1) {
-        struct work work;
-        MPI_Status status;
-        MPI_Recv(&work, 1, MPI_TYPE_WORK, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
-
-        if(status.MPI_TAG == DIE_TAG) {
-            break;
-        }
-        gsl_rng_set(rng, work.seed);
-
-        int64_t offset = work.offset;
-        int64_t chunksize = work.chunksize;
-        int64_t offsetnew = work.offsetnew;
-        int64_t nsel = work.nsel;
-        BigArray array;
-        BigBlockPtr ptrnew;
-        BigArray arraynew;
-
-        size_t dims[2];
-        void * buffer = malloc(dtype_itemsize(bb.dtype) * bb.nmemb * nsel);
-        dims[0] = nsel;
-        dims[1] = bb.nmemb;
-        big_array_init(&arraynew, buffer, bb.dtype, 2, dims, NULL);
-
-        ptrdiff_t i;
-        size_t step = dtype_itemsize(bb.dtype) * bb.nmemb;
-        size_t leftover = chunksize;
-        char * p = buffer;
-        char * q;
-        if(0 != big_block_read_simple(&bb, offset, chunksize, &array, NULL)) {
-            fprintf(stderr, "failed to read original: %s\n", big_file_get_error_message());
-            result = -1;
-            goto bad;
-        }
-        q = array.data;
-
-//        printf("%ld %ld\n", nsel, leftover);
-        for(i = 0; i < chunksize; i ++) {
-            int64_t r = gsl_rng_uniform_int(rng, leftover);
-            if(r < nsel) {
-                memcpy(p, q, step);
-                p += step;
-                nsel --;
-            }
-            if(nsel == 0) break;
-            leftover --;
-            q += step;
-        }
-        if(nsel != 0) abort();
-        free(array.data);
-        if(0 != big_block_seek(&bbnew, &ptrnew, offsetnew)) {
-            fprintf(stderr, "failed to seek new: %s\n", big_file_get_error_message());
-            result = -1;
-            free(arraynew.data);
-            goto bad;
-        }
-
-        if(0 != big_block_write(&bbnew, &ptrnew, &arraynew)) {
-            fprintf(stderr, "failed to write new: %s\n", big_file_get_error_message());
-            result = -1;
-            free(arraynew.data);
-            goto bad;
-        }
-
-        free(arraynew.data);
-        MPI_Send(&result, 1, MPI_INT, 0, DONE_TAG, MPI_COMM_WORLD);
-        continue;
-    bad:
-        MPI_Send(&result, 1, MPI_INT, 0, ERROR_TAG, MPI_COMM_WORLD);
-        continue;
-    }
-    return;
-}
diff --git a/depends/install_pfft.sh b/depends/install_pfft.sh
deleted file mode 100644
index 9be12703..00000000
--- a/depends/install_pfft.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh -e
-
-PREFIX="$1"
-shift
-OPTIMIZE="$*"
-OPTIMIZE1="$*"
-echo "Optimization for double" ${OPTIMIZE}
-
-PFFT_VERSION=1.0.8-alpha3-fftw3-2don2d
-TMP="tmp-pfft-$PFFT_VERSION"
-LOGFILE="build.log"
-
-mkdir $TMP 
-ROOT=`dirname $0`/../
-if ! [ -f $ROOT/depends/pfft-$PFFT_VERSION.tar.gz ]; then
-wget https://github.com/rainwoodman/pfft/releases/download/$PFFT_VERSION/pfft-$PFFT_VERSION.tar.gz \
-    -O $ROOT/depends/pfft-$PFFT_VERSION.tar.gz 
-fi
-
-gzip -dc $ROOT/depends/pfft-$PFFT_VERSION.tar.gz | tar xf - -C $TMP
-cd $TMP
-
-(
-mkdir -p double;cd double
-
-../pfft-${PFFT_VERSION}/configure --prefix=$PREFIX --disable-shared --enable-static --enable-openmp \
---disable-fortran --disable-dependency-tracking --disable-doc --enable-mpi ${OPTIMIZE} &&
-make -j 8   &&
-make install && echo "PFFT_DONE"
-) 2>&1 > ${LOGFILE}.double
-
-if ! grep PFFT_DONE ${LOGFILE}.double > /dev/null; then
-    tail ${LOGFILE}.double
-    exit 1
-fi
diff --git a/gadget/main.c b/gadget/main.c
index f3e84eee..1cf28fa0 100644
--- a/gadget/main.c
+++ b/gadget/main.c
@@ -6,7 +6,6 @@
 #include <sys/resource.h>
 #include <unistd.h>
 #include <math.h>
-#include <gsl/gsl_errno.h>
 #include <omp.h>
 
 #include <libgadget/slotsmanager.h>
@@ -21,11 +20,6 @@
 
 #include "params.h"
 
-void gsl_handler (const char * reason, const char * file, int line, int gsl_errno)
-{
-    endrun(2001,"GSL_ERROR in file: %s, line %d, errno:%d, error: %s\n",file, line, gsl_errno, reason);
-}
-
 /*! \file main.c
  *  \brief start of the program
  */
@@ -107,9 +101,6 @@ int main(int argc, char **argv)
         endrun(0, "Need to give the snapshot number if FOF is selected for output\n");
     }
 
-    /*Set up GSL so it gives a proper MPI termination*/
-    gsl_set_error_handler(gsl_handler);
-
     /*Initialize the memory manager*/
     mymalloc_init(MaxMemSizePerNode);
 
diff --git a/gadget/params.c b/gadget/params.c
index b7773196..b0087ef9 100644
--- a/gadget/params.c
+++ b/gadget/params.c
@@ -21,10 +21,7 @@
 #include <libgadget/timebinmgr.h>
 #include <libgadget/petaio.h>
 #include <libgadget/cooling_qso_lightup.h>
-#include <libgadget/metal_return.h>
-#include <libgadget/uvbg.h>
 #include <libgadget/stats.h>
-#include <libgadget/plane.h>
 
 static int
 BlackHoleFeedbackMethodAction (ParameterSet * ps, const char * name, void * data)
@@ -420,7 +417,6 @@ void read_parameter_file(char *fname, int * ShowBacktrace, double * MaxMemSizePe
 
     /*Initialize per-module parameters.*/
     set_all_global_params(ps);
-    set_plane_params(ps);
     set_init_params(ps);
     set_petaio_params(ps);
     set_timestep_params(ps);
@@ -434,11 +430,9 @@ void read_parameter_file(char *fname, int * ShowBacktrace, double * MaxMemSizePe
     set_domain_params(ps);
     set_sfr_params(ps);
     set_sync_params(ps);
-    set_uvbg_params(ps);
     set_winds_params(ps);
     set_fof_params(ps);
     set_blackhole_params(ps);
-    set_metal_return_params(ps);
     set_stats_params(ps);
     parameter_set_free(ps);
 }
diff --git a/genic/main.c b/genic/main.c
index 61b800d5..3efd1bb5 100644
--- a/genic/main.c
+++ b/genic/main.c
@@ -8,7 +8,6 @@
 #include <bigfile-mpi.h>
 #include <libgenic/allvars.h>
 #include <libgenic/proto.h>
-#include <libgenic/thermal.h>
 #include <libgadget/walltime.h>
 #include <libgadget/physconst.h>
 #include <libgadget/petapm.h>
@@ -63,18 +62,10 @@ int main(int argc, char **argv)
   const double meanspacing = All2.BoxSize / DMAX(All2.Ngrid, All2.NgridGas);
   double shift_gas = -All2.ProduceGas * 0.5 * (CP.Omega0 - CP.OmegaBaryon) / CP.Omega0 * meanspacing;
   double shift_dm = All2.ProduceGas * 0.5 * CP.OmegaBaryon / CP.Omega0 * meanspacing;
-  
-  double shift_nu = 0;
-  if(!All2.ProduceGas && All2.NGridNu > 0) {
-      double OmegaNu = get_omega_nu(&CP.ONu, 1);
-      shift_nu = -0.5 * (CP.Omega0 - OmegaNu) / CP.Omega0 * meanspacing;
-      shift_dm = 0.5 * OmegaNu / CP.Omega0 * meanspacing;
-  }
-    
+
   if(All2.PrePosGridCenter){
       shift_dm += 0.5 * meanspacing;
       shift_gas += 0.5 * meanspacing;
-      shift_nu += 0.5 * meanspacing;
   }
 
   /*Write the header*/
@@ -88,15 +79,6 @@ int main(int argc, char **argv)
 
   const int64_t TotNu = (int64_t) All2.NGridNu*All2.NGridNu*All2.NGridNu;
   double total_nufrac = 0;
-  struct thermalvel nu_therm;
-  if(TotNu > 0) {
-    const double kBMNu = 3*CP.ONu.kBtnu / (CP.MNu[0]+CP.MNu[1]+CP.MNu[2]);
-    double v_th = NU_V0(All2.TimeIC, kBMNu, All2.units.UnitVelocity_in_cm_per_s);
-    if(!All2.UsePeculiarVelocity)
-        v_th /= sqrt(All2.TimeIC);
-    total_nufrac = init_thermalvel(&nu_therm, v_th, All2.Max_nuvel/v_th, 0);
-    message(0,"F-D velocity scale: %g. Max particle vel: %g. Fraction of mass in particles: %g\n",v_th*sqrt(All2.TimeIC), All2.Max_nuvel*sqrt(All2.TimeIC), total_nufrac);
-  }
   saveheader(&bf, TotNumPart, TotNumPartGas, TotNu, total_nufrac, All2.BoxSize, &CP, All2);
 
   /*Save the transfer functions*/
@@ -163,33 +145,6 @@ int main(int argc, char **argv)
 
   if(NumPartCDM > 0) {
     displacement_fields(pm, DMType, ICP, NumPartCDM, &CP, All2);
-
-    /*Add a thermal velocity to WDM particles*/
-    if(All2.WDM_therm_mass > 0){
-        int i;
-        double v_th = WDM_V0(All2.TimeIC, All2.WDM_therm_mass, CP.Omega0 - CP.OmegaBaryon - get_omega_nu(&CP.ONu, 1), CP.HubbleParam, All2.units.UnitVelocity_in_cm_per_s);
-        if(!All2.UsePeculiarVelocity)
-           v_th /= sqrt(All2.TimeIC);
-        struct thermalvel WDM;
-        init_thermalvel(&WDM, v_th, 10000/v_th, 0);
-        unsigned int * seedtable = init_rng(All2.Seed+1,All2.Ngrid);
-        gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-        /*Seed the random number table with the Id.*/
-        gsl_rng_set(g_rng, seedtable[0]);
-
-        for(i = 0; i < NumPartCDM; i++) {
-             /*Find the slab, and reseed if it has zero z rank*/
-             if(i % All2.Ngrid == 0) {
-                  uint64_t id = idgen_create_id_from_index(idgen_cdm, i);
-                  /*Seed the random number table with x,y index.*/
-                  gsl_rng_set(g_rng, seedtable[id / All2.Ngrid]);
-             }
-             add_thermal_speeds(&WDM, g_rng, ICP[i].Vel);
-        }
-        gsl_rng_free(g_rng);
-        myfree(seedtable);
-    }
-
     write_particle_data(idgen_cdm, 1, &bf, 0, All2.SavePrePos, All2.NumFiles, All2.NumWriters, ICP);
   }
 
@@ -200,43 +155,6 @@ int main(int argc, char **argv)
   }
   myfree(ICP);
 
-  /*Now add random velocity neutrino particles*/
-  if(All2.NGridNu > 0) {
-      int i;
-      IDGenerator idgen_nu[1];
-      idgen_init(idgen_nu, pm, All2.NGridNu, All2.BoxSize);
-
-      int NumPartNu = idgen_nu->NumPart;
-      ICP = (struct ic_part_data *) mymalloc("PartTable", NumPartNu*sizeof(struct ic_part_data));
-
-      NumPartNu = setup_grid(idgen_nu, shift_nu, mass[2], ICP);
-
-	  /*Write initial positions into ICP struct (for neutrinos)*/
-	  for(j=0; j<NumPartNu; j++)
-		  for(k=0; k<3; k++)
-		      ICP[j].PrePos[k] = ICP[j].Pos[k];
-
-      displacement_fields(pm, NuType, ICP, NumPartNu, &CP, All2);
-      unsigned int * seedtable = init_rng(All2.Seed+2,All2.NGridNu);
-      gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-      /*Just in case*/
-      gsl_rng_set(g_rng, seedtable[0]);
-      for(i = 0; i < NumPartNu; i++) {
-           /*Find the slab, and reseed if it has zero z rank*/
-           if(i % All2.NGridNu == 0) {
-                uint64_t id = idgen_create_id_from_index(idgen_nu, i);
-                /*Seed the random number table with x,y index.*/
-                gsl_rng_set(g_rng, seedtable[id / All2.NGridNu]);
-           }
-           add_thermal_speeds(&nu_therm, g_rng, ICP[i].Vel);
-      }
-      gsl_rng_free(g_rng);
-      myfree(seedtable);
-
-      write_particle_data(idgen_nu, 2, &bf, TotNumPart+TotNumPartGas, All2.SavePrePos, All2.NumFiles, All2.NumWriters, ICP);
-      myfree(ICP);
-  }
-
   petapm_destroy(pm);
   big_file_mpi_close(&bf, MPI_COMM_WORLD);
 
diff --git a/genic/params.c b/genic/params.c
index 8fa2fa07..25b23c88 100644
--- a/genic/params.c
+++ b/genic/params.c
@@ -28,7 +28,7 @@ create_parameters(void)
     param_declare_int(ps, "Nmesh", OPTIONAL, 0, "Size of the FFT grid used to estimate displacements. Should be > Ngrid.");
     param_declare_int(ps, "Ngrid", REQUIRED, 0, "Size of regular grid on which the undisplaced CDM particles are created.");
     param_declare_int(ps, "NgridGas", OPTIONAL, -1, "Size of regular grid on which the undisplaced gas particles are created.");
-    param_declare_int(ps, "NgridNu", OPTIONAL, 0, "Number of neutrino particles created for hybrid neutrinos.");
+    param_declare_int(ps, "NgridNu", OPTIONAL, 0, "Number of neutrino particles created for hybrid neutrinos. Not supported in this version.");
     param_declare_int(ps, "Seed", REQUIRED, 0, "Random number generator seed used for the phases of the Gaussian random field.");
     param_declare_int(ps, "MakeGlassGas", OPTIONAL, -1, "Generate Glass IC for gas instead of Grid IC.");
     param_declare_int(ps, "MakeGlassCDM", OPTIONAL, 0, "Generate Glass IC for CDM instead of Grid IC.");
@@ -43,7 +43,7 @@ create_parameters(void)
     param_declare_double(ps, "MNue", OPTIONAL, 0, "First neutrino mass in eV.");
     param_declare_double(ps, "MNum", OPTIONAL, 0, "Second neutrino mass in eV.");
     param_declare_double(ps, "MNut", OPTIONAL, 0, "Third neutrino mass in eV.");
-    param_declare_double(ps, "MWDM_therm", OPTIONAL, 0, "Assign a thermal velocity to the DM. Specifies WDM particle mass in keV.");
+    param_declare_double(ps, "MWDM_therm", OPTIONAL, 0, "Not supported in this version.");
     param_declare_double(ps, "Max_nuvel", OPTIONAL, 5000, "Maximum neutrino velocity sampled from the F-D distribution.");
 
     param_declare_int(ps, "DifferentTransferFunctions", OPTIONAL, 1, "Use species specific transfer functions for baryon and CDM.");
diff --git a/libgadget/Makefile b/libgadget/Makefile
index d8f3684b..f3ca903d 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -19,7 +19,6 @@ TESTED = hci \
 	timebinmgr \
 	neutrinos_lra \
 	omega_nu_single \
-	metal_return \
 	cooling_rates \
 	density \
 	gravity \
@@ -52,20 +51,18 @@ GADGET_OBJS =  \
 	 run.o drift.o stats.o \
 	 timestep.o init.o checkpoint.o \
 	 sfr_eff.o cooling.o cooling_rates.o cooling_uvfluc.o cooling_qso_lightup.o \
-	 winds.o veldisp.o density.o metal_return.o \
+	 winds.o veldisp.o density.o \
 	 treewalk.o cosmology.o \
 	 gravshort-tree.o gravshort-pair.o hydra.o  timefac.o \
 	 gravpm.o powerspectrum.o \
 	 forcetree.o \
-	 petapm.o gravity.o \
-	 densitykernel.o lightcone.o walltime.o\
+	 petapm.o pm_kernel.o gravity.o \
+	 densitykernel.o walltime.o\
 	 runtests.o \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
-	 uvbg.o \
-	 plane.o\
-	 lenstools.o
+	 plane.o 
 
 GADGET_UTILS_OBJS= \
 utils/endrun.o \
@@ -109,9 +106,6 @@ all: libgadget.a libgadget-utils.a
 .objs/test_density: tests/test_density.c .objs/density.o libgadget.a ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
 	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
 
-.objs/test_metal_return: tests/test_metal_return.c .objs/metal_return.o libgadget.a ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
-	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
-
 .objs/test_cooling: tests/test_cooling.c .objs/cooling.o .objs/cooling_rates.o .objs/cooling_uvfluc.o ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
 	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
 
diff --git a/libgadget/box_iterator.cpp b/libgadget/box_iterator.cpp
new file mode 100644
index 00000000..139597f9
--- /dev/null
+++ b/libgadget/box_iterator.cpp
@@ -0,0 +1,2 @@
+
+
diff --git a/libgadget/box_iterator.hpp b/libgadget/box_iterator.hpp
new file mode 100644
index 00000000..2764bbf6
--- /dev/null
+++ b/libgadget/box_iterator.hpp
@@ -0,0 +1,238 @@
+#ifndef __CUFFTMP_BOX_ITERATOR_HPP__
+#define __CUFFTMP_BOX_ITERATOR_HPP__
+
+#include <iterator>
+#include <cstddef> 
+#include <cufftXt.h>
+#include <tuple>
+
+
+/**
+ * This iterator lets one iterate through the underlying data
+ * associated to a (lower, upper, strides) box, and exposes the mapping
+ * between global 3D coordinates (x, y, z) and local linear
+ * indices.
+ * 
+ * This iterator can be used in __host__ or __device__ code
+ */
+
+using int64 = long long int;
+
+struct Box3D {
+    int64 lower[3];
+    int64 upper[3];
+    int64 strides[3];
+};
+
+template<typename T>
+struct BoxIterator 
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type   = std::ptrdiff_t;
+    using value_type        = T;
+    using pointer           = T*;
+    using reference         = T&;
+
+    __host__ __device__ __forceinline__
+    BoxIterator(int64 i, Box3D box, T* ptr) : i_(i), box_(box), ptr_(ptr), 
+                                                    lx_(box.upper[0] - box.lower[0]),
+                                                    ly_(box.upper[1] - box.lower[1]),
+                                                    lz_(box.upper[2] - box.lower[2]) {
+        linear_to_box3d(i_, &x_, &y_, &z_);
+    };
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator++() { increment(1); return *this; } 
+
+    __host__ __device__
+    BoxIterator operator++(int) { 
+        BoxIterator tmp = *this; 
+        ++(*this); 
+        return tmp; 
+    } 
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator--() { increment(-1); return *this; } 
+
+    __host__ __device__
+    BoxIterator operator--(int) { 
+        BoxIterator tmp = *this; 
+        --(*this); 
+        return tmp; 
+    }  
+    
+    __host__ __device__ __forceinline__
+    BoxIterator& operator+=(difference_type rhs) { increment(rhs); return *this; }
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator-=(difference_type rhs) { increment(-rhs); return *this; }
+
+
+    __host__ __device__ __forceinline__
+    reference operator*() const { return ptr_[i()]; }
+
+    __host__ __device__ __forceinline__
+    pointer operator->() { return ptr_ + i(); }
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator[](difference_type rhs) const { return (*this + rhs); }
+
+    __host__ __device__ __forceinline__ 
+    friend difference_type operator-(const BoxIterator& a, const BoxIterator& b) {return a.i_ - b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator-(const BoxIterator& a, difference_type n) { a -= n; return a; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator+(const BoxIterator& a, difference_type n) { a += n; return a; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator+(difference_type n, const BoxIterator& a) { return a+n; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator==(const BoxIterator& a, const BoxIterator& b) { return a.i_ == b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator!=(const BoxIterator& a, const BoxIterator& b) { return a.i_ != b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator>(const BoxIterator& a, const BoxIterator& b) { return a.i_ > b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator<(const BoxIterator& a, const BoxIterator& b) { return a.i_ < b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator>=(const BoxIterator& a, const BoxIterator& b) { return a.i_ >= b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator<=(const BoxIterator& a, const BoxIterator& b) { return a.i_ <= b.i_; }
+
+    /**
+     * Return the global X coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 x() const { return x_; }
+
+    /**
+     * Return the global Y coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 y() const { return y_; }
+
+    /**
+     * Return the global Z coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 z() const { return z_; }
+
+    /**
+     * Return the linear position of the iterator
+     * in the local data buffer
+     */
+    __host__ __device__ __forceinline__
+    int64 i() const {
+        return (x_ - box_.lower[0]) * box_.strides[0] + (y_ - box_.lower[1]) * box_.strides[1] + (z_ - box_.lower[2]) * box_.strides[2]; 
+    }
+
+private:
+
+    // Current 3D global index in the box
+    int64 x_, y_, z_;
+    // Current linear 3D index (not the location in memory)
+    int64 i_;
+    // Global box lower and upper corner and local strides
+    const Box3D box_;
+    // Underlying data pointer
+    T* ptr_;
+    // Length of the X, Y and Z dimensions
+    const int64 lx_, ly_, lz_;
+
+    // Linear to 3D coordinates
+    __host__ __device__ __forceinline__
+    void linear_to_box3d(int64 i, int64* x, int64* y, int64* z) {
+        if(lx_ * ly_ * lz_ > 0) {
+            *x  =   i  / (ly_ * lz_);
+            i  -= (*x) * (ly_ * lz_);
+            *y  =   i  / (lz_);
+            i  -= (*y) * (lz_);
+            *z  =   i;
+        } else {
+            *x = 0;
+            *y = 0;
+            *z = 0;
+        }
+        *x += box_.lower[0];
+        *y += box_.lower[1];
+        *z += box_.lower[2];
+    }
+
+    // Increment/decrement by n
+    __host__ __device__ __forceinline__
+    void increment(difference_type n) {
+        i_ += n;
+        linear_to_box3d(i_, &x_, &y_, &z_);
+    }
+
+};
+
+inline int64 slabs_displacement(int64 length, int rank, int size) {
+    int ranks_cutoff = length % size;
+    return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+}
+
+inline Box3D buildBox3D(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz) {
+    if(format == CUFFT_XT_FORMAT_INPLACE) {
+        int64 x_start      = slabs_displacement(nx, rank,   size);
+        int64 x_end        = slabs_displacement(nx, rank+1, size);
+        int64 my_ny        = ny;
+        int64 my_nz        = nz;
+        int64 my_nz_padded = (type == CUFFT_C2C || type == CUFFT_Z2Z) ? my_nz : 2*(nz/2 + 1);
+        return {
+            {x_start, 0, 0}, {x_end, my_ny, my_nz}, {my_ny * my_nz_padded, my_nz_padded, 1}
+        };
+    } else {
+        int64 y_start      = slabs_displacement(ny, rank,   size);
+        int64 y_end        = slabs_displacement(ny, rank+1, size);
+        int64 my_nx        = nx;
+        int64 my_nz        = (type == CUFFT_C2C || type == CUFFT_Z2Z) ? nz : (nz/2 + 1);
+        int64 my_nz_padded = my_nz;
+        return {
+            {0, y_start, 0}, {my_nx, y_end, my_nz}, {(y_end-y_start) * my_nz_padded, my_nz_padded, 1}
+        };
+    }
+}
+
+
+template<typename T> __host__ __device__ __forceinline__ 
+BoxIterator<T> BoxIteratorBegin(Box3D box, T* ptr) {
+    return BoxIterator<T>(0, box, ptr);
+};
+
+template<typename T> __host__ __device__ __forceinline__
+BoxIterator<T> BoxIteratorEnd(Box3D box, T* ptr) {
+    return BoxIterator<T>( (box.upper[0] - box.lower[0]) * (box.upper[1] - box.lower[1]) * (box.upper[2] - box.lower[2]), box, ptr);
+};
+
+template<typename T>
+std::pair<BoxIterator<T>,BoxIterator<T>> BoxIterators(Box3D box, T* ptr) {
+    return {BoxIteratorBegin<T>(box, ptr),BoxIteratorEnd<T>(box, ptr)};
+}
+
+template<typename T>
+BoxIterator<T> BoxIteratorBegin(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    Box3D box = buildBox3D(format, type, rank, size, nx, ny, nz);
+    return BoxIteratorBegin<T>(box, ptr);
+}
+
+template<typename T>
+BoxIterator<T> BoxIteratorEnd(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    Box3D box = buildBox3D(format, type, rank, size, nx, ny, nz);
+    return BoxIteratorEnd<T>(box, ptr);
+}
+
+template<typename T>
+std::pair<BoxIterator<T>,BoxIterator<T>> BoxIterators(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    return {BoxIteratorBegin<T>(format, type, rank, size, nx, ny, nz, ptr),BoxIteratorEnd<T>(format, type, rank, size, nx, ny, nz, ptr)};
+}
+
+#endif // __CUFFTMP_BOX_ITERATOR_HPP__
\ No newline at end of file
diff --git a/libgadget/cooling_qso_lightup.c b/libgadget/cooling_qso_lightup.c
index a2e7a7ca..6b674a94 100644
--- a/libgadget/cooling_qso_lightup.c
+++ b/libgadget/cooling_qso_lightup.c
@@ -30,7 +30,7 @@
 #include <mpi.h>
 #include <string.h>
 #include <omp.h>
-#include <gsl/gsl_interp.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
 #include "physconst.h"
 #include "slotsmanager.h"
 #include "partmanager.h"
@@ -47,6 +47,9 @@
 #define E0_HeII 54.4 /* HeII ionization potential in eV*/
 #define HEMASS 4.002602 /* Helium mass in amu*/
 
+boost::math::interpolators::barycentric_rational<double>* HeIII_intp;
+boost::math::interpolators::barycentric_rational<double>* LMFP_intp;
+
 typedef struct
 {
     TreeWalkQueryBase base;
@@ -83,8 +86,6 @@ static int Nreionhist;
 static double * He_zz;
 static double * XHeIII;
 static double * LMFP;
-static gsl_interp * HeIII_intp;
-static gsl_interp * LMFP_intp;
 
 /*This is a helper for the tests*/
 void set_qso_lightup_par(struct qso_lightup_params qso)
@@ -226,11 +227,11 @@ load_heii_reion_hist(const char * reion_hist_file)
     /*Broadcast data to other processors*/
     MPI_Bcast(He_zz, 3 * Nreionhist, MPI_DOUBLE, 0, MPI_COMM_WORLD);
     MPI_Bcast(&qso_inst_heating, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-    /* Initialize the interpolators*/
-    HeIII_intp = gsl_interp_alloc(gsl_interp_linear,Nreionhist);
-    LMFP_intp = gsl_interp_alloc(gsl_interp_linear,Nreionhist);
-    gsl_interp_init(HeIII_intp, He_zz, XHeIII, Nreionhist);
-    gsl_interp_init(LMFP_intp, He_zz, LMFP, Nreionhist);
+
+    // Initialize HeIII interpolation using barycentric rational interpolation
+    HeIII_intp = new boost::math::interpolators::barycentric_rational<double>(He_zz, XHeIII, Nreionhist);
+    // Initialize LMFP interpolation
+    LMFP_intp = new boost::math::interpolators::barycentric_rational<double>(He_zz, LMFP, Nreionhist);
 
     QSOLightupParams.heIIIreion_start = 1/He_zz[0]-1;
 
@@ -271,7 +272,7 @@ get_long_mean_free_path_heating(double redshift)
     if(atime > He_zz[Nreionhist-1])
         return 0;
 
-    double long_mfp_heating = gsl_interp_eval(LMFP_intp, He_zz, LMFP, atime, NULL);
+    double long_mfp_heating = (*LMFP_intp)(atime);
 
     last_zz = redshift;
     last_long_mfp_heating = long_mfp_heating;
@@ -529,7 +530,8 @@ turn_on_quasars(double atime, FOFGroups * fof, ForceTree * gasTree, Cosmology *
     int * qso_cand = NULL;
     int64_t n_gas_tot=0, tot_n_ionized=0, ncand_tot=0;
     MPI_Allreduce(&SlotsManager->info[0].size, &n_gas_tot, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-    double desired_ion_frac = gsl_interp_eval(HeIII_intp, He_zz, XHeIII, atime, NULL);
+    // Evaluate the interpolators
+    double desired_ion_frac = (*HeIII_intp)(atime);
     struct QSOPriv priv;
     priv.fof = fof;
     priv.uu_in_cgs = uu_in_cgs;
@@ -663,7 +665,7 @@ do_heiii_reionization(double atime, FOFGroups * fof, ForceTree * gasTree, Cosmol
 int
 need_change_helium_ionization_fraction(double atime)
 {
-    double desired_ion_frac = gsl_interp_eval(HeIII_intp, He_zz, XHeIII, atime, NULL);
+    double desired_ion_frac = (*HeIII_intp)(atime);
     double curionfrac = gas_ionization_fraction();
     if(curionfrac < desired_ion_frac)
         return 1;
diff --git a/libgadget/cooling_rates.c b/libgadget/cooling_rates.c
index 9e41bbe0..b9399a04 100644
--- a/libgadget/cooling_rates.c
+++ b/libgadget/cooling_rates.c
@@ -59,7 +59,15 @@
 #include <mpi.h>
 #include <stdio.h>
 #include <string.h>
-#include <gsl/gsl_interp.h>
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+
+#include <boost/math/interpolators/barycentric_rational.hpp>
+
+// Optionally, redefine P afterward if you still need it
+#define P PartManager->Base
 #include "physconst.h"
 #include "utils/endrun.h"
 #include "utils/paramset.h"
@@ -67,7 +75,7 @@
 
 static struct cooling_params CoolingParams;
 
-static gsl_interp * GrayOpac;
+boost::math::interpolators::barycentric_rational<double>* GrayOpac;
 
 /*Tables for the self-shielding correction. Note these are not well-measured for z > 5!*/
 #define NGRAY 6
@@ -75,11 +83,11 @@ static gsl_interp * GrayOpac;
 static const double GrayOpac_ydata[NGRAY] = { 2.59e-18, 2.37e-18, 2.27e-18, 2.15e-18, 2.02e-18, 1.94e-18};
 static const double GrayOpac_zz[NGRAY] = {0, 1, 2, 3, 4, 5};
 
-/*Convenience structure bundling together the gsl interpolation routines.*/
+/*Convenience structure bundling together the interpolation routines.*/
 struct itp_type
 {
     double * ydata;
-    gsl_interp * intp;
+    boost::math::interpolators::barycentric_rational<double>* intp;
 };
 /*Interpolation objects for the redshift evolution of the UVB.*/
 /*Number of entries in the table*/
@@ -119,8 +127,7 @@ static double * cool_freefree1;
 static void
 init_itp_type(double * xarr, struct itp_type * Gamma, int Nelem)
 {
-    Gamma->intp = gsl_interp_alloc(gsl_interp_linear,Nelem);
-    gsl_interp_init(Gamma->intp, xarr, Gamma->ydata, Nelem);
+    Gamma->intp = new boost::math::interpolators::barycentric_rational<double>(xarr, Gamma->ydata, Nelem);
 }
 
 /* Helper function to correctly load a value in the TREECOOL file*/
@@ -325,7 +332,7 @@ get_photo_rate(double redshift, struct itp_type * Gamma_tab)
     else if (log1z < Gamma_log1z[0])
         photo_rate = Gamma_tab->ydata[0];
     else {
-        photo_rate = gsl_interp_eval(Gamma_tab->intp, Gamma_log1z, Gamma_tab->ydata, log1z, NULL);
+        photo_rate = (*Gamma_tab->intp)(log1z);
     }
     return pow(10, photo_rate) * CoolingParams.PhotoIonizeFactor;
 }
@@ -355,7 +362,7 @@ get_self_shield_dens(double redshift, const struct UVBG * uvbg)
     else if (redshift >= GrayOpac_zz[NGRAY-1])
         greyopac = GrayOpac_ydata[NGRAY-1];
     else {
-        greyopac = gsl_interp_eval(GrayOpac, GrayOpac_zz, GrayOpac_ydata,redshift, NULL);
+        greyopac = (*GrayOpac)(redshift);
     }
     return 6.73e-3 * pow(greyopac / 2.49e-18, -2./3)*pow(G12, 2./3)*pow(CoolingParams.fBar/0.17,-1./3);
 }
@@ -408,7 +415,7 @@ get_photorate_coeff(double alpha, struct itp_type * Gamma_tab)
     else if (alpha < Gamma_alpha[0])
         photo_rate = Gamma_tab->ydata[0];
     else {
-        photo_rate = gsl_interp_eval(Gamma_tab->intp, Gamma_alpha, Gamma_tab->ydata, alpha, NULL);
+        photo_rate = (*Gamma_tab->intp)(alpha);
     }
     //pow 10 here because the treecool load does log10
     return pow(10,photo_rate) * CoolingParams.PhotoIonizeFactor;
@@ -1107,10 +1114,8 @@ init_cooling_rates(const char * TreeCoolFile, const char * J21CoeffFile, const c
     CoolingParams.fBar = CP->OmegaBaryon / CP->OmegaCDM;
     CoolingParams.rho_crit_baryon = CP->OmegaBaryon * 3.0 * pow(CP->HubbleParam*HUBBLE,2.0) /(8.0*M_PI*GRAVITY);
 
-    /* Initialize the interpolation for the self-shielding module as a function of redshift.
-     * A crash has been observed in GSL with a cspline interpolator. */
-    GrayOpac = gsl_interp_alloc(gsl_interp_linear,NGRAY);
-    gsl_interp_init(GrayOpac,GrayOpac_zz,GrayOpac_ydata, NGRAY);
+    /* Initialize the interpolation for the self-shielding module as a function of redshift.*/
+    GrayOpac = new boost::math::interpolators::barycentric_rational<double>(GrayOpac_zz,GrayOpac_ydata, NGRAY);
 
     if(!TreeCoolFile || strnlen(TreeCoolFile,100) == 0) {
         CoolingParams.PhotoIonizationOn = 0;
diff --git a/libgadget/cosmology.c b/libgadget/cosmology.c
index b4abd76e..04c9223a 100644
--- a/libgadget/cosmology.c
+++ b/libgadget/cosmology.c
@@ -1,11 +1,9 @@
 #include <math.h>
-#include <gsl/gsl_integration.h>
-#include <gsl/gsl_errno.h>
-#include <gsl/gsl_odeiv2.h>
-
+#include <boost/numeric/odeint.hpp>
 #include "cosmology.h"
 #include "physconst.h"
 #include "utils.h"
+#include "timefac.h"
 
 /*Stefan-Boltzmann constant in cgs units*/
 #define  STEFAN_BOLTZMANN 5.670373e-5
@@ -90,11 +88,13 @@ double GrowthFactor(Cosmology * CP, double astart, double aend)
     return growth(CP, astart, NULL) / growth(CP, aend, NULL);
 }
 
-int growth_ode(double a, const double yy[], double dyda[], void * params)
+// Define the ODE system for the growth factor
+void growth_ode(const std::vector<double> &yy, std::vector<double> &dyda, double a, void * params)
 {
     Cosmology * CP = (Cosmology *) params;
-    const double hub = hubble_function(CP, a)/CP->Hubble;
-    dyda[0] = yy[1]/pow(a,3)/hub;
+    const double hub = hubble_function(CP, a) / CP->Hubble;
+
+    dyda[0] = yy[1] / pow(a, 3) / hub;
     /*Only use gravitating part*/
     /* Note: we do not include neutrinos
      * here as they are free-streaming at the initial time.
@@ -103,8 +103,7 @@ int growth_ode(double a, const double yy[], double dyda[], void * params)
      * and we need to numerically differentiate. In practice the box will either be larger
      * than the horizon, and so need radiation perturbations, or the neutrino
      * mass will be larger than current constraints allow, so we just warn for now.*/
-    dyda[1] = yy[0] * 1.5 * a * (CP->OmegaCDM + CP->OmegaBaryon)/(a*a*a) / hub;
-    return GSL_SUCCESS;
+    dyda[1] = yy[0] * 1.5 * a * (CP->OmegaCDM + CP->OmegaBaryon) / (a * a * a) / hub;
 }
 
 /** The growth function is given as a 2nd order DE in Peacock 1999, Cosmological Physics.
@@ -114,39 +113,59 @@ int growth_ode(double a, const double yy[], double dyda[], void * params)
  * Define F = a^3 H dD/da
  * and we have: dF/da = 1.5 a H D
  */
-double growth(Cosmology * CP, double a, double * dDda)
+
+double growth(Cosmology *CP, double a, double *dDda)
 {
-  gsl_odeiv2_system FF;
-  FF.function = &growth_ode;
-  FF.jacobian = NULL;
-  FF.params = CP;
-  FF.dimension = 2;
-  gsl_odeiv2_driver * drive = gsl_odeiv2_driver_alloc_standard_new(&FF,gsl_odeiv2_step_rkf45, 1e-5, 1e-8,1e-8,1,1);
-   /* We start early to avoid lambda.*/
-  double curtime = 1e-5;
-  /* Handle even earlier times*/
-  if(a < curtime)
-      curtime = a / 10;
-  /* Initial velocity chosen so that D = Omegar + 3/2 Omega_m a,
-   * the solution for a matter/radiation universe.*
-   * Note the normalisation of D is arbitrary
-   * and never seen outside this function.*/
-  double yinit[2] = {1.5 * (CP->OmegaCDM + CP->OmegaBaryon)/(curtime*curtime), pow(curtime,3)*hubble_function(CP, curtime)/CP->Hubble * 1.5 * (CP->OmegaCDM + CP->OmegaBaryon)/(curtime*curtime*curtime)};
-  if(CP->RadiationOn)
-      yinit[0] += CP->OmegaG/pow(curtime, 4)+get_omega_nu(&CP->ONu, curtime);
-
-  int stat = gsl_odeiv2_driver_apply(drive, &curtime,a, yinit);
-  if (stat != GSL_SUCCESS) {
-      endrun(1,"gsl_odeiv in growth: %d. Result at %g is %g %g\n",stat, curtime, yinit[0], yinit[1]);
-  }
-  gsl_odeiv2_driver_free(drive);
-  /*Store derivative of D if needed.*/
-  if(dDda) {
-      *dDda = yinit[1]/pow(a,3)/(hubble_function(CP, a)/CP->Hubble);
-  }
-  return yinit[0];
-}
+    using namespace boost::numeric::odeint;
+
+    // Define a default start time (scale factor)
+    double curtime = 1e-5;
+
+    // Adjust `curtime` if `a` is smaller than the default
+    if (a < curtime) {
+        curtime = a / 10.0;  // Ensure `curtime` is smaller than the target `a`
+    }
+
+    // Initial conditions for the growth factor
+    std::vector<double> yinit(2);
+
+    // Initial conditions at curtime: [D(curtime), D'(curtime)]
+    yinit[0] = 1.5 * (CP->OmegaCDM + CP->OmegaBaryon) / (curtime * curtime);  
+    yinit[1] = pow(curtime, 3) * hubble_function(CP, curtime) / CP->Hubble *
+               1.5 * (CP->OmegaCDM + CP->OmegaBaryon) / (curtime * curtime * curtime); 
+
+    // Include radiation if enabled
+    if (CP->RadiationOn) {
+        yinit[0] += CP->OmegaG / pow(curtime, 4) + get_omega_nu(&CP->ONu, curtime);
+    }
 
+    // Define the ODE system (as a lambda function)
+    auto growth_system = [&CP](const std::vector<double> &yy, std::vector<double> &dyda, double a) {
+        growth_ode(yy, dyda, a, CP);
+    };
+
+    // Use Boost's Runge-Kutta-Fehlberg (RKF45) adaptive step-size integrator
+    runge_kutta_cash_karp54<std::vector<double>> stepper;
+    double abs_error = 1e-8;
+    double rel_error = 1e-8;
+    double step_size = 1e-5;
+
+    try {
+        // Integrate the ODE from curtime (curtime) to the given `a`
+        integrate_adaptive(make_controlled(abs_error, rel_error, stepper),
+                           growth_system, yinit, curtime, a, step_size);
+    } catch (...) {
+        endrun(1, "Boost ODE solver failed during integration\n");
+    }
+
+    // If the derivative is needed, store it in dDda
+    if (dDda) {
+        *dDda = yinit[1] / pow(a, 3) / (hubble_function(CP, a) / CP->Hubble);
+    }
+
+    // Return the growth factor D(a)
+    return yinit[0];
+}
 /*
  * This is the Zeldovich approximation prefactor,
  * f1 = d ln D1 / dlna = a / D (dD/da)
@@ -236,19 +255,22 @@ double function_of_k_eval(FunctionOfK * fk, double k)
     }
 }
 
-double function_of_k_tophat_sigma(FunctionOfK * fk, double R)
+// Adapted function to use Tanh-Sinh adaptive integration
+double function_of_k_tophat_sigma(FunctionOfK *fk, double R)
 {
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
+    // Create the parameter structure
     struct sigma2_params params = {fk, R};
-    double result,abserr;
-    gsl_function F;
-    F.function = &sigma2_int;
-    F.params = &params;
-
-    /* note: 500/R is here chosen as integration boundary (infinity) */
-    gsl_integration_qags (&F, 0, 500. / R, 0, 1e-4,1000,w,&result, &abserr);
-    //   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size);
-    gsl_integration_workspace_free (w);
+    double abserr;  // To hold the estimated error
+
+    // Define the integrand as a lambda function wrapping the original `sigma2_int`
+    auto integrand = [&params](double k) -> double {
+        return sigma2_int(k, (void*)&params);
+    };
+
+    // Perform the Tanh-Sinh adaptive integration
+    double result = tanh_sinh_integrate_adaptive(integrand, 0, 500.0 / R, &abserr, 1e-4);
+
+    // Return the square root of the result
     return sqrt(result);
 }
 
diff --git a/libgadget/domain.c b/libgadget/domain.c
index 571b3371..c8980e1b 100644
--- a/libgadget/domain.c
+++ b/libgadget/domain.c
@@ -323,8 +323,8 @@ int domain_maintain(DomainDecomp * ddecomp, struct DriftData * drift)
     walltime_measure("/Domain/drift");
 
     /* Try a domain exchange. Note ExchangeList is freed inside.*/
-    int errno = domain_exchange(domain_layoutfunc, ddecomp, ExchangeData, PartManager, SlotsManager, 10000, ddecomp->DomainComm);
-    return errno;
+    int exchange_status = domain_exchange(domain_layoutfunc, ddecomp, ExchangeData, PartManager, SlotsManager, 10000, ddecomp->DomainComm);
+    return exchange_status;
 }
 
 /* this function generates several domain decomposition policies for attempting
diff --git a/libgadget/gravity.h b/libgadget/gravity.h
index e5d2dcf1..adeb04e8 100644
--- a/libgadget/gravity.h
+++ b/libgadget/gravity.h
@@ -58,9 +58,9 @@ void grav_short_pair(const ActiveParticles * act, PetaPM * pm, ForceTree * tree,
 void grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFloat (* AccelStore)[3], double rho0, inttime_t Ti_Current);
 
 /*Read the power spectrum, without changing the input value.*/
-void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value);
+void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value);
 
 /* Compute the power spectrum of the Fourier transformed grid in value.*/
-void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh);
+void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh);
 
 #endif
diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index 3130d117..cdc2c69a 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -20,11 +20,11 @@ static int pm_mark_region_for_node(int startno, int rid, int * RegionInd, const
 static void convert_node_to_region(PetaPM * pm, PetaPMRegion * r, struct NODE * Nodes);
 
 static int hybrid_nu_gravpm_is_active(int i);
-static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void compute_neutrino_power(PetaPM * pm);
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -62,8 +62,8 @@ gravpm_force(PetaPM * pm, DomainDecomp * ddecomp, Cosmology * CP, double Time, d
     PetaPMParticleStruct pstruct = {
         P,
         sizeof(P[0]),
-        (char*) &P[0].Pos[0]  - (char*) P,
-        (char*) &P[0].Mass  - (char*) P,
+        static_cast<size_t>((char*) &P[0].Pos[0]  - (char*) P),
+        static_cast<size_t>((char*) &P[0].Mass  - (char*) P),
         /* Regions allocated inside _prepare*/
         NULL,
         /* By default all particles are active. For hybrid neutrinos set below.*/
@@ -318,9 +318,7 @@ static void compute_neutrino_power(PetaPM * pm) {
     delta_nu_from_power(ps, GravPM.CP, GravPM.Time, GravPM.TimeIC);
 
     /*Initialize the interpolation for the neutrinos*/
-    ps->nu_spline = gsl_interp_alloc(gsl_interp_linear,ps->nonzero);
-    ps->nu_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(ps->nu_spline,ps->logknu,ps->delta_nu_ratio,ps->nonzero);
+    ps->nu_spline = new boost::math::interpolators::barycentric_rational<double>(ps->logknu, ps->delta_nu_ratio, ps->nonzero);
     /*Zero power spectrum, which is stored with the neutrinos*/
     powerspectrum_zero(ps);
 }
@@ -328,11 +326,11 @@ static void compute_neutrino_power(PetaPM * pm) {
 /* Compute the power spectrum of the fourier transformed grid in value.
  * Store it in the PowerSpectrum structure */
 void
-powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh)
+powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh)
 {
     if(k2 == 0) {
         /* Save zero mode corresponding to the mean as the normalisation factor.*/
-        PowerSpectrum->Norm = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        PowerSpectrum->Norm = (value[0].x * value[0].x + value[0].y * value[0].y);
         return;
     }
     /* Measure power spectrum: we don't want the zero mode.
@@ -344,7 +342,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
         int kint=floor(binsperunit*log(k2)/2.);
         int w;
         const double keff = sqrt(kpos[0]*kpos[0]+kpos[1]*kpos[1]+kpos[2]*kpos[2]);
-        const double m = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        const double m = (value[0].x * value[0].x + value[0].y * value[0].y);
         /*Make sure we do not overflow (although this should never happen)*/
         if(kint >= PowerSpectrum->size)
             return;
@@ -362,7 +360,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
 
 /*Just read the power spectrum, without changing the input value.*/
 void
-measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value) {
+measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value) {
     double f = 1.0;
     /* the CIC deconvolution kernel is
      *
@@ -380,8 +378,10 @@ measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value
     powerspectrum_add_mode(pm->ps, k2, kpos, value, f, pm->Nmesh);
 }
 
+
+/*  */
 static void
-potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
+potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value)
 {
     const double asmth2 = pow((2 * M_PI) * pm->Asmth / pm->Nmesh,2);
     double f = 1.0;
@@ -430,10 +430,9 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
          *            = (M_cdm + M_nu) * delta_t
          * This is correct for the forces, and gives the right power spectrum,
          * once we multiply PowerSpectrum.Norm by (Omega0 / (Omega0 - OmegaNu))**2 */
-        const double nufac = 1 + ps->nu_prefac * gsl_interp_eval(ps->nu_spline,ps->logknu,
-                                                                       ps->delta_nu_ratio,logk2,ps->nu_acc);
-        value[0][0] *= nufac;
-        value[0][1] *= nufac;
+        const double nufac = 1 + ps->nu_prefac * (*ps->nu_spline)(logk2);
+        value[0].x *= nufac;
+        value[0].y *= nufac;
     }
 
     /*Compute the power spectrum*/
@@ -444,13 +443,13 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
             ps->Norm *= MtotbyMcdm*MtotbyMcdm;
         }
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -473,7 +472,7 @@ static int hybrid_nu_gravpm_is_active(int i) {
         return 1;
 }
 
-static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM * pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -482,18 +481,18 @@ static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight) {
diff --git a/libgadget/lenstools.c b/libgadget/lenstools.c
index 5cc2b735..a58cac7f 100644
--- a/libgadget/lenstools.c
+++ b/libgadget/lenstools.c
@@ -298,6 +298,7 @@ void savePotentialPlane(double *data, int rows, int cols, const char * const fil
     double Lbox_Mpc = Lbox * UnitLength_in_cm / CM_PER_MPC;  // Box size in Mpc/h
     double comoving_distance_Mpc = comoving_distance * UnitLength_in_cm / CM_PER_MPC;
     double Ode0 = CP->OmegaLambda > 0 ? CP->OmegaLambda : CP->Omega_fld;
+    char unit[] = "rad2    ";  // Mutable string for the UNIT keyword
     // Insert a blank line as a separator
     fits_write_record(fptr, "        ", &status);
     // Add headers to the FITS file
@@ -313,7 +314,7 @@ void savePotentialPlane(double *data, int rows, int cols, const char * const fil
     fits_update_key(fptr, TDOUBLE, "CHI", (&comoving_distance_Mpc), "Comoving distance in Mpc/h", &status);
     fits_update_key(fptr, TDOUBLE, "SIDE", &(Lbox_Mpc), "Side length in Mpc/h", &status);
     fits_update_key(fptr, TLONGLONG, "NPART", &num_particles, "Number of particles on the plane", &status);
-    fits_update_key(fptr, TSTRING, "UNIT", "rad2    ", "Pixel value unit", &status);
+    fits_update_key(fptr, TSTRING, "UNIT", unit, "Pixel value unit", &status);
 
     // Write the 2D array of doubles to the image
     long fpixel[2] = {1, 1};  // first pixel to write (1-based indexing)
diff --git a/libgadget/lightcone.c b/libgadget/lightcone.c
deleted file mode 100644
index c5bc6322..00000000
--- a/libgadget/lightcone.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <gsl/gsl_integration.h>
-/*For mkdir*/
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include "utils.h"
-
-#include "timefac.h"
-#include "partmanager.h"
-#include "cosmology.h"
-#include "physconst.h"
-
-#define NENTRY 4096
-static double tab_loga[NENTRY];
-static double dloga;
-static double tab_Dc[NENTRY];
-/*
- * light cone on the fly:
- *
- * assuming the origin is at (0, 0, 0)
- *
- * */
-
-/*
- * replicas to consider, function of redshift;
- *
- * */
-static int Nreplica;
-static int BoxBoost = 20;
-static double Reps[8192][3];
-static double HorizonDistance2;
-static double HorizonDistance;
-static double HorizonDistancePrev;
-static double HorizonDistance2Prev;
-static double HorizonDistanceRef;
-static double zmin = 0.1;
-static double zmax = 80.0;
-static double ReferenceRedshift = 2.0; /* write all particles below this redshift; write a fraction above this. */
-static double SampleFraction; /* current fraction of particle gets written */
-static FILE * fd_lightcone;
-
-static double lightcone_get_horizon(double a);
-static void lightcone_cross(int p, double ddrift, const RandTable * const rnd);
-static void lightcone_set_time(double a, const double BoxSize);
-/*
-M, L = self.M, self.L
-  logx = numpy.linspace(log10amin, 0, Np)
-  def kernel(log10a):
-    a = numpy.exp(log10a)
-    return 1 / self.Ea(a) * a ** -1 # dz = - 1 / a dlog10a
-  y = numpy.array( [romberg(kernel, log10a, 0, vec_func=True, divmax=10) for log10a in logx])
-*/
-static double kernel(double loga, void * params) {
-    double a = exp(loga);
-      Cosmology * CP = (Cosmology *) params;
-    return 1 / hubble_function(CP, a) * CP->Hubble / a;
-}
-
-static void lightcone_init_entry(Cosmology * CP, int i, const double UnitLength_in_cm) {
-    tab_loga[i] = - dloga * (NENTRY - i - 1);
-
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
-
-    double result, error;
-
-    gsl_function F;
-    F.function = &kernel;
-    F.params = CP;
-    gsl_integration_qags (&F, tab_loga[i], 0, 0, 1e-7, 1000,
-            w, &result, &error);
-
-    /* result is in DH, hubble distance */
-    /* convert to cm / h */
-    result *= LIGHTCGS / HUBBLE;
-    /* convert to Kpc/h or internal units */
-    result /= UnitLength_in_cm;
-
-    gsl_integration_workspace_free (w);
-    tab_Dc[i] = result;
-//    double a = exp(tab_loga[i]);
-//    double z = 1 / a - 1;
-//    printf("a = %g z = %g Dc = %g\n", a, z, result);
-}
-
-void lightcone_init(Cosmology * CP, double timeBegin, const double UnitLength_in_cm, const char * OutputDir)
-{
-    int i;
-    dloga = (0.0 - log(timeBegin)) / (NENTRY - 1);
-    for(i = 0; i < NENTRY; i ++) {
-        lightcone_init_entry(CP, i, UnitLength_in_cm);
-    };
-    char buf[1024];
-    int chunk = 100;
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-
-    sprintf(buf, "%s/lightcone/", OutputDir);
-    mkdir(buf, 02755);
-    sprintf(buf, "%s/lightcone/%03d/", OutputDir, (int)(ThisTask / chunk));
-    mkdir(buf, 02755);
-    sprintf(buf, "%s/lightcone/%03d/lightcone-%05d.raw", OutputDir, (int)(ThisTask / chunk), ThisTask);
-
-    fd_lightcone = fopen(buf, "a+");
-    if(fd_lightcone == NULL) {
-        endrun(1, "failed to open %s\n", buf);
-    }
-    HorizonDistanceRef = lightcone_get_horizon(1 / (1 + ReferenceRedshift));
-    printf("lightcone reference redshift = %g distance = %g\n",
-            ReferenceRedshift, HorizonDistanceRef);
-}
-
-/* returns the horizon distance */
-static double lightcone_get_horizon(double a) {
-    double loga = log(a);
-    int bin = (log(a) -tab_loga[0]) / dloga;
-    if (bin < 0) {
-        return tab_Dc[0];
-    }
-    if (bin >= NENTRY - 1) {
-        return tab_Dc[NENTRY - 1];
-    }
-    double u1 = loga - tab_loga[bin];
-    double u2 = tab_loga[bin + 1] - loga;
-    u1 /= (tab_loga[bin + 1] - tab_loga[bin]);
-    u2 /= (tab_loga[bin + 1] - tab_loga[bin]);
-    return tab_Dc[bin] * u2 + tab_Dc[bin + 1] * u1;
-}
-
-/* fill in the table of box offsets for current time */
-static void update_replicas(double a, double BoxSize) {
-    int Nmax = BoxBoost * BoxBoost * BoxBoost;
-    int i;
-    int rx, ry, rz;
-    rx = ry = rz = 0;
-    Nreplica = 0;
-
-    for(i = 0; i < Nmax; i ++) {
-        double dx = BoxSize * rx;
-        double dy = BoxSize * ry;
-        double dz = BoxSize * rz;
-        double d1, d2;
-        d1 = dx * dx + dy * dy + dz * dz;
-        dx += BoxSize;
-        dy += BoxSize;
-        dz += BoxSize;
-        d2 = dx * dx + dy * dy + dz * dz;
-        if(d1 <= HorizonDistance2 && d2 >= HorizonDistance2) {
-            Reps[Nreplica][0] = rx * BoxSize;
-            Reps[Nreplica][1] = ry * BoxSize;
-            Reps[Nreplica][2] = rz * BoxSize;
-            Nreplica ++;
-            if(Nreplica > 1000) {
-                endrun(951234, "too many replica");
-            }
-        }
-        rz ++;
-        if(rz == BoxBoost) {
-            rz = 0;
-            ry ++;
-        }
-        if(ry == BoxBoost) {
-            ry = 0;
-            rx ++;
-        }
-    }
-}
-
-/* Compute a list of particles which crossed
- * the lightcone boundaries on this timestep and
- * write them to the lightcone file*/
-void lightcone_compute(double a, double BoxSize, Cosmology * CP, inttime_t ti_curr, inttime_t ti_next, const RandTable * const rnd)
-{
-    int i;
-    lightcone_set_time(a, BoxSize);
-    const double ddrift = get_exact_drift_factor(CP, ti_curr, ti_next);
-    #pragma omp parallel for
-    for(i = 0; i < PartManager->NumPart; i++)
-    {
-        lightcone_cross(i, ddrift, rnd);
-    }
-}
-
-void lightcone_set_time(double a, const double BoxSize) {
-    double z = 1 / a - 1;
-    if(z > zmin && z < zmax) {
-        HorizonDistancePrev = HorizonDistance;
-        HorizonDistance2Prev = HorizonDistance2;
-        HorizonDistance = lightcone_get_horizon(a);
-        HorizonDistance2 = HorizonDistance * HorizonDistance;
-        update_replicas(a, BoxSize);
-        fflush(fd_lightcone);
-        if (z < ReferenceRedshift) {
-            SampleFraction = 1.0;
-        } else {
-            /* write a smaller fraction of the points at high redshift
-             */
-            /* This is the angular resolution rule */
-            SampleFraction = HorizonDistanceRef / HorizonDistance;
-            SampleFraction *= SampleFraction;
-            SampleFraction *= SampleFraction;
-            /* This is the luminosity resolution rule */
-#if 0
-            SampleFraction = HorizonDistanceRef / HorizonDistance;
-            SampleFraction *= (1 + ReferenceRedshift) / (1 + z);
-            SampleFraction *= SampleFraction;
-
-#endif
-        }
-        message(0,"RefRedeshit=%g, SampleFraction=%g HorizonDistance=%g\n", ReferenceRedshift, SampleFraction, HorizonDistance);
-    } else {
-        SampleFraction = 0;
-    }
-}
-
-/* check crossing of the horizon, write the particle */
-static void lightcone_cross(int p, double ddrift, const RandTable * const rnd) {
-    if(SampleFraction <= 0.0) return;
-    int i;
-    int k;
-    /* DM only */
-    if(P[p].Type != 1) return;
-
-    for(i = 0; i < Nreplica; i++) {
-        double r = get_random_number(P[p].ID + i, rnd);
-        if(r > SampleFraction) continue;
-
-        double pnew[3];
-        double pold[3];
-        double p3[4];
-        double dnew = 0, dold = 0;
-        for(k = 0; k < 3; k ++) {
-            pold[k] = P[p].Pos[k] + Reps[i][k] - PartManager->CurrentParticleOffset[k];
-            pnew[k] = P[p].Pos[k] + P[i].Vel[k] * ddrift - PartManager->CurrentParticleOffset[k];
-            dnew += pnew[k] * pnew[k];
-            dold += pold[k] * pold[k];
-        }
-        if(
-            (dold <= HorizonDistance2Prev && dnew >= HorizonDistance2)
-         ) {
-            double u1, u2;
-            if(dold != dnew) {
-                double cnew, cold;
-                dnew = sqrt(dnew);
-                dold = sqrt(dold);
-                cnew = dnew - HorizonDistance;
-                cold = dold - HorizonDistancePrev;
-                u1 = -cold / (cnew - cold);
-                u2 = cnew / (cnew - cold);
-            } else {
-                /* really should write all particles along the line:
-                 * this partilce is moving along the horizon! */
-                u1 = u2 = 0.5;
-            }
-
-            /* write particle position */
-            for(k = 0; k < 3; k ++) {
-                p3[k] = pold[k] * u2 + pnew[k] * u1;
-            }
-            p3[3] = SampleFraction;
-            fwrite(p3, sizeof(double), 4, fd_lightcone);
-        }
-    }
-}
diff --git a/libgadget/lightcone.h b/libgadget/lightcone.h
deleted file mode 100644
index 5f254100..00000000
--- a/libgadget/lightcone.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef LIGHTCONE_H
-#define LIGHTCONE_H
-
-/* Initialise the lightcone code module. */
-void lightcone_init(Cosmology * CP, double timeBegin, const double UnitLength_in_cm, const char * OutputDir);
-void lightcone_compute(double a, double BoxSize, Cosmology * CP, inttime_t ti_curr, inttime_t ti_next, const RandTable * const rnd);
-#endif
diff --git a/libgadget/metal_return.c b/libgadget/metal_return.c
deleted file mode 100644
index 39af0088..00000000
--- a/libgadget/metal_return.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <gsl/gsl_roots.h>
-#include <gsl/gsl_errno.h>
-#include <omp.h>
-
-#include "physconst.h"
-#include "walltime.h"
-#include "slotsmanager.h"
-#include "treewalk.h"
-#include "metal_return.h"
-#include "densitykernel.h"
-#include "density.h"
-#include "cosmology.h"
-#include "winds.h"
-#include "utils/spinlocks.h"
-#include "metal_tables.h"
-
-/*! \file metal_return.c
- *  \brief Compute the mass return rate of metals from stellar evolution.
- *
- *  This file returns metals from stars with some delay.
- *  Delayed sources followed are AGB stars, SNII and Sn1a.
- *  9 Species specific yields are stored in the stars and the gas particles.
- *  Gas enrichment is not run every timestep, but only for stars that have
- *  significant enrichment, or are young.
- *  The model closely follows Illustris-TNG, https://arxiv.org/abs/1703.02970
- *  However the tables used are slightly different: we consider SNII between 8 and 40 Msun
- *  following Kobayashi 2006, where they use a hybrid of Kobayashi and Portinari.
- *  AGB yields are from Karakas 2010, like TNG, but stars with mass > 6.5 are
- *  from Doherty 2014, not Fishlock 2014. More details of the model can be found in
- *  the Illustris model Vogelsberger 2013: https://arxiv.org/abs/1305.2913
- *  As the Kobayashi table only goes to 13 Msun, stars with masses 8-13 Msun
- *  are assumed to yield like a 13 Msun star, but scaled by a factor of (M/13).
- */
-
-#if NMETALS != NSPECIES
-    #pragma error " Inconsistency in metal number between slots and metals"
-#endif
-
-static struct metal_return_params
-{
-    double Sn1aN0;
-    int SPHWeighting;
-    double MaxNgbDeviation;
-} MetalParams;
-
-/* For tests*/
-void set_metal_params(double Sn1aN0)
-{
-    MetalParams.Sn1aN0 = Sn1aN0;
-}
-
-/*Set the parameters of the hydro module*/
-void
-set_metal_return_params(ParameterSet * ps)
-{
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    if(ThisTask == 0) {
-        MetalParams.Sn1aN0 = param_get_double(ps, "MetalsSn1aN0");
-        MetalParams.SPHWeighting = param_get_int(ps, "MetalsSPHWeighting");
-        MetalParams.MaxNgbDeviation = param_get_double(ps, "MetalsMaxNgbDeviation");
-    }
-    MPI_Bcast(&MetalParams, sizeof(struct metal_return_params), MPI_BYTE, 0, MPI_COMM_WORLD);
-}
-
-/* Build the interpolators for each yield table. We use bilinear interpolation
- * so there is no extra memory allocation and we never free the tables*/
-void setup_metal_table_interp(struct interps * interp)
-{
-    interp->lifetime_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, LIFE_NMET, LIFE_NMASS);
-    gsl_interp2d_init(interp->lifetime_interp, lifetime_metallicity, lifetime_masses, lifetime, LIFE_NMET, LIFE_NMASS);
-    interp->agb_mass_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-    gsl_interp2d_init(interp->agb_mass_interp, agb_metallicities, agb_masses, agb_total_mass, AGB_NMET, AGB_NMASS);
-    interp->agb_metallicity_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-    gsl_interp2d_init(interp->agb_metallicity_interp, agb_metallicities, agb_masses, agb_total_metals, AGB_NMET, AGB_NMASS);
-    int i;
-    for(i=0; i<NMETALS; i++) {
-        interp->agb_metals_interp[i] = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-        gsl_interp2d_init(interp->agb_metals_interp[i], agb_metallicities, agb_masses, agb_yield[i], AGB_NMET, AGB_NMASS);
-    }
-    interp->snii_mass_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-    gsl_interp2d_init(interp->snii_mass_interp, snii_metallicities, snii_masses, snii_total_mass, SNII_NMET, SNII_NMASS);
-    interp->snii_metallicity_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-    gsl_interp2d_init(interp->snii_metallicity_interp, snii_metallicities, snii_masses, snii_total_metals, SNII_NMET, SNII_NMASS);
-    for(i=0; i<NMETALS; i++) {
-        interp->snii_metals_interp[i] = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-        gsl_interp2d_init(interp->snii_metals_interp[i], snii_metallicities, snii_masses, snii_yield[i], SNII_NMET, SNII_NMASS);
-    }
-}
-
-#define METALS_GET_PRIV(tw) ((struct MetalReturnPriv*) ((tw)->priv))
-
-typedef struct {
-    TreeWalkQueryBase base;
-    MyFloat Metallicity;
-    MyFloat Mass;
-    MyFloat Hsml;
-    MyFloat StarVolumeSPH;
-    /* This is the metal/mass generated this timestep.*/
-    MyFloat MetalSpeciesGenerated[NMETALS];
-    MyFloat MassGenerated;
-    MyFloat MetalGenerated;
-} TreeWalkQueryMetals;
-
-typedef struct {
-    TreeWalkResultBase base;
-    /* This is the total mass returned to
-     * the surrounding gas particles, for mass conservation.*/
-    MyFloat MassReturn;
-} TreeWalkResultMetals;
-
-typedef struct {
-    TreeWalkNgbIterBase base;
-    DensityKernel kernel;
-} TreeWalkNgbIterMetals;
-
-static int
-metal_return_haswork(int n, TreeWalk * tw);
-
-static void
-metal_return_ngbiter(
-    TreeWalkQueryMetals * I,
-    TreeWalkResultMetals * O,
-    TreeWalkNgbIterMetals * iter,
-    LocalTreeWalk * lv
-   );
-
-static void
-metal_return_copy(int place, TreeWalkQueryMetals * input, TreeWalk * tw);
-
-static void
-metal_return_postprocess(int place, TreeWalk * tw);
-
-static void
-metal_return_reduce(const int place, TreeWalkResultMetals * remote, const enum TreeWalkReduceMode mode, TreeWalk * tw);
-
-/* The Chabrier IMF used for computing SnII and AGB yields.
- * See 1305.2913 eq 3*/
-static double chabrier_imf(double mass)
-{
-    if(mass <= 1) {
-        return 0.852464 / mass * exp(- pow(log(mass / 0.079)/ 0.69, 2)/2);
-    }
-    else {
-        return 0.237912 * pow(mass, -2.3);
-    }
-}
-
-double atime_integ(double atime, void * params)
-{
-    Cosmology * CP = (Cosmology *) params;
-    return 1/(hubble_function(CP, atime) * atime);
-}
-
-/* Compute the difference in internal time units between two scale factors.*/
-static double atime_to_myr(Cosmology *CP, double atime1, double atime2, gsl_integration_workspace * gsl_work)
-{
-    /* t = dt/da da = 1/(Ha) da*/
-    /* Approximate hubble function as constant here: we only care
-     * about metal return over a single timestep*/
-    gsl_function ff = {atime_integ, CP};
-    double tmyr, abserr;
-    gsl_integration_qag(&ff, atime1, atime2, 1e-4, 0, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &tmyr, &abserr);
-    return tmyr * CP->UnitTime_in_s / SEC_PER_MEGAYEAR;
-}
-
-/* Functions for the root finder*/
-struct massbin_find_params
-{
-    double dtfind;
-    double stellarmetal;
-    gsl_interp2d * lifetime_tables;
-    gsl_interp_accel * metalacc;
-    gsl_interp_accel * massacc;
-};
-
-/* This is the inverse of the lifetime function from the tables.
- * Need to find the stars with a given lifetime*/
-double
-massendlife (double mass, void *params)
-{
-  struct massbin_find_params *p = (struct massbin_find_params *) params;
-  double tlife = gsl_interp2d_eval(p->lifetime_tables, lifetime_metallicity, lifetime_masses, lifetime, p->stellarmetal, mass, p->metalacc, p->massacc);
-  double tlifemyr = tlife/1e6;
-  return tlifemyr - p->dtfind;
-}
-
-/* Solve the lifetime function to find the lowest and highest mass bin that dies this timestep*/
-double do_rootfinding(struct massbin_find_params *p, double mass_low, double mass_high)
-{
-    int iter = 0;
-    gsl_function F;
-
-    F.function = &massendlife;
-    F.params = p;
-
-    const gsl_root_fsolver_type *T = gsl_root_fsolver_falsepos;
-    gsl_root_fsolver * s = gsl_root_fsolver_alloc (T);
-    gsl_root_fsolver_set (s, &F, mass_low, mass_high);
-
-    /* Iterate until we have an idea of the mass bins dying this timestep.
-     * No check is done for success, but it should always be close enough.*/
-    for(iter = 0; iter < MAXITER; iter++)
-    {
-      gsl_root_fsolver_iterate (s);
-      mass_low = gsl_root_fsolver_x_lower (s);
-      mass_high = gsl_root_fsolver_x_upper (s);
-      int status = gsl_root_test_interval (mass_low, mass_high,
-                                       0, 0.005);
-      //message(4, "lo %g hi %g root %g val %g\n", mass_low, mass_high, gsl_root_fsolver_root(s), massendlife(gsl_root_fsolver_root(s), p));
-      if (status == GSL_SUCCESS)
-        break;
-  }
-  double root = gsl_root_fsolver_root(s);
-  gsl_root_fsolver_free (s);
-  return root;
-}
-
-/* Find the mass bins which die in this timestep using the lifetime table.
- * dtstart, dtend - time at start and end of timestep in Myr.
- * stellarmetal - metallicity of the star.
- * lifetime_tables - 2D interpolation table of the lifetime.
- * masshigh, masslow - pointers in which to store the high and low lifetime limits
- */
-void find_mass_bin_limits(double * masslow, double * masshigh, const double dtstart, const double dtend, double stellarmetal, gsl_interp2d * lifetime_tables)
-{
-    /* Clamp metallicities to the table values.*/
-    if(stellarmetal < lifetime_metallicity[0])
-        stellarmetal = lifetime_metallicity[0];
-    if(stellarmetal > lifetime_metallicity[LIFE_NMET-1])
-        stellarmetal = lifetime_metallicity[LIFE_NMET-1];
-
-    /* Find the root with GSL routines. */
-    struct massbin_find_params p = {0};
-    p.metalacc = gsl_interp_accel_alloc();
-    p.massacc = gsl_interp_accel_alloc();
-    p.lifetime_tables = lifetime_tables;
-    p.stellarmetal = stellarmetal;
-    /* First find stars that died before the end of this timebin*/
-    p.dtfind = dtend;
-    /* If no stars have died yet*/
-    if(massendlife (MAXMASS, &p) >= 0)
-    {
-        *masslow = MAXMASS;
-        *masshigh = MAXMASS;
-        return;
-    }
-    /* All stars die before the end of this timestep*/
-    if(massendlife (agb_masses[0], &p) <= 0)
-        *masslow = lifetime_masses[0];
-    else
-        *masslow = do_rootfinding(&p, agb_masses[0], MAXMASS);
-
-    /* Now find stars that died before the start of this timebin*/
-    p.dtfind = dtstart;
-    /* Now we know that life(masslow) = dtend, so life(masslow) > dtstart, so life(masslow) - dtstart > 0
-     * This is when no stars have died at the beginning of this timestep.*/
-    if(massendlife (MAXMASS, &p) >= 0)
-        *masshigh = MAXMASS;
-    /* This can sometimes happen due to root finding inaccuracy.
-     * Just do this star next timestep.*/
-    else if(massendlife (*masslow, &p) <= 0)
-        *masshigh = *masslow;
-    else
-        *masshigh = do_rootfinding(&p, *masslow, MAXMASS);
-    gsl_interp_accel_free(p.metalacc);
-    gsl_interp_accel_free(p.massacc);
-}
-
-/* Parameters of the interpolator
- * to hand to the imf integral.
- * Use different interpolation structures
- * for mass return, metal return and yield.*/
-struct imf_integ_params
-{
-    gsl_interp2d * interp;
-    const double * masses;
-    const double * metallicities;
-    const double * weights;
-    double metallicity;
-};
-
-/* Integrand for a function which computes a Chabrier IMF weighted quantity.*/
-double chabrier_imf_integ (double mass, void * params)
-{
-    struct imf_integ_params * para = (struct imf_integ_params * ) params;
-    /* This is needed so that the yield for SNII with masses between 8 and 13 Msun
-     * are the same as the smallest mass in the table, 13 Msun,
-     * but they still contribute their number density to the IMF.*/
-    double intpmass = mass;
-    if(mass < para->masses[0])
-        intpmass = para->masses[0];
-    if(mass > para->masses[para->interp->ysize-1])
-        intpmass = para->masses[para->interp->ysize-1];
-    double weight = gsl_interp2d_eval(para->interp, para->metallicities, para->masses, para->weights, para->metallicity, intpmass, NULL, NULL);
-    /* This rescales the return by the original mass of the star, if it was outside the table.
-     * It means that, for example, an 8 Msun star does not return more than 8 Msun. */
-    weight *= (mass/intpmass);
-    return weight * chabrier_imf(mass);
-}
-
-/* Helper for the IMF normalisation*/
-double chabrier_mass(double mass, void * params)
-{
-    return mass * chabrier_imf(mass);
-}
-
-/* Compute factor to normalise the total mass in the IMF to unity.*/
-double compute_imf_norm(gsl_integration_workspace * gsl_work)
-{
-    double norm, abserr;
-    gsl_function ff = {chabrier_mass, NULL};
-    gsl_integration_qag(&ff, MINMASS, MAXMASS, 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &norm, &abserr);
-    return norm;
-}
-
-/* Compute number of Sn1a: has units of N0 = 1.3e-3, which is SN1A/(unit initial mass in M_sun).
- * Zero for age < 40 Myr. */
-double sn1a_number(double dtmyrstart, double dtmyrend, double hub)
-{
-    /* Number of Sn1a events follows a delay time distribution (1305.2913, eq. 10) */
-    const double sn1aindex = 1.12;
-    const double tau8msun = 40;
-    if(dtmyrend < tau8msun)
-        return 0;
-    /* Lower integration limit modelling formation time of WDs*/
-    if(dtmyrstart < tau8msun)
-        dtmyrstart  = tau8msun;
-    /* Total number of Sn1a events from this star: integral evaluated from t=tau8msun to t=hubble time.*/
-    const double totalSN1a = 1- pow(1/(hub*HUBBLE * SEC_PER_MEGAYEAR)/tau8msun, 1-sn1aindex);
-    /* This is the integral of the DTD, normalised to the N0 rate which is in SN/M_sun.*/
-    double Nsn1a = MetalParams.Sn1aN0 /totalSN1a * (pow(dtmyrstart / tau8msun, 1-sn1aindex) - pow(dtmyrend / tau8msun, 1-sn1aindex));
-    return Nsn1a;
-}
-
-/* Compute yield of AGB stars: this is normalised to the yield which has units of Msun / (unit Msun in the initial SSP and so is really dimensionless.)*/
-double compute_agb_yield(gsl_interp2d * agb_interp, const double * agb_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work )
-{
-    struct imf_integ_params para;
-    gsl_function ff = {chabrier_imf_integ, &para};
-    double agbyield = 0, abserr;
-    /* Only return AGB metals for the range of AGB stars*/
-    if (masshigh > SNAGBSWITCH)
-        masshigh = SNAGBSWITCH;
-    if (masslow < agb_masses[0])
-        masslow = agb_masses[0];
-    if (stellarmetal > agb_metallicities[AGB_NMET-1])
-        stellarmetal = agb_metallicities[AGB_NMET-1];
-    if (stellarmetal < agb_metallicities[0])
-        stellarmetal = agb_metallicities[0];
-    /* This happens if no bins in range had dying stars this timestep*/
-    if(masslow >= masshigh)
-        return 0;
-    para.interp = agb_interp;
-    para.masses = agb_masses;
-    para.metallicities = agb_metallicities;
-    para.metallicity = stellarmetal;
-    para.weights = agb_weights;
-    gsl_integration_qag(&ff, masslow, masshigh, 1e-7, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &agbyield, &abserr);
-    return agbyield;
-}
-
-double compute_snii_yield(gsl_interp2d * snii_interp, const double * snii_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work )
-{
-    struct imf_integ_params para;
-    gsl_function ff = {chabrier_imf_integ, &para};
-    double yield = 0, abserr;
-    /* Only return metals for the range of SNII stars.*/
-    if (masshigh > snii_masses[SNII_NMASS-1])
-        masshigh = snii_masses[SNII_NMASS-1];
-    if (masslow < SNAGBSWITCH)
-        masslow = SNAGBSWITCH;
-    if (stellarmetal > snii_metallicities[SNII_NMET-1])
-        stellarmetal = snii_metallicities[SNII_NMET-1];
-    if (stellarmetal < snii_metallicities[0])
-        stellarmetal = snii_metallicities[0];
-    para.interp = snii_interp;
-    para.masses = snii_masses;
-    para.metallicities = snii_metallicities;
-    para.metallicity = stellarmetal;
-    para.weights = snii_weights;
-    /* This happens if no bins in range had dying stars this timestep*/
-    if(masslow >= masshigh)
-        return 0;
-    gsl_integration_qag(&ff, masslow, masshigh, 1e-7, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &yield, &abserr);
-    return yield;
-}
-
-/* Compute the total mass yield for this star in this timestep*/
-static double mass_yield(double dtmyrstart, double dtmyrend, double stellarmetal, double hub, struct interps * interp, double imf_norm, gsl_integration_workspace * gsl_work, double masslow, double masshigh)
-{
-    /* Number of AGB stars/SnII by integrating the IMF*/
-    double agbyield = compute_agb_yield(interp->agb_mass_interp, agb_total_mass, stellarmetal, masslow, masshigh, gsl_work);
-    double sniiyield = compute_snii_yield(interp->snii_mass_interp, snii_total_mass, stellarmetal, masslow, masshigh, gsl_work);
-    /* Fraction of the IMF which goes off this timestep. Normalised by the total IMF so we get a fraction of the SSP.*/
-    double massyield = (agbyield + sniiyield)/imf_norm;
-    /* Mass yield from Sn1a*/
-    double Nsn1a = sn1a_number(dtmyrstart, dtmyrend, hub);
-    massyield += Nsn1a * sn1a_total_metals;
-    //message(3, "masslow %g masshigh %g stellarmetal %g dystart %g dtend %g agb %g snii %g sn1a %g imf_norm %g\n",
-    //        masslow, masshigh, stellarmetal, dtmyrstart, dtmyrend, agbyield, sniiyield, Nsn1a * sn1a_total_metals, imf_norm);
-    return massyield;
-}
-
-/* Compute the total metal yield for this star in this timestep*/
-static double metal_yield(double dtmyrstart, double dtmyrend, double stellarmetal, double hub, struct interps * interp, MyFloat * MetalYields, double imf_norm, gsl_integration_workspace * gsl_work, double masslow, double masshigh)
-{
-    double MetalGenerated = 0;
-    /* Number of AGB stars/SnII by integrating the IMF*/
-    MetalGenerated += compute_agb_yield(interp->agb_metallicity_interp, agb_total_metals, stellarmetal, masslow, masshigh, gsl_work);
-    MetalGenerated += compute_snii_yield(interp->snii_metallicity_interp, snii_total_metals, stellarmetal, masslow, masshigh, gsl_work);
-    MetalGenerated /= imf_norm;
-
-    int i;
-    for(i = 0; i < NMETALS; i++)
-    {
-        MetalYields[i] = 0;
-        MetalYields[i] += compute_agb_yield(interp->agb_metals_interp[i], agb_yield[i], stellarmetal, masslow, masshigh, gsl_work);
-        MetalYields[i] += compute_snii_yield(interp->snii_metals_interp[i], snii_yield[i], stellarmetal, masslow, masshigh, gsl_work);
-        MetalYields[i] /= imf_norm;
-    }
-    double Nsn1a = sn1a_number(dtmyrstart, dtmyrend, hub);
-    for(i = 0; i < NMETALS; i++)
-        MetalYields[i] += Nsn1a * sn1a_yields[i];
-    MetalGenerated += Nsn1a * sn1a_total_metals;
-
-    return MetalGenerated;
-}
-
-/* Initialise the private structure, finding stellar mass return and ages*/
-int64_t
-metal_return_init(const ActiveParticles * act, Cosmology * CP, struct MetalReturnPriv * priv, const double atime)
-{
-    int nthread = omp_get_max_threads();
-    priv->gsl_work = ta_malloc("gsl_work", gsl_integration_workspace *, nthread);
-    int i;
-    /* Allocate a workspace for each thread*/
-    for(i=0; i < nthread; i++)
-        priv->gsl_work[i] = gsl_integration_workspace_alloc(GSL_WORKSPACE);
-    priv->hub = CP->HubbleParam;
-
-    /* Initialize*/
-    setup_metal_table_interp(&priv->interp);
-    priv->StellarAges = (MyFloat *) mymalloc("StellarAges", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->MassReturn = (MyFloat *) mymalloc("MassReturn", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->LowDyingMass = (MyFloat *) mymalloc("LowDyingMass", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->HighDyingMass = (MyFloat *) mymalloc("HighDyingMass", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->StarVolumeSPH = (MyFloat *) mymalloc("StarVolumeSPH", SlotsManager->info[4].size * sizeof(MyFloat));
-
-    priv->imf_norm = compute_imf_norm(priv->gsl_work[0]);
-    /* Maximum possible mass return for below*/
-    double maxmassfrac = mass_yield(0, 1/(CP->HubbleParam*HUBBLE * SEC_PER_MEGAYEAR), snii_metallicities[SNII_NMET-1], CP->HubbleParam, &priv->interp, priv->imf_norm, priv->gsl_work[0],agb_masses[0], MAXMASS);
-
-    int64_t haswork = 0;
-    /* First find the mass return as a fraction of the total mass and the age of the star.
-     * This is done first so we can skip density computation for not active stars*/
-    #pragma omp parallel for reduction(+: haswork)
-    for(i=0; i < act->NumActiveParticle;i++)
-    {
-        int p_i = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        if(P[p_i].Type != 4)
-            continue;
-        int tid = omp_get_thread_num();
-        const int slot = P[p_i].PI;
-        priv->StellarAges[slot] = atime_to_myr(CP, STARP(p_i).FormationTime, atime, priv->gsl_work[tid]);
-        /* Note this takes care of units*/
-        double initialmass = P[p_i].Mass + STARP(p_i).TotalMassReturned;
-        find_mass_bin_limits(&priv->LowDyingMass[slot], &priv->HighDyingMass[slot], STARP(p_i).LastEnrichmentMyr, priv->StellarAges[P[p_i].PI], STARP(p_i).Metallicity, priv->interp.lifetime_interp);
-
-        priv->MassReturn[slot] = initialmass * mass_yield(STARP(p_i).LastEnrichmentMyr, priv->StellarAges[P[p_i].PI], STARP(p_i).Metallicity, CP->HubbleParam, &priv->interp, priv->imf_norm, priv->gsl_work[tid],priv->LowDyingMass[slot], priv->HighDyingMass[slot]);
-        //message(3, "Particle %d PI %d massgen %g mass %g initmass %g\n", p_i, P[p_i].PI, priv->MassReturn[P[p_i].PI], P[p_i].Mass, initialmass);
-        /* Guard against making a zero mass particle and warn since this should not happen.*/
-        if(STARP(p_i).TotalMassReturned + priv->MassReturn[slot] > initialmass * maxmassfrac) {
-            if(priv->MassReturn[slot] / STARP(p_i).TotalMassReturned > 0.01)
-                message(1, "Large mass return id %ld %g from %d mass %g initial %g (maxfrac %g) age %g lastenrich %g metal %g dymass %g %g\n",
-                    P[p_i].ID, priv->MassReturn[slot], p_i, STARP(p_i).TotalMassReturned, initialmass, maxmassfrac, priv->StellarAges[P[p_i].PI], STARP(p_i).LastEnrichmentMyr, STARP(p_i).Metallicity, priv->LowDyingMass[slot], priv->HighDyingMass[slot]);
-            priv->MassReturn[slot] = initialmass * maxmassfrac - STARP(p_i).TotalMassReturned;
-            if(priv->MassReturn[slot] < 0) {
-                priv->MassReturn[slot] = 0;
-            }
-            /* Ensure that we skip this step*/
-            if(!metals_haswork(p_i, priv->MassReturn))
-                STARP(p_i).LastEnrichmentMyr = priv->StellarAges[P[p_i].PI];
-
-        }
-        /* Keep count of how much work we need to do*/
-        if(metals_haswork(p_i, priv->MassReturn))
-            haswork++;
-    }
-    return haswork;
-}
-
-/* Free memory allocated by metal_return_init */
-void
-metal_return_priv_free(struct MetalReturnPriv * priv)
-{
-    myfree(priv->StarVolumeSPH);
-    myfree(priv->HighDyingMass);
-    myfree(priv->LowDyingMass);
-    myfree(priv->MassReturn);
-    myfree(priv->StellarAges);
-
-    int i;
-    for(i=0; i < omp_get_max_threads(); i++)
-        gsl_integration_workspace_free(priv->gsl_work[i]);
-
-    ta_free(priv->gsl_work);
-}
-
-/*! This function is the driver routine for the calculation of metal return. */
-void
-metal_return(const ActiveParticles * act, ForceTree * gasTree, Cosmology * CP, const double atime, const double AvgGasMass)
-{
-    /* Do nothing if no stars yet*/
-    int64_t totstar;
-    MPI_Allreduce(&SlotsManager->info[4].size, &totstar, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-    if(totstar == 0)
-        return;
-
-    struct MetalReturnPriv priv[1];
-
-    int64_t nwork = metal_return_init(act, CP, priv, atime);
-
-    /* Maximum mass of a gas particle after enrichment: cap it at a few times the initial mass.
-     * FIXME: Ideally we should here fork a new particle with a smaller gas mass. We should
-     * figure out then how set the gas entropy. A possibly better idea is to add
-     * a generic routine to split gas particles into the density code.*/
-    priv->MaxGasMass = 4* AvgGasMass;
-
-    int64_t totwork;
-    MPI_Allreduce(&nwork, &totwork, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-
-    walltime_measure("/SPH/Metals/Init");
-
-    if(totwork == 0) {
-        metal_return_priv_free(priv);
-        return;
-    }
-
-    if(!gasTree->tree_allocated_flag || !(gasTree->mask & GASMASK))
-        endrun(5, "metal_return called with bad tree allocated %d mask %d\n", gasTree->tree_allocated_flag, gasTree->mask);
-    /* Compute total number of weights around each star for actively returning stars*/
-    stellar_density(act, priv->StarVolumeSPH, priv->MassReturn, gasTree);
-
-    /* Do the metal return*/
-    TreeWalk tw[1] = {{0}};
-
-    tw->ev_label = "METALS";
-    tw->visit = (TreeWalkVisitFunction) treewalk_visit_ngbiter;
-    tw->ngbiter = (TreeWalkNgbIterFunction) metal_return_ngbiter;
-    tw->ngbiter_type_elsize = sizeof(TreeWalkNgbIterMetals);
-    tw->haswork = metal_return_haswork;
-    tw->fill = (TreeWalkFillQueryFunction) metal_return_copy;
-    tw->reduce = (TreeWalkReduceResultFunction) metal_return_reduce;
-    tw->postprocess = (TreeWalkProcessFunction) metal_return_postprocess;
-    tw->query_type_elsize = sizeof(TreeWalkQueryMetals);
-    tw->result_type_elsize = sizeof(TreeWalkResultMetals);
-    tw->tree = gasTree;
-    tw->priv = priv;
-
-    priv->spin = init_spinlocks(SlotsManager->info[0].size);
-    treewalk_run(tw, act->ActiveParticle, act->NumActiveParticle);
-    free_spinlocks(priv->spin);
-
-    metal_return_priv_free(priv);
-
-    /* collect some timing information */
-    walltime_measure("/SPH/Metals/Yield");
-}
-
-/* This function is unusually important:
- * it computes the total amount of metals to be returned in this timestep.*/
-static void
-metal_return_copy(int place, TreeWalkQueryMetals * input, TreeWalk * tw)
-{
-    input->Metallicity = STARP(place).Metallicity;
-    input->Mass = P[place].Mass;
-    input->Hsml = P[place].Hsml;
-    int pi = P[place].PI;
-    input->StarVolumeSPH = METALS_GET_PRIV(tw)->StarVolumeSPH[pi];
-    double InitialMass = P[place].Mass + STARP(place).TotalMassReturned;
-    double dtmyrend = METALS_GET_PRIV(tw)->StellarAges[pi];
-    double dtmyrstart = STARP(place).LastEnrichmentMyr;
-    int tid = omp_get_thread_num();
-    /* This is the total mass returned from this stellar population this timestep. Note this is already in the desired units.*/
-    input->MassGenerated = METALS_GET_PRIV(tw)->MassReturn[pi];
-    /* This returns the total amount of metal produced this timestep, and also fills out MetalSpeciesGenerated, which is an
-     * element by element table of the metal produced by dying stars this timestep.*/
-    double total_z_yield = metal_yield(dtmyrstart, dtmyrend, input->Metallicity, METALS_GET_PRIV(tw)->hub, &METALS_GET_PRIV(tw)->interp, input->MetalSpeciesGenerated, METALS_GET_PRIV(tw)->imf_norm, METALS_GET_PRIV(tw)->gsl_work[tid], METALS_GET_PRIV(tw)->LowDyingMass[pi], METALS_GET_PRIV(tw)->HighDyingMass[pi]);
-    /* The total metal returned is the metal ejected into the ISM this timestep. total_z_yield is given as a fraction of the initial SSP.*/
-    input->MetalGenerated = InitialMass * total_z_yield;
-    //message(3, "Particle %d PI %d z %g massgen %g metallicity %g\n", pi, P[pi].PI, total_z_yield, METALS_GET_PRIV(tw)->MassReturn[pi], STARP(place).Metallicity);
-    /* It should be positive! If it is not, this is some integration error
-     * in the yield table as we cannot destroy metal which is not present.*/
-    if(input->MetalGenerated < 0)
-        input->MetalGenerated = 0;
-    /* Similarly for all the other metal species*/
-    int i;
-    for(i = 0; i < NMETALS; i++) {
-        input->MetalSpeciesGenerated[i] *= InitialMass;
-        if(input->MetalSpeciesGenerated[i] < 0)
-            input->MetalSpeciesGenerated[i] = 0;
-    }
-}
-
-/* Update the mass return variable to contain the amount of mass actually returned.*/
-static void
-metal_return_reduce(int place, TreeWalkResultMetals * remote, enum TreeWalkReduceMode mode, TreeWalk * tw)
-{
-    TREEWALK_REDUCE(METALS_GET_PRIV(tw)->MassReturn[P[place].PI], remote->MassReturn);
-}
-
-/* Update the mass and enrichment variables for the star.
- * Note that the stellar metallicity is not updated, as the
- * metal-forming stars are now dead and their metals in the gas.*/
-static void
-metal_return_postprocess(int place, TreeWalk * tw)
-{
-    /* Conserve mass returned*/
-    P[place].Mass -= METALS_GET_PRIV(tw)->MassReturn[P[place].PI];
-    STARP(place).TotalMassReturned += METALS_GET_PRIV(tw)->MassReturn[P[place].PI];
-    /* Update the last enrichment time*/
-    STARP(place).LastEnrichmentMyr = METALS_GET_PRIV(tw)->StellarAges[P[place].PI];
-}
-
-/*! For all gas particles within the density radius of this star,
- * add a fraction of the total mass and metals generated,
- * weighted by the SPH kernel distance from the star.
- */
-static void
-metal_return_ngbiter(
-    TreeWalkQueryMetals * I,
-    TreeWalkResultMetals * O,
-    TreeWalkNgbIterMetals * iter,
-    LocalTreeWalk * lv
-   )
-{
-    if(iter->base.other == -1) {
-        /* Only return metals to gas*/
-        iter->base.mask = GASMASK;
-        iter->base.Hsml = I->Hsml;
-        iter->base.symmetric = NGB_TREEFIND_ASYMMETRIC;
-        /* Initialise the mass lost by this star in this timestep*/
-        O->MassReturn = 0;
-        density_kernel_init(&iter->kernel, I->Hsml, GetDensityKernelType());
-        return;
-    }
-
-    const int other = iter->base.other;
-    const double r2 = iter->base.r2;
-    const double r = iter->base.r;
-
-    if(r2 > 0 && r2 < iter->kernel.HH)
-    {
-        double wk = 1;
-        const double u = r * iter->kernel.Hinv;
-
-        if(MetalParams.SPHWeighting)
-            wk = density_kernel_wk(&iter->kernel, u);
-        double ThisMetals[NMETALS];
-        if(I->StarVolumeSPH ==0)
-            endrun(3, "StarVolumeSPH %g hsml %g\n", I->StarVolumeSPH, I->Hsml);
-        double newmass;
-        int pi = P[other].PI;
-        lock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-        /* Volume of particle weighted by the SPH kernel*/
-        double volume = P[other].Mass / SPHP(other).Density;
-        double returnfraction = wk * volume / I->StarVolumeSPH;
-        double thismass = returnfraction * I->MassGenerated;
-        /* Ensure that the gas particles don't become overweight.
-         * If there are few gas particles around, the star clusters
-         * will hold onto their metals.*/
-        if(P[other].Mass + thismass > METALS_GET_PRIV(lv->tw)->MaxGasMass) {
-            unlock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-            return;
-        }
-        /* Add metals weighted by SPH kernel*/
-        int i;
-        for(i = 0; i < NMETALS; i++)
-            ThisMetals[i] = returnfraction * I->MetalSpeciesGenerated[i];
-        double thismetal = returnfraction * I->MetalGenerated;
-        /* Add the metals to the particle.*/
-        for(i = 0; i < NMETALS; i++)
-            SPHP(other).Metals[i] = (SPHP(other).Metals[i] * P[other].Mass + ThisMetals[i])/(P[other].Mass + thismass);
-        /* Update total metallicity*/
-        SPHP(other).Metallicity = (SPHP(other).Metallicity * P[other].Mass + thismetal)/(P[other].Mass + thismass);
-        /* Update mass*/
-        double massfrac = (P[other].Mass + thismass) / P[other].Mass;
-        P[other].Mass *= massfrac;
-        /* Density also needs a correction so the volume fraction is unchanged.
-         * This ensures that volume = Mass/Density is unchanged for the next particle
-         * and thus the weighting still sums to unity.*/
-        SPHP(other).Density *= massfrac;
-        /* Keep track of how much was returned for conservation purposes*/
-        O->MassReturn += thismass;
-        newmass = P[other].Mass;
-        unlock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-        if(newmass <= 0)
-            endrun(3, "New mass %g new metal %g in particle %d id %ld from star mass %g metallicity %g\n",
-                   newmass, SPHP(other).Metallicity, other, P[other].ID, I->Mass, I->Metallicity);
-    }
-}
-
-/* Find stars returning enough metals to the gas.
- * This is a wrapper function to allow for
- * different private structs in different treewalks*/
-int
-metals_haswork(int i, MyFloat * MassReturn)
-{
-    if(P[i].Type != 4)
-        return 0;
-    int pi = P[i].PI;
-    /* Don't do enrichment from all stars, just those with significant enrichment*/
-    if(MassReturn[pi] < 1e-3 * (P[i].Mass + STARP(i).TotalMassReturned))
-        return 0;
-    return 1;
-}
-
-static int
-metal_return_haswork(int i, TreeWalk * tw)
-{
-    return metals_haswork(i, METALS_GET_PRIV(tw)->MassReturn);
-}
-
-/* Number of densities to evaluate simultaneously*/
-#define NHSML 10
-
-typedef struct {
-    TreeWalkNgbIterBase base;
-    DensityKernel kernel[NHSML];
-    double kernel_volume[NHSML];
-} TreeWalkNgbIterStellarDensity;
-
-typedef struct
-{
-    TreeWalkQueryBase base;
-    MyFloat Hsml[NHSML];
-} TreeWalkQueryStellarDensity;
-
-typedef struct {
-    TreeWalkResultBase base;
-    MyFloat VolumeSPH[NHSML];
-    MyFloat Ngb[NHSML];
-    int maxcmpte;
-    int _alignment;
-} TreeWalkResultStellarDensity;
-
-struct StellarDensityPriv {
-    /* Current number of neighbours*/
-    MyFloat (*NumNgb)[NHSML];
-    /* Lower and upper bounds on smoothing length*/
-    MyFloat *Left, *Right;
-    MyFloat (*VolumeSPH)[NHSML];
-    /* For haswork*/
-    MyFloat *MassReturn;
-    /*!< Desired number of SPH neighbours */
-    double DesNumNgb;
-    /* Maximum index where NumNgb is valid. */
-    int * maxcmpte;
-};
-
-#define STELLAR_DENSITY_GET_PRIV(tw) ((struct StellarDensityPriv*) ((tw)->priv))
-
-static int
-stellar_density_haswork(int i, TreeWalk * tw)
-{
-    return metals_haswork(i, STELLAR_DENSITY_GET_PRIV(tw)->MassReturn);
-}
-
-/* Get Hsml for one of the evaluations*/
-static inline double
-effhsml(int place, int i, TreeWalk * tw)
-{
-    int pi = P[place].PI;
-    double left = STELLAR_DENSITY_GET_PRIV(tw)->Left[pi];
-    double right = STELLAR_DENSITY_GET_PRIV(tw)->Right[pi];
-    /* If somehow Hsml has become zero through underflow, use something non-zero
-     * to make sure we converge. */
-    if(left == 0 && right > 0.99*tw->tree->BoxSize && P[place].Hsml == 0) {
-        int fat = force_get_father(place, tw->tree);
-        P[place].Hsml = tw->tree->Nodes[fat].len;
-        if(P[place].Hsml == 0)
-            P[place].Hsml = tw->tree->BoxSize / pow(PartManager->NumPart, 1./3)/4.;
-    }
-    /* Use slightly past the current Hsml as the right most boundary*/
-    if(right > 0.99*tw->tree->BoxSize)
-        right = P[place].Hsml * ((1.+NHSML)/NHSML);
-    /* Use 1/2 of current Hsml for left. The asymmetry is because it is free
-     * to compute extra densities for h < Hsml, but not for h > Hsml.*/
-    if(left == 0)
-        left = 0.1 * P[place].Hsml;
-    /* From left + 1/N  to right - 1/N, evenly spaced in volume,
-     * since NumNgb ~ h^3.*/
-    double rvol = pow(right, 3);
-    double lvol = pow(left, 3);
-    return pow((1.*i+1)/(1.*NHSML+1) * (rvol - lvol) + lvol, 1./3);
-}
-
-static void
-stellar_density_copy(int place, TreeWalkQueryStellarDensity * I, TreeWalk * tw)
-{
-    int i;
-    for(i = 0; i < NHSML; i++)
-        I->Hsml[i] = effhsml(place, i, tw);
-}
-
-static void
-stellar_density_reduce(int place, TreeWalkResultStellarDensity * remote, enum TreeWalkReduceMode mode, TreeWalk * tw)
-{
-    int pi = P[place].PI;
-    int i;
-    if(mode == TREEWALK_PRIMARY || STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi] > remote->maxcmpte)
-        STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi] = remote->maxcmpte;
-    for(i = 0; i < remote->maxcmpte; i++) {
-        TREEWALK_REDUCE(STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi][i], remote->Ngb[i]);
-        TREEWALK_REDUCE(STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][i], remote->VolumeSPH[i]);
-    }
-}
-
-void stellar_density_check_neighbours (int i, TreeWalk * tw)
-{
-    MyFloat * Left = STELLAR_DENSITY_GET_PRIV(tw)->Left;
-    MyFloat * Right = STELLAR_DENSITY_GET_PRIV(tw)->Right;
-
-    int pi = P[i].PI;
-    int tid = omp_get_thread_num();
-    double desnumngb = STELLAR_DENSITY_GET_PRIV(tw)->DesNumNgb;
-
-    const int maxcmpt = STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi];
-    int j;
-    double evalhsml[NHSML];
-    evalhsml[0] = effhsml(i, 0, tw);
-    for(j = 1; j < maxcmpt; j++)
-        evalhsml[j] = effhsml(i, j, tw);
-
-    int close = 0;
-    P[i].Hsml = ngb_narrow_down(&Right[pi],&Left[pi],evalhsml,STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi],maxcmpt,desnumngb,&close,tw->tree->BoxSize);
-    double numngb = STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi][close];
-
-    /* Save VolumeSPH*/
-    STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][0] = STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][close];
-
-    /* now check whether we had enough neighbours */
-    if(numngb < (desnumngb - MetalParams.MaxNgbDeviation) ||
-            (numngb > (desnumngb + MetalParams.MaxNgbDeviation)))
-    {
-        /* This condition is here to prevent the density code looping forever if it encounters
-         * multiple particles at the same position. If this happens you likely have worse
-         * problems anyway, so warn also. */
-        if((Right[pi] - Left[pi]) < 1.0e-4 * Left[pi])
-        {
-            /* If this happens probably the exchange is screwed up and all your particles have moved to (0,0,0)*/
-            message(1, "Very tight Hsml bounds for i=%d ID=%lu type %d Hsml=%g Left=%g Right=%g Ngbs=%g des = %g Right-Left=%g pos=(%g|%g|%g)\n",
-             i, P[i].ID, P[i].Type, evalhsml[0], Left[pi], Right[pi], numngb, desnumngb, Right[pi] - Left[pi], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
-            return;
-        }
-        /* More work needed: add this particle to the redo queue*/
-        tw->NPRedo[tid][tw->NPLeft[tid]] = i;
-        tw->NPLeft[tid] ++;
-        if(tw->Niteration >= 10)
-            message(1, "i=%d ID=%lu Hsml=%g lastdhsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g pos=(%g|%g|%g)\n",
-             i, P[i].ID, P[i].Hsml, evalhsml[close], Left[pi], Right[pi], numngb, Right[pi] - Left[pi], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
-
-    }
-    if(tw->maxnumngb[tid] < numngb)
-        tw->maxnumngb[tid] = numngb;
-    if(tw->minnumngb[tid] > numngb)
-        tw->minnumngb[tid] = numngb;
-
-}
-
-static void
-stellar_density_ngbiter(
-        TreeWalkQueryStellarDensity * I,
-        TreeWalkResultStellarDensity * O,
-        TreeWalkNgbIterStellarDensity * iter,
-        LocalTreeWalk * lv)
-{
-    if(iter->base.other == -1) {
-        int i;
-        for(i = 0; i < NHSML; i++) {
-            density_kernel_init(&iter->kernel[i], I->Hsml[i], GetDensityKernelType());
-            iter->kernel_volume[i] = density_kernel_volume(&iter->kernel[i]);
-        }
-        iter->base.Hsml = I->Hsml[NHSML-1];
-        iter->base.mask = GASMASK; /* gas only */
-        iter->base.symmetric = NGB_TREEFIND_ASYMMETRIC;
-        O->maxcmpte = NHSML;
-        return;
-    }
-    const int other = iter->base.other;
-    const double r = iter->base.r;
-    const double r2 = iter->base.r2;
-
-    int i;
-    for(i = 0; i < O->maxcmpte; i++) {
-        if(r2 < iter->kernel[i].HH)
-        {
-            const double u = r * iter->kernel[i].Hinv;
-            double wk = density_kernel_wk(&iter->kernel[i], u);
-            O->Ngb[i] += wk * iter->kernel_volume[i];
-            /* For stars we need the total weighting, sum(w_k m_k / rho_k).*/
-            double thisvol = P[other].Mass / SPHP(other).Density;
-            if(MetalParams.SPHWeighting)
-                thisvol *= wk;
-            O->VolumeSPH[i] += thisvol;
-        }
-    }
-    double desnumngb = STELLAR_DENSITY_GET_PRIV(lv->tw)->DesNumNgb;
-    /* If there is an entry which is above desired DesNumNgb,
-     * we don't need to search past it. After this point
-     * all entries in the Ngb table above O->Ngb are invalid.*/
-    for(i = 0; i < NHSML; i++) {
-        if(O->Ngb[i] > desnumngb) {
-            O->maxcmpte = i+1;
-            iter->base.Hsml = I->Hsml[i];
-            break;
-        }
-    }
-
-}
-
-void
-stellar_density(const ActiveParticles * act, MyFloat * StarVolumeSPH, MyFloat * MassReturn, const ForceTree * const tree)
-{
-    TreeWalk tw[1] = {{0}};
-    struct StellarDensityPriv priv[1];
-
-    tw->ev_label = "STELLAR_DENSITY";
-    tw->visit = treewalk_visit_nolist_ngbiter;
-    tw->NoNgblist = 1;
-    tw->ngbiter_type_elsize = sizeof(TreeWalkNgbIterStellarDensity);
-    tw->ngbiter = (TreeWalkNgbIterFunction) stellar_density_ngbiter;
-    tw->haswork = stellar_density_haswork;
-    tw->fill = (TreeWalkFillQueryFunction) stellar_density_copy;
-    tw->reduce = (TreeWalkReduceResultFunction) stellar_density_reduce;
-    tw->postprocess = (TreeWalkProcessFunction) stellar_density_check_neighbours;
-    tw->query_type_elsize = sizeof(TreeWalkQueryStellarDensity);
-    tw->result_type_elsize = sizeof(TreeWalkResultStellarDensity);
-    tw->priv = priv;
-    tw->tree = tree;
-
-    int i;
-
-    priv->MassReturn = MassReturn;
-
-    priv->Left = (MyFloat *) mymalloc("DENS_PRIV->Left", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->Right = (MyFloat *) mymalloc("DENS_PRIV->Right", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->NumNgb = (MyFloat (*) [NHSML]) mymalloc("DENS_PRIV->NumNgb", SlotsManager->info[4].size * sizeof(priv->NumNgb[0]));
-    priv->VolumeSPH = (MyFloat (*) [NHSML]) mymalloc("DENS_PRIV->VolumeSPH", SlotsManager->info[4].size * sizeof(priv->VolumeSPH[0]));
-    priv->maxcmpte = (int *) mymalloc("maxcmpte", SlotsManager->info[4].size * sizeof(int));
-
-    priv->DesNumNgb = GetNumNgb(GetDensityKernelType());
-
-    #pragma omp parallel for
-    for(i = 0; i < act->NumActiveParticle; i++) {
-        int a = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        /* Skip the garbage particles */
-        if(P[a].IsGarbage)
-            continue;
-        if(!stellar_density_haswork(a, tw))
-            continue;
-        int pi = P[a].PI;
-        priv->Left[pi] = 0;
-        priv->Right[pi] = tree->BoxSize;
-    }
-
-    /* allocate buffers to arrange communication */
-
-    treewalk_do_hsml_loop(tw, act->ActiveParticle, act->NumActiveParticle, 1);
-    #pragma omp parallel for
-    for(i = 0; i < act->NumActiveParticle; i++) {
-        int a = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        /* Skip the garbage particles */
-        if(P[a].IsGarbage)
-            continue;
-        if(!stellar_density_haswork(a, tw))
-            continue;
-        /* Copy the Star Volume SPH*/
-        StarVolumeSPH[P[a].PI] = priv->VolumeSPH[P[a].PI][0];
-        if(priv->VolumeSPH[P[a].PI][0] == 0)
-            endrun(3, "i = %d pi = %d StarVolumeSPH %g hsml %g\n", a, P[a].PI, priv->VolumeSPH[P[a].PI][0], P[a].Hsml);
-    }
-
-    myfree(priv->maxcmpte);
-    myfree(priv->VolumeSPH);
-    myfree(priv->NumNgb);
-    myfree(priv->Right);
-    myfree(priv->Left);
-
-    double timeall = walltime_measure(WALLTIME_IGNORE);
-
-    double timecomp = tw->timecomp0 + tw->timecomp3 + tw->timecomp1 + tw->timecomp2;
-    walltime_add("/SPH/Metals/Density/Compute", timecomp);
-    walltime_add("/SPH/Metals/Density/Wait", tw->timewait1);
-    walltime_add("/SPH/Metals/Density/Reduce", tw->timecommsumm);
-    walltime_add("/SPH/Metals/Density/Misc", timeall - (timecomp + tw->timewait1 + tw->timecommsumm));
-
-    return;
-}
diff --git a/libgadget/metal_return.h b/libgadget/metal_return.h
deleted file mode 100644
index f5acd9c8..00000000
--- a/libgadget/metal_return.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef METAL_RETURN_H
-#define METAL_RETURN_H
-
-#include "forcetree.h"
-#include "timestep.h"
-#include "utils/paramset.h"
-#include <gsl/gsl_interp2d.h>
-#include <gsl/gsl_integration.h>
-#include "slotsmanager.h"
-
-struct interps
-{
-    gsl_interp2d * lifetime_interp;
-    gsl_interp2d * agb_mass_interp;
-    gsl_interp2d * agb_metallicity_interp;
-    gsl_interp2d * agb_metals_interp[NMETALS];
-    gsl_interp2d * snii_mass_interp;
-    gsl_interp2d * snii_metallicity_interp;
-    gsl_interp2d * snii_metals_interp[NMETALS];
-};
-
-/* Build the interpolators for each yield table. We use bilinear interpolation
- * so there is no extra memory allocation and we never free the tables*/
-void setup_metal_table_interp(struct interps * interp);
-
-struct MetalReturnPriv {
-    gsl_integration_workspace ** gsl_work;
-    MyFloat * StellarAges;
-    MyFloat * MassReturn;
-    MyFloat * LowDyingMass;
-    MyFloat * HighDyingMass;
-    double imf_norm;
-    double hub;
-    /* Maximum of the new gas mass*/
-    double MaxGasMass;
-    Cosmology *CP;
-    MyFloat * StarVolumeSPH;
-    struct interps interp;
-    struct SpinLocks * spin;
-};
-
-void metal_return(const ActiveParticles * act, ForceTree * gasTree, Cosmology * CP, const double atime, const double AvgGasMass);
-
-void set_metal_return_params(ParameterSet * ps);
-
-/* Initialise the metal private structure, finding mass return.*/
-int64_t metal_return_init(const ActiveParticles * act, Cosmology * CP, struct MetalReturnPriv * priv, const double atime);
-/* Free memory allocated in metal_return_init*/
-void metal_return_priv_free(struct MetalReturnPriv * priv);
-
-/* Find stellar density, returning the total SPH Volume weights for each particle.*/
-void stellar_density(const ActiveParticles * act, MyFloat * StarVolumeSPH, MyFloat * MassReturn, const ForceTree * const tree);
-
-/* Determines whether metal return runs for this star this timestep*/
-int metals_haswork(int i, MyFloat * MassReturn);
-#endif
diff --git a/libgadget/metal_tables.h b/libgadget/metal_tables.h
deleted file mode 100644
index 9d265753..00000000
--- a/libgadget/metal_tables.h
+++ /dev/null
@@ -1,439 +0,0 @@
-#ifndef METAL_TABLES_H
-#define METAL_TABLES_H
-
-/* Metals followed:
- * H, He, C, N, O, Ne, Mg, Si, Fe (9, following 1703.02970)
- */
-#define NSPECIES 9
-/* Largest mass in the IMF normalisation*/
-#define MAXMASS 40
-/* Only used for IMF normalisation*/
-#define MINMASS 0.1
-/* Mass in solar at which the yield tables switch from AGB stars to SNII*/
-#define SNAGBSWITCH 8
-/* Metallicity values (in terms of metal yield, not solar metallicity)
- * for the stellar lifetime table. Columns of lifetime.*/
-#define LIFE_NMET 5
-#define LIFE_NMASS 30
-static const double lifetime_metallicity[LIFE_NMET] = { 0.0004 , 0.004 , 0.008, 0.02, 0.05 };
-/* Mass values in solar masses for the stellar lifetime table. Rows of lifetime*/
-static const double lifetime_masses[LIFE_NMASS] = {0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5,
-    1.6, 1.7, 1.8, 1.9, 2.0, 2.5, 3 , 4 , 5 , 6 , 7 , 9 , 12 , 15 , 20 , 30 , 40 , 60 , 100, 120};
-/* Stellar lifetimes as a function of mass and metallicity in years.
- * Table 14 of Portinari et al, astro-ph/9711337 */
-static const double lifetime[LIFE_NMASS*LIFE_NMET] = {
-4.28e+10,   5.35E+10,   6.47E+10,   7.92E+10,   7.18E+10,
-2.37E+10,   2.95E+10,   3.54E+10,   4.45E+10,   4.00E+10,
-1.41E+10,   1.73E+10,   2.09E+10,   2.61E+10,   2.33E+10,
-8.97E+09,   1.09E+10,   1.30E+10,   1.59E+10,   1.42E+10,
-6.03E+09,   7.13E+09,   8.46E+09,   1.03E+10,   8.88E+09,
-4.23E+09,   4.93E+09,   5.72E+09,   6.89E+09,   5.95E+09,
-3.08E+09,   3.52E+09,   4.12E+09,   4.73E+09,   4.39E+09,
-2.34E+09,   2.64E+09,   2.92E+09,   3.59E+09,   3.37E+09,
-1.92E+09,   2.39E+09,   2.36E+09,   2.87E+09,   3.10E+09,
-1.66E+09,   1.95E+09,   2.18E+09,   2.64E+09,   2.51E+09,
-1.39E+09,   1.63E+09,   1.82E+09,   2.18E+09,   2.06E+09,
-1.18E+09,   1.28E+09,   1.58E+09,   1.84E+09,   1.76E+09,
-1.11E+09,   1.25E+09,   1.41E+09,   1.59E+09,   1.51E+09,
-9.66E+08,   1.23E+09,   1.25E+09,   1.38E+09,   1.34E+09,
-8.33E+08,   1.08E+09,   1.23E+09,   1.21E+09,   1.24E+09,
-4.64E+08,   5.98E+08,   6.86E+08,   7.64E+08,   6.58E+08,
-3.03E+08,   3.67E+08,   4.12E+08,   4.56E+08,   3.81E+08,
-1.61E+08,   1.82E+08,   1.93E+08,   2.03E+08,   1.64E+08,
-1.01E+08,   1.11E+08,   1.15E+08,   1.15E+08,   8.91E+07,
-7.15E+07,   7.62E+07,   7.71E+07,   7.45E+07,   5.67E+07,
-5.33E+07,   5.61E+07,   5.59E+07,   5.31E+07,   3.97E+07,
-3.42E+07,   3.51E+07,   3.44E+07,   3.17E+07,   2.33E+07,
-2.13E+07,   2.14E+07,   2.10E+07,   1.89E+07,   1.39E+07,
-1.54E+07,   1.52E+07,   1.49E+07,   1.33E+07,   9.95E+06,
-1.06E+07,   1.05E+07,   1.01E+07,   9.15E+06,   6.99E+06,
-6.90E+06,   6.85E+06,   6.65E+06,   6.13E+06,   5.15E+06,
-5.45E+06,   5.44E+06,   5.30E+06,   5.12E+06,   4.34E+06,
-4.20E+06,   4.19E+06,   4.15E+06,   4.12E+06,   3.62E+06,
-3.32E+06,   3.38E+06,   3.44E+06,   3.39E+06,   3.11E+06,
-3.11E+06,   3.23E+06,   3.32E+06,   3.23E+06,   3.11E+06};
-
-/* Sn1a yields from the W7 model of Nomoto et al 1997 https://arxiv.org/abs/astro-ph/9706025
- * I extracted this from the latex source of their table 1 by hand.
- * total_metals is just the sum of all metal masses in the table.
- */
-static const double sn1a_total_metals = 1.3743416565891;
-static const double sn1a_yields[NSPECIES] = {0, 0, 4.83E-02, 1.16E-06 , 1.43E-01 , 4.51E-03, 8.57E-03, 1.53E-01, 7.43e-01};
-
-/* AGB yields from Karakas 2010, 0912.2142 Tables A2 - A5. These have been parsed by the script in tools/extract_yields.py
- * Massive stars are from Doherty 2014, https://doi.org/10.1093/mnras/stt1877 and https://doi.org/10.1093/mnras/stu571
- * Some of the metallicities in Karakas are listed at M = 2 and some at M = 2.1. I have altered them all to be at M = 2,
- * a change which is within the uncertainty of the calculation.
- */
-#define AGB_NMET 4
-#define AGB_NMASS 18
-static const double agb_masses[AGB_NMASS] = { 1.00,1.25,1.50,1.75,1.90,2.00,2.25,2.50,3.00,3.50,4.00,4.50,5.00,5.50,6.00,6.50,7.00,7.50 };
-static const double agb_metallicities[AGB_NMET] = { 0.0001,0.0040,0.0080,0.0200 };
-static const double agb_total_mass[AGB_NMET*AGB_NMASS] = {
-0.280,0.390,0.423,0.436,
-0.582,0.608,0.650,0.676,
-0.839,0.872,0.867,0.900,
-1.086,1.120,1.114,1.135,
-1.219,1.260,1.260,1.270,
-1.315,1.450,1.456,1.360,
-1.537,1.586,1.598,1.590,
-1.768,1.829,1.837,1.837,
-2.187,2.269,2.306,2.318,
-2.646,2.686,2.734,2.782,
-3.126,3.148,3.164,3.208,
-3.603,3.628,3.639,3.648,
-4.071,4.095,4.114,4.121,
-4.534,4.568,4.593,4.600,
-4.994,5.023,5.052,5.071,
-5.401,5.494,5.548,5.537,
-5.827,5.936,6.001,6.033,
-6.269,6.342,6.442,6.489,
-};
-
-static const double agb_total_metals[AGB_NMET*AGB_NMASS] = {
-2.939e-04,1.485e-03,3.221e-03,8.302e-03,
-3.962e-03,2.500e-03,4.963e-03,1.290e-02,
-1.255e-02,5.246e-03,6.885e-03,1.721e-02,
-2.588e-02,1.014e-02,1.164e-02,2.172e-02,
-3.383e-02,1.416e-02,1.501e-02,2.431e-02,
-4.129e-02,1.806e-02,2.186e-02,2.603e-02,
-5.727e-02,3.139e-02,2.788e-02,3.145e-02,
-5.650e-02,4.520e-02,3.684e-02,4.116e-02,
-2.582e-02,5.087e-02,5.969e-02,6.170e-02,
-2.710e-02,3.576e-02,4.916e-02,7.676e-02,
-3.387e-02,4.534e-02,4.494e-02,7.494e-02,
-2.771e-02,6.428e-02,6.819e-02,8.330e-02,
-2.863e-02,8.468e-02,8.391e-02,9.272e-02,
-3.246e-02,6.702e-02,7.944e-02,1.112e-01,
-4.058e-02,6.991e-02,7.896e-02,1.164e-01,
-1.007e-01,4.809e-02,7.900e-02,1.224e-01,
-4.971e-02,3.936e-02,6.945e-02,1.349e-01,
-3.387e-02,3.916e-02,6.827e-02,1.432e-01,
-
-};
-
-static const double agb_yield[NSPECIES][AGB_NMET*AGB_NMASS] = {
-{2.098917e-01,2.837685e-01,3.030716e-01,2.917916e-01,
-4.181523e-01,4.402004e-01,4.647374e-01,4.526932e-01,
-5.871375e-01,6.310413e-01,6.216631e-01,6.062396e-01,
-7.231110e-01,8.003024e-01,7.952502e-01,7.689168e-01,
-7.975993e-01,8.940117e-01,8.979415e-01,8.627354e-01,
-8.418292e-01,1.031207e+00,1.025440e+00,9.253072e-01,
-9.679005e-01,1.086865e+00,1.113040e+00,1.078653e+00,
-1.159112e+00,1.219034e+00,1.258521e+00,1.233581e+00,
-1.581442e+00,1.538121e+00,1.534155e+00,1.524957e+00,
-1.893222e+00,1.915657e+00,1.904093e+00,1.819895e+00,
-2.169540e+00,2.193368e+00,2.242207e+00,2.143635e+00,
-2.481520e+00,2.449144e+00,2.483966e+00,2.400789e+00,
-2.761358e+00,2.636303e+00,2.707931e+00,2.652623e+00,
-3.037755e+00,2.989116e+00,3.035734e+00,2.888854e+00,
-3.240722e+00,3.205465e+00,3.278734e+00,3.136674e+00,
-3.129000e+00,3.531000e+00,3.577000e+00,3.389015e+00,
-3.581000e+00,3.824000e+00,3.870000e+00,3.770000e+00,
-3.959000e+00,4.085000e+00,4.144000e+00,4.017000e+00,
-
-},
-{
-6.976463e-02,1.046962e-01,1.166566e-01,1.358523e-01,
-1.598219e-01,1.652351e-01,1.802360e-01,2.103396e-01,
-2.392354e-01,2.356368e-01,2.383760e-01,2.764771e-01,
-3.369195e-01,3.094681e-01,3.070183e-01,3.442725e-01,
-3.874754e-01,3.517370e-01,3.469518e-01,3.828618e-01,
-4.317810e-01,4.006273e-01,4.085950e-01,4.085601e-01,
-5.117177e-01,4.676308e-01,4.569640e-01,4.797870e-01,
-5.522625e-01,5.646438e-01,5.415177e-01,5.621283e-01,
-5.795870e-01,6.798629e-01,7.120018e-01,7.311886e-01,
-7.255036e-01,7.344115e-01,7.805668e-01,8.851663e-01,
-9.223878e-01,9.090908e-01,8.766484e-01,9.892259e-01,
-1.093530e+00,1.114345e+00,1.086616e+00,1.163685e+00,
-1.280739e+00,1.373748e+00,1.321897e+00,1.375409e+00,
-1.463488e+00,1.511581e+00,1.477543e+00,1.599672e+00,
-1.712394e+00,1.747269e+00,1.693976e+00,1.817578e+00,
-2.171000e+00,1.915004e+00,1.892005e+00,2.025270e+00,
-2.196000e+00,2.073004e+00,2.062005e+00,2.128016e+00,
-2.276000e+00,2.218006e+00,2.230006e+00,2.329018e+00,
-
-},
-{
-2.548307e-04,2.007317e-04,4.499520e-04,1.188860e-03,
-3.733844e-03,4.464190e-04,6.100381e-04,1.652404e-03,
-1.161688e-02,2.206198e-03,9.797027e-04,2.018329e-03,
-2.314672e-02,6.015899e-03,3.854005e-03,2.424878e-03,
-2.968959e-02,9.332017e-03,6.109907e-03,2.706045e-03,
-3.570287e-02,1.239873e-02,1.124944e-02,2.891137e-03,
-4.844249e-02,2.385695e-02,1.585494e-02,4.315774e-03,
-4.921946e-02,3.498655e-02,2.250193e-02,9.550566e-03,
-2.395744e-02,3.907886e-02,3.964889e-02,2.093392e-02,
-7.499905e-03,2.477810e-02,2.842070e-02,2.737855e-02,
-5.496491e-03,1.091623e-02,2.201131e-02,1.921261e-02,
-3.847183e-03,6.090742e-03,7.251039e-03,1.996513e-02,
-3.193289e-03,5.875254e-03,4.790409e-03,1.786269e-02,
-2.979007e-03,6.065955e-03,6.545073e-03,8.263152e-03,
-1.564622e-03,5.016426e-03,5.207464e-03,6.271126e-03,
-3.853600e-03,4.820900e-03,6.313500e-03,6.291284e-03,
-2.563200e-03,3.790400e-03,5.379300e-03,7.561300e-03,
-2.565400e-03,3.831900e-03,5.280200e-03,8.279700e-03,
-
-},
-{
-5.571887e-06,1.361231e-04,2.725037e-04,6.631505e-04,
-2.301341e-05,2.542933e-04,5.158191e-04,1.252953e-03,
-4.452569e-05,3.972116e-04,7.765967e-04,1.880492e-03,
-5.958874e-05,5.543761e-04,1.064622e-03,2.507770e-03,
-7.103837e-05,6.368285e-04,1.191760e-03,2.815089e-03,
-7.813261e-05,6.860504e-04,1.421203e-03,3.024347e-03,
-8.142337e-05,8.018469e-04,1.559451e-03,3.694201e-03,
-5.836106e-05,9.885715e-04,1.904179e-03,4.309833e-03,
-8.782401e-05,1.217263e-03,2.468241e-03,5.661651e-03,
-1.805374e-02,1.648287e-03,3.023767e-03,7.118027e-03,
-2.634043e-02,2.366309e-02,3.487779e-03,8.631895e-03,
-2.241779e-02,4.579481e-02,3.820866e-02,1.046125e-02,
-2.405146e-02,6.564541e-02,5.544794e-02,1.642645e-02,
-2.796230e-02,4.863977e-02,4.800098e-02,3.906971e-02,
-3.745824e-02,5.402872e-02,4.988818e-02,4.182413e-02,
-9.045626e-02,3.020097e-02,4.330114e-02,4.271098e-02,
-4.486655e-02,2.116088e-02,3.290107e-02,4.301120e-02,
-2.942762e-02,1.918132e-02,2.870115e-02,4.533135e-02,
-
-},
-{
-2.490971e-05,7.457671e-04,1.624870e-03,4.196570e-03,
-1.234212e-04,1.169008e-03,2.494431e-03,6.505379e-03,
-3.160840e-04,1.706989e-03,3.330410e-03,8.659561e-03,
-4.793708e-04,2.266871e-03,4.320108e-03,1.092500e-02,
-5.378037e-04,2.564215e-03,4.910615e-03,1.222423e-02,
-5.795720e-04,2.952074e-03,5.648813e-03,1.308835e-02,
-7.052051e-04,3.252561e-03,6.188843e-03,1.514512e-02,
-7.763611e-04,3.616575e-03,6.863281e-03,1.731160e-02,
-5.221822e-04,4.374406e-03,8.296453e-03,2.123638e-02,
-5.783944e-04,5.266665e-03,9.988093e-03,2.512805e-02,
-7.999336e-04,6.077652e-03,1.176262e-02,2.924345e-02,
-6.513333e-04,6.113582e-03,1.287679e-02,3.280990e-02,
-6.109250e-04,5.122539e-03,1.201272e-02,3.599422e-02,
-5.915777e-04,5.593279e-03,1.320244e-02,3.827719e-02,
-6.622771e-04,3.847015e-03,1.164147e-02,4.077411e-02,
-1.920547e-03,5.879672e-03,1.483454e-02,4.364789e-02,
-1.101692e-03,7.012498e-03,1.623960e-02,4.776699e-02,
-1.245540e-03,8.272425e-03,1.848258e-02,5.049371e-02,
-
-},
-{
-3.771378e-06,1.366522e-04,2.967020e-04,7.653311e-04,
-7.088088e-05,2.154086e-04,4.557682e-04,1.186591e-03,
-5.502761e-04,3.397790e-04,6.141479e-04,1.578606e-03,
-2.107102e-03,5.352139e-04,8.800038e-04,1.988661e-03,
-3.370820e-03,7.546288e-04,1.072599e-03,2.223432e-03,
-4.668795e-03,1.022163e-03,1.535058e-03,2.380030e-03,
-7.470195e-03,2.341272e-03,2.068702e-03,2.850746e-03,
-5.582660e-03,4.206462e-03,2.998946e-03,3.696213e-03,
-9.685537e-04,4.331039e-03,5.897057e-03,5.878638e-03,
-6.893982e-04,1.962699e-03,3.747073e-03,7.487394e-03,
-8.870731e-04,2.101035e-03,3.135203e-03,6.760196e-03,
-5.356187e-04,3.045464e-03,4.243832e-03,7.393902e-03,
-4.735030e-04,3.782563e-03,5.084168e-03,8.110790e-03,
-4.972539e-04,2.788993e-03,4.618498e-03,9.353449e-03,
-4.242995e-04,2.676837e-03,4.541172e-03,9.767407e-03,
-1.414044e-03,2.517535e-03,5.249992e-03,1.032522e-02,
-3.300342e-04,2.519584e-03,5.119844e-03,1.234227e-02,
-1.866221e-04,2.662918e-03,5.306701e-03,1.307199e-02,
-
-},
-{
-9.455019e-07,5.151808e-05,1.117829e-04,2.882281e-04,
-2.232505e-06,8.029951e-05,1.717731e-04,4.469390e-04,
-6.836223e-06,1.151794e-04,2.290949e-04,5.949837e-04,
-3.313334e-05,1.483363e-04,2.945120e-04,7.503477e-04,
-6.736667e-05,1.679304e-04,3.335481e-04,8.395954e-04,
-1.247014e-04,1.948988e-04,3.878231e-04,8.990899e-04,
-3.621839e-04,2.331503e-04,4.297748e-04,1.051289e-03,
-7.180272e-04,3.186369e-04,5.078613e-04,1.216990e-03,
-2.311555e-04,5.492023e-04,7.524818e-04,1.556575e-03,
-2.196609e-04,5.836734e-04,9.220928e-04,1.920263e-03,
-2.767304e-04,7.972837e-04,1.015380e-03,2.198472e-03,
-1.887811e-04,1.148456e-03,1.511265e-03,2.545408e-03,
-2.277470e-04,1.842968e-03,1.907678e-03,2.872014e-03,
-3.411340e-04,1.315514e-03,1.897493e-03,3.416546e-03,
-3.504029e-04,1.486829e-03,2.012915e-03,3.675149e-03,
-2.503435e-03,1.097400e-03,2.109200e-03,3.984398e-03,
-6.246600e-04,1.042920e-03,2.058100e-03,4.734800e-03,
-2.920890e-04,1.122820e-03,2.193400e-03,5.149200e-03,
-
-},
-{
-1.003708e-06,5.549328e-05,1.204001e-04,3.104456e-04,
-2.090670e-06,8.651391e-05,1.850165e-04,4.813928e-04,
-3.027505e-06,1.240917e-04,2.468060e-04,6.408511e-04,
-3.979750e-06,1.594153e-04,3.171329e-04,8.081894e-04,
-4.539297e-06,1.793837e-04,3.587247e-04,9.043193e-04,
-5.069056e-06,2.064813e-04,4.146332e-04,9.684054e-04,
-7.413180e-06,2.262163e-04,4.552052e-04,1.132337e-03,
-1.456918e-05,2.615355e-04,5.236178e-04,1.308336e-03,
-1.150049e-05,3.277709e-04,6.597087e-04,1.651721e-03,
-1.354274e-05,3.924938e-04,7.852946e-04,1.983755e-03,
-1.726750e-05,4.656181e-04,9.124538e-04,2.287597e-03,
-1.690565e-05,5.476943e-04,1.065059e-03,2.604582e-03,
-1.957340e-05,6.417297e-04,1.222199e-03,2.943220e-03,
-2.466138e-05,6.879804e-04,1.350983e-03,3.299326e-03,
-2.855312e-05,7.600837e-04,1.487615e-03,3.634983e-03,
-1.379400e-04,9.100400e-04,1.834180e-03,3.968367e-03,
-7.630900e-05,9.734000e-04,1.966520e-03,4.892300e-03,
-4.450300e-05,1.039640e-03,2.101830e-03,5.266300e-03,
-
-},
-{
-1.778093e-06,9.938364e-05,2.156261e-04,5.559812e-04,
-3.689621e-06,1.549358e-04,3.313467e-04,8.621287e-04,
-5.302608e-06,2.222145e-04,4.420010e-04,1.147705e-03,
-6.825926e-06,2.854161e-04,5.678945e-04,1.447389e-03,
-7.663427e-06,3.210919e-04,6.423236e-04,1.619551e-03,
-8.258400e-06,3.695058e-04,7.422465e-04,1.734322e-03,
-9.513126e-06,4.039695e-04,8.146398e-04,2.027870e-03,
-1.076578e-05,4.652282e-04,9.363337e-04,2.342809e-03,
-1.365988e-05,5.732944e-04,1.173206e-03,2.956308e-03,
-1.657160e-05,6.786403e-04,1.387979e-03,3.546689e-03,
-1.955280e-05,7.936696e-04,1.606529e-03,4.088933e-03,
-2.264573e-05,9.098115e-04,1.837633e-03,4.647151e-03,
-2.560047e-05,1.020177e-03,2.071569e-03,5.248905e-03,
-2.846764e-05,1.148467e-03,2.319909e-03,5.850658e-03,
-3.131396e-05,1.263550e-03,2.554443e-03,6.452388e-03,
-3.726125e-05,1.565131e-03,3.160576e-03,7.047671e-03,
-4.105254e-05,1.694741e-03,3.425661e-03,8.622082e-03,
-4.456126e-05,1.811054e-03,3.679863e-03,9.274617e-03,
-}
-};
-
-/* Supernova II yields are from Kobayashi 2006. There is a mass gap from 8 - 13 Msun, between AGB and SNII,
- * for which we extrapolate Kobayashi 2006 yields to lower masses.*/
-#define SNII_NMET 4
-#define SNII_NMASS 7
-static const double snii_masses[SNII_NMASS] = { 13.00,15.00,18.00,20.00,25.00,30.00,40.00 };
-static const double snii_metallicities[SNII_NMET] = { 0.0000,0.0010,0.0040,0.0200 };
-static const double snii_total_mass[SNII_NMET*SNII_NMASS] = {
-11.430,11.280,11.250,11.130,
-13.520,13.390,12.890,12.640,
-16.350,16.140,14.980,15.180,
-18.340,17.870,17.760,16.810,
-23.080,22.510,22.350,19.940,
-27.930,26.990,25.000,22.480,
-37.110,34.640,30.120,19.620,
-
-};
-
-static const double snii_total_metals[SNII_NMET*SNII_NMASS] = {
-8.208e-01,9.820e-01,8.451e-01,6.734e-01,
-1.539e+00,7.860e-01,8.312e-01,6.045e-01,
-2.501e+00,1.135e+00,1.451e+00,1.539e+00,
-3.634e+00,3.489e+00,1.778e+00,2.110e+00,
-4.411e+00,5.733e+00,3.703e+00,4.296e+00,
-6.712e+00,7.570e+00,6.944e+00,5.378e+00,
-1.120e+01,1.088e+01,1.171e+01,1.136e+01,
-
-};
-
-static const double snii_yield[NSPECIES][SNII_NMET*SNII_NMASS] = {
-{6.600000e+00,6.440000e+00,6.370000e+00,6.160000e+00,
-7.580000e+00,7.450000e+00,7.110000e+00,6.790000e+00,
-8.430000e+00,8.460000e+00,7.470000e+00,7.530000e+00,
-8.770000e+00,8.430000e+00,8.950000e+00,7.930000e+00,
-1.060000e+01,9.800000e+00,1.020000e+01,8.410000e+00,
-1.170000e+01,1.110000e+01,1.010000e+01,8.750000e+00,
-1.400000e+01,1.290000e+01,1.030000e+01,3.550000e+00,
-
-},
-{
-4.010041e+00,3.860143e+00,4.040170e+00,4.300196e+00,
-4.400041e+00,5.160153e+00,4.950159e+00,5.250218e+00,
-5.420033e+00,6.540157e+00,6.060224e+00,6.110230e+00,
-5.940048e+00,5.940160e+00,7.030175e+00,6.760238e+00,
-8.030211e+00,6.970126e+00,8.480185e+00,7.240221e+00,
-9.520206e+00,8.380144e+00,7.920184e+00,8.360212e+00,
-1.190003e+01,1.090012e+01,8.120180e+00,4.710051e+00,
-
-},
-{
-7.410008e-02,1.071670e-01,8.798800e-02,1.080000e-01,
-1.720001e-01,8.505380e-02,8.830900e-02,6.625000e-02,
-2.190000e-01,1.300720e-01,1.653000e-01,1.373800e-01,
-2.110000e-01,1.280196e-01,9.769200e-02,2.464500e-01,
-2.940000e-01,2.150981e-01,1.323830e-01,2.186000e-01,
-3.380000e-01,1.210820e-01,1.823390e-01,2.519200e-01,
-4.290000e-01,7.398200e-02,4.583680e-01,5.964310e-01,
-
-},
-{
-1.830064e-03,9.077570e-03,9.086840e-03,4.804090e-02,
-1.860069e-03,3.580859e-03,1.290870e-02,6.155970e-02,
-1.890240e-04,4.470921e-03,1.262000e-01,6.611530e-02,
-5.421130e-05,1.290137e-02,1.842780e-02,7.212400e-02,
-5.911180e-04,9.207240e-03,3.159530e-02,1.306000e-01,
-1.656800e-06,6.190379e-03,2.010498e-02,1.020066e-01,
-1.218000e-06,8.692450e-03,2.600501e-02,5.810572e-02,
-
-},
-{
-4.500017e-01,5.058796e-01,3.870375e-01,2.223680e-01,
-7.730065e-01,2.943916e-01,2.930546e-01,1.653520e-01,
-1.380005e+00,4.223302e-01,5.741100e-01,7.825760e-01,
-2.110000e+00,2.180030e+00,9.953840e-01,1.056171e+00,
-2.790002e+00,3.820098e+00,2.200960e+00,2.435640e+00,
-4.810000e+00,5.330076e+00,4.790164e+00,3.227870e+00,
-8.380000e+00,8.370055e+00,7.960996e+00,7.343272e+00,
-
-},
-{
-1.530074e-02,6.751500e-02,1.332350e-01,3.944500e-02,
-3.270537e-01,1.903415e-01,1.258970e-01,3.575000e-02,
-4.941169e-01,1.775626e-01,2.051700e-01,1.558320e-01,
-9.121122e-01,6.283170e-01,2.794180e-01,4.048500e-01,
-5.330335e-01,1.221979e+00,8.249530e-01,8.713900e-01,
-8.511408e-01,1.452171e+00,9.439010e-01,9.585700e-01,
-3.070175e-01,2.879870e-01,1.884040e+00,2.225870e+00,
-
-},
-{
-8.642770e-02,6.583400e-02,4.642000e-02,2.994000e-02,
-6.889700e-02,6.572000e-02,7.848000e-02,4.110000e-02,
-1.584600e-01,6.117300e-02,8.396000e-02,1.159800e-01,
-1.503540e-01,2.468400e-01,1.005800e-01,9.487000e-02,
-1.200906e-01,1.827300e-01,2.457500e-01,2.766000e-01,
-2.273760e-01,2.938200e-01,2.321700e-01,2.472000e-01,
-4.785540e-01,7.073200e-01,4.043000e-01,4.562000e-01,
-
-},
-{
-8.257000e-02,9.317000e-02,6.229700e-02,7.784000e-02,
-7.358800e-02,4.370700e-02,1.054100e-01,8.875000e-02,
-1.167870e-01,1.541350e-01,1.006900e-01,1.147800e-01,
-9.969200e-02,1.298860e-01,1.268800e-01,6.768000e-02,
-3.513464e-01,1.207150e-01,1.225100e-01,1.412500e-01,
-2.488430e-01,1.667390e-01,4.031900e-01,2.579800e-01,
-1.036660e+00,8.971400e-01,5.340500e-01,2.607500e-01,
-
-},
-{
-7.172600e-02,7.559680e-02,7.493770e-02,8.746100e-02,
-7.238000e-02,7.327010e-02,7.540890e-02,8.976000e-02,
-7.227800e-02,7.444550e-02,9.409600e-02,9.294600e-02,
-7.228700e-02,7.404090e-02,7.768170e-02,9.375600e-02,
-7.377700e-02,7.395780e-02,7.734500e-02,9.664700e-02,
-7.457300e-02,7.518580e-02,8.269400e-02,1.038800e-01,
-8.000101e-02,8.257700e-02,8.525500e-02,8.967500e-02,
-}
-};
-
-/* These definitions are here for the tests*/
-#define GSL_WORKSPACE 1000
-
-double compute_imf_norm(gsl_integration_workspace * gsl_work);
-double compute_agb_yield(gsl_interp2d * agb_interp, const double * agb_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work );
-double compute_snii_yield(gsl_interp2d * snii_interp, const double * snii_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work );
-
-double chabrier_mass(double mass, void * params);
-
-double sn1a_number(double dtmyrstart, double dtmyrend, double hub);
-
-void set_metal_params(double Sn1aN0);
-
-void find_mass_bin_limits(double * masslow, double * masshigh, const double dtstart, const double dtend, double stellarmetal, gsl_interp2d * lifetime_tables);
-
-#endif
diff --git a/libgadget/neutrinos_lra.c b/libgadget/neutrinos_lra.c
index 166e1b6b..5af9b3e3 100644
--- a/libgadget/neutrinos_lra.c
+++ b/libgadget/neutrinos_lra.c
@@ -9,10 +9,8 @@
 #include <math.h>
 #include <string.h>
 #include <bigfile-mpi.h>
-#include <gsl/gsl_integration.h>
-#include <gsl/gsl_errno.h>
-#include <gsl/gsl_interp.h>
-#include <gsl/gsl_sf_bessel.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
+#include <boost/math/special_functions/bessel.hpp>
 
 #include "neutrinos_lra.h"
 
@@ -23,11 +21,10 @@
 #include "cosmology.h"
 #include "powerspectrum.h"
 #include "physconst.h"
+#include "timefac.h"
 
 /** Floating point accuracy*/
 #define FLOAT_ACC   1e-6
-/** Number of bins in integrations*/
-#define GSL_VAL 400
 
 /** Update the last value of delta_tot in the table with a new value computed
  from the given delta_cdm_curr and delta_nu_curr.
@@ -100,19 +97,16 @@ static void delta_tot_first_init(_delta_tot_table * const d_tot, const int nk_in
     d_tot->nk=nk_in;
     const double OmegaNua3=get_omega_nu_nopart(d_tot->omnu, d_tot->TimeTransfer)*pow(d_tot->TimeTransfer,3);
     const double OmegaNu1 = get_omega_nu(d_tot->omnu, 1);
-    gsl_interp_accel *acc = gsl_interp_accel_alloc();
-    gsl_interp * spline;
-    if(t_init->NPowerTable > 2)
-        spline = gsl_interp_alloc(gsl_interp_cspline,t_init->NPowerTable);
-    else
-        spline = gsl_interp_alloc(gsl_interp_linear,t_init->NPowerTable);
-    gsl_interp_init(spline,t_init->logk,t_init->T_nu,t_init->NPowerTable);
+    boost::math::interpolators::barycentric_rational<double>* spline;
+
+    spline = new boost::math::interpolators::barycentric_rational<double>(t_init->logk,t_init->T_nu,t_init->NPowerTable);
+
     /*Check we have a long enough power table: power tables are in log_10*/
     if(log10(wavenum[d_tot->nk-1]) > t_init->logk[t_init->NPowerTable-1])
         endrun(2,"Want k = %g but maximum in CLASS table is %g\n",wavenum[d_tot->nk-1], pow(10, t_init->logk[t_init->NPowerTable-1]));
     for(ik=0;ik<d_tot->nk;ik++) {
             /* T_nu contains T_nu / T_cdm.*/
-            double T_nubyT_nonu = gsl_interp_eval(spline,t_init->logk,t_init->T_nu,log10(wavenum[ik]),acc);
+            double T_nubyT_nonu = (*spline)(log10(wavenum[ik]));
             /*Initialise delta_nu_init to use the first timestep's delta_cdm_curr
              * so that it includes potential Rayleigh scattering. */
             d_tot->delta_nu_init[ik] = delta_cdm_curr[ik]*T_nubyT_nonu;
@@ -122,8 +116,6 @@ static void delta_tot_first_init(_delta_tot_table * const d_tot, const int nk_in
             /*Set up the wavenumber array*/
             d_tot->wavenum[ik] = wavenum[ik];
     }
-    gsl_interp_accel_free(acc);
-    gsl_interp_free(spline);
 
     /*If we are not restarting, make sure we set the scale factor*/
     d_tot->scalefact[0]=log(TimeIC);
@@ -155,19 +147,20 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
         double * logPower = (double *) mymalloc("logpk", PowerSpectrum->nonzero * sizeof(double));
         for(i = 0; i < PowerSpectrum->nonzero; i++)
             logPower[i] = log(PowerSpectrum->Power[i]);
-        gsl_interp * pkint = gsl_interp_alloc(gsl_interp_linear, PowerSpectrum->nonzero);
-        gsl_interp_init(pkint, PowerSpectrum->logknu, logPower, PowerSpectrum->nonzero);
-        gsl_interp_accel * pkacc = gsl_interp_accel_alloc();
+
+        boost::math::interpolators::barycentric_rational<double> pkint(PowerSpectrum->logknu, logPower, PowerSpectrum->nonzero);
+        double xmin = PowerSpectrum->logknu[0];
+        double xmax = PowerSpectrum->logknu[PowerSpectrum->nonzero-1];
+
         for(i = 0; i < delta_tot_table.nk; i++) {
             double logk = log(delta_tot_table.wavenum[i]);
-            if(pkint->xmax < logk || pkint->xmin > logk)
+            if(xmax < logk || xmin > logk)
                 Power_in[i] = delta_tot_table.delta_tot[i][delta_tot_table.ia-1];
             else
-                Power_in[i] = exp(gsl_interp_eval(pkint, PowerSpectrum->logknu, logPower, logk, pkacc));
+                Power_in[i] = exp(pkint(logk));
+
         }
         myfree(logPower);
-        gsl_interp_accel_free(pkacc);
-        gsl_interp_free(pkint);
     }
 
     const double partnu = particle_nu_fraction(&CP->ONu.hybnu, Time, 0);
@@ -202,8 +195,7 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
     }
     double * delta_nu_ratio = (double *) mymalloc2("dnu_rat", delta_tot_table.nk * sizeof(double));
     double * logwavenum = (double *) mymalloc2("logwavenum", delta_tot_table.nk * sizeof(double));
-    gsl_interp * pkint = gsl_interp_alloc(gsl_interp_linear, delta_tot_table.nk);
-    gsl_interp_accel * pkacc = gsl_interp_accel_alloc();
+
     /*We want to interpolate in log space*/
     for(i=0; i < delta_tot_table.nk; i++) {
         if(isnan(delta_tot_table.delta_nu_last[i]))
@@ -216,7 +208,11 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
     }
     if(delta_tot_table.nk != PowerSpectrum->nonzero)
         myfree(Power_in);
-    gsl_interp_init(pkint, logwavenum, delta_nu_ratio, delta_tot_table.nk);
+
+    boost::math::interpolators::barycentric_rational<double> pkint(logwavenum, delta_nu_ratio, delta_tot_table.nk);
+
+    double xmin = logwavenum[0];
+    double xmax = logwavenum[delta_tot_table.nk-1];
 
     /*We want to interpolate in log space*/
     for(i=0; i < PowerSpectrum->nonzero; i++) {
@@ -224,14 +220,12 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
             PowerSpectrum->delta_nu_ratio[i] = delta_nu_ratio[i];
         else {
             double logk = PowerSpectrum->logknu[i];
-            if(logk > pkint->xmax)
-                logk = pkint->xmax;
-            PowerSpectrum->delta_nu_ratio[i] = gsl_interp_eval(pkint, logwavenum, delta_nu_ratio, logk, pkacc);
+            if(logk > xmax)
+                logk = xmax;
+            PowerSpectrum->delta_nu_ratio[i] = pkint(logk);
         }
     }
 
-    gsl_interp_accel_free(pkacc);
-    gsl_interp_free(pkint);
     myfree(logwavenum);
     myfree(delta_nu_ratio);
 }
@@ -257,9 +251,6 @@ void powerspectrum_nu_save(struct _powerspectrum * PowerSpectrum, const char * O
     }
     fclose(fp);
     myfree(fname);
-    /*Clean up the neutrino memory now we saved the power spectrum.*/
-    gsl_interp_free(PowerSpectrum->nu_spline);
-    gsl_interp_accel_free(PowerSpectrum->nu_acc);
 }
 
 void petaio_save_neutrinos(BigFile * bf, int ThisTask)
@@ -552,17 +543,19 @@ Result is in Unit_Length/Unit_Time.
 ******************************************************************************************************/
 double fslength(Cosmology * CP, const double logai, const double logaf, const double light)
 {
-  double abserr;
-  double fslength_val;
-  gsl_function F;
-  gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
-  F.function = &fslength_int;
-  F.params = CP;
-  if(logai >= logaf)
-      return 0;
-  gsl_integration_qag (&F, logai, logaf, 0, 1e-6,GSL_VAL,6,w,&(fslength_val), &abserr);
-  gsl_integration_workspace_free (w);
-  return light*fslength_val;
+    double abserr;
+    if (logai >= logaf)
+        return 0;
+
+    // Define the integrand as a lambda function wrapping fslength_int
+    auto integrand = [CP](double loga) {
+        return fslength_int(loga, (void *)CP);
+    };
+
+    // Use Tanh-Sinh adaptive integration
+    double fslength_val = tanh_sinh_integrate_adaptive(integrand, logai, logaf, &abserr, 1e-6);
+
+    return light * fslength_val;
 }
 
 /**************************************************************************************************
@@ -589,7 +582,9 @@ static inline double specialJ_fit(const double x)
 /*Asymptotic series expansion from YAH. Not good when qc * x is small, but fine otherwise.*/
 static inline double II(const double x, const double qc, const int n)
 {
-    return (n*n+n*n*n*qc+n*qc*x*x - x*x)* qc*gsl_sf_bessel_j0(qc*x) + (2*n+n*n*qc+qc*x*x)*cos(qc*x);
+    using boost::math::cyl_bessel_j;  // Import Boost Bessel function
+    return (n*n+n*n*n*qc+n*qc*x*x - x*x) * qc * cyl_bessel_j(0, qc * x)  // Bessel J0
+           + (2 * n + n * n * qc + qc * x * x) * cos(qc * x);
 }
 
 /* Fourier transform of truncated Fermi Dirac distribution, with support on q > qc only.
@@ -628,12 +623,10 @@ struct _delta_nu_int_params
     double k;
     /**Neutrino mass divided by k_B T_nu*/
     double mnubykT;
-    gsl_interp_accel *acc;
-    gsl_interp *spline;
+    boost::math::interpolators::barycentric_rational<double>* spline;
     Cosmology * CP;
     /**Precomputed free-streaming lengths*/
-    gsl_interp_accel *fs_acc;
-    gsl_interp *fs_spline;
+    boost::math::interpolators::barycentric_rational<double>* fs_spline;
     double * fslengths;
     double * fsscales;
     /**Make sure this is at the same k as above*/
@@ -648,12 +641,12 @@ struct _delta_nu_int_params
 };
 typedef struct _delta_nu_int_params delta_nu_int_params;
 
-/**GSL integration kernel for get_delta_nu*/
+/**integration kernel for get_delta_nu*/
 double get_delta_nu_int(double logai, void * params)
 {
     delta_nu_int_params * p = (delta_nu_int_params *) params;
-    double fsl_aia = gsl_interp_eval(p->fs_spline,p->fsscales,p->fslengths,logai,p->fs_acc);
-    double delta_tot_at_a = gsl_interp_eval(p->spline,p->scale,p->delta_tot,logai,p->acc);
+    double fsl_aia = (*p->fs_spline)(logai);
+    double delta_tot_at_a = (*p->spline)(logai);
     double specJ = specialJ(p->k*fsl_aia/p->mnubykT, p->qc, p->nufrac_low);
     double ai = exp(logai);
     return fsl_aia/(ai*hubble_function(p->CP, ai)) * specJ * delta_tot_at_a;
@@ -709,19 +702,7 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
   /*If neutrino mass is zero, we are not accurate, just use the initial conditions piece*/
   if(Na > 1 && mnubykT > 0){
         delta_nu_int_params params;
-        params.acc = gsl_interp_accel_alloc();
-        gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
-        gsl_function F;
-        F.function = &get_delta_nu_int;
-        F.params=&params;
-        /*Use cubic interpolation*/
-        if(Na > 2) {
-                params.spline=gsl_interp_alloc(gsl_interp_cspline,Na);
-        }
-        /*Unless we have only two points*/
-        else {
-                params.spline=gsl_interp_alloc(gsl_interp_linear,Na);
-        }
+
         params.scale=d_tot->scalefact;
         params.mnubykT=mnubykT;
         params.qc = qc;
@@ -731,34 +712,42 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
          * which is exactly where it doesn't matter, but
          * we still want to be safe. */
         int Nfs = Na*16;
-        params.fs_acc = gsl_interp_accel_alloc();
-        params.fs_spline=gsl_interp_alloc(gsl_interp_cspline,Nfs);
+
         params.CP = CP;
         /*Pre-compute the free-streaming lengths, which are scale-independent*/
         double * fslengths = (double *) mymalloc("fslengths", Nfs* sizeof(double));
         double * fsscales = (double *) mymalloc("fsscales", Nfs* sizeof(double));
         for(ik=0; ik < Nfs; ik++) {
             fsscales[ik] = log(d_tot->TimeTransfer) + ik*(log(a) - log(d_tot->TimeTransfer))/(Nfs-1.);
+            if (ik == Nfs-1)
+                fsscales[ik] = log(a); // Make sure the last point is exactly a without precision loss
             fslengths[ik] = fslength(CP, fsscales[ik], log(a),d_tot->light);
         }
         params.fslengths = fslengths;
         params.fsscales = fsscales;
 
-        if(!params.spline || !params.acc || !w || !params.fs_spline || !params.fs_acc || !fslengths || !fsscales)
-              endrun(2016,"Error initialising and allocating memory for gsl interpolator and integrator.\n");
+        params.fs_spline = new boost::math::interpolators::barycentric_rational<double>(params.fsscales,params.fslengths,Nfs);
+
+        // if Na is less than 4, the approximation order for interpolation should be adjusted
+        size_t approx_order = 3;
+        if (Na < 4) {
+            approx_order = Na - 1;
+        }
 
-        gsl_interp_init(params.fs_spline,params.fsscales,params.fslengths,Nfs);
         for (ik = 0; ik < d_tot->nk; ik++) {
             double abserr,d_nu_tmp;
             params.k=d_tot->wavenum[ik];
             params.delta_tot=d_tot->delta_tot[ik];
-            gsl_interp_init(params.spline,params.scale,params.delta_tot,Na);
-            gsl_integration_qag (&F, log(d_tot->TimeTransfer), log(a), 0, relerr,GSL_VAL,6,w,&d_nu_tmp, &abserr);
+            // print the number of data points
+            
+            params.spline = new boost::math::interpolators::barycentric_rational<double>(params.scale,params.delta_tot,Na,approx_order);
+            // Define the integrand as a lambda function wrapping get_delta_nu_int
+            auto integrand = [&params](double logai) {
+                return get_delta_nu_int(logai, (void *)&params);
+            };
+            d_nu_tmp = tanh_sinh_integrate_adaptive(integrand, log(d_tot->TimeTransfer), log(a), &abserr, relerr);
             delta_nu_curr[ik] += d_tot->delta_nu_prefac * d_nu_tmp;
          }
-         gsl_integration_workspace_free (w);
-         gsl_interp_free(params.spline);
-         gsl_interp_accel_free(params.acc);
          myfree(fsscales);
          myfree(fslengths);
    }
diff --git a/libgadget/omega_nu_single.c b/libgadget/omega_nu_single.c
index 8cfc849a..083cd2ed 100644
--- a/libgadget/omega_nu_single.c
+++ b/libgadget/omega_nu_single.c
@@ -1,11 +1,11 @@
 #include "omega_nu_single.h"
 
 #include <math.h>
-#include <gsl/gsl_integration.h>
 #include <string.h>
 #include "physconst.h"
 #include "utils/mymalloc.h"
 #include "utils/endrun.h"
+#include "timefac.h"
 
 #define HBAR    6.582119e-16  /*hbar in units of eV s*/
 #define STEFAN_BOLTZMANN 5.670373e-5
@@ -13,8 +13,6 @@
 #define NRHOTAB 200
 /** Floating point accuracy*/
 #define FLOAT_ACC   1e-6
-/** Number of bins in integrations*/
-#define GSL_VAL 200
 
 void init_omega_nu(_omega_nu * omnu, const double MNu[], const double a0, const double HubbleParam, const double tcmb0)
 {
@@ -129,8 +127,7 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
      /*Make the table over a slightly wider range than requested, in case there is roundoff error*/
      const double logA0=log(a0)-log(1.2);
      const double logaf=log(NU_SW*kBtnu/mnu)+log(1.2);
-     gsl_function F;
-     F.function = &rho_nu_int;
+
      /*Initialise constants*/
      rho_nu_tab->mnu = mnu;
      /*Shortcircuit if we don't need to do the integration*/
@@ -140,23 +137,27 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
      /*Allocate memory for arrays*/
      rho_nu_tab->loga = (double *) mymalloc("rho_nu_table",2*NRHOTAB*sizeof(double));
      rho_nu_tab->rhonu = rho_nu_tab->loga+NRHOTAB;
-     rho_nu_tab->acc = gsl_interp_accel_alloc();
-     rho_nu_tab->interp=gsl_interp_alloc(gsl_interp_cspline,NRHOTAB);
-     if(!rho_nu_tab->interp || !rho_nu_tab->acc || !rho_nu_tab->loga)
+     if(!rho_nu_tab->loga)
          endrun(2035,"Could not initialise tables for neutrino matter density\n");
 
-     gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
      for(i=0; i< NRHOTAB; i++){
         double param[2];
         rho_nu_tab->loga[i]=logA0+i*(logaf-logA0)/(NRHOTAB-1);
         param[0]=mnu*exp(rho_nu_tab->loga[i]);
         param[1] = kBtnu;
-        F.params = &param;
-        gsl_integration_qag (&F, 0, 500*kBtnu,0 , 1e-9,GSL_VAL,6,w,&(rho_nu_tab->rhonu[i]), &abserr);
-        rho_nu_tab->rhonu[i]=rho_nu_tab->rhonu[i]/pow(exp(rho_nu_tab->loga[i]),4)*get_rho_nu_conversion();
+
+        // Define the integrand for rho_nu_int
+        auto integrand = [param](double q) {
+            return rho_nu_int(q, (void *)param);
+        };
+
+        // Perform the Tanh-Sinh adaptive integration
+        double result = tanh_sinh_integrate_adaptive(integrand, 0, 500 * kBtnu, &abserr, 1e-9);
+
+        rho_nu_tab->rhonu[i] = result / pow(exp(rho_nu_tab->loga[i]), 4) * get_rho_nu_conversion();
      }
-     gsl_integration_workspace_free (w);
-     gsl_interp_init(rho_nu_tab->interp,rho_nu_tab->loga,rho_nu_tab->rhonu,NRHOTAB);
+
+     rho_nu_tab->interp = new boost::math::interpolators::barycentric_rational<double>(rho_nu_tab->loga, rho_nu_tab->rhonu, NRHOTAB);
      return;
 }
 
@@ -200,7 +201,7 @@ double rho_nu(const _rho_nu_single * rho_nu_tab, const double a, const double kT
             if (!rho_nu_tab->loga || loga < rho_nu_tab->loga[0])
                 rho_nu_val = rel_rho_nu(a,kT);
             else
-                rho_nu_val=gsl_interp_eval(rho_nu_tab->interp,rho_nu_tab->loga,rho_nu_tab->rhonu,loga,rho_nu_tab->acc);
+                rho_nu_val=(*rho_nu_tab->interp)(loga);
         }
         return rho_nu_val;
 }
@@ -217,17 +218,17 @@ double fermi_dirac_kernel(double x, void * params)
  * This is integral f_0(q) q^2 dq between 0 and qc to compute the fraction of OmegaNu which is in particles.*/
 double nufrac_low(const double qc)
 {
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
     double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
-    double total_fd;
-    gsl_integration_qag (&F, 0, qc, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
+    // Define the integrand for Fermi-Dirac kernel
+    auto integrand = [](double x) {
+        return fermi_dirac_kernel(x, NULL);
+    };
+
+    // Use Tanh-Sinh adaptive integration for the Fermi-Dirac kernel
+    double total_fd = tanh_sinh_integrate_adaptive(integrand, 0, qc, &abserr, 1e-6);
     /*divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
     total_fd /= 1.5*1.202056903159594;
-    gsl_integration_workspace_free (w);
+
     return total_fd;
 }
 
diff --git a/libgadget/omega_nu_single.h b/libgadget/omega_nu_single.h
index 9cbdbd34..ff24a896 100644
--- a/libgadget/omega_nu_single.h
+++ b/libgadget/omega_nu_single.h
@@ -3,7 +3,11 @@
 /** \file
  * Routines for computing the matter density in a single neutrino species*/
 
-#include <gsl/gsl_interp.h>
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+#include <boost/math/interpolators/barycentric_rational.hpp>
 
 /** Ratio between the massless neutrino temperature and the CMB temperature.
  * Note there is a slight correction from 4/11
@@ -24,8 +28,7 @@
 struct _rho_nu_single {
     double * loga;
     double * rhonu;
-    gsl_interp * interp;
-    gsl_interp_accel * acc;
+    boost::math::interpolators::barycentric_rational<double>* interp;
     /*Neutrino mass for this structure*/
     double mnu;
 };
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index aeda7bb3..3bd2fb09 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,10 +7,12 @@
 
 #include "types.h"
 #include "petapm.h"
+#include "pm_kernel.cuh"
 
 #include "utils.h"
 #include "walltime.h"
 
+
 static void
 layout_prepare(PetaPM * pm,
                struct Layout * L,
@@ -19,7 +21,7 @@ layout_prepare(PetaPM * pm,
                const int Nregions,
                MPI_Comm comm);
 static void layout_finish(struct Layout * L);
-static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+static void layout_build_and_exchange_cells_to_fft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 
 /* cell_iterator needs to be thread safe !*/
@@ -46,10 +48,10 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 static MPI_Datatype MPI_PENCIL;
 
 /*Used only in MP-GenIC*/
-pfft_complex *
+cufftDoubleComplex *
 petapm_alloc_rhok(PetaPM * pm)
 {
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftDoubleComplex * rho_k = (cufftDoubleComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
     memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
@@ -57,18 +59,10 @@ petapm_alloc_rhok(PetaPM * pm)
 static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
 
 static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
-static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
 #define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
 #define INACTIVE(i) (CPS->active && !CPS->active(i))
 
-/* (jdavies) reion defs */
-#define TYPE(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_type]))
-#define PI(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_pi]))
-/* NOTE: These are 'myfloat' types */
-#define FESC(i) ((double*) (&((char*)CPS_R->Starslot)[CPS_R->star_elsize * *PI(i) + CPS_R->offset_fesc]))
-#define FESCSPH(i) ((double*) (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_fesc_sph]))
-#define SFR(i) ((double*)  (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_sfr]))
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
     return &pm->fourier_space_region;
@@ -90,11 +84,16 @@ int *petapm_get_ntask2d(PetaPM * pm) {
 void
 petapm_module_init(int Nthreads)
 {
-    pfft_init();
+    // CUDA Device Initialization if necessary (optional if only one GPU is used)
+    int device_id = 0;
+    cudaSetDevice(device_id);  // Set the active GPU device
 
-    pfft_plan_with_nthreads(Nthreads);
+    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
+    // #ifdef _OPENMP
+    // omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    // #endif
+    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    /* initialize the MPI Datatype of pencil */
     MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
     MPI_Type_commit(&MPI_PENCIL);
 }
@@ -110,97 +109,143 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     pm->CellSize = BoxSize / Nmesh;
     pm->comm = comm;
 
-    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    ptrdiff_t np[2];
-
     int ThisTask;
     int NTask;
-
     pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
     pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
-
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
+    
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(ThisTask % ndevices);
 
-    /* try to find a square 2d decomposition */
-    int i;
-    int k;
-    for(i = sqrt(NTask) + 1; i >= 0; i --) {
-        if(NTask % i == 0) break;
-    }
-    np[0] = i;
-    np[1] = NTask / i;
+    message(0, "Cuda Devices %d \n", ndevices);
 
-    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-    if( pfft_create_procmesh_2d(comm, np[0], np[1], &pm->priv->comm_cart_2d) ){
-        endrun(0, "Error: This test file only works with %td processes.\n", np[0]*np[1]);
+    /* try to find a square 2d decomposition */
+    /* CUDA NOTE: CufftMp only supports square decomposition, 
+    so Ntask has to be a perfect square*/
+    int nranks1d;
+    nranks1d = sqrt(NTask);
+    if (nranks1d != NTask/nranks1d) {
+        endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-    int periods_unused[2];
-    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
-
-    if(pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1])
-        endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
-
-    pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
-           PFFT_TRANSPOSED_OUT,
-           pm->real_space_region.size, pm->real_space_region.offset,
-           pm->fourier_space_region.size, pm->fourier_space_region.offset);
-
-    /*
-     * In fourier space, the transposed array is ordered in
-     * are in (y, z, x). The strides and sizes returned
-     * from local size is in (Nx, Ny, Nz), hence we roll them once
-     * so that the strides will give correct linear indexing for
-     * integer coordinates given in order of (y, z, x).
-     * */
-
-#define ROLL(a, N, j) { \
-    typeof(a[0]) tmp[N]; \
-    ptrdiff_t k; \
-    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
-    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
+    message(0, "Using 2D Task mesh %d x %d \n", nranks1d, nranks1d);
+    
+    // Define custom data distribution
+    int64 nx               = Nmesh;
+    int64 ny               = Nmesh;
+    int64 nz               = Nmesh;
+    int64 nz_real          = nz;
+    int64 nz_complex       = (nz/2+1);
+    int64 nz_real_padded   = 2*nz_complex;
+
+    // create 2D cartesian MPI comm without pfft
+    int dims[2] = {nranks1d, nranks1d};
+    int periods[2] = {0, 0};  // non-periodic in both dimensions
+    // Allow the ranks to be reordered by MPI for efficiency
+    // Actually don't allow reordering for now to be safe
+    int reorder = 0;
+    MPI_Cart_create(comm, 2, dims, periods, reorder, &pm->priv->comm_cart_2d);
+    if (pm->priv->comm_cart_2d == MPI_COMM_NULL) {
+        endrun(0, "Error: comm_cart_2d is MPI_COMM_NULL\n");
     }
-
-    ROLL(pm->fourier_space_region.offset, 3, 1);
-    ROLL(pm->fourier_space_region.size, 3, 1);
-
-#undef ROLL
-
-    /* calculate the strides */
-    petapm_region_init_strides(&pm->real_space_region);
-    petapm_region_init_strides(&pm->fourier_space_region);
-
-    /* planning the fft; need temporary arrays */
-
-    double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-
-    pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
-        n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
-        PFFT_TRANSPOSED_OUT | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
-    pm->priv->plan_back = pfft_plan_dft_c2r_3d(
-        n, complx, real, pm->priv->comm_cart_2d, PFFT_BACKWARD,
-        PFFT_TRANSPOSED_IN | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
-
-    myfree(complx);
-    myfree(rho_k);
-    myfree(real);
-
+    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods, pm->ThisTask2d);
+    message(1, "Task = %d ThisTask2d = (%d, %d) Ntask2d = (%d, %d) \n", 
+        ThisTask, pm->ThisTask2d[0], pm->ThisTask2d[1], pm->NTask2d[0], pm->NTask2d[1]);
+
+
+    // compute offset, size and strides
+    auto displacement = [](int64 length, int rank, int size) {
+        int ranks_cutoff = length % size;
+        int chunk_size = length / size;
+        return (rank < ranks_cutoff ? rank * (chunk_size + 1) : ranks_cutoff * (chunk_size + 1) + (rank - ranks_cutoff) * chunk_size);
+    };
+    
+    // update region properties, also have a redundant box struct for now to use box_iterator, will merge it to region
+    auto update_region_and_box = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region, Box3D &box) {
+        region.totalsize = 1;
+        for (int i = 0; i < 3; i++) {
+            region.offset[i]  = lower[i];
+            region.upper[i]   = upper[i];
+            region.size[i]    = upper[i] - lower[i];
+            region.strides[i] = strides[i];
+            region.totalsize *= region.size[i];
+            // init box3d
+            box.lower[i] = lower[i];
+            box.upper[i] = upper[i];
+            box.strides[i] = strides[i];
+        }
+        region.buffer = NULL;
+    };
+    
+    int i = ThisTask / nranks1d;
+    int j = ThisTask % nranks1d;
+    
+    // real region setup
+    // note the petapm->region has non-padded strides, while cufft takes in padded strides
+    int64 lower_real[3]   = {displacement(nx, i, nranks1d), displacement(ny, j, nranks1d), 0};
+    int64 upper_real[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
+    int64 strides_real[3] = {(upper_real[1] - lower_real[1]) * nz_real_padded, nz_real_padded, 1};
+    int64 strides_real_nopad[3] = {(upper_real[1] - lower_real[1]) * nz_real, nz_real, 1};
+
+    update_region_and_box(lower_real, upper_real, strides_real_nopad, pm->real_space_region, pm->box_real);
+    
+    // complex region setup
+    int64 lower_fourier[3]   = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
+    int64 upper_fourier[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides_fourier[3] = {(upper_fourier[1] - lower_fourier[1]) * (upper_fourier[2] - lower_fourier[2]), (upper_fourier[2] - lower_fourier[2]), 1};
+    update_region_and_box(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region, pm->box_complex);
+
+    //===============================================================================================
+    CUDA_CHECK(cudaStreamCreate(&pm->priv->stream));
+    CUFFT_CHECK(cufftCreate(&pm->priv->plan_forw));
+    CUFFT_CHECK(cufftCreate(&pm->priv->plan_back));
+    
+    // Attach the MPI communicator to the plans
+    CUFFT_CHECK(cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm));
+    CUFFT_CHECK(cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm));
+    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
+    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
+    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
+    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
+    CUFFT_CHECK(cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier));
+    CUFFT_CHECK(cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier));
+
+    // Set the stream
+    CUFFT_CHECK(cufftSetStream(pm->priv->plan_forw, pm->priv->stream));
+    CUFFT_CHECK(cufftSetStream(pm->priv->plan_back, pm->priv->stream));
+
+    // Make the plan
+    size_t workspace;
+    CUFFT_CHECK(cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_D2Z, &workspace));
+    CUFFT_CHECK(cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_Z2D, &workspace));
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_DISTRIBUTED_INPUT, i.e., box_real
+//    cudaLibXtDesc *desc;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT));
+    
+    pm->priv->fftsize = (upper_real[0] - lower_real[0]) * strides_real[0];
+    pm->priv->fftsize_complex = (upper_fourier[0] - lower_fourier[0]) * strides_fourier[0];
+    //===============================================================================================
+    message(1, "Task %d NGPUs=%d, local real size (fftsize)=%d, local fourier size=%d\n", ThisTask, pm->priv->desc->descriptor->nGPUs, pm->priv->fftsize, pm->priv->fftsize_complex);
     /* now lets fill up the mesh2task arrays */
-
-#if 0
-    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
-            pm->real_space_region.offset[0],
-            pm->real_space_region.offset[1],
-            pm->real_space_region.offset[2],
-            pm->real_space_region.size[0],
-            pm->real_space_region.size[1],
-            pm->real_space_region.size[2]);
-#endif
-
+    #if 0
+        message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->fourier_space_region.offset[0],
+                pm->fourier_space_region.offset[1],
+                pm->fourier_space_region.offset[2],
+                pm->fourier_space_region.upper[0],
+                pm->fourier_space_region.upper[1],
+                pm->fourier_space_region.upper[2],
+                pm->fourier_space_region.strides[0],
+                pm->fourier_space_region.strides[1],
+                pm->fourier_space_region.strides[2]);
+    #endif
     int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+    int k;
     for(k = 0; k < 2; k ++) {
         for(i = 0; i < Nmesh; i ++) {
             tmp[i] = 0;
@@ -211,11 +256,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
         /* which column / row hosts this tile? */
         /* FIXME: this is very inefficient */
         MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-        /*
-        for(i = 0; i < Nmesh; i ++) {
-            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
-        }
-        */
+        // for(i = 0; i < Nmesh; i ++) {
+        //     message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
+        // }
     }
     myfree(tmp);
 }
@@ -223,8 +266,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 void
 petapm_destroy(PetaPM * pm)
 {
-    pfft_destroy_plan(pm->priv->plan_forw);
-    pfft_destroy_plan(pm->priv->plan_back);
+    CUFFT_CHECK(cufftDestroy(pm->priv->plan_forw));
+    CUFFT_CHECK(cufftDestroy(pm->priv->plan_back));
+    CUDA_CHECK(cudaStreamDestroy(pm->priv->stream));
     MPI_Comm_free(&pm->priv->comm_cart_2d);
     myfree(pm->Mesh2Task[0]);
 }
@@ -235,15 +279,8 @@ petapm_destroy(PetaPM * pm)
  * */
 typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
-/* apply transfer function to value, kpos array is in x, y, z order */
-static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-
 /*
  * 1. calls prepare to build the Regions covering particles
  * 2. CIC the particles
@@ -270,7 +307,6 @@ petapm_force_init(
     *Nregions = 0;
     PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
     pm_init_regions(pm, regions, *Nregions);
-
     pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
 
     layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
@@ -279,79 +315,159 @@ petapm_force_init(
     return regions;
 }
 
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+
+cufftDoubleComplex *
+petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
-    /* call pfft rho_k is CFT of rho */
-
-    /* this is because
-     *
-     * CFT = DFT * dx **3
-     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
-     * */
+     // CUDA TODO: figureout how to properly get fftsize
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
-    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    
     walltime_measure("/PMgrav/comm2");
-
 #ifdef DEBUG
     verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
     walltime_measure("/PMgrav/Verify");
 #endif
-
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
+    // pm->priv->desc allocated in init
+    // copy real array to gpu
+    
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_forw, pm->priv->desc, real, CUFFT_COPY_HOST_TO_DEVICE));
+    message(1, "Real array first element %f\n", real[0]);
+    
+    // execute the plan
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD));
+    // message(1, "complex array first element %f\n", ((cufftDoubleComplex*)pm->priv->desc->descriptor->data[0])[0].x);
     myfree(real);
 
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
-
+     // CUDA TODO: need to check if the output complex array is transpose
+     // need to verify
+    //=============================== End of R2C =============================================
+    //========================== Begin Transfer Function =====================================
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(pm->comm, &ThisTask);
+    MPI_Comm_size(pm->comm, &NTask);
     /*Do any analysis that may be required before the transfer function is applied*/
-    petapm_transfer_func global_readout = global_functions->global_readout;
-    if(global_readout)
-        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
-    if(global_functions->global_analysis)
-        global_functions->global_analysis(pm);
+    /* CUDA Note: global readout and analysis is NULL unless CP->MassiveNuLinRespOn*/
+    /* CUDA TODO: add back the CP->MassiveNuLinRespOn function later*/
+    
     /*Apply the transfer function*/
-    petapm_transfer_func global_transfer = global_functions->global_transfer;
-    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
+    /* global transfer is potential transfer in gravpm*/
+    // petapm_transfer_func global_transfer = global_functions->global_transfer;
+    // pm_apply_transfer_function(pm, complex_data, rho_k, global_transfer);
+    
+    launch_potential_transfer(pm->box_complex, (cufftDoubleComplex *) pm->priv->desc->descriptor->data[0], ThisTask, NTask, pm, pm->priv->stream);
+    message(1, "Simple kernel suceeded \n");
     walltime_measure("/PMgrav/r2c");
-
-    myfree(complx);
+    cufftDoubleComplex * rho_k = (cufftDoubleComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
 
 void
 petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k,
+        cufftDoubleComplex * rho_k,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions)
 {
-
-    PetaPMFunctions * f = functions;
-    for (f = functions; f->name; f ++) {
-        petapm_transfer_func transfer = f->transfer;
-        petapm_readout_func readout = f->readout;
-
-        pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-        /* apply the greens function turn rho_k into potential in fourier space */
-        pm_apply_transfer_function(pm, rho_k, complx, transfer);
-        walltime_measure("/PMgrav/calc");
-
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
-
-        walltime_measure("/PMgrav/c2r");
-        if(f == functions) // Once
-            report_memory_usage("PetaPM");
-        myfree(complx);
-        /* read out the potential: this will copy and free real.*/
-        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-        walltime_measure("/PMgrav/comm");
-
-        pm_iterate(pm, readout, regions, Nregions);
-        walltime_measure("/PMgrav/readout");
-    }
+    // For grav the functions are: potential, forcex, forcey, forcez, 
+    // where the potential has no transfer function, only readout
+    // as the potential transfer is applied in r2c
+    PetaPMFunctions f;
+    petapm_readout_func readout;
+
+    // c2r on rhok and apply potential readout function
+    // transfer function for x,y,z, c2r, then readout
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(pm->comm, &ThisTask);
+    MPI_Comm_size(pm->comm, &NTask);
+    cufftResult res;
+    cudaError_t CudaError;
+    size_t size_cpy = pm->priv->desc->descriptor->size[0];
+    cufftDoubleComplex* complex;
+    
+    // -------------------- force x --------------------------------
+    double * real_fx = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMalloc(&complex, size_cpy));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_x_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fx;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fx, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fx->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fx, desc_fx, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fx, desc_fx, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fx);
+    f = functions[1];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fx));
+
+    // -------------------- force y--------------------------------
+    double * real_fy = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_y_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fy;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fy, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fy->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fy, desc_fy, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fy, desc_fy, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fy);
+    f = functions[2];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fy));
+    // -------------------- force z --------------------------------
+    double * real_fz = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_y_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fz;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fz, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fz->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fz, desc_fz, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fz, desc_fz, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fz);
+    f = functions[3];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fz));
+    // -------------------- potential --------------------------------
+    double * real_pot = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    /* get potential out last*/
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE));
+    // copy data back to cpu
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_pot, pm->priv->desc, CUFFT_COPY_DEVICE_TO_HOST));
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_pot);
+    walltime_measure("/PMgrav/comm");
+    f = functions[0];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    CUFFT_CHECK(cufftXtFree(pm->priv->desc));
+    CUDA_CHECK(cudaFree(complex));
 }
 
 void petapm_force_finish(PetaPM * pm) {
@@ -366,7 +482,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
         void * userdata) {
     int Nregions;
     PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    pfft_complex * rho_k = petapm_force_r2c(pm, global_functions);
+    cufftDoubleComplex * rho_k = petapm_force_r2c(pm, global_functions);
     if(functions)
         petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
     myfree(rho_k);
@@ -376,205 +492,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
     petapm_force_finish(pm);
 }
 
-/* These functions are for the excursion set reionization module*/
-
-/* initialise one set of regions with custom iterator
- * this is the same as petapm_force_init with a custom iterator
- * (and no CPS definition since it's called multiple times)*/
-PetaPMRegion *
-petapm_reion_init(
-        PetaPM * pm,
-        petapm_prepare_func prepare,
-        pm_iterator iterator,
-        PetaPMParticleStruct * pstruct,
-        int * Nregions,
-        void * userdata) {
-
-    *Nregions = 0;
-    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
-    pm_init_regions(pm, regions, *Nregions);
-
-    walltime_measure("/PMreion/Misc");
-    pm_iterate(pm, iterator, regions, *Nregions);
-    walltime_measure("/PMreion/cic");
-
-    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
-
-    walltime_measure("/PMreion/comm");
-    return regions;
-}
-
-/* 30Mpc to 0.5 Mpc with a delta of 1.1 is ~50 iterations, this should be more than enough*/
-#define MAX_R_ITERATIONS 10000
-
-/* differences from force c2r (why I think I need this separate)
- * radius loop (could do this with long list of same function + global R)
- * I'm pretty sure I need a third function type (reion loop) with all three grids
- * ,after c2r but iteration over the grid, instead of particles */
-void
-petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        pfft_complex * mass_unfiltered, pfft_complex * star_unfiltered, pfft_complex * sfr_unfiltered,
-        PetaPMRegion * regions,
-        const int Nregions,
-        PetaPMFunctions * functions,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr)
-{
-    PetaPMFunctions * f = functions;
-    double R = fmin(R_max,pm_mass->BoxSize);
-    int last_step = 0;
-    int f_count = 0;
-    petapm_readout_func readout = f->readout;
-
-    /* TODO: seriously re-think the allocation ordering in this function */
-    double * mass_real = (double * ) mymalloc2("mass_real", pm_mass->priv->fftsize * sizeof(double));
-
-    //TODO: add CellLengthFactor for lowres (>1Mpc, see old find_HII_bubbles function)
-    while(!last_step) {
-        f_count++;
-        //The last step will be unfiltered
-        if(R/R_delta < R_min || R/R_delta < (pm_mass->CellSize) || f_count > MAX_R_ITERATIONS)
-        {
-            last_step = 1;
-            R = pm_mass->CellSize;
-        }
-
-        //NOTE: The PetaPM structs for reionisation use the G variable for filter radius in order to use
-        //the transfer functions correctly
-        pm_mass->G = R;
-        pm_star->G = R;
-        if(use_sfr)pm_sfr->G = R;
-
-        //TODO: maybe allocate and free these outside the loop
-        pfft_complex * mass_filtered = (pfft_complex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
-        pfft_complex * star_filtered = (pfft_complex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
-        pfft_complex * sfr_filtered;
-        if(use_sfr){
-            sfr_filtered = (pfft_complex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
-        }
-
-        /* apply the filtering at this radius */
-        /*We want the last step to be unfiltered,
-         *  calling apply transfer with NULL should just copy the grids */
-
-        petapm_transfer_func transfer = last_step ? NULL : f->transfer;
-
-        pm_apply_transfer_function(pm_mass, mass_unfiltered, mass_filtered, transfer);
-        pm_apply_transfer_function(pm_star, star_unfiltered, star_filtered, transfer);
-        if(use_sfr){
-            pm_apply_transfer_function(pm_sfr, sfr_unfiltered, sfr_filtered, transfer);
-        }
-        walltime_measure("/PMreion/calc");
-
-        double * star_real = (double * ) mymalloc2("star_real", pm_star->priv->fftsize * sizeof(double));
-        /* back to real space */
-        pfft_execute_dft_c2r(pm_mass->priv->plan_back, mass_filtered, mass_real);
-        pfft_execute_dft_c2r(pm_star->priv->plan_back, star_filtered, star_real);
-        double * sfr_real = NULL;
-        if(use_sfr){
-            sfr_real = (double * ) mymalloc2("sfr_real", pm_sfr->priv->fftsize * sizeof(double));
-            pfft_execute_dft_c2r(pm_sfr->priv->plan_back, sfr_filtered, sfr_real);
-            myfree(sfr_filtered);
-        }
-        walltime_measure("/PMreion/c2r");
-
-        myfree(star_filtered);
-        myfree(mass_filtered);
-
-        /* the reion loop calculates the J21 and stores it,
-         * for now the mass_real grid will be reused to hold J21
-         * on the last filtering step*/
-        reion_loop(pm_mass,pm_star,pm_sfr,mass_real,star_real,sfr_real,last_step);
-
-        /* since we don't need to readout star and sfr grids...*/
-        /* on the last step, the mass grid is populated with J21 and read out*/
-        if(sfr_real){
-            myfree(sfr_real);
-        }
-        myfree(star_real);
-
-        R = R / R_delta;
-    }
-    //J21 grid is exchanged to pm_mass buffer and freed
-    layout_build_and_exchange_cells_to_local(pm_mass, &pm_mass->priv->layout, pm_mass->priv->meshbuf, mass_real);
-    walltime_measure("/PMreion/comm");
-    //J21 read out to particles
-    pm_iterate(pm_mass, readout, regions, Nregions);
-    walltime_measure("/PMreion/readout");
-}
-
-/* We need a slightly different flow for reionisation, so I
- * will define these here instead of messing with the force functions.
- * The c2r function is the same, however we need a new function, reion_loop
- * to run over all three filtered grids, after the inverse transform.
- * The c2r function itself is also different since we need to apply the
- * transfer (filter) function on all three grids and run reion_loop before any readout.*/
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata) {
-
-    //assigning CPS here due to three sets of regions
-    CPS = pstruct;
-    CPS_R = rstruct;
-
-    /* initialise regions for each grid
-     * NOTE: these regions should be identical except for the grid buffer */
-    int Nregions_mass, Nregions_star, Nregions_sfr;
-    PetaPMRegion * regions_mass = petapm_reion_init(pm_mass, prepare, put_particle_to_mesh, pstruct, &Nregions_mass, userdata);
-    PetaPMRegion * regions_star = petapm_reion_init(pm_star, prepare, put_star_to_mesh, pstruct, &Nregions_star, userdata);
-    PetaPMRegion * regions_sfr;
-    if(use_sfr){
-        regions_sfr = petapm_reion_init(pm_sfr, prepare, put_sfr_to_mesh, pstruct, &Nregions_sfr, userdata);
-    }
-
-    walltime_measure("/PMreion/comm2");
-
-    //using force r2c since this part can be done independently
-    pfft_complex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
-    pfft_complex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
-    pfft_complex * sfr_unfiltered = NULL;
-    if(use_sfr){
-        sfr_unfiltered = petapm_force_r2c(pm_sfr, global_functions);
-    }
-
-    //need custom reion_c2r to implement the 3 grid c2r and readout
-    //the readout is only performed on the mass grid so for now I only pass in regions/Nregions for mass
-    if(functions)
-        petapm_reion_c2r(pm_mass, pm_star, pm_sfr,
-               mass_unfiltered, star_unfiltered, sfr_unfiltered,
-               regions_mass, Nregions_mass, functions, reion_loop,
-               R_max, R_min, R_delta, use_sfr);
-
-    //free everything in the correct order
-    if(sfr_unfiltered){
-        myfree(sfr_unfiltered);
-    }
-    myfree(star_unfiltered);
-    myfree(mass_unfiltered);
-
-    if(CPS->RegionInd)
-        myfree(CPS->RegionInd);
-
-    if(use_sfr){
-        myfree(regions_sfr);
-    }
-    myfree(regions_star);
-    myfree(regions_mass);
-
-    if(use_sfr){
-        petapm_force_finish(pm_sfr);
-    }
-    petapm_force_finish(pm_star);
-    petapm_force_finish(pm_mass);
-}
-/* End excursion set reionization module*/
-
+/******************************************************************************************************************************************** */
 /* build a communication layout */
 
 static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
@@ -780,15 +698,15 @@ static void layout_finish(struct Layout * L) {
     myfree(L->ibuffer);
 }
 
-/* exchange cells to their pfft host, then reduce the cells to the pfft
+/* exchange cells to their fft host, then reduce the cells to the fft
  * array */
-static void to_pfft(double * cell, double * buf) {
+static void to_fft(double * cell, double * buf) {
 #pragma omp atomic update
             cell[0] += buf[0];
 }
 
 static void
-layout_build_and_exchange_cells_to_pfft(
+layout_build_and_exchange_cells_to_fft(
         PetaPM * pm,
         struct Layout * L,
         double * meshbuf,
@@ -815,7 +733,7 @@ layout_build_and_exchange_cells_to_pfft(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->comm);
 
-#if 0
+#if 1
     double massExport = 0;
     for(i = 0; i < L->NcExport; i ++) {
         massExport += L->BufSend[i];
@@ -832,12 +750,12 @@ layout_build_and_exchange_cells_to_pfft(
     message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
 #endif
 
-    layout_iterate_cells(pm, L, to_pfft, real);
+    layout_iterate_cells(pm, L, to_fft, real);
     myfree(L->BufRecv);
     myfree(L->BufSend);
 }
 
-/* readout cells on their pfft host, then exchange the cells to the domain
+/* readout cells on their fft host, then exchange the cells to the domain
  * host */
 static void to_region(double * cell, double * region) {
     *region = *cell;
@@ -863,7 +781,7 @@ layout_build_and_exchange_cells_to_local(
     L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
 
     /* exchange cells */
-    /* notice the order is reversed from to_pfft */
+    /* notice the order is reversed from to_fft */
     MPI_Alltoallv(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
@@ -893,6 +811,7 @@ layout_iterate_cells(PetaPM * pm,
                      double * real)
 {
     int i;
+    message(1, "******** NpImport %d \n", L->NpImport);
 #pragma omp parallel for
     for(i = 0; i < L->NpImport; i ++) {
         struct Pencil * p = &L->PencilRecv[i];
@@ -904,19 +823,21 @@ layout_iterate_cells(PetaPM * pm,
             while(ix >= pm->Nmesh) ix -= pm->Nmesh;
             ix -= pm->real_space_region.offset[k];
             if(ix >= pm->real_space_region.size[k]) {
-                /* serious problem assumption about pfft layout was wrong*/
-                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
+                /* serious problem assumption about fft layout was wrong*/
+                endrun(1, "bad fft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
             }
             linear0 += ix * pm->real_space_region.strides[k];
         }
+        
+        
         int j;
         for(j = 0; j < p->len; j ++) {
             int iz = p->offset[2] + j;
             while(iz < 0) iz += pm->Nmesh;
             while(iz >= pm->Nmesh) iz -= pm->Nmesh;
             if(iz >= pm->real_space_region.size[2]) {
-                /* serious problem assmpution about pfft layout was wrong*/
-                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
+                /* serious problem assmpution about fft layout was wrong*/
+                endrun(1, "bad fft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
             }
             ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
             /*
@@ -1028,6 +949,7 @@ void petapm_region_init_strides(PetaPMRegion * region) {
     region->buffer = NULL;
 }
 
+
 static int pos_get_target(PetaPM * pm, const int pos[2]) {
     int k;
     int task2d[2];
@@ -1053,6 +975,11 @@ static int pencil_cmp_target(const void * v1, const void * v2) {
         ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
 }
 
+
+
+
+/********************************************************************************************/
+
 #ifdef DEBUG
 static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
     /* verify the density field */
@@ -1087,52 +1014,17 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 }
 #endif
 
-static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
 
-    PetaPMRegion * region = &pm->fourier_space_region;
 
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
 
-}
+
 
 
 /**************
  * functions iterating over particle / mesh pairs
  ***************/
+ // can write to some other place and add up later
+ // look for numpy reduce at/bin count
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
     double Mass = *MASS(i);
     if(INACTIVE(i))
@@ -1140,24 +1032,6 @@ static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weigh
 #pragma omp atomic update
     mesh[0] += weight * Mass;
 }
-//escape fraction scaled GSM
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 4)
-        return;
-    double Mass = *MASS(i);
-    double fesc = *FESC(i);
-#pragma omp atomic update
-    mesh[0] += weight * Mass * fesc;
-}
-//escape fraciton scaled SFR
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 0)
-        return;
-    double Sfr = *SFR(i);
-    double fesc = *FESCSPH(i);
-#pragma omp atomic update
-    mesh[0] += weight * Sfr * fesc;
-}
 static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
     int64_t result = 0;
     MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 2a3a45c8..8d057c62 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,14 +1,41 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <pfft.h>
+#include <cufftMp.h>
 
 #include "powerspectrum.h"
+#include "box_iterator.hpp"
+
+using int64 = long long int;
+
+#define CUDA_CHECK(ans) { gpu_checkAssert((ans), __FILE__, __LINE__); }
+inline void gpu_checkAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"CUDA_CHECK: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#define CUFFT_CHECK(ans) { cufft_check((ans), __FILE__, __LINE__); }
+inline void cufft_check(int code, const char *file, int line, bool abort=true)
+{
+    if (code != CUFFT_SUCCESS) 
+    {
+        fprintf(stderr,"CUFFT_CHECK: %d %s %d\n", code, file, line);
+        if (abort) exit(code);
+    }
+}
+
+
 
 typedef struct Region {
     /* represents a region in the FFT Mesh */
-    ptrdiff_t offset[3];
-    ptrdiff_t size[3];
-    ptrdiff_t strides[3];
+    int64 offset[3];
+    int64 size[3];
+    int64 upper[3];
+    int64 strides[3];
+
     size_t totalsize;
     double * buffer;
     /* below are used mostly for investigation */
@@ -21,6 +48,8 @@ typedef struct Region {
 /* a layout is the communication object, represent
  * pencil / cells exchanged  */
 
+
+// Layout determins which cells are sent to which task.
 struct Layout {
     MPI_Comm comm;
     int NpExport;
@@ -49,14 +78,17 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
-    pfft_plan plan_forw;
-    pfft_plan plan_back;
+    int fftsize_complex;
+    cufftHandle plan_forw; // NC:change plan function call
+    cufftHandle plan_back;
+    cudaStream_t stream;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */
     double * meshbuf;
     size_t meshbufsize;
     struct Layout layout;
+    cudaLibXtDesc *desc;
 } PetaPMPriv;
 
 typedef struct PetaPM {
@@ -64,6 +96,8 @@ typedef struct PetaPM {
     MPI_Comm comm;
     PetaPMRegion real_space_region;
     PetaPMRegion fourier_space_region;
+    Box3D box_real;
+    Box3D box_complex;
     double CellSize;
     int Nmesh;
     double Asmth;
@@ -86,20 +120,7 @@ typedef struct {
     int64_t NumPart;
 } PetaPMParticleStruct;
 
-/* extra particle info used in reionisation*/
-typedef struct {
-    size_t offset_type; //offset in particle data to type
-    size_t offset_pi; //offset in particle data to property index
-    void * Sphslot; //pointer to SPH slot
-    size_t sph_elsize; //element size of SPH slot
-    size_t offset_sfr; //offset in SPH slot to star formation rate
-    size_t offset_fesc_sph; //offset in SPH slot to escape fraction
-    void* Starslot; //pointer to fof groups
-    size_t star_elsize; //element size of fof group
-    size_t offset_fesc; //offset in fof groups to fof mass
-} PetaPMReionPartStruct;
-
-typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftDoubleComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
 typedef PetaPMRegion * (*petapm_prepare_func)(PetaPM * pm, PetaPMParticleStruct * pstruct, void * data, int *Nregions);
 
@@ -109,9 +130,6 @@ typedef struct {
     petapm_readout_func readout;
 } PetaPMFunctions;
 
-/* Reion Loop function, applied after c2r, doesn't iterate over all particles*/
-typedef void (*petapm_reion_func)(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, double * mass_real, double * star_real, double * sfr_real, int last_step);
-
 /* this mixes up fourier space analysis; with transfer. Shall split them. */
 typedef struct {
     /* this is a fourier space readout; need a better name */
@@ -142,13 +160,14 @@ PetaPMRegion * petapm_force_init(PetaPM * pm,
         PetaPMParticleStruct * pstruct,
         int * Nregions,
         void * userdata);
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+cufftDoubleComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
-        );
+        ); // NC: changed returned complex type
 void petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k, PetaPMRegion * regions,
+            cufftDoubleComplex * rho_k,
+               PetaPMRegion * regions,
         const int Nregions,
-        PetaPMFunctions * functions);
+        PetaPMFunctions * functions); // NC: changed input complex type
 void petapm_force_finish(PetaPM * pm);
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm);
@@ -156,16 +175,5 @@ PetaPMRegion * petapm_get_real_region(PetaPM * pm);
 int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
-pfft_complex * petapm_alloc_rhok(PetaPM * pm);
-
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata);
-
+cufftDoubleComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
 #endif
diff --git a/libgadget/pm_kernel.cu b/libgadget/pm_kernel.cu
new file mode 100644
index 00000000..1b58fbdb
--- /dev/null
+++ b/libgadget/pm_kernel.cu
@@ -0,0 +1,166 @@
+// pm_kernel.cu
+#include <cuda_runtime.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include "box_iterator.hpp"
+#include "petapm.h"
+
+
+/* unnormalized sinc function sin(x) / x */
+__device__ double sinc_unnormed(double x) {
+    if(x < 1e-5 && x > -1e-5) {
+        double x2 = x * x;
+        return 1.0 - x2 / 6. + x2  * x2 / 120.;
+    } else {
+        return sin(x) / x;
+    }
+}
+
+
+/* the transfer functions for force in fourier space applied to potential */
+/* super lanzcos in CH6 P 122 Digital Filters by Richard W. Hamming */
+__device__ double diff_kernel(double w) {
+/* order N = 1 */
+/*
+ * This is the same as GADGET-2 but in fourier space:
+ * see gadget-2 paper and Hamming's book.
+ * c1 = 2 / 3, c2 = 1 / 12
+ * */
+    return 1 / 6.0 * (8 * sin (w) - sin (2 * w));
+}
+
+
+__global__
+void potential_transfer_kernel(BoxIterator<cufftDoubleComplex> begin, BoxIterator<cufftDoubleComplex> end, PetaPM *pm) {
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    begin += tid;
+
+    if (begin < end) {
+        // Get global 3D coordinates of the current element in real space
+        int x = begin.x();
+        int y = begin.y();
+        int z = begin.z();
+
+        // Compute the corresponding wave numbers (kx, ky, kz), in grid unit
+        int kx = x<=pm->Nmesh/2 ? x : (x-pm->Nmesh);
+        int ky = y<=pm->Nmesh/2 ? y : (y-pm->Nmesh);
+        int kz = z<=pm->Nmesh/2 ? z : (z-pm->Nmesh);
+        int64_t k2 = 0.0;
+        k2 += ((int64_t)kx) * kx;
+        k2 += ((int64_t)ky) * ky;
+        k2 += ((int64_t)kz) * kz;
+        
+        const double asmth2 = pow((2 * M_PI) * pm->Asmth / pm->Nmesh, 2);
+        double f = 1.0;
+        const double smth = exp(-k2 * asmth2) / k2;
+        const double pot_factor = -pm->G / (M_PI * pm->BoxSize);
+
+        int kpos[3] = {kx, ky, kz};
+        // Apply CIC deconvolution
+        for (int k = 0; k < 3; k++) {
+            double tmp = (kpos[k] * M_PI) / pm->Nmesh;
+            tmp = sinc_unnormed(tmp);
+            f *= 1.0 / (tmp * tmp);
+        }
+        const double fac = pot_factor * smth * f * f;
+        //CUDA TODO: add massive neutrino back
+
+        // Handle zero mode separately
+        if (k2 == 0) {
+            begin->x = 0.0;
+            begin->y = 0.0;
+            return;
+        }
+        if(tid < 10) {
+            printf("GPU data (after first transform): global 3D index [%d %d %d], local index %d is (%f,%f)\n", 
+                (int)begin.x(), (int)begin.y(), (int)begin.z(), (int)begin.i(), begin->x, begin->y);
+        }
+        // Apply scaling factor
+        begin->x *= fac;
+        begin->y *= fac;
+    }
+}
+
+
+__global__ 
+void force_transfer_kernel(BoxIterator<cufftDoubleComplex> begin, BoxIterator<cufftDoubleComplex> end, PetaPM *pm, int ik) {
+    double tmp0;
+    double tmp1;
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    begin += tid;
+    int pos;
+    
+    if (begin < end) {
+        // Get global 3D coordinates of the current element in real space
+        switch (ik) {
+            case 0:
+                pos = begin.x();
+                break;
+            case 1:
+                pos = begin.y();
+                break;
+            case 2:
+                pos = begin.z();
+                break;
+        }
+        // Compute the corresponding wave numbers (kx, ky, kz), in grid unit
+        int kpos = pos<=pm->Nmesh/2 ? pos : (pos-pm->Nmesh);
+        /*
+         * negative sign is from force_x = - Del_x pot
+         *
+         * filter is   i K(w)
+         * */
+        double fac = -1 * diff_kernel (kpos * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
+        tmp0 = - begin->y * fac;
+        tmp1 = begin->x * fac;
+        begin->x = tmp0;
+        begin->y = tmp1;
+    }
+}
+
+
+extern "C" void launch_potential_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    potential_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm);
+}
+
+
+extern "C" void launch_force_x_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 0);
+}
+
+extern "C" void launch_force_y_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 1);
+}
+
+extern "C" void launch_force_z_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 2);
+}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/libgadget/pm_kernel.cuh b/libgadget/pm_kernel.cuh
new file mode 100644
index 00000000..4a36a999
--- /dev/null
+++ b/libgadget/pm_kernel.cuh
@@ -0,0 +1,15 @@
+// kernel_launch.h
+#ifndef PM_KERNEL_H
+#define PM_KERNEL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void launch_potential_transfer(Box3D box_complex, cufftComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PM_KERNEL_H
\ No newline at end of file
diff --git a/libgadget/powerspectrum.h b/libgadget/powerspectrum.h
index b9320cbc..315add84 100644
--- a/libgadget/powerspectrum.h
+++ b/libgadget/powerspectrum.h
@@ -3,7 +3,14 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <gsl/gsl_interp.h>
+
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+#include <boost/math/interpolators/barycentric_rational.hpp>
+
+#define P PartManager->Base
 
 typedef struct _powerspectrum {
     double * kk;
@@ -20,8 +27,7 @@ typedef struct _powerspectrum {
     double * logknu;
     double * delta_nu_ratio;
     double nu_prefac;
-    gsl_interp *nu_spline;
-    gsl_interp_accel * nu_acc;
+    boost::math::interpolators::barycentric_rational<double>* nu_spline;
 
 } Power;
 
diff --git a/libgadget/run.c b/libgadget/run.c
index f001a265..6b750db0 100644
--- a/libgadget/run.c
+++ b/libgadget/run.c
@@ -25,14 +25,11 @@
 #include "blackhole.h"
 #include "hydra.h"
 #include "sfr_eff.h"
-#include "metal_return.h"
 #include "slotsmanager.h"
 #include "hci.h"
 #include "fof.h"
 #include "cooling_qso_lightup.h"
-#include "lightcone.h"
 #include "timefac.h"
-#include "uvbg.h"
 #include "neutrinos_lra.h"
 #include "stats.h"
 #include "veldisp.h"
@@ -249,9 +246,6 @@ begrun(const int RestartSnapNum, struct header_data * head)
 
     gravshort_fill_ntab(All.ShortRangeForceWindowType, All.Asmth);
 
-    if(All.LightconeOn)
-        lightcone_init(&All.CP, head->TimeSnapshot, head->UnitLength_in_cm, All.OutputDir);
-
     /* Ensure that the timeline runs at least to the current time*/
     if(head->TimeSnapshot > All.TimeMax)
         All.TimeMax = head->TimeSnapshot;
@@ -395,8 +389,8 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
 
         /* We need to re-seed the random number table each timestep.
          * The seed needs to be the same on all processors, and a different
-         * value each timestep. Only the lowest 32 bits are used in the GSL
-         * random number generator. The populated part of the timestep hierarchy
+         * value each timestep. Only the lowest 32 bits are used in some
+         * random number generators. The populated part of the timestep hierarchy
          * is added to the random seed. The current snapshot is folded into
          * bits 32 - 23 so that the random tables do not cycle after every snapshot.
          * We may still cycle after 512 snapshots but that should be far enough apart. */
@@ -599,12 +593,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
             if(!gasTree.tree_allocated_flag)
                 force_tree_rebuild_mask(&gasTree, ddecomp, GASMASK | BHMASK, All.OutputDir);
 
-            /* Do this before sfr and bh so the gas hsml always contains DesNumNgb neighbours.*/
-            if(All.MetalReturnOn) {
-                double AvgGasMass = All.CP.OmegaBaryon * 3 * All.CP.Hubble * All.CP.Hubble / (8 * M_PI * All.CP.GravInternal) * pow(PartManager->BoxSize, 3) / header->NTotalInit[0];
-                metal_return(&Act, &gasTree, &All.CP, atime, AvgGasMass);
-            }
-
             /* this will find new black hole seed halos.
              * Note: the FOF code does not know about garbage particles,
              * so ensure we do not have garbage present when we call this.
@@ -625,13 +613,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
                     /* Helium reionization by switching on quasar bubbles*/
                     do_heiii_reionization(atime, &fof, &gasTree, &All.CP, &rnd, units.UnitInternalEnergy_in_cgs, fds.FdHelium);
                 }
-#ifdef EXCUR_REION
-                //excursion set reionisation
-                if(CalcUVBG && All.ExcursionSetReionOn) {
-                    calculate_uvbg(&pm_mass, &pm_star, &pm_sfr, WriteSnapshot, SnapshotFileCount, All.OutputDir, atime, &All.CP, units);
-                    message(0,"uvbg calculated\n");
-                }
-#endif // ifdef EXCUR_REION
                 fof_finish(&fof);
             }
 
@@ -658,11 +639,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
         /* We don't need this timestep's tree anymore.*/
         force_tree_free(&gasTree);
 
-        /* Compute the list of particles that cross a lightcone and write it to disc.
-         * This should happen when kick and drift times are synchronised.*/
-        if(All.LightconeOn)
-            lightcone_compute(atime, PartManager->BoxSize, &All.CP, Ti_Last, Ti_Next, &rnd);
-
         /* Now done with random numbers*/
         if(rnd.Table)
             free_random_numbers(&rnd);
diff --git a/libgadget/tests/test_cosmology.c b/libgadget/tests/test_cosmology.c
index b7594cd5..35ccdfc8 100644
--- a/libgadget/tests/test_cosmology.c
+++ b/libgadget/tests/test_cosmology.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
-#include <gsl/gsl_sf_hyperg.h>
+#include <boost/math/special_functions/hypergeometric_2F1.hpp>
 #include <libgadget/physconst.h>
 #include <libgadget/cosmology.h>
 #include "stub.h"
@@ -51,8 +51,8 @@ static inline double radgrow(double aa, double omegar) {
 
 //Omega_L + Omega_M = 1 => D+ ~ Gauss hypergeometric function
 static inline double growth(double aa, double omegam) {
-    double omegal = 1-omegam;
-    return aa * gsl_sf_hyperg_2F1(1./3, 1, 11./6, -omegal/omegam*pow(aa,3));
+    double omegal = 1 - omegam;
+    return aa * boost::math::hypergeometric_2F1(1./3, 1, 11./6, -omegal/omegam * pow(aa, 3));
 }
 
 static void test_cosmology(void ** state)
@@ -82,7 +82,6 @@ static void test_cosmology(void ** state)
     assert_true(fabs(GrowthFactor(&CP, 0.01,0.001) - radgrow(0.01, CP.OmegaG)/radgrow(0.001, CP.OmegaG))< 1e-3);
 
     //Check against exact solutions from gr-qc/0504089: No radiation!
-    //Note that the GSL hyperg needs the last argument to be < 1
     double omegam = 0.5;
     setup_cosmology(&CP, omegam, 0.0455, 0.7);
     CP.RadiationOn = 0;
diff --git a/libgadget/tests/test_density.c b/libgadget/tests/test_density.c
index 176d4de0..a716ec36 100644
--- a/libgadget/tests/test_density.c
+++ b/libgadget/tests/test_density.c
@@ -8,7 +8,8 @@
 #include <mpi.h>
 #include <stdio.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include <libgadget/partmanager.h>
 #include <libgadget/walltime.h>
@@ -28,7 +29,7 @@ struct density_testdata
     struct sph_pred_data sph_pred;
     DomainDecomp ddecomp;
     struct density_params dp;
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 
 /* Perform some simple checks on the densities*/
@@ -204,8 +205,9 @@ static void test_density_close(void ** state) {
     do_density_test(state, numpart, 0.131726, 1e-4);
 }
 
-void do_random_test(void **state, gsl_rng * r, const int numpart)
+void do_random_test(void **state, boost::random::mt19937 &r, const int numpart)
 {
+    boost::random::uniform_real_distribution<double> dist(0.0, 1.0);
     /* Create a randomly space set of particles, 8x8x8, all of type 0. */
     int i;
     for(i=0; i<numpart/4; i++) {
@@ -215,7 +217,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
 
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         P[i].Type = 0;
@@ -223,7 +225,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
         P[i].Hsml = PartManager->BoxSize/cbrt(numpart);
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         P[i].Type = 0;
@@ -231,7 +233,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
         P[i].Hsml = PartManager->BoxSize/cbrt(numpart);
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     do_density_test(state, numpart, 0.187515, 1e-3);
 }
@@ -240,7 +242,7 @@ static void test_density_random(void ** state) {
     /*Set up the particle data*/
     int ncbrt = 32;
     struct density_testdata * data = * (struct density_testdata **) state;
-    gsl_rng * r = (gsl_rng *) data->r;
+    boost::random::mt19937 &r = (boost::random::mt19937) data->r;
     int numpart = ncbrt*ncbrt*ncbrt;
     /*Allocate tree*/
     /*Base pointer*/
@@ -280,7 +282,6 @@ static int teardown_density(void **state) {
     myfree(data->ddecomp.Tasks);
     myfree(data->ddecomp.TopLeaves);
     myfree(data->ddecomp.TopNodes);
-    free(data->r);
     myfree(data);
     return 0;
 }
@@ -324,8 +325,7 @@ static int setup_density(void **state) {
     data->dp.BlackHoleMaxAccretionRadius = 99999.;
 
     set_densitypar(data->dp);
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     return 0;
 }
diff --git a/libgadget/tests/test_exchange.c b/libgadget/tests/test_exchange.c
index dc6de144..5cdc4616 100644
--- a/libgadget/tests/test_exchange.c
+++ b/libgadget/tests/test_exchange.c
@@ -9,7 +9,6 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
 
 #define qsort_openmp qsort
 
diff --git a/libgadget/tests/test_fof.c b/libgadget/tests/test_fof.c
index 1a5f5852..45f1e72d 100644
--- a/libgadget/tests/test_fof.c
+++ b/libgadget/tests/test_fof.c
@@ -9,7 +9,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #define qsort_openmp qsort
 
@@ -30,8 +31,8 @@ setup_particles(int NumPart, double BoxSize)
     int ThisTask;
     MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
 
-    gsl_rng * r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(r, 0);
+    boost::random::mt19937 r(0);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
 
     particle_alloc_memory(PartManager, BoxSize, 1.5 * NumPart);
     PartManager->NumPart = NumPart;
@@ -46,10 +47,9 @@ setup_particles(int NumPart, double BoxSize)
         P[i].IsGarbage = 0;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = BoxSize * dist(r);
     }
 
-    gsl_rng_free(r);
     /* TODO: Here create particles in some halo-like configuration*/
 
     return 0;
diff --git a/libgadget/tests/test_forcetree.c b/libgadget/tests/test_forcetree.c
index 6ae7190d..d13f2163 100644
--- a/libgadget/tests/test_forcetree.c
+++ b/libgadget/tests/test_forcetree.c
@@ -10,7 +10,8 @@
 #include <stdio.h>
 #include <time.h>
 #include <omp.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include <libgadget/forcetree.h>
 #include <libgadget/partmanager.h>
@@ -23,7 +24,7 @@
 struct forcetree_testdata
 {
     DomainDecomp ddecomp;
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 
 #define NODECACHE_SIZE 100
@@ -355,8 +356,9 @@ static void test_rebuild_close(void ** state) {
     myfree(PartManager->Base);
 }
 
-void do_random_test(gsl_rng * r, const int numpart, const ForceTree tb, DomainDecomp * ddecomp)
+void do_random_test(boost::random::mt19937 & r, const int numpart, const ForceTree tb, DomainDecomp * ddecomp)
 {
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Create a regular grid of particles, 8x8x8, all of type 1,
      * in a box 8 kpc across.*/
     int i;
@@ -364,19 +366,19 @@ void do_random_test(gsl_rng * r, const int numpart, const ForceTree tb, DomainDe
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     PartManager->NumPart = numpart;
     do_tree_test(numpart, tb, ddecomp);
@@ -387,7 +389,7 @@ static void test_rebuild_random(void ** state) {
     int ncbrt = 64;
     struct forcetree_testdata * data = * (struct forcetree_testdata **) state;
     DomainDecomp ddecomp = data->ddecomp;
-    gsl_rng * r = (gsl_rng *) data->r;
+    boost::random::mt19937 & r = data->r;
     int numpart = ncbrt*ncbrt*ncbrt;
     particle_alloc_memory(PartManager, 8, numpart);
     /*Allocate tree*/
@@ -442,8 +444,7 @@ static int setup_tree(void **state) {
     /*Set up the top-level domain grid*/
     struct forcetree_testdata *data = malloc(sizeof(struct forcetree_testdata));
     trivial_domain(&data->ddecomp);
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     walltime_init(&Clocks);
     return 0;
diff --git a/libgadget/tests/test_gravity.c b/libgadget/tests/test_gravity.c
index d38e9bbb..bb0e40da 100644
--- a/libgadget/tests/test_gravity.c
+++ b/libgadget/tests/test_gravity.c
@@ -9,7 +9,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <omp.h>
 
 #include "stub.h"
@@ -30,7 +31,7 @@ static struct ClockTable CT;
 /* The true struct for the state variable*/
 struct forcetree_testdata
 {
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 static const double G = 43.0071;
 
@@ -280,25 +281,26 @@ static void test_force_close(void ** state) {
     myfree(P);
 }
 
-void do_random_test(gsl_rng * r, const int numpart)
+void do_random_test(boost::random::mt19937 & r, const int numpart)
 {
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Create a regular grid of particles, 8x8x8, all of type 1,
      * in a box 8 kpc across.*/
     int i;
     for(i=0; i<numpart/4; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     PartManager->NumPart = numpart;
     do_force_test(48, 1.5, 0.002, 1);
@@ -308,7 +310,7 @@ static void test_force_random(void ** state) {
     /*Set up the particle data*/
     int numpart = PartManager->NumPart;
     struct forcetree_testdata * data = * (struct forcetree_testdata **) state;
-    gsl_rng * r = data->r;
+    boost::random::mt19937 & r = data->r;
     particle_alloc_memory(PartManager, 8, numpart);
     int i;
     for(i=0; i<2; i++) {
@@ -334,8 +336,7 @@ static int setup_tree(void **state) {
     init_forcetree_params(0.7);
     /*Set up the top-level domain grid*/
     struct forcetree_testdata *data = malloc(sizeof(struct forcetree_testdata));
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     return 0;
 }
diff --git a/libgadget/tests/test_metal_return.c b/libgadget/tests/test_metal_return.c
deleted file mode 100644
index 055dfc2e..00000000
--- a/libgadget/tests/test_metal_return.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*Tests for the drift factor module.*/
-#include <stdarg.h>
-#include <stddef.h>
-#include <setjmp.h>
-#include <cmocka.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <gsl/gsl_integration.h>
-#include <gsl/gsl_interp2d.h>
-#include <stdint.h>
-
-#include "stub.h"
-#include "libgadget/utils/endrun.h"
-#include "libgadget/metal_return.h"
-#include "libgadget/slotsmanager.h"
-#include "libgadget/metal_tables.h"
-
-void test_yields(void ** state)
-{
-    gsl_integration_workspace * gsl_work = gsl_integration_workspace_alloc(GSL_WORKSPACE);
-    set_metal_params(1.3e-3);
-
-    struct interps interp;
-    setup_metal_table_interp(&interp);
-    /* Compute factor to normalise the total mass in the IMF to unity.*/
-    double imf_norm = compute_imf_norm(gsl_work);
-    assert_true(fabs(imf_norm - 0.624632) <  0.01);
-
-    double agbyield = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 1, 40, gsl_work);
-    double agbyield2 = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 1, SNAGBSWITCH, gsl_work);
-    assert_true(fabs(agbyield / agbyield2 - 1) < 1e-3);
-    /* Lifetime is about 200 Myr*/
-    double agbyield3 = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 5, 40, gsl_work);
-
-    /* Integrate the region of the IMF which contains SNII and AGB stars. The yields should never be larger than this*/
-    gsl_function ff = {chabrier_mass, NULL};
-    double agbmax, sniimax, abserr;
-    gsl_integration_qag(&ff, agb_total_mass[0], SNAGBSWITCH, 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &agbmax, &abserr);
-    gsl_integration_qag(&ff, SNAGBSWITCH, snii_masses[SNII_NMASS-1], 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &sniimax, &abserr);
-
-    double sniiyield = compute_snii_yield(interp.snii_mass_interp, snii_total_mass, 0.01, 1, 40, gsl_work);
-
-    double sn1a = sn1a_number(0, 1500, 0.679)*sn1a_total_metals;
-    assert_true(sn1a < 1.3e-3);
-
-    message(0, "agbyield %g max %g (in 200 Myr: %g)\n", agbyield, agbmax, agbyield3);
-    message(0, "sniiyield %g max %g sn1a %g\n", sniiyield, sniimax, sn1a);
-    message(0, "Total fraction of mass returned %g\n", (sniiyield + sn1a + agbyield)/imf_norm);
-    assert_true(agbyield < agbmax);
-    assert_true(sniiyield < sniimax);
-    assert_true((sniiyield + sn1a + agbyield)/imf_norm < 1.);
-
-    double masslow1, masshigh1;
-    double masslow2, masshigh2;
-    double masslowsum, masshighsum;
-    find_mass_bin_limits(&masslow1, &masshigh1, 0, 30, 0.02, interp.lifetime_interp);
-    find_mass_bin_limits(&masslow2, &masshigh2, 30, 60, 0.02, interp.lifetime_interp);
-    find_mass_bin_limits(&masslowsum, &masshighsum, 0, 60, 0.02, interp.lifetime_interp);
-    message(0, "0 - 30: %g %g 30 - 60 %g %g 0 - 60 %g %g\n", masslow1, masshigh1, masslow2, masshigh2, masslowsum, masshighsum);
-    assert_true(fabs(masslow1 - masshigh2) < 0.01);
-    assert_true(fabs(masslowsum - masslow2) < 0.01);
-}
-
-int main(void) {
-    const struct CMUnitTest tests[] = {
-        cmocka_unit_test(test_yields),
-    };
-    return cmocka_run_group_tests_mpi(tests, NULL, NULL);
-}
diff --git a/libgadget/tests/test_omega_nu_single.c b/libgadget/tests/test_omega_nu_single.c
index 7357cf5a..683d6f2b 100644
--- a/libgadget/tests/test_omega_nu_single.c
+++ b/libgadget/tests/test_omega_nu_single.c
@@ -4,10 +4,10 @@
 #include <cmocka.h>
 #include <stdio.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
 #include "stub.h"
 #include "../omega_nu_single.h"
 #include "../physconst.h"
+#include "../timefac.h"
 
 #define  T_CMB0      2.7255	/* present-day CMB temperature, from Fixsen 2009 */
 
@@ -33,7 +33,6 @@ static void test_rho_nu_init(void **state) {
 /*Check massless neutrinos work*/
 #define STEFAN_BOLTZMANN 5.670373e-5
 #define OMEGAR (4*STEFAN_BOLTZMANN*8*M_PI*GRAVITY/(3*LIGHTCGS*LIGHTCGS*LIGHTCGS*HUBBLE*HUBBLE*HubbleParam*HubbleParam)*pow(T_CMB0,4))
-#define GSL_VAL 200
 
 
 /* Check that the table gives the right answer. */
@@ -76,17 +75,15 @@ double rho_nu_int(double q, void * params);
 
 double do_exact_rho_nu_integration(double a, double mnu, double rhocrit)
 {
-    gsl_function F;
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
+    auto integrand = [&param](double q) {
+        return rho_nu_int(q, (void*) &param);
+    };
     double abserr;
-    F.function = &rho_nu_int;
     double kTnu = BOLEVK*TNUCMB*T_CMB0;
     double param[2] = {mnu * a, kTnu};
-    F.params = &param;
     double result;
-    gsl_integration_qag (&F, 0, 500*kTnu,0 , 1e-9,GSL_VAL,6,w,&result, &abserr);
+    result = tanh_sinh_integrate_adaptive(integrand, 0, 500*kTnu, &abserr, 1e-9);
     result*=get_rho_nu_conversion()/pow(a,4)/rhocrit;
-    gsl_integration_workspace_free (w);
     return result;
 }
 
diff --git a/libgadget/tests/test_peano.c b/libgadget/tests/test_peano.c
index 049009ce..8f78fd00 100644
--- a/libgadget/tests/test_peano.c
+++ b/libgadget/tests/test_peano.c
@@ -7,7 +7,6 @@
 #include <math.h>
 #include <mpi.h>
 #include <stdio.h>
-#include <gsl/gsl_rng.h>
 
 #include <libgadget/utils/peano.h>
 #include "stub.h"
diff --git a/libgadget/tests/test_slotsmanager.c b/libgadget/tests/test_slotsmanager.c
index 21f54e9f..f6d806e4 100644
--- a/libgadget/tests/test_slotsmanager.c
+++ b/libgadget/tests/test_slotsmanager.c
@@ -7,7 +7,6 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
 
 #include "stub.h"
 
diff --git a/libgadget/tests/test_timefac.c b/libgadget/tests/test_timefac.c
index a983b6fb..7ab23b9e 100644
--- a/libgadget/tests/test_timefac.c
+++ b/libgadget/tests/test_timefac.c
@@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
+#include "../timefac.h"
 #include <stdint.h>
 
 #include "stub.h"
@@ -64,14 +64,12 @@ static inline inttime_t get_ti(double aa)
 double exact_drift_factor(Cosmology * CP, double a1, double a2, int exp)
 {
     double result, abserr;
-    gsl_function F;
-    gsl_integration_workspace *workspace;
-    workspace = gsl_integration_workspace_alloc(10000);
-    F.function = &fac_integ;
+    
     struct fac_params ff = {CP, exp};
-    F.params = &ff;
-    gsl_integration_qag(&F, a1,a2, 0, 1.0e-8, 10000, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    auto integrand = [&ff](double a) {
+        return fac_integ(a, (void*)&ff);
+    };
+    result = tanh_sinh_integrate_adaptive(integrand, a1, a2, &abserr, 1e-8);
     return result;
 }
 
diff --git a/libgadget/timebinmgr.c b/libgadget/timebinmgr.c
index 4e1e30ce..792dd96e 100644
--- a/libgadget/timebinmgr.c
+++ b/libgadget/timebinmgr.c
@@ -1,13 +1,13 @@
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
-#include <gsl/gsl_integration.h>
 
 #include "timebinmgr.h"
 #include "utils.h"
 #include "cosmology.h"
 #include "physconst.h"
 #include "plane.h"
+#include "timefac.h"
 
 #define MAXTIMES 1024
 /*! table with desired sync points. All forces and phase space variables are synchonized to the same order. */
@@ -116,10 +116,7 @@ static double integrand_time_to_present(double a, void *param)
 //time_to_present in Myr for excursion set syncpoints
 static double time_to_present(double a, Cosmology * CP)
 {
-#define WORKSIZE 1000
 #define SEC_PER_MEGAYEAR 3.155e13
-    gsl_function F;
-    gsl_integration_workspace* workspace;
     double time;
     double result;
     double abserr;
@@ -127,18 +124,17 @@ static double time_to_present(double a, Cosmology * CP)
     double hubble;
     hubble = CP->Hubble / CP->UnitTime_in_s * SEC_PER_MEGAYEAR * CP->HubbleParam;
 
-    workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    F.function = &integrand_time_to_present;
-    F.params = CP;
+    // Define the integrand as a lambda function
+    auto integrand = [CP](double a) {
+        return integrand_time_to_present(a, (void *)CP);
+    };
 
-    gsl_integration_qag(&F, a, 1.0, 1.0 / hubble,
-        1.0e-8, WORKSIZE, GSL_INTEG_GAUSS21, workspace, &result, &abserr);
+    // Perform the Tanh-Sinh adaptive integration
+    result = tanh_sinh_integrate_adaptive(integrand, a, 1.0, &abserr, 1.0e-8, 1.0 / hubble);
 
     //convert to Myr and multiply by h
     time = result / (hubble/CP->Hubble);
 
-    gsl_integration_workspace_free(workspace);
-
     // return time to present as a function of redshift
     return time;
 }
diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index 76d2c84d..ba269cbf 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -3,14 +3,65 @@
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
 
 #include "physconst.h"
 #include "timefac.h"
 #include "timebinmgr.h"
 #include "utils.h"
 
-#define WORKSIZE 10000
+#include <stdio.h>
+#include <math.h>
+#include <boost/math/quadrature/gauss.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>  // For isnan and isinf
+#include <functional>
+#include <boost/math/quadrature/tanh_sinh.hpp>
+
+// Function to perform tanh-sinh integration with adaptive max_refinements
+double tanh_sinh_integrate_adaptive(
+    std::function<double(double)> func, double a, double b, 
+    double* estimated_error, double rel_tol, double abs_tol, 
+    int max_refinements_limit, int init_refine, int step) 
+{
+    double result_prev = 0.0;
+    double result_current = 0.0;
+    int max_refine = init_refine;
+
+    // Loop until reaching the max refinements limit or satisfying the tolerance
+    for (; max_refine <= max_refinements_limit; max_refine += step) {
+        // Create a Tanh-Sinh integrator with the current max_refinements
+        boost::math::quadrature::tanh_sinh<double> integrator(max_refine);
+
+        // Perform the integration
+        result_current = integrator.integrate(func, a, b);
+
+        // If this is not the first iteration, compute the absolute and relative errors
+        if (max_refine > init_refine) {
+            double abs_error = fabs(result_current - result_prev);  // Absolute error
+            double rel_error = abs_error / fabs(result_current);    // Relative error
+
+            *estimated_error = abs_error;  // Store the absolute error
+
+            // Check if either the relative or absolute error is within the target tolerance
+            if (rel_error < rel_tol || abs_error < abs_tol) {
+                break;  // Stop refining if either error is within the tolerance
+            }
+        }
+
+        // Update the previous result for the next iteration
+        result_prev = result_current;
+    }
+
+    // If we exited the loop without achieving the desired tolerance, print a warning
+    if (*estimated_error > abs_tol && (*estimated_error / fabs(result_current)) > rel_tol) {
+        message(1, 
+            "Warning: Tanh-Sinh integration reached neither the desired relative tolerance of %g nor absolute tolerance of %g. "
+            "Final absolute error: %g, relative error: %g\n", 
+            rel_tol, abs_tol, *estimated_error, (*estimated_error / fabs(result_current)));
+    }
+
+    // Return the final result
+    return result_current;
+}
 
 /* Integrand for the drift table*/
 static double drift_integ(double a, void *param)
@@ -41,21 +92,26 @@ static double hydrokick_integ(double a, void *param)
   return 1 / (h * pow(a, 3 * GAMMA_MINUS1) * a);
 }
 
-/*Do the integral required to get a factor.*/
-static double get_exact_factor(Cosmology * CP, inttime_t t0, inttime_t t1, double (*factor) (double, void *))
+// Function to compute a factor using Tanh-Sinh adaptive integration
+static double get_exact_factor(Cosmology *CP, inttime_t t0, inttime_t t1, double (*factor)(double, void *))
 {
-    double result, abserr;
-    if(t0 == t1)
+    if (t0 == t1) {
         return 0;
+    }
+
+    // Calculate the scale factors
     double a0 = exp(loga_from_ti(t0));
     double a1 = exp(loga_from_ti(t1));
-    gsl_function F;
-    gsl_integration_workspace *workspace;
-    workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    F.function = factor;
-    F.params = CP;
-    gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    double abserr;
+
+    // Define the integrand as a lambda function, wrapping the existing factor function
+    auto integrand = [CP, factor](double a) {
+        return factor(a, (void*)CP);
+    };
+
+    // Call the adaptive Tanh-Sinh integrator
+    double result = tanh_sinh_integrate_adaptive(integrand, a0, a1, &abserr);
+
     return result;
 }
 
@@ -79,25 +135,27 @@ double get_exact_hydrokick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1)
 /* Integrand for comoving distance */
 static double comoving_distance_integ(double a, void *param)
 {
-    Cosmology *CP = (Cosmology *) param;
-    double h = hubble_function(CP, a);
-    return 1. / (h * a * a); 
+    // Cosmology *CP = (Cosmology *) param;
+    // double h = hubble_function(CP, a);
+    // return 1. / (h * a * a); 
+    return gravkick_integ(a, param);
 }
 
-/* Function to compute the comoving distance between two scale factors */
+/* Function to compute comoving distance using the adaptive integrator */
 double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
-{
+{   
+    // relative error tolerance
+    // double epsrel = 1e-8;
     double result, abserr;
-    gsl_function F;
-    gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    
-    F.function = comoving_distance_integ;
-    F.params = CP;
+    // Define the integrand as a lambda function, wrapping comoving_distance_integ
+    auto integrand = [CP](double a) {
+        return comoving_distance_integ(a, (void*)CP);
+    };
 
-    // Using GSL to perform the integration
-    gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    // Call the generic adaptive integration function
+    // result = adaptive_integrate(integrand, a0, a1, &abserr);
+    result = tanh_sinh_integrate_adaptive(integrand, a0, a1, &abserr);
 
-    return (LIGHTCGS/UnitVelocity_in_cm_per_s) * result;
+    // Convert the result using the provided units
+    return (LIGHTCGS / UnitVelocity_in_cm_per_s) * result;
 }
-
diff --git a/libgadget/timefac.h b/libgadget/timefac.h
index 10e85522..c3720811 100644
--- a/libgadget/timefac.h
+++ b/libgadget/timefac.h
@@ -4,10 +4,23 @@
 #include "types.h"
 #include "cosmology.h"
 #include "timebinmgr.h"
+#include <functional>  // For std::function
 
 /* Get the exact drift and kick factors at given time by integrating. */
 double get_exact_drift_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double get_exact_hydrokick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double get_exact_gravkick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s);
+double tanh_sinh_integrate_adaptive(
+    std::function<double(double)> func, 
+    double a, 
+    double b, 
+    double* estimated_error, 
+    double rel_tol = 1e-8, 
+    double abs_tol = 0,
+    int max_refinements_limit = 30, 
+    int init_refine = 5, 
+    int step = 5
+);
+
 #endif
diff --git a/libgadget/treewalk.c b/libgadget/treewalk.c
index 62acbd4a..959d5b1a 100644
--- a/libgadget/treewalk.c
+++ b/libgadget/treewalk.c
@@ -206,7 +206,7 @@ treewalk_build_queue(TreeWalk * tw, int * active_set, const size_t size, int may
     /* Explicitly deal with the case where the queue is zero and there is nothing to do.
      * Some OpenMP compilers (nvcc) seem to still execute the below loop in that case*/
     if(size == 0) {
-        tw->WorkSet = mymalloc("ActiveQueue", sizeof(int));
+        tw->WorkSet = (int *) mymalloc("ActiveQueue", sizeof(int));
         tw->WorkSetSize = size;
         return;
     }
diff --git a/libgadget/utils/memory.c b/libgadget/utils/memory.c
index 23289363..f0f3df13 100644
--- a/libgadget/utils/memory.c
+++ b/libgadget/utils/memory.c
@@ -5,6 +5,10 @@
 #include "memory.h"
 #include "endrun.h"
 
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #define MAGIC "DEADBEEF"
 #define ALIGNMENT 4096
 
@@ -151,7 +155,11 @@ allocator_alloc_va(Allocator * alloc, const char * name, const size_t request_si
     char * cptr;
     if(alloc->use_malloc) {
         /* prepend a copy of the header to the malloc block; allocator_free will use it*/
+    #ifdef USE_CUDA
+        if (cudaMallocManaged((void **) &cptr, request_size + ALIGNMENT, cudaMemAttachGlobal) != cudaSuccess)
+    #else
         if(posix_memalign((void **) &cptr, ALIGNMENT, request_size + ALIGNMENT))
+    #endif
             endrun(1, "Failed malloc: %lu bytes for %s\n", request_size, header->name);
         header->ptr = cptr + ALIGNMENT;
         memcpy(cptr, header, ALIGNMENT);
diff --git a/libgadget/utils/mymalloc.c b/libgadget/utils/mymalloc.c
index e8f31af7..ae56d410 100644
--- a/libgadget/utils/mymalloc.c
+++ b/libgadget/utils/mymalloc.c
@@ -16,15 +16,15 @@
 /* The main allocator is used to store large objects, e.g. tree, toptree */
 Allocator A_MAIN[1];
 
+#ifdef VALGRIND
+#define allocator_init allocator_malloc_init
+#endif
+
 /* The temp allocator is used to store objects that lives on the stack;
  * replacing alloca and similar cases to avoid stack induced memory fragmentation
  * */
 Allocator A_TEMP[1];
 
-#ifdef VALGRIND
-#define allocator_init allocator_malloc_init
-#endif
-
 void
 tamalloc_init(void)
 {
diff --git a/libgadget/utils/spinlocks.c b/libgadget/utils/spinlocks.c
index d160db10..d3f76d27 100644
--- a/libgadget/utils/spinlocks.c
+++ b/libgadget/utils/spinlocks.c
@@ -54,7 +54,7 @@ struct SpinLocks * init_spinlocks(int NumLock)
     spin.SpinLocks = (pthread_spinlock_t *) mymalloc("SpinLocks", NumLock * sizeof(pthread_spinlock_t));
     #pragma omp parallel for
 #else
-    spin.SpinLocks = mymalloc("SpinLocks", NumLock * sizeof(omp_lock_t));
+    spin.SpinLocks = (omp_lock_t*)mymalloc("SpinLocks", NumLock * sizeof(omp_lock_t));
 #endif
     for(i = 0; i < NumLock; i ++) {
 #ifndef NO_OPENMP_SPINLOCK
diff --git a/libgadget/utils/system.c b/libgadget/utils/system.c
index 9fde1555..42fdf93a 100644
--- a/libgadget/utils/system.c
+++ b/libgadget/utils/system.c
@@ -11,7 +11,8 @@
 #include <sys/resource.h>
 #include <unistd.h>
 #include <signal.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <omp.h>
 
 #define __UTILS_SYSTEM_C
@@ -66,15 +67,14 @@ RandTable set_random_numbers(uint64_t seed, const size_t rndtablesize)
     rnd.Table = (double *) mymalloc2("Random", rndtablesize * sizeof(double));
     rnd.tablesize = rndtablesize;
     /* start-up seed */
-    gsl_rng * random_generator = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(random_generator, seed);
 
+    boost::random::mt19937 random_generator(seed);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Populate a table with uniform random numbers between 0 and 1*/
     size_t i;
     for(i = 0; i < rndtablesize; i++)
-        rnd.Table[i] = gsl_rng_uniform(random_generator);
+        rnd.Table[i] = dist(random_generator);
 
-    gsl_rng_free(random_generator);
     return rnd;
 }
 
diff --git a/libgadget/utils/system.h b/libgadget/utils/system.h
index 7c1dca95..215e76f7 100644
--- a/libgadget/utils/system.h
+++ b/libgadget/utils/system.h
@@ -31,7 +31,6 @@ double get_physmem_bytes(void);
  * independent of processor.*/
 double get_random_number(const uint64_t id, const RandTable * const rnd);
 /* Generate the random number table. The seed should be the same on each processor so the output is invariant to
- * To quote the GSL documentation: 'Note that the most generators only accept 32-bit seeds, with higher values being reduced modulo 2^32.'
  * It is important that each timestep uses a new seed value, so the seed should change by less than 2^32 each timestep.
  * The random number table is heap-allocated high, and random numbers are uniform doubles between 0 and 1.*/
 RandTable set_random_numbers(uint64_t seed, const size_t rndtablesize);
diff --git a/libgadget/uvbg.c b/libgadget/uvbg.c
deleted file mode 100644
index 17070af6..00000000
--- a/libgadget/uvbg.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/*=============================================================================
- * An implementation of a patchy UV ionising background
- * calculation. This code utilises the decomposition and communication
- * in the long-range force code in petapm.c, some new functions have been
- * written in petapm.c to accomodate the order of operations and multiple grids
- * present in the reionisation model
-============================================================================*/
-
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <bigfile.h>
-#include <bigfile-mpi.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include "uvbg.h"
-#include "cosmology.h"
-#include "utils.h"
-#include "partmanager.h"
-#include "slotsmanager.h"
-#include "petapm.h"
-#include "physconst.h"
-#include "walltime.h"
-#include "petaio.h"
-
-// TODO(smutch): See if something equivalent is defined anywhere else
-#define FLOAT_REL_TOL (float)1e-5
-
-static struct UVBGParams {
-    /*filter scale parameters*/
-    double ReionRBubbleMax;
-    double ReionRBubbleMin;
-    double ReionDeltaRFactor;
-    int ReionFilterType;
-    int RtoMFilterType;
-
-    /*J21 calculation parameters*/
-    double ReionGammaHaloBias;
-    double ReionNionPhotPerBary;
-    double AlphaUV;
-    double EscapeFractionNorm;
-    double EscapeFractionScaling;
-    int ReionUseParticleSFR;
-    double ReionSFRTimescale;
-    int UVBGdim;
-
-    double Time;
-    Cosmology *CP;
-    double UnitLength_in_cm;
-    double UnitMass_in_g;
-    double UnitTime_in_s;
-
-} uvbg_params;
-
-struct UVBGgrids_type UVBGgrids;
-
-/*set uvbg parameters*/
-void set_uvbg_params(ParameterSet * ps) {
-
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    if(ThisTask==0)
-    {
-        uvbg_params.ReionFilterType = param_get_int(ps, "ReionFilterType");
-        uvbg_params.RtoMFilterType = param_get_int(ps, "RtoMFilterType");
-        uvbg_params.ReionRBubbleMax = param_get_double(ps, "ReionRBubbleMax");
-        uvbg_params.ReionRBubbleMin = param_get_double(ps, "ReionRBubbleMin");
-        uvbg_params.ReionDeltaRFactor = param_get_double(ps, "ReionDeltaRFactor");
-        uvbg_params.ReionGammaHaloBias = param_get_double(ps, "ReionGammaHaloBias");
-        uvbg_params.ReionNionPhotPerBary = param_get_double(ps, "ReionNionPhotPerBary");
-        uvbg_params.AlphaUV = param_get_double(ps, "AlphaUV");
-        uvbg_params.EscapeFractionNorm = param_get_double(ps, "EscapeFractionNorm");
-        uvbg_params.EscapeFractionScaling = param_get_double(ps, "EscapeFractionScaling");
-        uvbg_params.ReionUseParticleSFR = param_get_int(ps, "ReionUseParticleSFR");
-        uvbg_params.ReionSFRTimescale = param_get_double(ps, "ReionSFRTimescale");
-        uvbg_params.UVBGdim = param_get_int(ps,"UVBGdim");
-    }
-
-    MPI_Bcast(&uvbg_params, sizeof(struct UVBGParams), MPI_BYTE, 0, MPI_COMM_WORLD);
-}
-
-int grid_index(int i, int j, int k, ptrdiff_t strides[3])
-{
-    return k*strides[2] + j*strides[1] + i*strides[0];
-}
-
-
-void save_uvbg_grids(int SnapshotFileCount, char * OutputDir, PetaPM * pm)
-{
-    int n_ranks;
-    int this_rank=-1;
-    int grid_n = pm->real_space_region.size[0] * pm->real_space_region.size[1] * pm->real_space_region.size[2];
-    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
-    MPI_Comm_rank(MPI_COMM_WORLD, &this_rank);
-
-    //TODO(jdavies): finish this grid writing function, it outputs fine but in the wrong rank order
-    BigFile fout;
-    char fname[256];
-    sprintf(fname, "%s/UVgrids_%03d", OutputDir,SnapshotFileCount);
-    message(0, "saving uv grids to %s \n", fname);
-
-    if(0 != big_file_mpi_create(&fout, fname, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to create snapshot at %s:%s\n", fname,
-                    big_file_get_error_message());
-    }
-
-    BigBlock bh;
-    if(0 != big_file_mpi_create_block(&fout, &bh, "Header", NULL, 0, 0, 0, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to create block at %s:%s\n", "Header",
-                big_file_get_error_message());
-    }
-
-    if(
-    (0 != big_block_set_attr(&bh, "volume_weighted_global_xHI", &(UVBGgrids.volume_weighted_global_xHI), "f8", 1)) ||
-    (0 != big_block_set_attr(&bh, "mass_weighted_global_xHI", &(UVBGgrids.mass_weighted_global_xHI), "f8", 1)) ||
-    (0 != big_block_set_attr(&bh, "scale_factor", &uvbg_params.Time, "f8", 1)) ) {
-        endrun(0, "Failed to write attributes %s\n",
-                    big_file_get_error_message());
-    }
-
-    if(0 != big_block_mpi_close(&bh, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to close block %s\n",
-                    big_file_get_error_message());
-    }
-
-    //TODO: think about the cartesian communicator in the PetaPM struct
-    //and the mapping between ranks, indices and positions
-
-    size_t dims[2] = {grid_n, 1};
-    //J21 block
-    BigArray arr = {0};
-    big_array_init(&arr, UVBGgrids.J21, "=f4", 2, dims, NULL);
-    petaio_save_block(&fout,"J21",&arr,1);
-
-    message(0,"saved J21\n");
-
-    //xHI block
-    BigArray arr2 = {0};
-    big_array_init(&arr2, UVBGgrids.xHI, "=f4", 2, dims, NULL);
-    petaio_save_block(&fout,"XHI",&arr2,1);
-
-    message(0,"saved XHI\n");
-
-    if(0 != big_file_mpi_close(&fout, MPI_COMM_WORLD)){
-        endrun(0, "Failed to close snapshot at %s:%s\n", fname,
-                    big_file_get_error_message());
-    }
-}
-
-#ifdef EXCUR_REION
-
-static double RtoM(double R)
-{
-    // All in internal units
-    const int filter = uvbg_params.RtoMFilterType;
-    double OmegaM = uvbg_params.CP->Omega0;
-    double RhoCrit = uvbg_params.CP->RhoCrit;
-
-    switch (filter) {
-    case 0: //top hat M = (4/3) PI <rho> R^3
-        return (4.0 / 3.0) * M_PI * pow(R, 3) * (OmegaM * RhoCrit);
-    case 1: //gaussian: M = (2PI)^1.5 <rho> R^3
-        return pow(2 * M_PI, 1.5) * OmegaM * RhoCrit * pow(R, 3);
-    default: // filter not defined
-        endrun(1, "Unrecognised RtoM filter (%d).\n", filter);
-        break;
-    }
-
-    return -1;
-}
-
-//Simple region initialization (taken from zeldovich.c)
-//TODO: look into _prepare (gravpm.c) and see if its worth implementing anything there
-static PetaPMRegion * makeregion(PetaPM * pm, PetaPMParticleStruct * pstruct, void * userdata, int * Nregions) {
-    PetaPMRegion * regions = mymalloc2("Regions", sizeof(PetaPMRegion));
-    int NumPart = PartManager->NumPart;
-    int k;
-    int r = 0;
-    int i;
-    double min[3] = {pm->BoxSize, pm->BoxSize, pm->BoxSize};
-    double max[3] = {0, 0, 0.};
-
-    for(i = 0; i < NumPart; i ++) {
-        for(k = 0; k < 3; k ++) {
-            if(min[k] > P[i].Pos[k])
-            min[k] = P[i].Pos[k];
-            if(max[k] < P[i].Pos[k])
-            max[k] = P[i].Pos[k];
-        }
-    }
-
-    for(k = 0; k < 3; k ++) {
-        regions[r].offset[k] = floor(min[k] / pm->BoxSize * pm->Nmesh - 1);
-        regions[r].size[k] = ceil(max[k] / pm->BoxSize * pm->Nmesh + 2);
-        regions[r].size[k] -= regions[r].offset[k];
-    }
-
-    /* setup the internal data structure of the region */
-    petapm_region_init_strides(&regions[r]);
-    *Nregions = 1;
-    return regions;
-}
-
-//this is applied as global_transfer, dividing by n_cells due to the forward-reverse FFT
-static void divide_by_ncell(PetaPM * pm, int64_t k2, int k[3], pfft_complex * value){
-        int total_n_cells = (double)(uvbg_params.UVBGdim * uvbg_params.UVBGdim * uvbg_params.UVBGdim);
-        value[0][0] /= total_n_cells;
-        value[0][1] /= total_n_cells;
-}
-
-//transfer functions that applies a certain filter (top-hat or gaussian)
-static void filter_pm(PetaPM * pm, int64_t k2, int k[3], pfft_complex * value)
-{
-    const int filter_type = uvbg_params.ReionFilterType;
-    double k_mag = sqrt(k2) * (2 * M_PI / pm->Nmesh) * (pm->Nmesh / pm->BoxSize);
-
-    double kR = k_mag * pm->G; // Radius is stored in the G variable
-
-    switch (filter_type) {
-    case 0: // Real space top-hat
-        if (kR > 1e-4){
-            value[0][0] *= (3.0 * (sinf(kR) / powf(kR, 3) - cosf(kR) / powf(kR, 2)));
-            value[0][1] *= (3.0 * (sinf(kR) / powf(kR, 3) - cosf(kR) / powf(kR, 2)));
-        }
-        break;
-
-    case 1: // k-space top hat
-        kR *= 0.413566994; // Equates integrated volume to the real space top-hat (9pi/2)^(-1/3)
-        if (kR > 1){
-            value[0][0] = 0.0;
-            value[0][1] = 0.0;
-        }
-        break;
-
-    case 2: // Gaussian
-        kR *= 0.643; // Equates integrated volume to the real space top-hat
-        value[0][0] *= (pow(M_E,(-kR * kR / 2.0)));
-        value[0][1] *= (pow(M_E,(-kR * kR / 2.0)));
-        break;
-
-    default:
-        endrun(1, "ReionFilterType type %d is undefined!\n", filter_type);
-        break;
-    }
-}
-
-#ifdef DEBUG
-//print some statistics of the reion grids for debugging
-static void print_reion_debug_info(PetaPM * pm_mass, float * J21, float * xHI, double * mass_real, double * star_real, double * sfr_real)
-{
-    double min_J21 = 1e30;
-    double max_J21 = 0;
-    double min_mass = 1e30;
-    double max_mass = 0;
-    double min_star = 1e30;
-    double max_star = 0;
-    double min_sfr = 1e30;
-    double max_sfr = 0;
-    double total_star = 0;
-    double total_mass = 0;
-    int neutral_count = 0;
-    int ion_count = 0;
-    int pm_idx;
-    int uvbg_dim = uvbg_params.UVBGdim;
-    int grid_n_real = uvbg_dim * uvbg_dim * uvbg_dim;
-#pragma omp parallel for collapse(3) reduction(+:neutral_count,ion_count,total_mass,total_star) reduction(min:min_J21,min_mass,min_star,min_sfr) reduction(max:max_J21,max_mass,max_star,max_sfr) private(pm_idx)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-
-                total_mass += mass_real[pm_idx];
-                total_star += star_real[pm_idx];
-                if(xHI[pm_idx] > 1 - FLOAT_REL_TOL)
-                    neutral_count += 1;
-                if(xHI[pm_idx] < FLOAT_REL_TOL)
-                    ion_count += 1;
-                min_J21 = min_J21 < J21[pm_idx] ? min_J21 : J21[pm_idx];
-                max_J21 = max_J21 > J21[pm_idx] ? max_J21 : J21[pm_idx];
-                min_mass = min_mass < mass_real[pm_idx] ? min_mass : mass_real[pm_idx];
-                max_mass = max_mass > mass_real[pm_idx] ? max_mass : mass_real[pm_idx];
-                min_star = min_star < star_real[pm_idx] ? min_star : star_real[pm_idx];
-                max_star = max_star > star_real[pm_idx] ? max_star : star_real[pm_idx];
-                if(uvbg_params.ReionUseParticleSFR){
-                    min_sfr = min_sfr < sfr_real[pm_idx] ? min_sfr : sfr_real[pm_idx];
-                    max_sfr = max_sfr > sfr_real[pm_idx] ? max_sfr : sfr_real[pm_idx];
-                }
-            }
-
-    message(1,"rank total mass : %e | rank total star : %e\n",total_mass,total_star);
-    MPI_Allreduce(MPI_IN_PLACE, &neutral_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &ion_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_J21, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_J21, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_mass, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_mass, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_star, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_star, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_sfr, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_sfr, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &total_mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &total_star, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double n_ratio = (double)neutral_count / (double)grid_n_real;
-    double i_ratio = (double)ion_count / (double)grid_n_real;
-
-    message(0,"neutral cells : %d, ion cells %d, ratio(%d) N %f ion %f\n",neutral_count, ion_count, grid_n_real, n_ratio, i_ratio);
-    message(0,"min J21 : %e | max J21 %e\n",min_J21,max_J21);
-    message(0,"min mass : %e | max mass : %e | total mass %e\n",min_mass,max_mass,total_mass);
-    message(0,"min star : %e | max star %e | total star : %e\n",min_star,max_star,total_star);
-    message(0,"min sfr : %e | max sfr %e\n",min_sfr,max_sfr);
-}
-#endif
-
-//takes filtered mass, star, sfr grids and calculates J21 and neutral fractions onto a grid
-//which is placed in the mass grid out on the last call of this function.
-static void reion_loop_pm(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        double * mass_real, double * star_real, double * sfr_real, int last_step)
-{
-    //MAKE SURE THESE ARE PRIVATE IN THREADED LOOPS
-    double density_over_mean = 0;
-    double sfr_density = 0;
-    double f_coll_stars = 0;
-    int pm_idx = 0;
-
-    double R = pm_mass->G; //radius is stored here
-
-    const double redshift = 1.0 / (uvbg_params.Time) - 1.;
-
-    // Loop through filter radii
-    //(jdavies): get the parameters
-    //double ReionGammaHaloBias = uvbg_params.ReionGammaHaloBias;
-    const double ReionNionPhotPerBary = uvbg_params.ReionNionPhotPerBary;
-    int use_sfr = uvbg_params.ReionUseParticleSFR;
-    double alpha_uv = uvbg_params.AlphaUV;
-
-    // TODO(smutch): tidy this up!
-    // The following is based on Sobacchi & Messinger (2013) eqn 7
-    // with f_* removed and f_b added since we define f_coll as M_*/M_tot rather than M_vir/M_tot,
-    // and also with the inclusion of the effects of the Helium fraction.
-    const double Y_He = 1.0 - HYDROGEN_MASSFRAC;
-    const double BaryonFrac = uvbg_params.CP->OmegaBaryon / uvbg_params.CP->Omega0;
-    double ReionEfficiency = 1.0 / BaryonFrac * ReionNionPhotPerBary / (1.0 - 0.75 * Y_He);
-
-    const double tot_n_cells = pm_mass->Nmesh * pm_mass->Nmesh * pm_mass->Nmesh;
-    const double pixel_volume = pm_mass->CellSize * pm_mass->CellSize * pm_mass->CellSize;
-    const double deltax_conv_factor = tot_n_cells / (uvbg_params.CP->RhoCrit * uvbg_params.CP->Omega0 * pm_mass->BoxSize * pm_mass->BoxSize * pm_mass->BoxSize);
-
-    float* J21 = UVBGgrids.J21;
-    float* xHI = UVBGgrids.xHI;
-
-    // Perform sanity checks to account for aliasing effects
-#pragma omp parallel for collapse(3) private(pm_idx)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-                mass_real[pm_idx] = fmax(mass_real[pm_idx], 0.0);
-                star_real[pm_idx] = fmax(star_real[pm_idx], 0.0);
-                if(use_sfr)
-                    sfr_real[pm_idx] = fmax(sfr_real[pm_idx], 0.0);
-            }
-
-    const double J21_aux_constant = (1.0 + redshift) * (1.0 + redshift) / (4.0 * M_PI)
-        * alpha_uv * PLANCK * 1e21
-        * R * uvbg_params.UnitLength_in_cm * ReionNionPhotPerBary / PROTONMASS
-        * uvbg_params.UnitMass_in_g / pow(uvbg_params.UnitLength_in_cm, 3) / uvbg_params.UnitTime_in_s;
-
-    const double hubble_time = 1 / (hubble_function(uvbg_params.CP,uvbg_params.Time) * uvbg_params.CP->HubbleParam);
-
-    // Main loop through the box
-#pragma omp parallel for collapse(3) private(pm_idx,density_over_mean,f_coll_stars,sfr_density)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-
-                //convert mass to delta
-                density_over_mean = mass_real[pm_idx] * deltax_conv_factor;
-
-                f_coll_stars = star_real[pm_idx] / (RtoM(R) * density_over_mean)
-                    * (4.0 / 3.0) * M_PI * R * R * R / pixel_volume;
-
-                if(use_sfr)
-                    sfr_density = sfr_real[pm_idx] / pixel_volume / (uvbg_params.UnitMass_in_g / SOLAR_MASS) * (uvbg_params.UnitTime_in_s / SEC_PER_YEAR); // In internal units
-                else
-                    sfr_density = star_real[pm_idx] / (uvbg_params.ReionSFRTimescale * hubble_time) / pixel_volume; // In internal units
-                const float J21_aux = (float)(sfr_density * J21_aux_constant);
-
-                // Check if ionised!
-                if (f_coll_stars > (1.0 / ReionEfficiency)) // IONISED!!!!
-                {
-                    // If it is the first crossing of the ionisation barrier for this cell (largest R), let's record J21
-                    if (xHI[pm_idx] > FLOAT_REL_TOL) {
-                        J21[pm_idx] = J21_aux;
-                    }
-
-                    // Mark as ionised
-                    xHI[pm_idx] = 0.0f;
-
-                    // TODO(smutch): Do we want to implement this?
-                    // r_bubble[i_real] = (float)R;
-                }
-                //TODO: implement CellSizeFactor for low-res
-                else if (last_step && (xHI[pm_idx] > FLOAT_REL_TOL)) {
-                    // Check if this is the last filtering step.
-                    // If so, assign partial ionisations to those cells which aren't fully ionised
-                     xHI[pm_idx] = (float)(1.0 - f_coll_stars * ReionEfficiency);
-                }
-
-            } // iz
-    // Find the volume and mass weighted neutral fractions
-    // TODO: The deltax grid will have rounding errors from forward and reverse
-    //       FFT. Should cache deltax slabs prior to ffts and reuse here.
-    if(last_step){
-
-#ifdef DEBUG
-        print_reion_debug_info(pm_mass,J21,xHI,mass_real,star_real,sfr_real);
-#endif
-
-        double volume_weighted_global_xHI = 0.0;
-        double mass_weighted_global_xHI = 0.0;
-        double mass_weight = 0.0;
-        int uvbg_dim = uvbg_params.UVBGdim;
-        int grid_n_real = uvbg_dim * uvbg_dim * uvbg_dim;
-#pragma omp parallel for collapse(3) reduction(+:volume_weighted_global_xHI,mass_weighted_global_xHI,mass_weight) private(pm_idx,density_over_mean)
-        for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-            for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-                for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                    pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-                    volume_weighted_global_xHI += (double)(xHI[pm_idx]);
-
-                    density_over_mean = deltax_conv_factor * mass_real[pm_idx];
-                    mass_weighted_global_xHI += (double)(xHI[pm_idx]) * density_over_mean;
-                    mass_weight += density_over_mean;
-
-                    //if we are on the last step, we re_use the mass grid to store J21 so it can be read out
-                    mass_real[pm_idx] = (double)(J21[pm_idx]);
-                }
-
-        MPI_Allreduce(MPI_IN_PLACE, &volume_weighted_global_xHI, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &mass_weighted_global_xHI, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &mass_weight, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-        volume_weighted_global_xHI /= grid_n_real;
-        mass_weighted_global_xHI /= mass_weight;
-        UVBGgrids.volume_weighted_global_xHI = volume_weighted_global_xHI;
-        UVBGgrids.mass_weighted_global_xHI = mass_weighted_global_xHI;
-        message(0,"vol weighted xhi : %f\n",volume_weighted_global_xHI);
-        message(0,"mass weighted xhi : %f\n",mass_weighted_global_xHI);
-    }
-
-}
-
-//readout J21 from grid to particle
-static void readout_J21(PetaPM * pm, int i, double * mesh, double weight) {
-    // Since we need to decide whether particles on the boundary are ionised or not,
-    // We choose to take the maximum J21 (of 8 cells) here.
-    //TODO: change the iterator in petapm for reionisation to use NGP to avoid the (minor) resolution effects
-    if (P[i].Type == 0 && mesh[0] > SPHP(i).local_J21){
-        SPHP(i).local_J21 = mesh[0];
-        //if particle has not been ionised yet, set its zreion
-        //the above conditional makes sure the particle is (partially) in an ionsied cell
-        if(SPHP(i).zreion == -1)
-            SPHP(i).zreion = 1/uvbg_params.Time - 1;
-    }
-}
-/* sets particle properties needed for the Excursion Set */
-static void init_particle_uvbg(){
-    /* need to convert halo mass to 1e10 solar */
-    double fesc_unit_conv = uvbg_params.UnitMass_in_g / SOLAR_MASS / 1e10 / uvbg_params.CP->HubbleParam;
-    double fesc_temp;
-
-    /* Reset local J21 */
-#pragma omp parallel for private(fesc_temp)
-    for(int ii = 0; ii < PartManager->NumPart; ii++) {
-        /* Init J21 and set escape fracitons for sph particles */
-        if(P[ii].Type == 0) {
-            SPHP(ii).local_J21 = 0.;
-            //P[i].EscapeFraction is currently halo mass (from fof.c)
-            if(!uvbg_params.ReionUseParticleSFR || SPHP(ii).EscapeFraction == 0) continue;
-
-            fesc_temp = uvbg_params.EscapeFractionNorm * pow(SPHP(ii).EscapeFraction
-                    * fesc_unit_conv, uvbg_params.EscapeFractionScaling);
-
-            if(fesc_temp > 1) fesc_temp = 1;
-            if(fesc_temp < 0) endrun(1,"negative escape fraction?\n");
-            SPHP(ii).EscapeFraction = fesc_temp;
-        }
-        /* Assign escape fractions to star particles */
-        else if(P[ii].Type == 4) {
-            if(STARP(ii).EscapeFraction == 0) continue;
-
-            fesc_temp = uvbg_params.EscapeFractionNorm * pow(STARP(ii).EscapeFraction
-                    * fesc_unit_conv, uvbg_params.EscapeFractionScaling);
-
-            if(fesc_temp > 1) fesc_temp = 1;
-            if(fesc_temp < 0) endrun(1,"negative escape fraction?\n");
-            STARP(ii).EscapeFraction = fesc_temp;
-        }
-    }
-}
-
-void calculate_uvbg(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, int WriteSnapshot, int SnapshotFileCount, char * OutputDir, double Time, Cosmology * CP, const struct UnitSystem units){
-    //setup filter radius range
-    double Rmax = uvbg_params.ReionRBubbleMax;
-    double Rmin = uvbg_params.ReionRBubbleMin;
-    double Rdelta = uvbg_params.ReionDeltaRFactor;
-
-    //define particle structure with the info petapm needs
-    PetaPMParticleStruct pstruct = {
-        P,
-        sizeof(P[0]),
-        (char*) &P[0].Pos[0]  - (char*) P,
-        (char*) &P[0].Mass  - (char*) P,
-        /* Regions allocated inside _prepare*/
-        NULL,
-        /* By default all particles are active. For hybrid neutrinos set below.*/
-        NULL,
-        PartManager->NumPart,
-    };
-    PetaPMReionPartStruct rstruct = {
-        (char*) &P[0].Type  - (char*) P,
-        (char*) &P[0].PI  - (char*) P,
-        SphP,
-        sizeof(SphP[0]),
-        (char*) &SphP[0].Sfr  - (char*) SphP,
-        (char*) &SphP[0].EscapeFraction  - (char*) SphP,
-        StarP,
-        sizeof(StarP[0]),
-        (char*) &StarP[0].EscapeFraction - (char*) StarP,
-    };
-
-    uvbg_params.Time = Time;
-    uvbg_params.CP = CP;
-    uvbg_params.UnitMass_in_g = units.UnitMass_in_g;
-    uvbg_params.UnitLength_in_cm = units.UnitLength_in_cm;
-    uvbg_params.UnitTime_in_s = units.UnitTime_in_s;
-
-    PetaPMGlobalFunctions global_functions = {NULL, NULL, divide_by_ncell};
-
-    //TODO: set this up with all the filtering/reion loops
-    static PetaPMFunctions functions [] =
-    {
-        {"Reionisation", filter_pm, readout_J21},
-        {NULL, NULL, NULL},
-    };
-
-    //set local J21 = 0 and set escape fractions for all particles
-    init_particle_uvbg();
-    uvbg_params.Time = Time;
-    uvbg_params.CP = CP;
-
-    /* initialize grids */
-    int grid_n = pm_mass->real_space_region.size[0]
-        * pm_mass->real_space_region.size[1]
-        * pm_mass->real_space_region.size[2];
-
-    UVBGgrids.J21 = mymalloc("J21", sizeof(float) * grid_n);
-    float * J21 = UVBGgrids.J21;
-    UVBGgrids.xHI = mymalloc("xHI", sizeof(float) * grid_n);
-    float * xHI = UVBGgrids.xHI;
-
-    for (int ii = 0; ii < grid_n; ii++) {
-        J21[ii] = 0.0f;
-        xHI[ii] = 1.0f;
-    }
-
-    message(0, "Away to call find_HII_bubbles...\n");
-    petapm_reion(pm_mass,pm_star,pm_sfr,makeregion,&global_functions
-            ,functions,&pstruct,&rstruct,reion_loop_pm,Rmax,Rmin,Rdelta
-            ,uvbg_params.ReionUseParticleSFR,NULL);
-
-    //TODO: In line with Meraxes, should we multiply J21 with a halo bias parameter for particles in a group??
-
-    walltime_measure("/UVBG/find_HII_bubbles");
-
-    //since J21 is output to particles, we should only need to write these grids for debugging
-#ifdef DEBUG
-    if(WriteSnapshot) {
-        save_uvbg_grids(SnapshotFileCount, OutputDir, pm_mass);
-        message(0,"uvbg saved\n");
-    }
-    walltime_measure("/UVBG/save");
-#endif
-
-    myfree(UVBGgrids.xHI);
-    myfree(UVBGgrids.J21);
-
-    walltime_measure("/UVBG");
-}
-#endif // ifdef EXCUR_REION
diff --git a/libgadget/uvbg.h b/libgadget/uvbg.h
deleted file mode 100644
index 4520a583..00000000
--- a/libgadget/uvbg.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef UVBG_H
-#define UVBG_H
-
-#include <pfft.h>
-#include "petapm.h"
-#include "utils/paramset.h"
-#include "fof.h"
-
-struct UVBGgrids_type {
-    float *J21;
-    float *xHI;
-
-    double volume_weighted_global_xHI;
-    double mass_weighted_global_xHI;
-};
-
-//extern struct UVBGgrids_type UVBGgrids; 
-
-void calculate_uvbg(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, int WriteSnapshot, int SnapshotFileCount, char * Outputdir, double Time, Cosmology * CP, const struct UnitSystem units);
-void set_uvbg_params(ParameterSet * ps);
-
-#endif
diff --git a/libgenic/Makefile b/libgenic/Makefile
index 722120f2..26e507dc 100644
--- a/libgenic/Makefile
+++ b/libgenic/Makefile
@@ -4,14 +4,14 @@ CONFIG ?= ../Options.mk
 
 include $(CONFIG)
 
-TESTED = power thermal
+TESTED = power
 TESTBIN := $(TESTED:%=.objs/test_%) $(MPI_TESTED:%=.objs/test_%)
 SUITE?= $(TESTED:%=test_%)
 MPISUITE = $(MPI_TESTED:%=test_%)
 
 include ../Makefile.rules
 
-OBJS = power.o zeldovich.o glass.o save.o thermal.o
+OBJS = power.o zeldovich.o glass.o save.o
 
 OBJS := $(OBJS:%.o=.objs/%.o)
 
diff --git a/libgenic/glass.c b/libgenic/glass.c
index 6bd6ff34..5b24ee0b 100644
--- a/libgenic/glass.c
+++ b/libgenic/glass.c
@@ -4,7 +4,8 @@
 #include <string.h>
 #include <omp.h>
 
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include "allvars.h"
 #include "proto.h"
@@ -15,10 +16,10 @@
 #include <libgadget/powerspectrum.h>
 #include <libgadget/gravity.h>
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_z(PetaPM *pm, int i, double * mesh, double weight);
@@ -40,29 +41,27 @@ static void glass_stats(struct ic_part_data * ICP, int NumPart);
 int
 setup_glass(IDGenerator * idgen, PetaPM * pm, double shift, int seed, double mass, struct ic_part_data * ICP, const double UnitLength_in_cm, const char * OutputDir)
 {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
     int ThisTask;
     MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    gsl_rng_set(rng, seed + ThisTask);
+    boost::random::mt19937 rng(seed + ThisTask);
     memset(ICP, 0, idgen->NumPart*sizeof(struct ic_part_data));
+    boost::random::uniform_real_distribution<double> dist(0, 1);
 
     int i;
     /* Note: this loop should nto be omp because
-     * of the call to gsl_rng_uniform*/
+     * of the call to rng_uniform*/
     for(i = 0; i < idgen->NumPart; i ++) {
         int k;
         idgen_create_pos_from_index(idgen, i, &ICP[i].Pos[0]);
         /* a spread of 3 will kill most of the grid anisotropy structure;
          * and still being local */
         for(k = 0; k < 3; k++) {
-            double rand = idgen->BoxSize / idgen->Ngrid * 3 * (gsl_rng_uniform(rng) - 0.5);
+            double rand = idgen->BoxSize / idgen->Ngrid * 3 * (dist(rng) - 0.5);
             ICP[i].Pos[k] += shift + rand;
         }
         ICP[i].Mass = mass;
     }
 
-    gsl_rng_free(rng);
-
     char * fn = fastpm_strdup_printf("powerspectrum-glass-%08X", seed);
     glass_evolve(pm, 14, fn, ICP, idgen->NumPart, UnitLength_in_cm, OutputDir);
     myfree(fn);
@@ -183,8 +182,8 @@ static void glass_force(PetaPM * pm, double t_f, struct ic_part_data * ICP, cons
     PetaPMParticleStruct pstruct = {
         ICP,
         sizeof(ICP[0]),
-        (char*) &ICP[0].Pos[0]  - (char*) ICP,
-        (char*) &ICP[0].Mass  - (char*) ICP,
+        (size_t)((char*) &ICP[0].Pos[0]  - (char*) ICP),
+        (size_t)((char*) &ICP[0].Mass  - (char*) ICP),
         NULL,
         NULL,
         NumPart,
@@ -279,7 +278,7 @@ _prepare(PetaPM * pm, PetaPMParticleStruct * pstruct, void * userdata, int * Nre
  *
  *********************/
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex *value) {
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex *value) {
 
     double f = 1.0;
     const double smth = 1.0 / k2;
@@ -302,13 +301,13 @@ static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex
 
     if(k2 == 0) {
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -323,7 +322,7 @@ static double diff_kernel(double w) {
     return 1 / 6.0 * (8 * sin (w) - sin (2 * w));
 }
 
-static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM *pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -332,18 +331,18 @@ static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight) {
diff --git a/libgenic/pmesh.h b/libgenic/pmesh.h
index f73f533b..53c3498f 100644
--- a/libgenic/pmesh.h
+++ b/libgenic/pmesh.h
@@ -1,13 +1,10 @@
 #ifndef PMESH_H
 #define PMESH_H
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <libgadget/petapm.h>
 #include <libgadget/utils.h>
 
-/*
- * The following functions are from fastpm/libfastpm/initialcondition.c.
- * Agrees with nbodykit's pmesh/whitenoise.c, which agrees with n-genic.
- * */
 typedef struct {
     struct {
         ptrdiff_t start[3];
@@ -19,9 +16,10 @@ typedef struct {
 } PMDesc;
 
 static inline void
-SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, gsl_rng * rng)
+SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, boost::random::mt19937 & rng)
 {
-    unsigned int seed = 0x7fffffff * gsl_rng_uniform(rng);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
+    unsigned int seed = static_cast<unsigned int>(0x7fffffff * dist(rng));
 
     int ii[2] = {i, (pm->Nmesh[0] - i) % pm->Nmesh[0]};
     int jj[2] = {j, (pm->Nmesh[1] - j) % pm->Nmesh[1]};
@@ -41,6 +39,7 @@ SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, gsl_rng * rng)
         }
     }
 }
+
 static inline unsigned int
 GETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, int d1, int d2)
 {
@@ -54,11 +53,12 @@ GETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, int d1, int d2)
 }
 
 static void
-SAMPLE(gsl_rng * rng, double * ampl, double * phase)
+SAMPLE(boost::random::mt19937 & rng, double * ampl, double * phase)
 {
-    *phase = gsl_rng_uniform(rng) * 2 * M_PI;
+    boost::random::uniform_real_distribution<double> dist(0, 1);
+    *phase = dist(rng) * 2 * M_PI;
     *ampl = 0;
-    do *ampl = gsl_rng_uniform(rng); while(*ampl == 0);
+    do *ampl = dist(rng); while(*ampl == 0);
 }
 
 static void
@@ -68,8 +68,8 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
     int d;
     int i, j, k;
 
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(rng, seed);
+    // Initialize the Boost RNG
+    boost::random::mt19937 rng(seed);
 
     unsigned int * seedtable[2][2];
     for(i = 0; i < 2; i ++)
@@ -88,15 +88,13 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
         for(j = 0; j < i; j++) SETSEED(pm, seedtable, pm->Nmesh[0] - 1 - i, pm->Nmesh[1] - 1 - j, rng);
         for(j = 0; j < i + 1; j++) SETSEED(pm, seedtable, pm->Nmesh[1] - 1 - j, pm->Nmesh[0] - 1 - i, rng);
     }
-    gsl_rng_free(rng);
 
     ptrdiff_t irel[3];
     for(i = pm->ORegion.start[0];
         i < pm->ORegion.start[0] + pm->ORegion.size[0];
         i ++) {
 
-        gsl_rng * lower_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-        gsl_rng * this_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
+        boost::random::mt19937 lower_rng, this_rng;
 
         int ci = pm->Nmesh[0] - i;
         if(ci >= pm->Nmesh[0]) ci -= pm->Nmesh[0];
@@ -104,15 +102,10 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
         for(j = pm->ORegion.start[1];
             j < pm->ORegion.start[1] + pm->ORegion.size[1];
             j ++) {
-            /* always pull the gaussian from the lower quadrant plane for k = 0
-             * plane*/
-            /* always pull the whitenoise from the lower quadrant plane for k = 0
-             * plane and k == All.Nmesh / 2 plane*/
             int d1 = 0, d2 = 0;
             int cj = pm->Nmesh[1] - j;
             if(cj >= pm->Nmesh[1]) cj -= pm->Nmesh[1];
 
-            /* d1, d2 points to the conjugate quandrant */
             if( (ci == i && cj < j)
              || (ci < i && cj != j)
              || (ci < i && cj == j)) {
@@ -121,20 +114,17 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
             }
 
             unsigned int seed_conj, seed_this;
-            /* the lower quadrant generator */
             seed_conj = GETSEED(pm, seedtable, i, j, d1, d2);
-            gsl_rng_set(lower_rng, seed_conj);
+            lower_rng.seed(seed_conj);
 
             seed_this = GETSEED(pm, seedtable, i, j, 0, 0);
-            gsl_rng_set(this_rng, seed_this);
+            this_rng.seed(seed_this);
 
             for(k = 0; k <= pm->Nmesh[2] / 2; k ++) {
                 int use_conj = (d1 != 0 || d2 != 0) && (k == 0 || k == pm->Nmesh[2] / 2);
 
                 double ampl, phase;
                 if(use_conj) {
-                    /* on k = 0 and All.Nmesh/2 plane, we use the lower quadrant generator,
-                     * then hermit transform the result if it is nessessary */
                     SAMPLE(this_rng, &ampl, &phase);
                     SAMPLE(lower_rng, &ampl, &phase);
                 } else {
@@ -152,14 +142,12 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
                 if(irel[2] < 0) continue;
                 if(irel[2] >= pm->ORegion.size[2]) continue;
 
-                /* we want two numbers that are of std ~ 1/sqrt(2) */
                 ampl = sqrt(- log(ampl));
 
-                if (setUnitaryAmplitude) ampl = 1.0; /* cos and sin gives 1/sqrt(2)*/
-
+                if (setUnitaryAmplitude) ampl = 1.0;
 
                 if (setInvertPhase){
-                  phase += M_PI; /*invert phase*/
+                  phase += M_PI;
                 }
 
                 (delta_k + 2 * ip)[0] = ampl * cos(phase);
@@ -172,29 +160,21 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
                 if((pm->Nmesh[0] - iabs[0]) % pm->Nmesh[0] == iabs[0] &&
                    (pm->Nmesh[1] - iabs[1]) % pm->Nmesh[1] == iabs[1] &&
                    (pm->Nmesh[2] - iabs[2]) % pm->Nmesh[2] == iabs[2]) {
-                    /* The mode is self conjuguate, thus imaginary mode must be zero */
                     (delta_k + 2 * ip)[1] = 0;
                     (delta_k + 2 * ip)[0] = ampl * cos(phase);
                 }
 
                 if(iabs[0] == 0 && iabs[1] == 0 && iabs[2] == 0) {
-                    /* the mean is zero */
                     (delta_k + 2 * ip)[0] = 0;
                     (delta_k + 2 * ip)[1] = 0;
                 }
             }
         }
-        gsl_rng_free(lower_rng);
-        gsl_rng_free(this_rng);
     }
     for(i = 1; i >= 0; i --)
     for(j = 1; j >= 0; j --) {
         myfree(seedtable[i][j]);
     }
-/*
-    char * fn[1000];
-    sprintf(fn, "canvas.dump.f4.%d", pm->ThisTask);
-    fwrite(pm->canvas, sizeof(pm->canvas[0]), pm->ORegion.total * 2, fopen(fn, "w"));
-*/
 }
-#endif
+
+#endif
\ No newline at end of file
diff --git a/libgenic/power.c b/libgenic/power.c
index fafa8013..e959970b 100644
--- a/libgenic/power.c
+++ b/libgenic/power.c
@@ -3,8 +3,7 @@
 #include <math.h>
 #include <stddef.h>
 #include <mpi.h>
-#include <gsl/gsl_integration.h>
-#include <gsl/gsl_interp.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
 #include <bigfile-mpi.h>
 
 #include <libgadget/cosmology.h>
@@ -13,6 +12,8 @@
 #include <libgadget/physconst.h>
 #include "power.h"
 #include "proto.h"
+#include <libgadget/timefac.h>
+
 static double Delta_EH(double k);
 static double Delta_Tabulated(double k, enum TransferType Type);
 static double sigma2_int(double k, void * params);
@@ -36,7 +37,7 @@ struct table
     int Nentry;
     double * logk;
     double * logD[MAXCOLS];
-    gsl_interp * mat_intp[MAXCOLS];
+    boost::math::interpolators::barycentric_rational<double>* mat_intp[MAXCOLS];
 };
 
 /*Typedef for a function that parses the table from text*/
@@ -75,12 +76,12 @@ static double get_Tabulated(double k, enum TransferType Type, double oobval)
     if(logk < power_table.logk[0] || logk > power_table.logk[power_table.Nentry - 1])
       return oobval;
 
-    double logD = gsl_interp_eval(power_table.mat_intp[0], power_table.logk, power_table.logD[0], logk, NULL);
+    double logD = (*power_table.mat_intp[0])(logk);
     double trans = 1;
     /*Transfer table stores (T_type(k) / T_tot(k))*/
     if(transfer_table.Nentry > 0)
        if(Type >= DELTA_BAR && Type < DELTA_TOT)
-          trans = gsl_interp_eval(transfer_table.mat_intp[Type], transfer_table.logk, transfer_table.logD[Type], logk, NULL);
+          trans = (*transfer_table.mat_intp[Type])(logk);
 
     /*Convert delta from (Mpc/h)^3/2 to kpc/h^3/2*/
     logD += 1.5 * log10(scale);
@@ -321,9 +322,6 @@ void read_power_table(int ThisTask, const char * inputfile, const int ncols, str
     }
 
     MPI_Bcast(out_tab->logk, (ncols+1)*out_tab->Nentry, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-    for(j=0; j<ncols; j++) {
-        out_tab->mat_intp[j] = gsl_interp_alloc(gsl_interp_cspline,out_tab->Nentry);
-    }
 }
 
 int
@@ -392,7 +390,7 @@ init_transfer_table(int ThisTask, double InitTime, const struct power_params * c
     }
     /*Initialise the interpolation*/
     for(t = 0; t < MAXCOLS; t++)
-        gsl_interp_init(transfer_table.mat_intp[t],transfer_table.logk, transfer_table.logD[t],transfer_table.Nentry);
+        transfer_table.mat_intp[t] = new boost::math::interpolators::barycentric_rational<double>(transfer_table.logk, transfer_table.logD[t], transfer_table.Nentry);
 
     message(0,"Scale-dependent growth calculated. Mean = %g %g %g %g %g\n",meangrowth[0], meangrowth[1], meangrowth[2], meangrowth[3], meangrowth[4]);
     message(0, "Power spectrum rows: %d, Transfer: %d (%g -> %g)\n", power_table.Nentry, transfer_table.Nentry, transfer_table.logD[DELTA_BAR][0],transfer_table.logD[DELTA_BAR][transfer_table.Nentry-1]);
@@ -410,7 +408,7 @@ int init_powerspectrum(int ThisTask, double InitTime, double UnitLength_in_cm_in
     if(ppar->WhichSpectrum == 2) {
         read_power_table(ThisTask, ppar->FileWithInputSpectrum, 1, &power_table, InitTime, parse_power);
         /*Initialise the interpolation*/
-        gsl_interp_init(power_table.mat_intp[0],power_table.logk, power_table.logD[0],power_table.Nentry);
+        power_table.mat_intp[0] = new boost::math::interpolators::barycentric_rational<double>(power_table.logk, power_table.logD[0], power_table.Nentry);
         transfer_table.Nentry = 0;
         if(ppar->DifferentTransferFunctions || ppar->ScaleDepVelocity) {
             init_transfer_table(ThisTask, InitTime, ppar);
@@ -477,20 +475,19 @@ double tk_eh(double k)		/* from Martin White */
 
 double TopHatSigma2(double R)
 {
-  gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
-  double result,abserr;
-  gsl_function F;
-  F.function = &sigma2_int;
-  F.params = &R;
+    double result,abserr;
+  
+  // Define the integrand as a lambda function, wrapping sigma2_int
+    auto integrand = [R](double k) {
+        return sigma2_int(k, (void*)&R);
+    };
 
   /* note: 500/R is here chosen as integration boundary (infinity) */
-  gsl_integration_qags (&F, 0, 500. / R, 0, 1e-4,1000,w,&result, &abserr);
-/*   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
-  gsl_integration_workspace_free (w);
+  result = tanh_sinh_integrate_adaptive(integrand, 0, 500. / R, &abserr, 1e-4, 0.);
+/*   printf("integration in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
   return result;
 }
 
-
 double sigma2_int(double k, void * params)
 {
   double w, x;
diff --git a/libgenic/tests/test_thermal.c b/libgenic/tests/test_thermal.c
deleted file mode 100644
index 41dec626..00000000
--- a/libgenic/tests/test_thermal.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*Tests for the thermal velocity module, ported from S-GenIC.*/
-
-#include <stdarg.h>
-#include <stddef.h>
-#include <setjmp.h>
-#include <cmocka.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "stub.h"
-#include <libgadget/config.h>
-#include <libgenic/thermal.h>
-
-/*Check that the neutrino velocity NU_V0 is sensible*/
-static void
-test_mean_velocity(void ** state)
-{
-    /*Check has units of velocity*/
-    assert_true(fabs(NU_V0(1, 1, 1e3) - 100*NU_V0(1, 1, 1e5)) < 1e-6);
-    /*Check scales linearly with neutrino mass*/
-    assert_true(fabs(10*NU_V0(1, 0.1, 1e5) - NU_V0(1, 1, 1e5)) < 1e-6);
-    /*Check scales as z (gadget's cosmological velocity unit is accounted for outside)*/
-    assert_true(fabs(0.5*NU_V0(0.5, 1, 1e5) -  NU_V0(1, 1, 1e5)) < 1e-6);
-}
-
-static void
-test_thermal_vel(void ** state)
-{
-    /*Seed table with velocity of 100 km/s*/
-    struct thermalvel nu_vels;
-    init_thermalvel(&nu_vels, 100, 5000/100, 0);
-
-    /*Test getting the distribution*/
-    assert_true(fabs(nu_vels.fermi_dirac_vel[0]) < 1e-6);
-    assert_true(fabs(nu_vels.fermi_dirac_vel[LENGTH_FERMI_DIRAC_TABLE - 1] -  MAX_FERMI_DIRAC) < 1e-3);
-
-    /*Number verified by mathematica*/
-    int ii = 0;
-    while(nu_vels.fermi_dirac_cumprob[ii] < 0.5) {
-        ii++;
-    }
-    assert_true(fabs(nu_vels.fermi_dirac_vel[ii] - 2.839075) < 0.002);
-    /*Check some statistical properties (max, min, mean)*/
-    double mean = 0;
-    double max = 0;
-    double min = 1e10;
-    int nsample;
-    float Vel[3] = {0};
-    int64_t MaxID = 100000;
-    gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    for (nsample=0; nsample < MaxID; nsample++)
-    {
-        add_thermal_speeds(&nu_vels, g_rng, Vel);
-        double v2 = sqrt(Vel[0]*Vel[0]+Vel[1]*Vel[1]+Vel[2]*Vel[2]);
-        if(v2 > max)
-            max = v2;
-        if(v2 < min)
-            min = v2;
-        mean+=v2;
-        memset(Vel, 0, 3*sizeof(float));
-    }
-    gsl_rng_free(g_rng);
-    mean/=nsample;
-    /*Mean should be roughly 3*zeta(4)/zeta(3)*7/8/(3/4)* m_vamp*/
-    assert_true(fabs(mean - 3*pow(M_PI,4)/90./1.202057*(7./8)/(3/4.)*100) < 1);
-    assert_true(min > 0);
-    assert_true( max < MAX_FERMI_DIRAC*100);
-}
-
-int main(void) {
-    const struct CMUnitTest tests[] = {
-        cmocka_unit_test(test_mean_velocity),
-        cmocka_unit_test(test_thermal_vel)
-    };
-    return cmocka_run_group_tests_mpi(tests, NULL, NULL);
-}
diff --git a/libgenic/thermal.c b/libgenic/thermal.c
deleted file mode 100644
index 5635897c..00000000
--- a/libgenic/thermal.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <gsl/gsl_integration.h>
-#include <assert.h>
-#include "thermal.h"
-/*For speed of light*/
-#include <libgadget/physconst.h>
-#include <libgadget/utils.h>
-
-/*The Boltzmann constant in units of eV/K*/
-#define BOLEVK 8.61734e-5
-
-/* This function converts the dimensionless units used in the integral to dimensionful units.
- * Unit scaling velocity for neutrinos:
- * This is an arbitrary rescaling of the unit system in the Fermi-Dirac kernel so we can integrate dimensionless quantities.
- * The true thing to integrate is:
- * q^2 /(e^(q c / kT) + 1 ) dq between 0 and q.
- * So we choose x = (q c / kT_0) and integrate between 0 and x_0.
- * The units are restored by multiplying the resulting x by kT/c for q
- * To get a v we then use q = a m v/c^2
- * to get:   v/c =x kT/(m a)*/
-/*NOTE: this m is the mass of a SINGLE neutrino species, not the sum of neutrinos!*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s)
-{
-    return kBTNubyMNu / Time * (LIGHTCGS / UnitVelocity_in_cm_per_s);
-}
-
-//Amplitude of the random velocity for WDM
-double WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s)
-{
-        //Not actually sure where this equation comes from: the fiducial values are from Bode, Ostriker & Turok 2001.
-        double WDM_V0 = 0.012 / Time * pow(Omega_CDM / 0.3, 1.0 / 3) * pow(HubbleParam / 0.65, 2.0 / 3) * pow(1.0 /WDM_therm_mass,4.0 / 3);
-        WDM_V0 *= 1.0e5 / UnitVelocity_in_cm_per_s;
-        return WDM_V0;
-}
-
-/*Fermi-Dirac kernel for below*/
-static double
-fermi_dirac_kernel(double x, void * params)
-{
-  return x * x / (exp(x) + 1);
-}
-
-/*Initialise the probability tables*/
-double
-init_thermalvel(struct thermalvel* thermals, const double v_amp, double max_fd,const double min_fd)
-{
-    int i;
-    if(max_fd <= min_fd)
-        endrun(1,"Thermal vel called with negative interval: %g <= %g\n", max_fd, min_fd);
-
-    if(max_fd > MAX_FERMI_DIRAC)
-        max_fd = MAX_FERMI_DIRAC;
-    thermals->m_vamp = v_amp;
-
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
-    double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++) {
-        thermals->fermi_dirac_vel[i] = min_fd+(max_fd-min_fd)* i / (LENGTH_FERMI_DIRAC_TABLE - 1.0);
-        gsl_integration_qag (&F, min_fd, thermals->fermi_dirac_vel[i], 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(thermals->fermi_dirac_cumprob[i]), &abserr);
-    //       printf("gsl_integration_qng in fermi_dirac_init_nu. Result %g, error: %g, intervals: %lu\n",fermi_dirac_cumprob[i], abserr,w->size);
-    }
-    /*Save the largest cum. probability, pre-normalisation,
-     * divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
-    double total_fd;
-    gsl_integration_qag (&F, 0, MAX_FERMI_DIRAC, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
-    assert(total_fd > 1.8);
-
-    gsl_integration_workspace_free (w);
-
-    double total_frac = thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE-1]/total_fd;
-    //Normalise total integral to unity
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++)
-        thermals->fermi_dirac_cumprob[i] /= thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE - 1];
-
-    /*Initialise the GSL table*/
-    thermals->fd_intp = gsl_interp_alloc(gsl_interp_cspline,LENGTH_FERMI_DIRAC_TABLE);
-    thermals->fd_intp_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel,LENGTH_FERMI_DIRAC_TABLE);
-    return total_frac;
-}
-
-/*Generate a table of random seeds, one for each pencil.*/
-unsigned int *
-init_rng(int Seed, int Nmesh)
-{
-    unsigned int * seedtable = (unsigned int *) mymalloc("randseeds", Nmesh*Nmesh*sizeof(unsigned int));
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(rng, Seed);
-
-    int i, j;
-    for(i = 0; i < Nmesh; i++)
-        for(j=0; j < Nmesh; j++)
-        {
-            seedtable[i+Nmesh*j] = gsl_rng_get(rng);
-        }
-    gsl_rng_free(rng);
-    return seedtable;
-}
-
-/* Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.
- * The particle Id is used as a seed for the RNG.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[])
-{
-    const double p = gsl_rng_uniform (g_rng);
-    /*m_vamp multiples by the dimensional factor to get a velocity again.*/
-    const double v = thermals->m_vamp * gsl_interp_eval(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel, p, thermals->fd_intp_acc);
-
-    /*Random phase*/
-    const double phi = 2 * M_PI * gsl_rng_uniform (g_rng);
-    const double theta = acos(2 * gsl_rng_uniform (g_rng) - 1);
-
-    Vel[0] += v * sin(theta) * cos(phi);
-    Vel[1] += v * sin(theta) * sin(phi);
-    Vel[2] += v * cos(theta);
-}
diff --git a/libgenic/thermal.h b/libgenic/thermal.h
deleted file mode 100644
index a134ed4d..00000000
--- a/libgenic/thermal.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef THERMALVEL_H
-#define THERMALVEL_H
-
-#include <gsl/gsl_interp.h>
-#include <gsl/gsl_rng.h>
-/*Length of the table*/
-#define MAX_FERMI_DIRAC          17.0
-#define LENGTH_FERMI_DIRAC_TABLE 2000
-
-struct thermalvel
-{
-    double fermi_dirac_vel[LENGTH_FERMI_DIRAC_TABLE];
-    double fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE];
-    double m_vamp;
-    gsl_interp * fd_intp;
-    gsl_interp_accel * fd_intp_acc;
-};
-
-/*Single parameter is the amplitude of the random velocities. All the physics is in here.
- * max_fd and min_fd give the maximum and minimum velocities to integrate over.
- * Note these values are dimensionless*/
-/*Returns total fraction of the Fermi-Dirac distribution between max_fd and min_fd*/
-double
-init_thermalvel(struct thermalvel * thermals, const double v_amp, double max_fd, const double min_fd);
-
-/*Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[]);
-
-/*Amplitude of the random velocity for neutrinos*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s);
-
-/*Amplitude of the random velocity for WDM*/
-double
-WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s);
-
-unsigned int *
-init_rng(int Seed, int Nmesh);
-
-
-#endif
diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index a60f3a8a..c10e7641 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <math.h>
 /* do NOT use complex.h it breaks the code */
-#include <pfft.h>
 #include "allvars.h"
 #include "proto.h"
 #include "power.h"
@@ -16,13 +15,13 @@
 #include <libgadget/utils.h>
 
 #define MESH2K(i) petapm_mesh_to_k(i)
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_density(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -30,7 +29,7 @@ static void readout_vel_z(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_y(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight);
-static void gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
+static void gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
 
 static inline double periodic_wrap(double x, const double BoxSize)
 {
@@ -155,8 +154,9 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
     PetaPMParticleStruct pstruct = {
         curICP,
         sizeof(curICP[0]),
-        ((char*) &curICP[0].Pos[0]) - (char*) curICP,
-        ((char*) &curICP[0].Mass) - (char*) curICP,
+        (size_t)(((char*) &curICP[0].Pos[0]) - (char*) curICP),
+        (size_t)(((char*) &curICP[0].Mass) - (char*) curICP),
+
         NULL,
         NULL,
         NumPart,
@@ -217,7 +217,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
            &icprep);
 
     /*This allocates the memory*/
-    pfft_complex * rho_k = petapm_alloc_rhok(pm);
+    cufftComplex * rho_k = petapm_alloc_rhok(pm);
 
     gaussian_fill(pm->Nmesh, petapm_get_fourier_region(pm),
 		  rho_k, GenicConfig.UnitaryAmplitude, GenicConfig.InvertPhase, GenicConfig.Seed);
@@ -273,7 +273,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
  *
  *********************/
 
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     if(k2) {
         /* density is smoothed in k space by a gaussian kernel of 1 mesh grid */
         double r2 = 1.0 / pm->Nmesh;
@@ -283,12 +283,12 @@ static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex
         double kmag = sqrt(k2) * 2 * M_PI / pm->BoxSize;
         fac *= DeltaSpec(kmag, ptype) / sqrt(pm->BoxSize * pm->BoxSize * pm->BoxSize);
 
-        value[0][0] *= fac;
-        value[0][1] *= fac;
+        value[0].x *= fac;
+        value[0].y *= fac;
     }
 }
 
-static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * value, int include_growth) {
+static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, cufftComplex * value, int include_growth) {
     if(k2) {
         double fac = 1./ (2 * M_PI) / sqrt(pm->BoxSize) * kaxis / k2;
         /*
@@ -306,29 +306,29 @@ static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * val
             fac *= dlogGrowth(kmag, ptype);
         else
             fac *= DeltaSpec(kmag, ptype);
-        double tmp = value[0][0];
-        value[0][0] = - value[0][1] * fac;
-        value[0][1] = tmp * fac;
+        double tmp = value[0].x;
+        value[0].x = - value[0].y * fac;
+        value[0].y = tmp * fac;
     }
 }
 
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 1);
 }
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 1);
 }
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 1);
 }
 
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 0);
 }
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 0);
 }
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 0);
 }
 
@@ -359,7 +359,7 @@ static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight) {
 }
 
 static void
-gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
+gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
 {
     /* fastpm deals with strides properly; petapm not. So we translate it here. */
     PMDesc pm[1];
diff --git a/maintainer/build-MPGadget.sh b/maintainer/build-MPGadget.sh
index 6daec9b0..dfee5fe9 100644
--- a/maintainer/build-MPGadget.sh
+++ b/maintainer/build-MPGadget.sh
@@ -59,7 +59,6 @@ function build {
         cp platform-options/Options.mk.$host Options.mk
         ./bootstrap.sh
     fi
-    module load gsl
     make
 
     popd
diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 7e1267c2..40627ccb 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -4,13 +4,16 @@ channels:
   - bccp
 
 dependencies:
+  # Below are for building MP-Gadget
+  # Pin to latest version compatible with mpich 3.3.2: https://github.com/AnacondaRecipes/mpich-feedstock/issues/5
+  - mpich
+  - gxx_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - boost
+  # For the tests
   - python
+  - numpy
+  - matplotlib
   - mpi4py
   - nbodykit
-  - matplotlib
-  - numpy
   - configobj
-  # Below are for building MP-Gadget
-  - mpich
-  - gcc_linux-64=9.3.0   # Pin to latest version compatible with mpich 3.3.2: https://github.com/AnacondaRecipes/mpich-feedstock/issues/5
-  - gsl
diff --git a/platform-options/Options.mk.BlueTides b/platform-options/Options.mk.BlueTides
deleted file mode 100644
index d998d040..00000000
--- a/platform-options/Options.mk.BlueTides
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is the example for building a MP-Gadget
-# that runs the BlueTides simulation
-# on BlueWaters
-# the silly compiler is 
-
-#CC       = cc -h gnu -h omp
-MPICC       = cc
-#
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -static -fopenmp -O3 -Ofast -g
-#OPTIMIZE =  -g -static -h aggress -h flex_mp=default -h negmsgs -O3
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = -L$(GSL_DIR)/lib -lgsl -lgslcblas
-#OPT += VALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.cori b/platform-options/Options.mk.cori
deleted file mode 100644
index c4f813cc..00000000
--- a/platform-options/Options.mk.cori
+++ /dev/null
@@ -1,10 +0,0 @@
-MPICC       =   cc
-
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=native
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.coriknl b/platform-options/Options.mk.coriknl
deleted file mode 100644
index dc580324..00000000
--- a/platform-options/Options.mk.coriknl
+++ /dev/null
@@ -1,9 +0,0 @@
-MPICC       =   cc
-
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=knl
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.edison b/platform-options/Options.mk.edison
deleted file mode 100644
index c4f813cc..00000000
--- a/platform-options/Options.mk.edison
+++ /dev/null
@@ -1,10 +0,0 @@
-MPICC       =   cc
-
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=native
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.example.coma b/platform-options/Options.mk.example.coma
deleted file mode 100644
index f3132eac..00000000
--- a/platform-options/Options.mk.example.coma
+++ /dev/null
@@ -1,8 +0,0 @@
-MPICC       =   mpiicc
-
-OPTIMIZE =  -openmp -O0 -g
-GSL_INCL = -I/opt/gsl/impi/include/gsl
-GSL_LIBS = -L/opt/gsl/impi/lib64 -lgsl -lgslcblas
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.macos b/platform-options/Options.mk.macos
index 4e646ae7..5b076bf5 100644
--- a/platform-options/Options.mk.macos
+++ b/platform-options/Options.mk.macos
@@ -7,9 +7,6 @@ OPTIMIZE += -fno-common -fopenmp
 # Find the sdk path on Mac
 OPT += -isysroot $(shell xcrun -sdk macosx --show-sdk-path)
 
-GSL_INCL = $(shell pkg-config --cflags gsl)
-GSL_LIBS = $(shell pkg-config --libs gsl)
-
 OPT += -DVALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
 #OPT += -DDEBUG      # print a lot of debugging messages
 #Use alternative OpenMP locks, instead of the pthread spinlocks. Required on mac.
diff --git a/platform-options/Options.mk.pfe b/platform-options/Options.mk.pfe
deleted file mode 100644
index 4f3b38d7..00000000
--- a/platform-options/Options.mk.pfe
+++ /dev/null
@@ -1,6 +0,0 @@
-OPTIMIZE =  -fopenmp -O3 -g
-GSL_INCL = -I$(HOME)/anaconda3/envs/3.5/include
-GSL_LIBS = $(HOME)/anaconda3/envs/3.5/lib/libgsl.a $(HOME)/anaconda3/envs/3.5/lib/libgslcblas.a
-
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages
diff --git a/platform-options/Options.mk.scanbuild b/platform-options/Options.mk.scanbuild
index c794faeb..43e07ca9 100644
--- a/platform-options/Options.mk.scanbuild
+++ b/platform-options/Options.mk.scanbuild
@@ -5,8 +5,6 @@ MPICC=$(CC) -I/usr/include/openmpi-x86_64 -L/usr/lib64/openmpi/lib -lmpi
 OPTIMIZE =  -fopenmp -O0 -std=gnu99 -g -Wall -Wextra -Wno-unused-parameter -Wvla
 
 #--------------------------------------- Basic operation mode of code
-#OPT += -DLIGHTCONE                       # write a lightcone on the fly; in development
 #OPT += -DNO_OPENMP_SPINLOCK
 OPT += -DVALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
 OPT += -DDEBUG      # print a lot of debugging messages
-OPT += -DEXCUR_REION  # reionization with excursion set
diff --git a/platform-options/Options.mk.stampede2 b/platform-options/Options.mk.stampede2
deleted file mode 100644
index cc54e5b2..00000000
--- a/platform-options/Options.mk.stampede2
+++ /dev/null
@@ -1,10 +0,0 @@
-# Stampede 2 KNL nodes use icc.
-# TACC_VEC_FLAGS define: -xCORE-AVX2 -axMIC-AVX512,CORE-AVX512 
-# which means the base instruction set is CORE-AVX2, and 
-# alternate versions of some routines are generated for KNL and SKX nodes.
-#
-# -simd is important: it aligns various structures and without it intel's MPI crashes.
-OPTIMIZE =  -fopenmp -O3 -g -Wall ${TACC_VEC_FLAGS} -Zp16 -fp-model fast=1 -simd
-
-#OPT += VALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages
diff --git a/platform-options/Options.mk.travis b/platform-options/Options.mk.travis
index 0e0235cf..9366c785 100644
--- a/platform-options/Options.mk.travis
+++ b/platform-options/Options.mk.travis
@@ -1,6 +1,4 @@
-OPTIMIZE =  -fopenmp -O2 -g -std=gnu99
-GSL_INCL = 
-GSL_LIBS = -lgsl -lgslcblas
+OPTIMIZE =  -fopenmp -O2 -g 
 AR=ar
 SHELL = /bin/bash
 
diff --git a/platform-options/Options.mk.vista b/platform-options/Options.mk.vista
index ce9934bd..df699d6f 100644
--- a/platform-options/Options.mk.vista
+++ b/platform-options/Options.mk.vista
@@ -1,18 +1,41 @@
 #These variables are set to useful defaults, but may be overriden if needed
-#MPICC=mpicc
-GSL_LIBS=-L$(TACC_GSL_LIB) -lgsl -lgslcblas
-GSL_INCL=-I$(TACC_GSL_INC)
-#This is a good optimized build default for nvc
-OPTIMIZE =  -mp -g -Wall -fast
+MPICC=mpic++
+MPICCDEP=mpicc
+MPI_INCL=-I$(TACC_MPI_DIR)/include
+
+CUDA_LIBS=-L$(TACC_CUDA_LIB) -lcudart
+CUDA_INCL=-I$(TACC_CUDA_INC)
+
+BOOST_INCL=-I$(TACC_BOOST_INC)
+BOOST_LIBS=-L$(TACC_BOOST_LIB) -lboost_system -lboost_math_c99
+
+NVMATH_INCL=-I$(TACC_NVMATH_INC)
+NVMATH_LIBS=-L$(TACC_NVMATH_LIB)
+
+CUFFTMP_INCL=-I/home1/apps/nvidia/Linux_aarch64/24.7/math_libs/include/cufftmp
+CUFFTMP_LIBS=-L/home1/apps/nvidia/Linux_aarch64/24.7/math_libs/lib64 -lcufftMp
+
+NVSHMEM_INCL=-I$(TACC_NVSHMEM_INC)
+NVSHMEM_LIBS=-L$(TACC_NVSHMEM_LIB) -lnvshmem_host
+
+NVCC=nvcc
+NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
+
+#GSL_LIBS=
+#GSL_INCL=
+#This is a good optimized build default for gcc
+#OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
+#nvc++ does not have the -ffast-math flag
+#OPTIMIZE =  -fopenmp -O3 -g -Wall -use_fast_math
 #This is a good non-optimized default for debugging
-#OPTIMIZE =  -mp -O0 -g -Wall
+OPTIMIZE =  -fopenmp -O0 -g -Wall
 
 #--------------------------------------- Basic operation mode of code
 #OPT += VALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages
+OPT += -DDEBUG      # print a lot of debugging messages
 #Disable openmp locking. This means no threading.
 #OPT += -DNO_OPENMP_SPINLOCK
-
+OPT += -DUSE_CUDA  #Enable GPU-specific CUDA code
 #-----------
 #OPT += -DEXCUR_REION  # reionization with excursion set