From 6b53073ee28f81f9a86ee3c874e324f67d47becc Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 20 Sep 2024 11:30:48 -0700
Subject: [PATCH 001/120] Add a flag to use CUDA code

---
 Options.mk.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Options.mk.example b/Options.mk.example
index 3a91d5e2..09d9818f 100644
--- a/Options.mk.example
+++ b/Options.mk.example
@@ -12,7 +12,7 @@ OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
 #OPT += -DDEBUG      # print a lot of debugging messages
 #Disable openmp locking. This means no threading.
 #OPT += -DNO_OPENMP_SPINLOCK
-
+#OPT += -DUSE_CUDA  #Enable GPU-specific CUDA code
 #-----------
 #OPT += -DEXCUR_REION  # reionization with excursion set
 

From 43c8fa57b570d69da4067e719f6643cb777f71d5 Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Tue, 24 Sep 2024 22:36:26 -0700
Subject: [PATCH 002/120] memory allocation cudaMallocManaged

---
 Makefile.rules                          | 15 +++++++++++++--
 Options.mk.example                      |  4 ++++
 libgadget/utils/{memory.c => memory.cu} |  8 ++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)
 rename libgadget/utils/{memory.c => memory.cu} (98%)

diff --git a/Makefile.rules b/Makefile.rules
index 1e76ccb1..324e4b5b 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -13,10 +13,17 @@ ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     FITSIO_LIBS ?= $(shell pkg-config --libs cfitsio)
 endif
 
+ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
+    CUDA_INCL ?= 
+    CUDA_LIBS ?= -lcudart
+    NVCC ?= nvcc
+    NVOPTIMIZE ?= -O3
+endif
+
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(FITSIO_INCL)
+CFLAGS = $(OPTIONS) $(GSL_INCL) $(FITSIO_INCL) $(CUDA_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -24,7 +31,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm $(GSL_LIBS) $(FITSIO_LIBS)
+LIBS  = -lm $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
@@ -33,3 +40,7 @@ V ?= 0
 	if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
 	mkdir -p `dirname $@`; \
 	echo Compiling $<; $$cmd
+
+# Rule to compile .cu files (using nvcc)
+.objs/%.o: %.cu
+	$(NVCC) $(NVOPTIMIZE) -c $< -o $@
\ No newline at end of file
diff --git a/Options.mk.example b/Options.mk.example
index 09d9818f..160a0a95 100644
--- a/Options.mk.example
+++ b/Options.mk.example
@@ -1,5 +1,9 @@
 #These variables are set to useful defaults, but may be overriden if needed
 #MPICC=mpicc
+
+#NVCC=nvcc
+#NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
+
 #GSL_LIBS=
 #GSL_INCL=
 #This is a good optimized build default for gcc
diff --git a/libgadget/utils/memory.c b/libgadget/utils/memory.cu
similarity index 98%
rename from libgadget/utils/memory.c
rename to libgadget/utils/memory.cu
index c4552770..28f4a3ec 100644
--- a/libgadget/utils/memory.c
+++ b/libgadget/utils/memory.cu
@@ -5,6 +5,10 @@
 #include "memory.h"
 #include "endrun.h"
 
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #define MAGIC "DEADBEEF"
 #define ALIGNMENT 4096
 
@@ -151,7 +155,11 @@ allocator_alloc_va(Allocator * alloc, const char * name, const size_t request_si
     char * cptr;
     if(alloc->use_malloc) {
         /* prepend a copy of the header to the malloc block; allocator_free will use it*/
+    #ifdef USE_CUDA
+        if (cudaMallocManaged((void **) &cptr, request_size + ALIGNMENT, cudaMemAttachGlobal) != cudaSuccess)
+    #else
         if(posix_memalign((void **) &cptr, ALIGNMENT, request_size + ALIGNMENT))
+    #endif
             endrun(1, "Failed malloc: %lu bytes for %s\n", request_size, header->name);
         header->ptr = cptr + ALIGNMENT;
         memcpy(cptr, header, ALIGNMENT);

From 768dd66ff9ce3aa7f4fcc80ffc77d0cefb5369ab Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Tue, 24 Sep 2024 23:35:30 -0700
Subject: [PATCH 003/120] enforce mpicc for bigfile and pfft

---
 Options.mk.example | 3 ++-
 depends/Makefile   | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Options.mk.example b/Options.mk.example
index 160a0a95..a5d3eefe 100644
--- a/Options.mk.example
+++ b/Options.mk.example
@@ -1,5 +1,6 @@
 #These variables are set to useful defaults, but may be overriden if needed
-#MPICC=mpicc
+#MPICC=mpic++
+#MPICCDEP=mpicc
 
 #NVCC=nvcc
 #NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
diff --git a/depends/Makefile b/depends/Makefile
index d18b2f16..0c9e6144 100644
--- a/depends/Makefile
+++ b/depends/Makefile
@@ -3,7 +3,8 @@ include $(CONFIG)
 
 .PHONY: depends
 .INTERMEDIATE: pfft
-MPICC ?= mpicc
+# MPICC ?= mpicc
+MPICCDEP ?= mpicc
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
 LIBRARIES=lib/libbigfile-mpi.a
 FFTLIBRARIES=lib/libpfft_omp.a lib/libfftw3_mpi.a lib/libfftw3_omp.a
@@ -14,13 +15,13 @@ lib/libbigfile-mpi.a: bigfile/src/bigfile-mpi.c
 	mkdir -p lib; \
 	mkdir -p include; \
 	cd bigfile/src; \
-	make install PREFIX=$(PWD) CC="$(MPICC)" MPICC="$(MPICC)" CFLAGS="$(OPTIMIZE)" AR="$(AR)"
+	make install PREFIX=$(PWD) CC="$(MPICCDEP)" MPICC="$(MPICCDEP)" CFLAGS="$(OPTIMIZE)" AR="$(AR)"
 
 pfft: install_pfft.sh
 	mkdir -p lib; \
 	mkdir -p include; \
 	#Using -ipo causes icc to crash.
-	MPICC="$(MPICC)" CC="$(MPICC)" CFLAGS="$(filter-out -ipo,$(OPTIMIZE)) -I $(PWD)/include -L$(PWD)/lib" AR="$(AR)" RANLIB=$(RANLIB) \
+	MPICC="$(MPICCDEP)" CC="$(MPICCDEP)" CFLAGS="$(filter-out -ipo,$(OPTIMIZE)) -I $(PWD)/include -L$(PWD)/lib" AR="$(AR)" RANLIB=$(RANLIB) \
         sh $(PWD)/install_pfft.sh $(PWD)/
 
 clean: clean-fast clean-fft

From 9678b1d6bbec6e5ca8a3dc084709f56ad09c2aac Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Thu, 26 Sep 2024 22:30:06 -0700
Subject: [PATCH 004/120] replace gsl integration with boost first try on
 commoving distance

---
 Makefile.rules                          |   2 +-
 libgadget/lenstools.c                   |   3 +-
 libgadget/timefac.c                     | 108 +++++++++++++++++++++---
 libgadget/utils/{memory.cu => memory.c} |   0
 4 files changed, 101 insertions(+), 12 deletions(-)
 rename libgadget/utils/{memory.cu => memory.c} (100%)

diff --git a/Makefile.rules b/Makefile.rules
index 324e4b5b..22c9b4c9 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -31,7 +31,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm -lboost_system -lboost_math_c99 $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
diff --git a/libgadget/lenstools.c b/libgadget/lenstools.c
index 5cc2b735..a58cac7f 100644
--- a/libgadget/lenstools.c
+++ b/libgadget/lenstools.c
@@ -298,6 +298,7 @@ void savePotentialPlane(double *data, int rows, int cols, const char * const fil
     double Lbox_Mpc = Lbox * UnitLength_in_cm / CM_PER_MPC;  // Box size in Mpc/h
     double comoving_distance_Mpc = comoving_distance * UnitLength_in_cm / CM_PER_MPC;
     double Ode0 = CP->OmegaLambda > 0 ? CP->OmegaLambda : CP->Omega_fld;
+    char unit[] = "rad2    ";  // Mutable string for the UNIT keyword
     // Insert a blank line as a separator
     fits_write_record(fptr, "        ", &status);
     // Add headers to the FITS file
@@ -313,7 +314,7 @@ void savePotentialPlane(double *data, int rows, int cols, const char * const fil
     fits_update_key(fptr, TDOUBLE, "CHI", (&comoving_distance_Mpc), "Comoving distance in Mpc/h", &status);
     fits_update_key(fptr, TDOUBLE, "SIDE", &(Lbox_Mpc), "Side length in Mpc/h", &status);
     fits_update_key(fptr, TLONGLONG, "NPART", &num_particles, "Number of particles on the plane", &status);
-    fits_update_key(fptr, TSTRING, "UNIT", "rad2    ", "Pixel value unit", &status);
+    fits_update_key(fptr, TSTRING, "UNIT", unit, "Pixel value unit", &status);
 
     // Write the 2D array of doubles to the image
     long fpixel[2] = {1, 1};  // first pixel to write (1-based indexing)
diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index 76d2c84d..4f351abe 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -10,6 +10,12 @@
 #include "timebinmgr.h"
 #include "utils.h"
 
+#include <stdio.h>
+#include <math.h>
+#include <boost/math/quadrature/gauss.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>  // For isnan and isinf
+#include <functional>
+
 #define WORKSIZE 10000
 
 /* Integrand for the drift table*/
@@ -85,19 +91,101 @@ static double comoving_distance_integ(double a, void *param)
 }
 
 /* Function to compute the comoving distance between two scale factors */
-double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
+// double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
+// {
+//     double result, abserr;
+//     gsl_function F;
+//     gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(WORKSIZE);
+    
+//     F.function = comoving_distance_integ;
+//     F.params = CP;
+
+//     // Using GSL to perform the integration
+//     gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
+//     gsl_integration_workspace_free(workspace);
+
+//     return (LIGHTCGS/UnitVelocity_in_cm_per_s) * result;
+// }
+
+/* Adaptive integration function with error control */
+double adaptive_integrate(std::function<double(double)> integrand, double a0, double a1, double *abserr, double epsrel = 1e-8, size_t max_points = 1024)
 {
+    double result_prev = 0.0;
+    double result_current = 0.0;
+    size_t points = 15;  // Start with 15-point Gauss-Legendre quadrature
+
+    while (true) {
+        result_prev = result_current;
+
+        // Use switch-case to handle different compile-time fixed point values
+        switch (points) {
+            case 15:
+                result_current = boost::math::quadrature::gauss<double, 15>::integrate(integrand, a0, a1);
+                break;
+            case 31:
+                result_current = boost::math::quadrature::gauss<double, 31>::integrate(integrand, a0, a1);
+                break;
+            case 63:
+                result_current = boost::math::quadrature::gauss<double, 63>::integrate(integrand, a0, a1);
+                break;
+            case 127:
+                result_current = boost::math::quadrature::gauss<double, 127>::integrate(integrand, a0, a1);
+                break;
+            case 255:
+                result_current = boost::math::quadrature::gauss<double, 255>::integrate(integrand, a0, a1);
+                break;
+            case 511:
+                result_current = boost::math::quadrature::gauss<double, 511>::integrate(integrand, a0, a1);
+                break;
+            case 1024:
+                result_current = boost::math::quadrature::gauss<double, 1024>::integrate(integrand, a0, a1);
+                break;
+            default:
+                printf("Unsupported number of points: %zu\n", points);
+                return result_current;
+        }
+
+        // Estimate the absolute error as the difference between successive results
+        *abserr = fabs(result_current - result_prev);
+
+        // Check if the relative error is within the tolerance
+        if (fabs(result_current) > 0 && (*abserr / fabs(result_current)) < epsrel) {
+            break;
+        }
+
+        // If we've reached the max allowed points without satisfying error tolerance, stop
+        if (points == max_points) {
+            printf("Warning: Maximum points reached. Desired relative error not achieved.\n");
+            break;
+        }
+
+        // Double the number of quadrature points for the next iteration
+        size_t next_points = points * 2;
+        if (next_points > max_points) {
+            points = max_points;
+        } else {
+            points = next_points;
+        }
+    }
+
+    return result_current;
+}
+
+/* Function to compute comoving distance using the adaptive integrator */
+double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
+{   
+    // relative error tolerance
+    // double epsrel = 1e-8;
     double result, abserr;
-    gsl_function F;
-    gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    
-    F.function = comoving_distance_integ;
-    F.params = CP;
+    // Define the integrand as a lambda function, wrapping comoving_distance_integ
+    auto integrand = [CP](double a) {
+        return comoving_distance_integ(a, (void*)CP);
+    };
 
-    // Using GSL to perform the integration
-    gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    // Call the generic adaptive integration function
+    result = adaptive_integrate(integrand, a0, a1, &abserr);
 
-    return (LIGHTCGS/UnitVelocity_in_cm_per_s) * result;
+    // Convert the result using the provided units
+    return (LIGHTCGS / UnitVelocity_in_cm_per_s) * result;
 }
 
diff --git a/libgadget/utils/memory.cu b/libgadget/utils/memory.c
similarity index 100%
rename from libgadget/utils/memory.cu
rename to libgadget/utils/memory.c

From 757e5b8b7bea0fef88fcf1e3a3c45cca045f00a2 Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Fri, 27 Sep 2024 22:00:43 -0700
Subject: [PATCH 005/120] tanh_sinh integrator

---
 libgadget/timefac.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index 4f351abe..22d6ac9c 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -15,6 +15,7 @@
 #include <boost/math/quadrature/gauss.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>  // For isnan and isinf
 #include <functional>
+#include <boost/math/quadrature/tanh_sinh.hpp>
 
 #define WORKSIZE 10000
 
@@ -171,6 +172,41 @@ double adaptive_integrate(std::function<double(double)> integrand, double a0, do
     return result_current;
 }
 
+
+// Function to perform tanh-sinh integration with adaptive max_refinements
+double tanh_sinh_integrate_adaptive(auto func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
+    double result_prev = 0.0;
+    double result_current = 0.0;
+    *estimated_error = 1.0;  // Start with a large relative error
+    int max_refine = init_refine;
+
+    // Loop until reaching the max refinements limit or satisfying the tolerance
+    for (; max_refine <= max_refinements_limit; max_refine += step) {
+        // Create a Tanh-Sinh integrator with the current max_refinements
+        boost::math::quadrature::tanh_sinh<double> integrator(max_refine);
+
+        // Perform the integration
+        result_current = integrator.integrate(func, a, b);
+
+        // If this is not the first iteration, compute the relative error
+        if (max_refine > init_refine) {
+            *estimated_error = fabs(result_current - result_prev) / fabs(result_current);
+
+            // Check if the relative error is within the target tolerance
+            if (*estimated_error < rel_tol) {
+                break;  // Stop refining if the result is within the tolerance
+            }
+        }
+
+        // Update the previous result for the next iteration
+        result_prev = result_current;
+    }
+
+    // Return the final result
+    return result_current;
+}
+
+
 /* Function to compute comoving distance using the adaptive integrator */
 double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
 {   
@@ -183,9 +219,9 @@ double compute_comoving_distance(Cosmology *CP, double a0, double a1, const doub
     };
 
     // Call the generic adaptive integration function
-    result = adaptive_integrate(integrand, a0, a1, &abserr);
+    // result = adaptive_integrate(integrand, a0, a1, &abserr);
+    result = tanh_sinh_integrate_adaptive(integrand, a0, a1, &abserr);
 
     // Convert the result using the provided units
     return (LIGHTCGS / UnitVelocity_in_cm_per_s) * result;
 }
-

From e2d26591e6c75d3d536e830dd4d9c091ac36019c Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Fri, 27 Sep 2024 22:47:49 -0700
Subject: [PATCH 006/120] cpp compatibility correction

---
 libgadget/gravpm.c  | 4 ++--
 libgadget/timefac.c | 2 +-
 libgadget/uvbg.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index 3130d117..829443c7 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -62,8 +62,8 @@ gravpm_force(PetaPM * pm, DomainDecomp * ddecomp, Cosmology * CP, double Time, d
     PetaPMParticleStruct pstruct = {
         P,
         sizeof(P[0]),
-        (char*) &P[0].Pos[0]  - (char*) P,
-        (char*) &P[0].Mass  - (char*) P,
+        static_cast<size_t>((char*) &P[0].Pos[0]  - (char*) P),
+        static_cast<size_t>((char*) &P[0].Mass  - (char*) P),
         /* Regions allocated inside _prepare*/
         NULL,
         /* By default all particles are active. For hybrid neutrinos set below.*/
diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index 22d6ac9c..f29ca6ce 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -174,7 +174,7 @@ double adaptive_integrate(std::function<double(double)> integrand, double a0, do
 
 
 // Function to perform tanh-sinh integration with adaptive max_refinements
-double tanh_sinh_integrate_adaptive(auto func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
+double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
     double result_prev = 0.0;
     double result_current = 0.0;
     *estimated_error = 1.0;  // Start with a large relative error
diff --git a/libgadget/uvbg.c b/libgadget/uvbg.c
index 17070af6..97a379ae 100644
--- a/libgadget/uvbg.c
+++ b/libgadget/uvbg.c
@@ -131,7 +131,7 @@ void save_uvbg_grids(int SnapshotFileCount, char * OutputDir, PetaPM * pm)
     //TODO: think about the cartesian communicator in the PetaPM struct
     //and the mapping between ranks, indices and positions
 
-    size_t dims[2] = {grid_n, 1};
+    size_t dims[2] = {(size_t)grid_n, 1};
     //J21 block
     BigArray arr = {0};
     big_array_init(&arr, UVBGgrids.J21, "=f4", 2, dims, NULL);

From d20fb2ad5a68e209ee0da4b759b58ec17b8a3af2 Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Fri, 27 Sep 2024 22:51:00 -0700
Subject: [PATCH 007/120] cpp compat

---
 libgadget/utils/spinlocks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgadget/utils/spinlocks.c b/libgadget/utils/spinlocks.c
index d160db10..d3f76d27 100644
--- a/libgadget/utils/spinlocks.c
+++ b/libgadget/utils/spinlocks.c
@@ -54,7 +54,7 @@ struct SpinLocks * init_spinlocks(int NumLock)
     spin.SpinLocks = (pthread_spinlock_t *) mymalloc("SpinLocks", NumLock * sizeof(pthread_spinlock_t));
     #pragma omp parallel for
 #else
-    spin.SpinLocks = mymalloc("SpinLocks", NumLock * sizeof(omp_lock_t));
+    spin.SpinLocks = (omp_lock_t*)mymalloc("SpinLocks", NumLock * sizeof(omp_lock_t));
 #endif
     for(i = 0; i < NumLock; i ++) {
 #ifndef NO_OPENMP_SPINLOCK

From d6b2daffe2565ff7c59b37bee668fff041dad8e1 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Sat, 28 Sep 2024 01:57:15 -0700
Subject: [PATCH 008/120] cpp correction

---
 libgenic/glass.c     | 4 ++--
 libgenic/zeldovich.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libgenic/glass.c b/libgenic/glass.c
index 6bd6ff34..5d8e6bbd 100644
--- a/libgenic/glass.c
+++ b/libgenic/glass.c
@@ -183,8 +183,8 @@ static void glass_force(PetaPM * pm, double t_f, struct ic_part_data * ICP, cons
     PetaPMParticleStruct pstruct = {
         ICP,
         sizeof(ICP[0]),
-        (char*) &ICP[0].Pos[0]  - (char*) ICP,
-        (char*) &ICP[0].Mass  - (char*) ICP,
+        (size_t)((char*) &ICP[0].Pos[0]  - (char*) ICP),
+        (size_t)((char*) &ICP[0].Mass  - (char*) ICP),
         NULL,
         NULL,
         NumPart,
diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index a60f3a8a..ffc2bfee 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -155,8 +155,9 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
     PetaPMParticleStruct pstruct = {
         curICP,
         sizeof(curICP[0]),
-        ((char*) &curICP[0].Pos[0]) - (char*) curICP,
-        ((char*) &curICP[0].Mass) - (char*) curICP,
+        (size_t)(((char*) &curICP[0].Pos[0]) - (char*) curICP),
+        (size_t)(((char*) &curICP[0].Mass) - (char*) curICP),
+
         NULL,
         NULL,
         NumPart,

From 3c911e63a7f3aee52f268f20d30888d4089d6c7a Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Sat, 28 Sep 2024 21:35:14 -0700
Subject: [PATCH 009/120] tanh_sinh adaptive exact factor

---
 libgadget/timefac.c | 180 +++++++++++++++++---------------------------
 1 file changed, 70 insertions(+), 110 deletions(-)

diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index f29ca6ce..701da050 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -19,6 +19,39 @@
 
 #define WORKSIZE 10000
 
+// Function to perform tanh-sinh integration with adaptive max_refinements
+double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
+    double result_prev = 0.0;
+    double result_current = 0.0;
+    *estimated_error = 1.0;  // Start with a large relative error
+    int max_refine = init_refine;
+
+    // Loop until reaching the max refinements limit or satisfying the tolerance
+    for (; max_refine <= max_refinements_limit; max_refine += step) {
+        // Create a Tanh-Sinh integrator with the current max_refinements
+        boost::math::quadrature::tanh_sinh<double> integrator(max_refine);
+
+        // Perform the integration
+        result_current = integrator.integrate(func, a, b);
+
+        // If this is not the first iteration, compute the relative error
+        if (max_refine > init_refine) {
+            *estimated_error = fabs(result_current - result_prev) / fabs(result_current);
+
+            // Check if the relative error is within the target tolerance
+            if (*estimated_error < rel_tol) {
+                break;  // Stop refining if the result is within the tolerance
+            }
+        }
+
+        // Update the previous result for the next iteration
+        result_prev = result_current;
+    }
+
+    // Return the final result
+    return result_current;
+}
+
 /* Integrand for the drift table*/
 static double drift_integ(double a, void *param)
 {
@@ -49,20 +82,43 @@ static double hydrokick_integ(double a, void *param)
 }
 
 /*Do the integral required to get a factor.*/
-static double get_exact_factor(Cosmology * CP, inttime_t t0, inttime_t t1, double (*factor) (double, void *))
+// static double get_exact_factor(Cosmology * CP, inttime_t t0, inttime_t t1, double (*factor) (double, void *))
+// {
+//     double result, abserr;
+//     if(t0 == t1)
+//         return 0;
+//     double a0 = exp(loga_from_ti(t0));
+//     double a1 = exp(loga_from_ti(t1));
+//     gsl_function F;
+//     gsl_integration_workspace *workspace;
+//     workspace = gsl_integration_workspace_alloc(WORKSIZE);
+//     F.function = factor;
+//     F.params = CP;
+//     gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
+//     gsl_integration_workspace_free(workspace);
+//     return result;
+// }
+
+// Function to compute a factor using Tanh-Sinh adaptive integration
+static double get_exact_factor(Cosmology *CP, inttime_t t0, inttime_t t1, double (*factor)(double, void *))
 {
-    double result, abserr;
-    if(t0 == t1)
+    if (t0 == t1) {
         return 0;
+    }
+
+    // Calculate the scale factors
     double a0 = exp(loga_from_ti(t0));
     double a1 = exp(loga_from_ti(t1));
-    gsl_function F;
-    gsl_integration_workspace *workspace;
-    workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    F.function = factor;
-    F.params = CP;
-    gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    double abserr;
+
+    // Define the integrand as a lambda function, wrapping the existing factor function
+    auto integrand = [CP, factor](double a) {
+        return factor(a, (void*)CP);
+    };
+
+    // Call the adaptive Tanh-Sinh integrator
+    double result = tanh_sinh_integrate_adaptive(integrand, a0, a1, &abserr);
+
     return result;
 }
 
@@ -86,9 +142,10 @@ double get_exact_hydrokick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1)
 /* Integrand for comoving distance */
 static double comoving_distance_integ(double a, void *param)
 {
-    Cosmology *CP = (Cosmology *) param;
-    double h = hubble_function(CP, a);
-    return 1. / (h * a * a); 
+    // Cosmology *CP = (Cosmology *) param;
+    // double h = hubble_function(CP, a);
+    // return 1. / (h * a * a); 
+    return gravkick_integ(a, param);
 }
 
 /* Function to compute the comoving distance between two scale factors */
@@ -108,103 +165,6 @@ static double comoving_distance_integ(double a, void *param)
 //     return (LIGHTCGS/UnitVelocity_in_cm_per_s) * result;
 // }
 
-/* Adaptive integration function with error control */
-double adaptive_integrate(std::function<double(double)> integrand, double a0, double a1, double *abserr, double epsrel = 1e-8, size_t max_points = 1024)
-{
-    double result_prev = 0.0;
-    double result_current = 0.0;
-    size_t points = 15;  // Start with 15-point Gauss-Legendre quadrature
-
-    while (true) {
-        result_prev = result_current;
-
-        // Use switch-case to handle different compile-time fixed point values
-        switch (points) {
-            case 15:
-                result_current = boost::math::quadrature::gauss<double, 15>::integrate(integrand, a0, a1);
-                break;
-            case 31:
-                result_current = boost::math::quadrature::gauss<double, 31>::integrate(integrand, a0, a1);
-                break;
-            case 63:
-                result_current = boost::math::quadrature::gauss<double, 63>::integrate(integrand, a0, a1);
-                break;
-            case 127:
-                result_current = boost::math::quadrature::gauss<double, 127>::integrate(integrand, a0, a1);
-                break;
-            case 255:
-                result_current = boost::math::quadrature::gauss<double, 255>::integrate(integrand, a0, a1);
-                break;
-            case 511:
-                result_current = boost::math::quadrature::gauss<double, 511>::integrate(integrand, a0, a1);
-                break;
-            case 1024:
-                result_current = boost::math::quadrature::gauss<double, 1024>::integrate(integrand, a0, a1);
-                break;
-            default:
-                printf("Unsupported number of points: %zu\n", points);
-                return result_current;
-        }
-
-        // Estimate the absolute error as the difference between successive results
-        *abserr = fabs(result_current - result_prev);
-
-        // Check if the relative error is within the tolerance
-        if (fabs(result_current) > 0 && (*abserr / fabs(result_current)) < epsrel) {
-            break;
-        }
-
-        // If we've reached the max allowed points without satisfying error tolerance, stop
-        if (points == max_points) {
-            printf("Warning: Maximum points reached. Desired relative error not achieved.\n");
-            break;
-        }
-
-        // Double the number of quadrature points for the next iteration
-        size_t next_points = points * 2;
-        if (next_points > max_points) {
-            points = max_points;
-        } else {
-            points = next_points;
-        }
-    }
-
-    return result_current;
-}
-
-
-// Function to perform tanh-sinh integration with adaptive max_refinements
-double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
-    double result_prev = 0.0;
-    double result_current = 0.0;
-    *estimated_error = 1.0;  // Start with a large relative error
-    int max_refine = init_refine;
-
-    // Loop until reaching the max refinements limit or satisfying the tolerance
-    for (; max_refine <= max_refinements_limit; max_refine += step) {
-        // Create a Tanh-Sinh integrator with the current max_refinements
-        boost::math::quadrature::tanh_sinh<double> integrator(max_refine);
-
-        // Perform the integration
-        result_current = integrator.integrate(func, a, b);
-
-        // If this is not the first iteration, compute the relative error
-        if (max_refine > init_refine) {
-            *estimated_error = fabs(result_current - result_prev) / fabs(result_current);
-
-            // Check if the relative error is within the target tolerance
-            if (*estimated_error < rel_tol) {
-                break;  // Stop refining if the result is within the tolerance
-            }
-        }
-
-        // Update the previous result for the next iteration
-        result_prev = result_current;
-    }
-
-    // Return the final result
-    return result_current;
-}
 
 
 /* Function to compute comoving distance using the adaptive integrator */

From 5a2bee5659da2823a6afa4f4dc5dd04dc723de05 Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Sat, 28 Sep 2024 21:45:43 -0700
Subject: [PATCH 010/120] cleanup

---
 libgadget/timefac.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index 701da050..e850ecac 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -3,7 +3,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
 
 #include "physconst.h"
 #include "timefac.h"
@@ -17,8 +16,6 @@
 #include <functional>
 #include <boost/math/quadrature/tanh_sinh.hpp>
 
-#define WORKSIZE 10000
-
 // Function to perform tanh-sinh integration with adaptive max_refinements
 double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
     double result_prev = 0.0;
@@ -81,24 +78,6 @@ static double hydrokick_integ(double a, void *param)
   return 1 / (h * pow(a, 3 * GAMMA_MINUS1) * a);
 }
 
-/*Do the integral required to get a factor.*/
-// static double get_exact_factor(Cosmology * CP, inttime_t t0, inttime_t t1, double (*factor) (double, void *))
-// {
-//     double result, abserr;
-//     if(t0 == t1)
-//         return 0;
-//     double a0 = exp(loga_from_ti(t0));
-//     double a1 = exp(loga_from_ti(t1));
-//     gsl_function F;
-//     gsl_integration_workspace *workspace;
-//     workspace = gsl_integration_workspace_alloc(WORKSIZE);
-//     F.function = factor;
-//     F.params = CP;
-//     gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-//     gsl_integration_workspace_free(workspace);
-//     return result;
-// }
-
 // Function to compute a factor using Tanh-Sinh adaptive integration
 static double get_exact_factor(Cosmology *CP, inttime_t t0, inttime_t t1, double (*factor)(double, void *))
 {
@@ -148,25 +127,6 @@ static double comoving_distance_integ(double a, void *param)
     return gravkick_integ(a, param);
 }
 
-/* Function to compute the comoving distance between two scale factors */
-// double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
-// {
-//     double result, abserr;
-//     gsl_function F;
-//     gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    
-//     F.function = comoving_distance_integ;
-//     F.params = CP;
-
-//     // Using GSL to perform the integration
-//     gsl_integration_qag(&F, a0, a1, 0, 1.0e-8, WORKSIZE, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-//     gsl_integration_workspace_free(workspace);
-
-//     return (LIGHTCGS/UnitVelocity_in_cm_per_s) * result;
-// }
-
-
-
 /* Function to compute comoving distance using the adaptive integrator */
 double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s)
 {   

From c9cd36dc2baf8d25d1585620f7fd3612fecd2d59 Mon Sep 17 00:00:00 2001
From: astro-YYH <yyang440@ucr.edu>
Date: Sat, 28 Sep 2024 22:25:08 -0700
Subject: [PATCH 011/120] cosmology integrate adapted

---
 libgadget/cosmology.c | 27 +++++++++++++++------------
 libgadget/timefac.c   |  7 ++++++-
 libgadget/timefac.h   | 12 ++++++++++++
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/libgadget/cosmology.c b/libgadget/cosmology.c
index b4abd76e..61a8840d 100644
--- a/libgadget/cosmology.c
+++ b/libgadget/cosmology.c
@@ -1,11 +1,11 @@
 #include <math.h>
-#include <gsl/gsl_integration.h>
 #include <gsl/gsl_errno.h>
 #include <gsl/gsl_odeiv2.h>
 
 #include "cosmology.h"
 #include "physconst.h"
 #include "utils.h"
+#include "timefac.h"
 
 /*Stefan-Boltzmann constant in cgs units*/
 #define  STEFAN_BOLTZMANN 5.670373e-5
@@ -236,19 +236,22 @@ double function_of_k_eval(FunctionOfK * fk, double k)
     }
 }
 
-double function_of_k_tophat_sigma(FunctionOfK * fk, double R)
+// Adapted function to use Tanh-Sinh adaptive integration
+double function_of_k_tophat_sigma(FunctionOfK *fk, double R)
 {
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
+    // Create the parameter structure
     struct sigma2_params params = {fk, R};
-    double result,abserr;
-    gsl_function F;
-    F.function = &sigma2_int;
-    F.params = &params;
-
-    /* note: 500/R is here chosen as integration boundary (infinity) */
-    gsl_integration_qags (&F, 0, 500. / R, 0, 1e-4,1000,w,&result, &abserr);
-    //   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size);
-    gsl_integration_workspace_free (w);
+    double abserr;  // To hold the estimated error
+
+    // Define the integrand as a lambda function wrapping the original `sigma2_int`
+    auto integrand = [&params](double k) -> double {
+        return sigma2_int(k, (void*)&params);
+    };
+
+    // Perform the Tanh-Sinh adaptive integration
+    double result = tanh_sinh_integrate_adaptive(integrand, 0, 500.0 / R, &abserr, 1e-4);
+
+    // Return the square root of the result
     return sqrt(result);
 }
 
diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index e850ecac..d8984de8 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -17,7 +17,7 @@
 #include <boost/math/quadrature/tanh_sinh.hpp>
 
 // Function to perform tanh-sinh integration with adaptive max_refinements
-double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol = 1e-8, int max_refinements_limit = 30, int init_refine = 5, int step = 5) {
+double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol, int max_refinements_limit, int init_refine, int step) {
     double result_prev = 0.0;
     double result_current = 0.0;
     *estimated_error = 1.0;  // Start with a large relative error
@@ -45,6 +45,11 @@ double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a
         result_prev = result_current;
     }
 
+    // If we exited the loop without achieving the desired tolerance, print a warning
+    if (*estimated_error > rel_tol) {
+        message(1, "Warning: Tanh-Sinh integration did not reach the desired tolerance of %g. Final relative error: %g\n", rel_tol, *estimated_error);
+    }
+
     // Return the final result
     return result_current;
 }
diff --git a/libgadget/timefac.h b/libgadget/timefac.h
index 10e85522..a6a26672 100644
--- a/libgadget/timefac.h
+++ b/libgadget/timefac.h
@@ -4,10 +4,22 @@
 #include "types.h"
 #include "cosmology.h"
 #include "timebinmgr.h"
+#include <functional>  // For std::function
 
 /* Get the exact drift and kick factors at given time by integrating. */
 double get_exact_drift_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double get_exact_hydrokick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double get_exact_gravkick_factor(Cosmology * CP, inttime_t ti0, inttime_t ti1);
 double compute_comoving_distance(Cosmology *CP, double a0, double a1, const double UnitVelocity_in_cm_per_s);
+double tanh_sinh_integrate_adaptive(
+    std::function<double(double)> func, 
+    double a, 
+    double b, 
+    double* estimated_error, 
+    double rel_tol = 1e-8, 
+    int max_refinements_limit = 30, 
+    int init_refine = 5, 
+    int step = 5
+);
+
 #endif

From 4bc9970d91278c7683a8cef72a43185835dfc528 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 15:22:48 -0500
Subject: [PATCH 012/120] modified lib to add Boost path

---
 Makefile       |  3 ++-
 Makefile.rules | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 322699bb..802532f8 100644
--- a/Makefile
+++ b/Makefile
@@ -8,12 +8,13 @@ include Makefile.version
 FILES = $(shell git ls-files)
 
 all: $(CONFIG)
+	@echo "=================$(BOOST_LIBS)======================="
+	@echo "=================$(GSL_LIBS)======================="
 	cd depends; $(MAKE)
 	cd libgadget; $(MAKE)
 	cd libgenic; $(MAKE)
 	cd gadget; $(MAKE)
 	cd genic; $(MAKE)
-
 clean :
 	cd libgadget; $(MAKE) clean
 	cd libgenic; $(MAKE) clean
diff --git a/Makefile.rules b/Makefile.rules
index 22c9b4c9..db579939 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -1,12 +1,17 @@
 # vim: set ft=make:
 #
 AR ?= ar
-MPICC ?= mpicc
+MPICC ?= mpic++
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
 GSL_INCL ?= $(shell pkg-config --cflags gsl)
 GSL_LIBS ?= $(shell pkg-config --libs gsl)
+#BOOST_INCL ?= $(shell pkg-config --cflags boost)
+#BOOST_LIBS ?= $(shell pkg-config --libs boost)
+all:
+	@echo "=================$(BOOST_LIBS)======================="
+
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
     FITSIO_INCL ?= $(shell pkg-config --cflags cfitsio)
@@ -23,7 +28,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(FITSIO_INCL) $(CUDA_INCL)
+CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -31,7 +36,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm -lboost_system -lboost_math_c99 $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
@@ -43,4 +48,4 @@ V ?= 0
 
 # Rule to compile .cu files (using nvcc)
 .objs/%.o: %.cu
-	$(NVCC) $(NVOPTIMIZE) -c $< -o $@
\ No newline at end of file
+	$(NVCC) $(NVOPTIMIZE) -c $< -o $@

From 5672994a3a300ceb26495111c6efa28269045c9e Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:30:39 -0400
Subject: [PATCH 013/120] changed pfft calls/vars to cufft

---
 libgadget/petapm.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 2a3a45c8..97a47dd9 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,6 +1,6 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <pfft.h>
+#include <cufftmp.h>   // NC:library change
 
 #include "powerspectrum.h"
 
@@ -49,8 +49,8 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
-    pfft_plan plan_forw;
-    pfft_plan plan_back;
+    cufftmpHandle_t plan_forw; // NC:change plan function call
+    cufftmpHandle_t plan_back;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */
@@ -99,7 +99,7 @@ typedef struct {
     size_t offset_fesc; //offset in fof groups to fof mass
 } PetaPMReionPartStruct;
 
-typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
 typedef PetaPMRegion * (*petapm_prepare_func)(PetaPM * pm, PetaPMParticleStruct * pstruct, void * data, int *Nregions);
 
@@ -142,13 +142,13 @@ PetaPMRegion * petapm_force_init(PetaPM * pm,
         PetaPMParticleStruct * pstruct,
         int * Nregions,
         void * userdata);
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
-        );
+        ); // NC: changed returned complex type
 void petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k, PetaPMRegion * regions,
+        cufftComplex * rho_k, PetaPMRegion * regions,
         const int Nregions,
-        PetaPMFunctions * functions);
+        PetaPMFunctions * functions); // NC: changed input complex type
 void petapm_force_finish(PetaPM * pm);
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm);
@@ -156,7 +156,7 @@ PetaPMRegion * petapm_get_real_region(PetaPM * pm);
 int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
-pfft_complex * petapm_alloc_rhok(PetaPM * pm);
+cufftComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
 
 void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
         petapm_prepare_func prepare,

From 9b0d2ca467339a6086ccbaa39c2c3d410388b99c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:32:03 -0400
Subject: [PATCH 014/120] changed pfft calls/vars to cufft in petapm.c

---
 libgadget/petapm.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index aeda7bb3..c1b8e12d 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -46,10 +46,10 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 static MPI_Datatype MPI_PENCIL;
 
 /*Used only in MP-GenIC*/
-pfft_complex *
+cufftComplex *
 petapm_alloc_rhok(PetaPM * pm)
 {
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
     memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
@@ -174,8 +174,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     /* planning the fft; need temporary arrays */
 
     double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
 
     pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
         n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
@@ -237,8 +237,8 @@ typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
 /* apply transfer function to value, kpos array is in x, y, z order */
 static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H);
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
 static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
@@ -279,7 +279,7 @@ petapm_force_init(
     return regions;
 }
 
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
     /* call pfft rho_k is CFT of rho */
@@ -299,11 +299,11 @@ pfft_complex * petapm_force_r2c(PetaPM * pm,
     walltime_measure("/PMgrav/Verify");
 #endif
 
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
     pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
     myfree(real);
 
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
     /*Do any analysis that may be required before the transfer function is applied*/
     petapm_transfer_func global_readout = global_functions->global_readout;
@@ -322,7 +322,7 @@ pfft_complex * petapm_force_r2c(PetaPM * pm,
 
 void
 petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k,
+        cufftComplex * rho_k,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions)
@@ -333,7 +333,7 @@ petapm_force_c2r(PetaPM * pm,
         petapm_transfer_func transfer = f->transfer;
         petapm_readout_func readout = f->readout;
 
-        pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
         /* apply the greens function turn rho_k into potential in fourier space */
         pm_apply_transfer_function(pm, rho_k, complx, transfer);
         walltime_measure("/PMgrav/calc");
@@ -366,7 +366,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
         void * userdata) {
     int Nregions;
     PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    pfft_complex * rho_k = petapm_force_r2c(pm, global_functions);
+    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
     if(functions)
         petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
     myfree(rho_k);
@@ -413,7 +413,7 @@ petapm_reion_init(
  * ,after c2r but iteration over the grid, instead of particles */
 void
 petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        pfft_complex * mass_unfiltered, pfft_complex * star_unfiltered, pfft_complex * sfr_unfiltered,
+        cufftComplex * mass_unfiltered, cufftComplex * star_unfiltered, cufftComplex * sfr_unfiltered,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions,
@@ -446,11 +446,11 @@ petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
         if(use_sfr)pm_sfr->G = R;
 
         //TODO: maybe allocate and free these outside the loop
-        pfft_complex * mass_filtered = (pfft_complex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
-        pfft_complex * star_filtered = (pfft_complex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
-        pfft_complex * sfr_filtered;
+        cufftComplex * mass_filtered = (cufftComplex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
+        cufftComplex * star_filtered = (cufftComplex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
+        cufftComplex * sfr_filtered;
         if(use_sfr){
-            sfr_filtered = (pfft_complex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
+            sfr_filtered = (cufftComplex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
         }
 
         /* apply the filtering at this radius */
@@ -536,9 +536,9 @@ void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
     walltime_measure("/PMreion/comm2");
 
     //using force r2c since this part can be done independently
-    pfft_complex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
-    pfft_complex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
-    pfft_complex * sfr_unfiltered = NULL;
+    cufftComplex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
+    cufftComplex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
+    cufftComplex * sfr_unfiltered = NULL;
     if(use_sfr){
         sfr_unfiltered = petapm_force_r2c(pm_sfr, global_functions);
     }
@@ -1088,8 +1088,8 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 #endif
 
 static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
         ){
     size_t ip = 0;
 

From 7c5353ce1ca41db1503b6d3f289983d6d3107a01 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:40:27 -0400
Subject: [PATCH 015/120] fixed typo in lib include

---
 libgadget/petapm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 97a47dd9..1fa1070c 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,6 +1,6 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <cufftmp.h>   // NC:library change
+#include <cufftMp.h>   // NC:library change
 
 #include "powerspectrum.h"
 

From c6d0b8d40977f7d85c99964af1fdfcc01a557f2c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 21:55:49 -0500
Subject: [PATCH 016/120] fixed cufft complex indexing

---
 libgadget/gravpm.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index 829443c7..b2ab750b 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -20,11 +20,11 @@ static int pm_mark_region_for_node(int startno, int rid, int * RegionInd, const
 static void convert_node_to_region(PetaPM * pm, PetaPMRegion * r, struct NODE * Nodes);
 
 static int hybrid_nu_gravpm_is_active(int i);
-static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void compute_neutrino_power(PetaPM * pm);
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -328,11 +328,11 @@ static void compute_neutrino_power(PetaPM * pm) {
 /* Compute the power spectrum of the fourier transformed grid in value.
  * Store it in the PowerSpectrum structure */
 void
-powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh)
+powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh)
 {
     if(k2 == 0) {
         /* Save zero mode corresponding to the mean as the normalisation factor.*/
-        PowerSpectrum->Norm = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        PowerSpectrum->Norm = (value[0].x * value[0].x + value[0].y * value[0].y);
         return;
     }
     /* Measure power spectrum: we don't want the zero mode.
@@ -344,7 +344,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
         int kint=floor(binsperunit*log(k2)/2.);
         int w;
         const double keff = sqrt(kpos[0]*kpos[0]+kpos[1]*kpos[1]+kpos[2]*kpos[2]);
-        const double m = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        const double m = (value[0].x * value[0].x + value[0].y * value[0].y);
         /*Make sure we do not overflow (although this should never happen)*/
         if(kint >= PowerSpectrum->size)
             return;
@@ -362,7 +362,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
 
 /*Just read the power spectrum, without changing the input value.*/
 void
-measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value) {
+measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value) {
     double f = 1.0;
     /* the CIC deconvolution kernel is
      *
@@ -381,7 +381,7 @@ measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value
 }
 
 static void
-potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
+potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value)
 {
     const double asmth2 = pow((2 * M_PI) * pm->Asmth / pm->Nmesh,2);
     double f = 1.0;
@@ -432,8 +432,8 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
          * once we multiply PowerSpectrum.Norm by (Omega0 / (Omega0 - OmegaNu))**2 */
         const double nufac = 1 + ps->nu_prefac * gsl_interp_eval(ps->nu_spline,ps->logknu,
                                                                        ps->delta_nu_ratio,logk2,ps->nu_acc);
-        value[0][0] *= nufac;
-        value[0][1] *= nufac;
+        value[0].x *= nufac;
+        value[0].y *= nufac;
     }
 
     /*Compute the power spectrum*/
@@ -444,13 +444,13 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
             ps->Norm *= MtotbyMcdm*MtotbyMcdm;
         }
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -473,7 +473,7 @@ static int hybrid_nu_gravpm_is_active(int i) {
         return 1;
 }
 
-static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM * pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -482,18 +482,18 @@ static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight) {

From deb46ce1a69138f2bf4dc2ba15255dd6d0a0934f Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 08:43:37 -0500
Subject: [PATCH 017/120] added cufft/cuda libs to Makefile.rules

---
 Makefile.rules | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index db579939..56101cbe 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -9,8 +9,6 @@ GSL_INCL ?= $(shell pkg-config --cflags gsl)
 GSL_LIBS ?= $(shell pkg-config --libs gsl)
 #BOOST_INCL ?= $(shell pkg-config --cflags boost)
 #BOOST_LIBS ?= $(shell pkg-config --libs boost)
-all:
-	@echo "=================$(BOOST_LIBS)======================="
 
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
@@ -21,6 +19,10 @@ endif
 ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
     CUDA_INCL ?= 
     CUDA_LIBS ?= -lcudart
+    CUFFTMP_INCL ?= 
+    CUFFTMP_LIBS ?= -lcufftMp
+    NVSHMEM_INCL ?= 
+    NVSHMEM_LIBS ?= -lnvshmem_host
     NVCC ?= nvcc
     NVOPTIMIZE ?= -O3
 endif
@@ -28,7 +30,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL)
+CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -36,7 +38,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 

From 89fe1fb73651aec3b7fc1c96ec18ecd42cae1f5c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 08:44:12 -0500
Subject: [PATCH 018/120] some progress on pfft->cufftmp

---
 libgadget/gravity.h |  4 ++--
 libgadget/petapm.c  | 51 ++++++++++++++++++++++++++++++++-------------
 libgadget/petapm.h  |  4 ++--
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/libgadget/gravity.h b/libgadget/gravity.h
index e5d2dcf1..adeb04e8 100644
--- a/libgadget/gravity.h
+++ b/libgadget/gravity.h
@@ -58,9 +58,9 @@ void grav_short_pair(const ActiveParticles * act, PetaPM * pm, ForceTree * tree,
 void grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFloat (* AccelStore)[3], double rho0, inttime_t Ti_Current);
 
 /*Read the power spectrum, without changing the input value.*/
-void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value);
+void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value);
 
 /* Compute the power spectrum of the Fourier transformed grid in value.*/
-void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh);
+void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh);
 
 #endif
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index c1b8e12d..fbd6865e 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -90,11 +90,18 @@ int *petapm_get_ntask2d(PetaPM * pm) {
 void
 petapm_module_init(int Nthreads)
 {
-    pfft_init();
+    // CUDA Device Initialization if necessary (optional if only one GPU is used)
+    int device_id = 0;
+    cudaSetDevice(device_id);  // Set the active GPU device
 
-    pfft_plan_with_nthreads(Nthreads);
+    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
+    #ifdef _OPENMP
+    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    #endif
 
-    /* initialize the MPI Datatype of pencil */
+    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
+
+    // Initialize the MPI Datatype for the Pencil structure
     MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
     MPI_Type_commit(&MPI_PENCIL);
 }
@@ -131,21 +138,35 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     np[0] = i;
     np[1] = NTask / i;
 
-    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-    if( pfft_create_procmesh_2d(comm, np[0], np[1], &pm->priv->comm_cart_2d) ){
-        endrun(0, "Error: This test file only works with %td processes.\n", np[0]*np[1]);
-    }
+message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 
-    int periods_unused[2];
-    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+// Step 1: Create 2D Cartesian grid for the processes
+int dims[2] = {np[0], np[1]};
+int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+
+// Create 2D Cartesian communicator
+if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+}
+
+// Step 2: Get the Cartesian coordinates of the process in the grid
+int periods_unused[2];
+MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+
+// Ensure that the task grid matches the expected number of processes
+if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
+    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+}
 
-    if(pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1])
-        endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
+// cuFFTMp might require manual management of the local data size
+// Example: You may need to calculate how much data each process holds based on grid decomposition
 
-    pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
-           PFFT_TRANSPOSED_OUT,
-           pm->real_space_region.size, pm->real_space_region.offset,
-           pm->fourier_space_region.size, pm->fourier_space_region.offset);
+pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+                                                pm->real_space_region.size, 
+                                                pm->real_space_region.offset, 
+                                                pm->fourier_space_region.size, 
+                                                pm->fourier_space_region.offset);
 
     /*
      * In fourier space, the transposed array is ordered in
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 1fa1070c..e1f93700 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -49,8 +49,8 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
-    cufftmpHandle_t plan_forw; // NC:change plan function call
-    cufftmpHandle_t plan_back;
+    cufftHandle plan_forw; // NC:change plan function call
+    cufftHandle plan_back;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */

From 7e238e1ecbbe27526b2c17bfecf443a197b561dc Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 23:37:21 -0400
Subject: [PATCH 019/120] add cuda stream to petapm struct

---
 libgadget/petapm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index e1f93700..3db41533 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -51,6 +51,7 @@ typedef struct PetaPMPriv {
     int fftsize;
     cufftHandle plan_forw; // NC:change plan function call
     cufftHandle plan_back;
+    cudaStream_t stream;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */

From 9890ea1e9254696ff84671a4e65b0ead0b57a7f0 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 23:39:47 -0400
Subject: [PATCH 020/120] remove reion stuff; main changes to petapm_init,
 destroy, transfer function

---
 libgadget/petapm-cufft.c | 1029 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 1029 insertions(+)
 create mode 100644 libgadget/petapm-cufft.c

diff --git a/libgadget/petapm-cufft.c b/libgadget/petapm-cufft.c
new file mode 100644
index 00000000..fedf7515
--- /dev/null
+++ b/libgadget/petapm-cufft.c
@@ -0,0 +1,1029 @@
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+/* do NOT use complex.h it breaks the code */
+
+#include "types.h"
+#include "petapm.h"
+
+#include "utils.h"
+#include "walltime.h"
+
+static void
+layout_prepare(PetaPM * pm,
+               struct Layout * L,
+               double * meshbuf,
+               PetaPMRegion * regions,
+               const int Nregions,
+               MPI_Comm comm);
+static void layout_finish(struct Layout * L);
+static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+
+/* cell_iterator needs to be thread safe !*/
+typedef void (* cell_iterator)(double * cell_value, double * comm_buffer);
+static void layout_iterate_cells(PetaPM * pm, struct Layout * L, cell_iterator iter, double * real);
+
+struct Pencil { /* a pencil starting at offset, with lenght len */
+    int offset[3];
+    int len;
+    int first;
+    int meshbuf_first; /* first pixel in meshbuf */
+    int task;
+};
+static int pencil_cmp_target(const void * v1, const void * v2);
+static int pos_get_target(PetaPM * pm, const int pos[2]);
+
+/* FIXME: move this to MPIU_. */
+static int64_t reduce_int64(int64_t input, MPI_Comm comm);
+#ifdef DEBUG
+/* for debugging */
+static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize);
+#endif
+
+static MPI_Datatype MPI_PENCIL;
+
+/*Used only in MP-GenIC*/
+cufftComplex *
+petapm_alloc_rhok(PetaPM * pm)
+{
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
+    return rho_k;
+}
+
+static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
+
+static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
+static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
+#define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
+#define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
+#define INACTIVE(i) (CPS->active && !CPS->active(i))
+
+
+PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
+    return &pm->fourier_space_region;
+}
+PetaPMRegion * petapm_get_real_region(PetaPM * pm) {
+    return &pm->real_space_region;
+}
+int petapm_mesh_to_k(PetaPM * pm, int i) {
+    /*Return the position of this point on the Fourier mesh*/
+    return i<=pm->Nmesh/2 ? i : (i-pm->Nmesh);
+}
+int *petapm_get_thistask2d(PetaPM * pm) {
+    return pm->ThisTask2d;
+}
+int *petapm_get_ntask2d(PetaPM * pm) {
+    return pm->NTask2d;
+}
+
+void
+petapm_module_init(int Nthreads)
+{
+    // CUDA Device Initialization if necessary (optional if only one GPU is used)
+    int device_id = 0;
+    cudaSetDevice(device_id);  // Set the active GPU device
+
+    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
+    #ifdef _OPENMP
+    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    #endif
+    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
+
+    // get rid of pencil type
+    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    //MPI_Type_commit(&MPI_PENCIL);
+}
+
+void
+petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
+{
+    /* define the global long / short range force cut */
+    pm->BoxSize = BoxSize;
+    pm->Asmth = Asmth;
+    pm->Nmesh = Nmesh;
+    pm->G = G;
+    pm->CellSize = BoxSize / Nmesh;
+    pm->comm = comm;
+
+
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(ThisTask % ndevices);
+    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
+
+    // Logical transform size
+    size_t nx = NTask;      // any value >= NTask is OK
+    size_t ny = NTask;      // any value >= NTask is OK
+    size_t nz = 2 * NTask;  // need to be even and >= NTask
+
+    // We start with Slabs distributed along X (X-Slabs)
+    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
+    // All ranks own all element in the Y and Z dimension
+    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
+    // complex numbers assuming an in-place data layout.
+    int ranks_with_onemore = nx % size;
+    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
+    size_t padded_nz = 2 * (nz / 2 + 1);
+
+    // // Local, distributed, data
+    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
+    // generate_random(data, rank);
+    // std::vector<float> ref = data;
+
+
+
+/********************************not sure if these are useful or not**************************************** */
+    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
+    ptrdiff_t np[2];
+
+    int ThisTask;
+    int NTask;
+
+    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
+    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
+
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+    /* try to find a square 2d decomposition */
+    int i;
+    int k;
+    for(i = sqrt(NTask) + 1; i >= 0; i --) {
+        if(NTask % i == 0) break;
+    }
+    np[0] = i;
+    np[1] = NTask / i;
+
+message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+
+// Step 1: Create 2D Cartesian grid for the processes
+int dims[2] = {np[0], np[1]};
+int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+
+// Create 2D Cartesian communicator
+if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+}
+
+// Step 2: Get the Cartesian coordinates of the process in the grid
+int periods_unused[2];
+MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+
+// Ensure that the task grid matches the expected number of processes
+if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
+    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+}
+
+// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
+// cuFFTMp might require manual management of the local data size
+// Example: You may need to calculate how much data each process holds based on grid decomposition
+
+pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+                                                pm->real_space_region.size, 
+                                                pm->real_space_region.offset, 
+                                                pm->fourier_space_region.size, 
+                                                pm->fourier_space_region.offset);
+
+    /*
+     * In fourier space, the transposed array is ordered in
+     * are in (y, z, x). The strides and sizes returned
+     * from local size is in (Nx, Ny, Nz), hence we roll them once
+     * so that the strides will give correct linear indexing for
+     * integer coordinates given in order of (y, z, x).
+     * */
+
+#define ROLL(a, N, j) { \
+    typeof(a[0]) tmp[N]; \
+    ptrdiff_t k; \
+    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
+    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
+    }
+
+    ROLL(pm->fourier_space_region.offset, 3, 1);
+    ROLL(pm->fourier_space_region.size, 3, 1);
+
+#undef ROLL
+
+    /* calculate the strides */
+    petapm_region_init_strides(&pm->real_space_region);
+    petapm_region_init_strides(&pm->fourier_space_region);
+
+
+/******************************** end unsure block **************************************** */
+
+    cudaStreamCreate(&pm->priv->stream);
+    cufftCreate(&pm->priv->plan_forw);
+    cufftCreate(&pm->priv->plan_back);
+
+    // Attach the MPI communicator to the plans
+    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
+    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
+
+    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
+    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
+    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
+    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
+
+    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+
+    // Set the stream
+    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
+    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
+
+    // Make the plan
+    size_t workspace;
+    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
+    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
+    // TODO: what to make of the cpu_data here?
+    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
+
+    /* now lets fill up the mesh2task arrays */
+
+#if 0
+    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
+            pm->real_space_region.offset[0],
+            pm->real_space_region.offset[1],
+            pm->real_space_region.offset[2],
+            pm->real_space_region.size[0],
+            pm->real_space_region.size[1],
+            pm->real_space_region.size[2]);
+#endif
+
+    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+    for(k = 0; k < 2; k ++) {
+        for(i = 0; i < Nmesh; i ++) {
+            tmp[i] = 0;
+        }
+        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
+            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
+        }
+        /* which column / row hosts this tile? */
+        /* FIXME: this is very inefficient */
+        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
+        /*
+        for(i = 0; i < Nmesh; i ++) {
+            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
+        }
+        */
+    }
+    myfree(tmp);
+}
+
+void
+petapm_destroy(PetaPM * pm)
+{
+    cufftDestroy(pm->priv->plan_forw);
+    cufftDestroy(pm->priv->plan_back);
+    MPI_Comm_free(&pm->priv->comm_cart_2d);
+    myfree(pm->Mesh2Task[0]);
+}
+
+/*
+ * read out field to particle i, with value no need to be thread safe
+ * (particle i is never done by same thread)
+ * */
+typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
+static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
+/* apply transfer function to value, kpos array is in x, y, z order */
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H);
+
+static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
+/*
+ * 1. calls prepare to build the Regions covering particles
+ * 2. CIC the particles
+ * 3. Transform to rho_k
+ * 4. apply global_transfer (if not NULL --
+ *       this is the place to fill in gaussian seeds,
+ *       the transfer is stacked onto all following transfers.
+ * 5. for each transfer, readout in functions
+ * 6.    apply transfer from global_transfer -> complex
+ * 7.    transform to real
+ * 8.    readout
+ * 9. free regions
+ * */
+
+PetaPMRegion *
+petapm_force_init(
+        PetaPM * pm,
+        petapm_prepare_func prepare,
+        PetaPMParticleStruct * pstruct,
+        int * Nregions,
+        void * userdata) {
+    CPS = pstruct;
+
+    *Nregions = 0;
+    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
+    pm_init_regions(pm, regions, *Nregions);
+
+    pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
+
+    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
+
+    walltime_measure("/PMgrav/init");
+    return regions;
+}
+
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
+        ){
+    size_t ip = 0;
+
+    PetaPMRegion * region = &pm->fourier_space_region;
+
+#pragma omp parallel for
+    for(ip = 0; ip < region->totalsize; ip ++) {
+        ptrdiff_t tmp = ip;
+        int pos[3];
+        int kpos[3];
+        int64_t k2 = 0.0;
+        int k;
+        for(k = 0; k < 3; k ++) {
+            pos[k] = tmp / region->strides[k];
+            tmp -= pos[k] * region->strides[k];
+            /* lets get the abs pos on the grid*/
+            pos[k] += region->offset[k];
+            /* check */
+            if(pos[k] >= pm->Nmesh) {
+                endrun(1, "position didn't make sense\n");
+            }
+            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
+            /* Watch out the cast */
+            k2 += ((int64_t)kpos[k]) * kpos[k];
+        }
+        /* swap 0 and 1 because fourier space was transposed */
+        /* kpos is y, z, x */
+        pos[0] = kpos[2];
+        pos[1] = kpos[0];
+        pos[2] = kpos[1];
+        dst[ip][0] = src[ip][0];
+        dst[ip][1] = src[ip][1];
+        if(H) {
+            H(pm, k2, pos, &dst[ip]);
+        }
+    }
+
+}
+
+cufftComplex * petapm_force_r2c(PetaPM * pm,
+        PetaPMGlobalFunctions * global_functions
+        ) {
+    /* call pfft rho_k is CFT of rho */
+
+    /* this is because
+     *
+     * CFT = DFT * dx **3
+     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
+     * */
+    double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    memset(real, 0, sizeof(double) * pm->priv->fftsize);
+    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    walltime_measure("/PMgrav/comm2");
+
+#ifdef DEBUG
+    verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
+    walltime_measure("/PMgrav/Verify");
+#endif
+
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
+    myfree(real);
+
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
+
+    /*Do any analysis that may be required before the transfer function is applied*/
+    petapm_transfer_func global_readout = global_functions->global_readout;
+    if(global_readout)
+        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
+    if(global_functions->global_analysis)
+        global_functions->global_analysis(pm);
+    /*Apply the transfer function*/
+    petapm_transfer_func global_transfer = global_functions->global_transfer;
+    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
+    walltime_measure("/PMgrav/r2c");
+
+    myfree(complx);
+    return rho_k;
+}
+
+void
+petapm_force_c2r(PetaPM * pm,
+        cufftComplex * rho_k,
+        PetaPMRegion * regions,
+        const int Nregions,
+        PetaPMFunctions * functions)
+{
+
+    PetaPMFunctions * f = functions;
+    for (f = functions; f->name; f ++) {
+        petapm_transfer_func transfer = f->transfer;
+        petapm_readout_func readout = f->readout;
+
+        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+        /* apply the greens function turn rho_k into potential in fourier space */
+        pm_apply_transfer_function(pm, rho_k, complx, transfer);
+        walltime_measure("/PMgrav/calc");
+
+        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
+
+        walltime_measure("/PMgrav/c2r");
+        if(f == functions) // Once
+            report_memory_usage("PetaPM");
+        myfree(complx);
+        /* read out the potential: this will copy and free real.*/
+        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+        walltime_measure("/PMgrav/comm");
+
+        pm_iterate(pm, readout, regions, Nregions);
+        walltime_measure("/PMgrav/readout");
+    }
+}
+
+void petapm_force_finish(PetaPM * pm) {
+    layout_finish(&pm->priv->layout);
+    myfree(pm->priv->meshbuf);
+}
+
+void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
+        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
+        PetaPMFunctions * functions,
+        PetaPMParticleStruct * pstruct,
+        void * userdata) {
+    int Nregions;
+    PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
+    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
+    if(functions)
+        petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
+    myfree(rho_k);
+    if(CPS->RegionInd)
+        myfree(CPS->RegionInd);
+    myfree(regions);
+    petapm_force_finish(pm);
+}
+
+/******************************************************************************************************************************************** */
+/* build a communication layout */
+
+static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
+static void layout_exchange_pencils(struct Layout * L);
+static void
+layout_prepare (PetaPM * pm,
+                struct Layout * L,
+                double * meshbuf,
+                PetaPMRegion * regions,
+                const int Nregions,
+                MPI_Comm comm)
+{
+    int r;
+    int i;
+    int NTask;
+    L->comm = comm;
+
+    MPI_Comm_size(L->comm, &NTask);
+
+    L->ibuffer = (int *) mymalloc("PMlayout", sizeof(int) * NTask * 8);
+
+    memset(L->ibuffer, 0, sizeof(int) * NTask * 8);
+    L->NpSend = &L->ibuffer[NTask * 0];
+    L->NpRecv = &L->ibuffer[NTask * 1];
+    L->NcSend = &L->ibuffer[NTask * 2];
+    L->NcRecv = &L->ibuffer[NTask * 3];
+    L->DcSend = &L->ibuffer[NTask * 4];
+    L->DcRecv = &L->ibuffer[NTask * 5];
+    L->DpSend = &L->ibuffer[NTask * 6];
+    L->DpRecv = &L->ibuffer[NTask * 7];
+
+    L->NpExport = 0;
+    L->NcExport = 0;
+    L->NpImport = 0;
+    L->NcImport = 0;
+
+    int NpAlloc = 0;
+    /* count pencils until buffer would run out */
+    for (r = 0; r < Nregions; r ++) {
+        NpAlloc += regions[r].size[0] * regions[r].size[1];
+    }
+
+    L->PencilSend = (struct Pencil *) mymalloc("PencilSend", NpAlloc * sizeof(struct Pencil));
+
+    layout_build_pencils(pm, L, meshbuf, regions, Nregions);
+
+    /* sort the pencils by the target rank for ease of next step */
+    qsort_openmp(L->PencilSend, NpAlloc, sizeof(struct Pencil), pencil_cmp_target);
+    /* zero length pixels are moved to the tail */
+
+    /* now shrink NpExport*/
+    L->NpExport = NpAlloc;
+    while(L->NpExport > 0 && L->PencilSend[L->NpExport - 1].len == 0) {
+        L->NpExport --;
+    }
+
+    /* count total number of cells to be exported */
+    int NcExport = 0;
+    for(i = 0; i < L->NpExport; i++) {
+        int task = L->PencilSend[i].task;
+        L->NcSend[task] += L->PencilSend[i].len;
+        NcExport += L->PencilSend[i].len;
+        L->NpSend[task] ++;
+    }
+    L->NcExport = NcExport;
+
+    MPI_Alltoall(L->NpSend, 1, MPI_INT, L->NpRecv, 1, MPI_INT, L->comm);
+    MPI_Alltoall(L->NcSend, 1, MPI_INT, L->NcRecv, 1, MPI_INT, L->comm);
+
+    /* build the displacement array; why doesn't MPI build these automatically? */
+    L->DpSend[0] = 0; L->DpRecv[0] = 0;
+    L->DcSend[0] = 0; L->DcRecv[0] = 0;
+    for(i = 1; i < NTask; i ++) {
+        L->DpSend[i] = L->NpSend[i - 1] + L->DpSend[i - 1];
+        L->DpRecv[i] = L->NpRecv[i - 1] + L->DpRecv[i - 1];
+        L->DcSend[i] = L->NcSend[i - 1] + L->DcSend[i - 1];
+        L->DcRecv[i] = L->NcRecv[i - 1] + L->DcRecv[i - 1];
+    }
+    L->NpImport = L->DpRecv[NTask -1] + L->NpRecv[NTask -1];
+    L->NcImport = L->DcRecv[NTask -1] + L->NcRecv[NTask -1];
+
+    /* some checks */
+    if(L->DpSend[NTask - 1] + L->NpSend[NTask -1] != L->NpExport) {
+        endrun(1, "NpExport = %d NpSend=%d DpSend=%d\n", L->NpExport, L->NpSend[NTask -1], L->DpSend[NTask - 1]);
+    }
+    if(L->DcSend[NTask - 1] + L->NcSend[NTask -1] != L->NcExport) {
+        endrun(1, "NcExport = %d NcSend=%d DcSend=%d\n", L->NcExport, L->NcSend[NTask -1], L->DcSend[NTask - 1]);
+    }
+    int64_t totNpAlloc = reduce_int64(NpAlloc, L->comm);
+    int64_t totNpExport = reduce_int64(L->NpExport, L->comm);
+    int64_t totNcExport = reduce_int64(L->NcExport, L->comm);
+    int64_t totNpImport = reduce_int64(L->NpImport, L->comm);
+    int64_t totNcImport = reduce_int64(L->NcImport, L->comm);
+
+    if(totNpExport != totNpImport) {
+        endrun(1, "totNpExport = %ld\n", totNpExport);
+    }
+    if(totNcExport != totNcImport) {
+        endrun(1, "totNcExport = %ld\n", totNcExport);
+    }
+
+    /* exchange the pencils */
+    message(0, "PetaPM:  %010ld/%010ld Pencils and %010ld Cells\n", totNpExport, totNpAlloc, totNcExport);
+    L->PencilRecv = (struct Pencil *) mymalloc("PencilRecv", L->NpImport * sizeof(struct Pencil));
+    memset(L->PencilRecv, 0xfc, L->NpImport * sizeof(struct Pencil));
+    layout_exchange_pencils(L);
+}
+
+static void
+layout_build_pencils(PetaPM * pm,
+                     struct Layout * L,
+                     double * meshbuf,
+                     PetaPMRegion * regions,
+                     const int Nregions)
+{
+    /* now build pencils to be exported */
+    int p0 = 0;
+    int r;
+    for (r = 0; r < Nregions; r++) {
+        int ix;
+#pragma omp parallel for private(ix)
+        for(ix = 0; ix < regions[r].size[0]; ix++) {
+            int iy;
+            for(iy = 0; iy < regions[r].size[1]; iy++) {
+                int poffset = ix * regions[r].size[1] + iy;
+                struct Pencil * p = &L->PencilSend[p0 + poffset];
+
+                p->offset[0] = ix + regions[r].offset[0];
+                p->offset[1] = iy + regions[r].offset[1];
+                p->offset[2] = regions[r].offset[2];
+                p->len = regions[r].size[2];
+                p->meshbuf_first = (regions[r].buffer - meshbuf) +
+                    regions[r].strides[0] * ix +
+                    regions[r].strides[1] * iy;
+                /* now lets compress the pencil */
+                while((p->len > 0) && (meshbuf[p->meshbuf_first + p->len - 1] == 0.0)) {
+                    p->len --;
+                }
+                while((p->len > 0) && (meshbuf[p->meshbuf_first] == 0.0)) {
+                    p->len --;
+                    p->meshbuf_first++;
+                    p->offset[2] ++;
+                }
+
+                p->task = pos_get_target(pm, p->offset);
+            }
+        }
+        p0 += regions[r].size[0] * regions[r].size[1];
+    }
+
+}
+
+static void layout_exchange_pencils(struct Layout * L) {
+    int i;
+    int offset;
+    int NTask;
+    MPI_Comm_size(L->comm, &NTask);
+    /* build the first pointers to refer to the correct relative buffer locations */
+    /* note that the buffer hasn't bee assembled yet */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        int j;
+        struct Pencil * p = &L->PencilSend[offset];
+        if(L->NpSend[i] == 0) continue;
+        p->first = 0;
+        for(j = 1; j < L->NpSend[i]; j++) {
+            p[j].first = p[j - 1].first + p[j - 1].len;
+        }
+        offset += L->NpSend[i];
+    }
+
+    MPI_Alltoallv(
+            L->PencilSend, L->NpSend, L->DpSend, MPI_PENCIL,
+            L->PencilRecv, L->NpRecv, L->DpRecv, MPI_PENCIL,
+            L->comm);
+
+    /* set first to point to absolute position in the full import cell buffer */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        struct Pencil * p = &L->PencilRecv[offset];
+        int j;
+        for(j = 0; j < L->NpRecv[i]; j++) {
+            p[j].first += L->DcRecv[i];
+        }
+        offset += L->NpRecv[i];
+    }
+
+    /* set first to point to absolute position in the full export cell buffer */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        struct Pencil * p = &L->PencilSend[offset];
+        int j;
+        for(j = 0; j < L->NpSend[i]; j++) {
+            p[j].first += L->DcSend[i];
+        }
+        offset += L->NpSend[i];
+    }
+}
+
+static void layout_finish(struct Layout * L) {
+    myfree(L->PencilRecv);
+    myfree(L->PencilSend);
+    myfree(L->ibuffer);
+}
+
+/* exchange cells to their pfft host, then reduce the cells to the pfft
+ * array */
+static void to_pfft(double * cell, double * buf) {
+#pragma omp atomic update
+            cell[0] += buf[0];
+}
+
+static void
+layout_build_and_exchange_cells_to_pfft(
+        PetaPM * pm,
+        struct Layout * L,
+        double * meshbuf,
+        double * real)
+{
+    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
+    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
+
+    int i;
+    int offset;
+
+    /* collect all cells into the send buffer */
+    offset = 0;
+    for(i = 0; i < L->NpExport; i ++) {
+        struct Pencil * p = &L->PencilSend[i];
+        memcpy(L->BufSend + offset, &meshbuf[p->meshbuf_first],
+                sizeof(double) * p->len);
+        offset += p->len;
+    }
+
+    /* receive cells */
+    MPI_Alltoallv(
+            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
+            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
+            L->comm);
+
+#if 0
+    double massExport = 0;
+    for(i = 0; i < L->NcExport; i ++) {
+        massExport += L->BufSend[i];
+    }
+
+    double massImport = 0;
+    for(i = 0; i < L->NcImport; i ++) {
+        massImport += L->BufRecv[i];
+    }
+    double totmassExport;
+    double totmassImport;
+    MPI_Allreduce(&massExport, &totmassExport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
+    MPI_Allreduce(&massImport, &totmassImport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
+    message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
+#endif
+
+    layout_iterate_cells(pm, L, to_pfft, real);
+    myfree(L->BufRecv);
+    myfree(L->BufSend);
+}
+
+/* readout cells on their pfft host, then exchange the cells to the domain
+ * host */
+static void to_region(double * cell, double * region) {
+    *region = *cell;
+}
+
+static void
+layout_build_and_exchange_cells_to_local(
+        PetaPM * pm,
+        struct Layout * L,
+        double * meshbuf,
+        double * real)
+{
+    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
+    int i;
+    int offset;
+
+    /*layout_iterate_cells transfers real to L->BufRecv*/
+    layout_iterate_cells(pm, L, to_region, real);
+
+    /*Real is done now: reuse the memory for BufSend*/
+    myfree(real);
+    /*Now allocate BufSend, which is confusingly used to receive data*/
+    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
+
+    /* exchange cells */
+    /* notice the order is reversed from to_pfft */
+    MPI_Alltoallv(
+            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
+            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
+            L->comm);
+
+    /* distribute BufSend to meshbuf */
+    offset = 0;
+    for(i = 0; i < L->NpExport; i ++) {
+        struct Pencil * p = &L->PencilSend[i];
+        memcpy(&meshbuf[p->meshbuf_first],
+                L->BufSend + offset,
+                sizeof(double) * p->len);
+        offset += p->len;
+    }
+    myfree(L->BufSend);
+    myfree(L->BufRecv);
+}
+
+/* iterate over the pairs of real field cells and RecvBuf cells
+ *
+ * !!! iter has to be thread safe. !!!
+ * */
+static void
+layout_iterate_cells(PetaPM * pm,
+                     struct Layout * L,
+                     cell_iterator iter,
+                     double * real)
+{
+    int i;
+#pragma omp parallel for
+    for(i = 0; i < L->NpImport; i ++) {
+        struct Pencil * p = &L->PencilRecv[i];
+        int k;
+        ptrdiff_t linear0 = 0;
+        for(k = 0; k < 2; k ++) {
+            int ix = p->offset[k];
+            while(ix < 0) ix += pm->Nmesh;
+            while(ix >= pm->Nmesh) ix -= pm->Nmesh;
+            ix -= pm->real_space_region.offset[k];
+            if(ix >= pm->real_space_region.size[k]) {
+                /* serious problem assumption about pfft layout was wrong*/
+                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
+            }
+            linear0 += ix * pm->real_space_region.strides[k];
+        }
+        int j;
+        for(j = 0; j < p->len; j ++) {
+            int iz = p->offset[2] + j;
+            while(iz < 0) iz += pm->Nmesh;
+            while(iz >= pm->Nmesh) iz -= pm->Nmesh;
+            if(iz >= pm->real_space_region.size[2]) {
+                /* serious problem assmpution about pfft layout was wrong*/
+                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
+            }
+            ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
+            /*
+             * operate on the pencil, either modifying real or BufRecv
+             * */
+            iter(&real[linear], &L->BufRecv[p->first + j]);
+        }
+    }
+}
+
+static void
+pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions)
+{
+    if(regions) {
+        int i;
+        size_t size = 0;
+        for(i = 0 ; i < Nregions; i ++) {
+            size += regions[i].totalsize;
+        }
+        pm->priv->meshbufsize = size;
+        if ( size == 0 ) return;
+        pm->priv->meshbuf = (double *) mymalloc("PMmesh", size * sizeof(double));
+        /* this takes care of the padding */
+        memset(pm->priv->meshbuf, 0, size * sizeof(double));
+        size = 0;
+        for(i = 0 ; i < Nregions; i ++) {
+            regions[i].buffer = pm->priv->meshbuf + size;
+            size += regions[i].totalsize;
+        }
+    }
+}
+
+
+static void
+pm_iterate_one(PetaPM * pm,
+               int i,
+               pm_iterator iterator,
+               PetaPMRegion * regions,
+               const int Nregions)
+{
+    int k;
+    int iCell[3];  /* integer coordinate on the regional mesh */
+    double Res[3]; /* residual*/
+    double * Pos = POS(i);
+    const int RegionInd = CPS->RegionInd ? CPS->RegionInd[i] : 0;
+
+    /* Asserts that the swallowed particles are not considered (region -2).*/
+    if(RegionInd < 0)
+        return;
+    /* This should never happen: it is pure paranoia and to avoid icc being crazy*/
+    if(RegionInd >= Nregions)
+        endrun(1, "Particle %d has region %d out of bounds %d\n", i, RegionInd, Nregions);
+
+    PetaPMRegion * region = &regions[RegionInd];
+    for(k = 0; k < 3; k++) {
+        double tmp = Pos[k] / pm->CellSize;
+        iCell[k] = floor(tmp);
+        Res[k] = tmp - iCell[k];
+        iCell[k] -= region->offset[k];
+        /* seriously?! particles are supposed to be contained in cells */
+        if(iCell[k] >= region->size[k] - 1 || iCell[k] < 0) {
+            endrun(1, "particle out of cell better stop %d (k=%d) %g %g %g region: %td %td\n", iCell[k],k,
+                Pos[0], Pos[1], Pos[2],
+                region->offset[k], region->size[k]);
+        }
+    }
+
+    int connection;
+    for(connection = 0; connection < 8; connection++) {
+        double weight = 1.0;
+        size_t linear = 0;
+        for(k = 0; k < 3; k++) {
+            int offset = (connection >> k) & 1;
+            int tmp = iCell[k] + offset;
+            linear += tmp * region->strides[k];
+            weight *= offset?
+                /* offset == 1*/ (Res[k])    :
+                /* offset == 0*/ (1 - Res[k]);
+        }
+        if(linear >= region->totalsize) {
+            endrun(1, "particle linear index out of cell better stop\n");
+        }
+        iterator(pm, i, &region->buffer[linear], weight);
+    }
+}
+
+/*
+ * iterate over all particle / mesh pairs, call iterator
+ * function . iterator function shall be aware of thread safety.
+ * no threads run on same particle same time but may
+ * access one mesh points same time.
+ * */
+static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions) {
+    int i;
+#pragma omp parallel for
+    for(i = 0; i < CPS->NumPart; i ++) {
+        pm_iterate_one(pm, i, iterator, regions, Nregions);
+    }
+}
+
+void petapm_region_init_strides(PetaPMRegion * region) {
+    int k;
+    size_t rt = 1;
+    for(k = 2; k >= 0; k --) {
+        region->strides[k] = rt;
+        rt = region->size[k] * rt;
+    }
+    region->totalsize = rt;
+    region->buffer = NULL;
+}
+
+static int pos_get_target(PetaPM * pm, const int pos[2]) {
+    int k;
+    int task2d[2];
+    int rank;
+    for(k = 0; k < 2; k ++) {
+        int ix = pos[k];
+        while(ix < 0) ix += pm->Nmesh;
+        while(ix >= pm->Nmesh) ix -= pm->Nmesh;
+        task2d[k] = pm->Mesh2Task[k][ix];
+    }
+    MPI_Cart_rank(pm->priv->comm_cart_2d, task2d, &rank);
+    return rank;
+}
+static int pencil_cmp_target(const void * v1, const void * v2) {
+    const struct Pencil * p1 = (const struct Pencil *) v1;
+    const struct Pencil * p2 = (const struct Pencil *) v2;
+    /* move zero length pixels to the end */
+    if(p2->len == 0) return -1;
+    if(p1->len == 0) return 1;
+    int t1 = p1->task;
+    int t2 = p2->task;
+    return ((t2 < t1) - (t1 < t2)) * 2 +
+        ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
+}
+
+#ifdef DEBUG
+static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
+    /* verify the density field */
+    double mass_Part = 0;
+    int j;
+#pragma omp parallel for reduction(+: mass_Part)
+    for(j = 0; j < CPS->NumPart; j ++) {
+        double Mass = *MASS(j);
+        mass_Part += Mass;
+    }
+    double totmass_Part = 0;
+    MPI_Allreduce(&mass_Part, &totmass_Part, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+
+    double mass_Region = 0;
+    size_t i;
+
+#pragma omp parallel for reduction(+: mass_Region)
+    for(i = 0; i < meshsize; i ++) {
+        mass_Region += meshbuf[i];
+    }
+    double totmass_Region = 0;
+    MPI_Allreduce(&mass_Region, &totmass_Region, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+    double mass_CIC = 0;
+#pragma omp parallel for reduction(+: mass_CIC)
+    for(i = 0; i < pm->real_space_region.totalsize; i ++) {
+        mass_CIC += real[i];
+    }
+    double totmass_CIC = 0;
+    MPI_Allreduce(&mass_CIC, &totmass_CIC, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+
+    message(0, "total Region mass err = %g CIC mass err = %g Particle mass = %g\n", totmass_Region / totmass_Part - 1, totmass_CIC / totmass_Part - 1, totmass_Part);
+}
+#endif
+
+
+
+
+/**************
+ * functions iterating over particle / mesh pairs
+ ***************/
+static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
+    double Mass = *MASS(i);
+    if(INACTIVE(i))
+        return;
+#pragma omp atomic update
+    mesh[0] += weight * Mass;
+}
+static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
+    int64_t result = 0;
+    MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
+    return result;
+}
+
+/** Some FFT notes
+ *
+ *
+ * CFT = dx * iDFT (thus CFT has no 2pi factors and iCFT has,
+ *           same as wikipedia.)
+ *
+ * iCFT = dk * DFT
+ * iCFT(CFG) = dx * dk * DFT(iDFT)
+ *           = L / N * (2pi / L) * N
+ *           = 2 pi
+ * agreed with the usual def that
+ * iCFT(CFT) = 2pi
+ *
+ * **************************8*/

From fcaa79426e8ee6a49fbcfb223970a285b196b0e5 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Mon, 30 Sep 2024 21:43:31 -0700
Subject: [PATCH 021/120] errno system header macro conflict

---
 libgadget/domain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libgadget/domain.c b/libgadget/domain.c
index e544dbd6..6eb6f391 100644
--- a/libgadget/domain.c
+++ b/libgadget/domain.c
@@ -321,8 +321,8 @@ int domain_maintain(DomainDecomp * ddecomp, struct DriftData * drift)
     walltime_measure("/Domain/drift");
 
     /* Try a domain exchange. Note ExchangeList is freed inside.*/
-    int errno = domain_exchange(domain_layoutfunc, ddecomp, ExchangeData, PartManager, SlotsManager, 10000, ddecomp->DomainComm);
-    return errno;
+    int exchange_status = domain_exchange(domain_layoutfunc, ddecomp, ExchangeData, PartManager, SlotsManager, 10000, ddecomp->DomainComm);
+    return exchange_status;
 }
 
 /* this function generates several domain decomposition policies for attempting

From a7a2a8d607863f26dae9f7d1bd316259873fb67f Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Mon, 30 Sep 2024 22:20:20 -0700
Subject: [PATCH 022/120] neutrinos ira integ

---
 libgadget/neutrinos_lra.c | 43 +++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/libgadget/neutrinos_lra.c b/libgadget/neutrinos_lra.c
index 166e1b6b..50e60f1c 100644
--- a/libgadget/neutrinos_lra.c
+++ b/libgadget/neutrinos_lra.c
@@ -9,7 +9,6 @@
 #include <math.h>
 #include <string.h>
 #include <bigfile-mpi.h>
-#include <gsl/gsl_integration.h>
 #include <gsl/gsl_errno.h>
 #include <gsl/gsl_interp.h>
 #include <gsl/gsl_sf_bessel.h>
@@ -23,6 +22,7 @@
 #include "cosmology.h"
 #include "powerspectrum.h"
 #include "physconst.h"
+#include "timefac.h"
 
 /** Floating point accuracy*/
 #define FLOAT_ACC   1e-6
@@ -552,17 +552,19 @@ Result is in Unit_Length/Unit_Time.
 ******************************************************************************************************/
 double fslength(Cosmology * CP, const double logai, const double logaf, const double light)
 {
-  double abserr;
-  double fslength_val;
-  gsl_function F;
-  gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
-  F.function = &fslength_int;
-  F.params = CP;
-  if(logai >= logaf)
-      return 0;
-  gsl_integration_qag (&F, logai, logaf, 0, 1e-6,GSL_VAL,6,w,&(fslength_val), &abserr);
-  gsl_integration_workspace_free (w);
-  return light*fslength_val;
+    double abserr;
+    if (logai >= logaf)
+        return 0;
+
+    // Define the integrand as a lambda function wrapping fslength_int
+    auto integrand = [CP](double loga) {
+        return fslength_int(loga, (void *)CP);
+    };
+
+    // Use Tanh-Sinh adaptive integration
+    double fslength_val = tanh_sinh_integrate_adaptive(integrand, logai, logaf, &abserr, 1e-6);
+
+    return light * fslength_val;
 }
 
 /**************************************************************************************************
@@ -710,10 +712,6 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
   if(Na > 1 && mnubykT > 0){
         delta_nu_int_params params;
         params.acc = gsl_interp_accel_alloc();
-        gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
-        gsl_function F;
-        F.function = &get_delta_nu_int;
-        F.params=&params;
         /*Use cubic interpolation*/
         if(Na > 2) {
                 params.spline=gsl_interp_alloc(gsl_interp_cspline,Na);
@@ -744,8 +742,9 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
         params.fslengths = fslengths;
         params.fsscales = fsscales;
 
-        if(!params.spline || !params.acc || !w || !params.fs_spline || !params.fs_acc || !fslengths || !fsscales)
-              endrun(2016,"Error initialising and allocating memory for gsl interpolator and integrator.\n");
+        if (!params.spline || !params.acc || !params.fs_spline || !params.fs_acc || !fslengths || !fsscales) {
+            endrun(2016, "Error initializing and allocating memory for interpolators.\n");
+        }
 
         gsl_interp_init(params.fs_spline,params.fsscales,params.fslengths,Nfs);
         for (ik = 0; ik < d_tot->nk; ik++) {
@@ -753,10 +752,14 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
             params.k=d_tot->wavenum[ik];
             params.delta_tot=d_tot->delta_tot[ik];
             gsl_interp_init(params.spline,params.scale,params.delta_tot,Na);
-            gsl_integration_qag (&F, log(d_tot->TimeTransfer), log(a), 0, relerr,GSL_VAL,6,w,&d_nu_tmp, &abserr);
+
+            // Define the integrand as a lambda function wrapping get_delta_nu_int
+            auto integrand = [&params](double logai) {
+                return get_delta_nu_int(logai, (void *)&params);
+            };
+            d_nu_tmp = tanh_sinh_integrate_adaptive(integrand, log(d_tot->TimeTransfer), log(a), &abserr, relerr);
             delta_nu_curr[ik] += d_tot->delta_nu_prefac * d_nu_tmp;
          }
-         gsl_integration_workspace_free (w);
          gsl_interp_free(params.spline);
          gsl_interp_accel_free(params.acc);
          myfree(fsscales);

From 589bfa8d5e65362a38577538da0124a51cae5c53 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Mon, 30 Sep 2024 22:53:34 -0700
Subject: [PATCH 023/120] omega nu integ

---
 libgadget/omega_nu_single.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/libgadget/omega_nu_single.c b/libgadget/omega_nu_single.c
index 8cfc849a..998a7ca6 100644
--- a/libgadget/omega_nu_single.c
+++ b/libgadget/omega_nu_single.c
@@ -1,7 +1,6 @@
 #include "omega_nu_single.h"
 
 #include <math.h>
-#include <gsl/gsl_integration.h>
 #include <string.h>
 #include "physconst.h"
 #include "utils/mymalloc.h"
@@ -129,8 +128,7 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
      /*Make the table over a slightly wider range than requested, in case there is roundoff error*/
      const double logA0=log(a0)-log(1.2);
      const double logaf=log(NU_SW*kBtnu/mnu)+log(1.2);
-     gsl_function F;
-     F.function = &rho_nu_int;
+
      /*Initialise constants*/
      rho_nu_tab->mnu = mnu;
      /*Shortcircuit if we don't need to do the integration*/
@@ -145,17 +143,23 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
      if(!rho_nu_tab->interp || !rho_nu_tab->acc || !rho_nu_tab->loga)
          endrun(2035,"Could not initialise tables for neutrino matter density\n");
 
-     gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
      for(i=0; i< NRHOTAB; i++){
         double param[2];
         rho_nu_tab->loga[i]=logA0+i*(logaf-logA0)/(NRHOTAB-1);
         param[0]=mnu*exp(rho_nu_tab->loga[i]);
         param[1] = kBtnu;
-        F.params = &param;
-        gsl_integration_qag (&F, 0, 500*kBtnu,0 , 1e-9,GSL_VAL,6,w,&(rho_nu_tab->rhonu[i]), &abserr);
-        rho_nu_tab->rhonu[i]=rho_nu_tab->rhonu[i]/pow(exp(rho_nu_tab->loga[i]),4)*get_rho_nu_conversion();
+
+        // Define the integrand for rho_nu_int
+        auto integrand = [param](double q) {
+            return rho_nu_int(q, (void *)param);
+        };
+
+        // Perform the Tanh-Sinh adaptive integration
+        double result = tanh_sinh_integrate_adaptive(integrand, 0, 500 * kBtnu, &abserr, 1e-9);
+
+        rho_nu_tab->rhonu[i] = result / pow(exp(rho_nu_tab->loga[i]), 4) * get_rho_nu_conversion();
      }
-     gsl_integration_workspace_free (w);
+
      gsl_interp_init(rho_nu_tab->interp,rho_nu_tab->loga,rho_nu_tab->rhonu,NRHOTAB);
      return;
 }
@@ -217,17 +221,19 @@ double fermi_dirac_kernel(double x, void * params)
  * This is integral f_0(q) q^2 dq between 0 and qc to compute the fraction of OmegaNu which is in particles.*/
 double nufrac_low(const double qc)
 {
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
     double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
+    // Define the integrand for Fermi-Dirac kernel
+    auto integrand = [](double x) {
+        return fermi_dirac_kernel(x, NULL);
+    };
+
     double total_fd;
-    gsl_integration_qag (&F, 0, qc, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
+
+    // Use Tanh-Sinh adaptive integration for the Fermi-Dirac kernel
+    double total_fd = tanh_sinh_integrate_adaptive(integrand, 0, qc, &abserr, 1e-6);
     /*divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
     total_fd /= 1.5*1.202056903159594;
-    gsl_integration_workspace_free (w);
+
     return total_fd;
 }
 

From da3efc39f3214aa1f45008ca374aa2e5c6d574d9 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Mon, 30 Sep 2024 23:28:26 -0700
Subject: [PATCH 024/120] integrator abs err tolerance, Omnu timebin integ
 updated

---
 libgadget/omega_nu_single.c |  3 +--
 libgadget/timebinmgr.c      | 17 +++++++----------
 libgadget/timefac.c         | 27 ++++++++++++++++++---------
 libgadget/timefac.h         |  1 +
 4 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/libgadget/omega_nu_single.c b/libgadget/omega_nu_single.c
index 998a7ca6..322f84d2 100644
--- a/libgadget/omega_nu_single.c
+++ b/libgadget/omega_nu_single.c
@@ -5,6 +5,7 @@
 #include "physconst.h"
 #include "utils/mymalloc.h"
 #include "utils/endrun.h"
+#include "timefac.h"
 
 #define HBAR    6.582119e-16  /*hbar in units of eV s*/
 #define STEFAN_BOLTZMANN 5.670373e-5
@@ -227,8 +228,6 @@ double nufrac_low(const double qc)
         return fermi_dirac_kernel(x, NULL);
     };
 
-    double total_fd;
-
     // Use Tanh-Sinh adaptive integration for the Fermi-Dirac kernel
     double total_fd = tanh_sinh_integrate_adaptive(integrand, 0, qc, &abserr, 1e-6);
     /*divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
diff --git a/libgadget/timebinmgr.c b/libgadget/timebinmgr.c
index 4e1e30ce..d38e689e 100644
--- a/libgadget/timebinmgr.c
+++ b/libgadget/timebinmgr.c
@@ -8,6 +8,7 @@
 #include "cosmology.h"
 #include "physconst.h"
 #include "plane.h"
+#include "timefac.h"
 
 #define MAXTIMES 1024
 /*! table with desired sync points. All forces and phase space variables are synchonized to the same order. */
@@ -116,10 +117,7 @@ static double integrand_time_to_present(double a, void *param)
 //time_to_present in Myr for excursion set syncpoints
 static double time_to_present(double a, Cosmology * CP)
 {
-#define WORKSIZE 1000
 #define SEC_PER_MEGAYEAR 3.155e13
-    gsl_function F;
-    gsl_integration_workspace* workspace;
     double time;
     double result;
     double abserr;
@@ -127,18 +125,17 @@ static double time_to_present(double a, Cosmology * CP)
     double hubble;
     hubble = CP->Hubble / CP->UnitTime_in_s * SEC_PER_MEGAYEAR * CP->HubbleParam;
 
-    workspace = gsl_integration_workspace_alloc(WORKSIZE);
-    F.function = &integrand_time_to_present;
-    F.params = CP;
+    // Define the integrand as a lambda function
+    auto integrand = [CP](double a) {
+        return integrand_time_to_present(a, (void *)CP);
+    };
 
-    gsl_integration_qag(&F, a, 1.0, 1.0 / hubble,
-        1.0e-8, WORKSIZE, GSL_INTEG_GAUSS21, workspace, &result, &abserr);
+    // Perform the Tanh-Sinh adaptive integration
+    result = tanh_sinh_integrate_adaptive(integrand, a, 1.0, &abserr, 1.0e-8, 1.0 / hubble);
 
     //convert to Myr and multiply by h
     time = result / (hubble/CP->Hubble);
 
-    gsl_integration_workspace_free(workspace);
-
     // return time to present as a function of redshift
     return time;
 }
diff --git a/libgadget/timefac.c b/libgadget/timefac.c
index d8984de8..ba269cbf 100644
--- a/libgadget/timefac.c
+++ b/libgadget/timefac.c
@@ -17,10 +17,13 @@
 #include <boost/math/quadrature/tanh_sinh.hpp>
 
 // Function to perform tanh-sinh integration with adaptive max_refinements
-double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a, double b, double* estimated_error, double rel_tol, int max_refinements_limit, int init_refine, int step) {
+double tanh_sinh_integrate_adaptive(
+    std::function<double(double)> func, double a, double b, 
+    double* estimated_error, double rel_tol, double abs_tol, 
+    int max_refinements_limit, int init_refine, int step) 
+{
     double result_prev = 0.0;
     double result_current = 0.0;
-    *estimated_error = 1.0;  // Start with a large relative error
     int max_refine = init_refine;
 
     // Loop until reaching the max refinements limit or satisfying the tolerance
@@ -31,13 +34,16 @@ double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a
         // Perform the integration
         result_current = integrator.integrate(func, a, b);
 
-        // If this is not the first iteration, compute the relative error
+        // If this is not the first iteration, compute the absolute and relative errors
         if (max_refine > init_refine) {
-            *estimated_error = fabs(result_current - result_prev) / fabs(result_current);
+            double abs_error = fabs(result_current - result_prev);  // Absolute error
+            double rel_error = abs_error / fabs(result_current);    // Relative error
+
+            *estimated_error = abs_error;  // Store the absolute error
 
-            // Check if the relative error is within the target tolerance
-            if (*estimated_error < rel_tol) {
-                break;  // Stop refining if the result is within the tolerance
+            // Check if either the relative or absolute error is within the target tolerance
+            if (rel_error < rel_tol || abs_error < abs_tol) {
+                break;  // Stop refining if either error is within the tolerance
             }
         }
 
@@ -46,8 +52,11 @@ double tanh_sinh_integrate_adaptive(std::function<double(double)> func, double a
     }
 
     // If we exited the loop without achieving the desired tolerance, print a warning
-    if (*estimated_error > rel_tol) {
-        message(1, "Warning: Tanh-Sinh integration did not reach the desired tolerance of %g. Final relative error: %g\n", rel_tol, *estimated_error);
+    if (*estimated_error > abs_tol && (*estimated_error / fabs(result_current)) > rel_tol) {
+        message(1, 
+            "Warning: Tanh-Sinh integration reached neither the desired relative tolerance of %g nor absolute tolerance of %g. "
+            "Final absolute error: %g, relative error: %g\n", 
+            rel_tol, abs_tol, *estimated_error, (*estimated_error / fabs(result_current)));
     }
 
     // Return the final result
diff --git a/libgadget/timefac.h b/libgadget/timefac.h
index a6a26672..c3720811 100644
--- a/libgadget/timefac.h
+++ b/libgadget/timefac.h
@@ -17,6 +17,7 @@ double tanh_sinh_integrate_adaptive(
     double b, 
     double* estimated_error, 
     double rel_tol = 1e-8, 
+    double abs_tol = 0,
     int max_refinements_limit = 30, 
     int init_refine = 5, 
     int step = 5

From e44449930476a13e629d41425fd11bd73efa8c8c Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Mon, 30 Sep 2024 23:35:45 -0700
Subject: [PATCH 025/120] power integ

---
 libgadget/timebinmgr.c |  1 -
 libgenic/power.c       | 17 +++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libgadget/timebinmgr.c b/libgadget/timebinmgr.c
index d38e689e..792dd96e 100644
--- a/libgadget/timebinmgr.c
+++ b/libgadget/timebinmgr.c
@@ -1,7 +1,6 @@
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
-#include <gsl/gsl_integration.h>
 
 #include "timebinmgr.h"
 #include "utils.h"
diff --git a/libgenic/power.c b/libgenic/power.c
index fafa8013..ac10a109 100644
--- a/libgenic/power.c
+++ b/libgenic/power.c
@@ -3,7 +3,6 @@
 #include <math.h>
 #include <stddef.h>
 #include <mpi.h>
-#include <gsl/gsl_integration.h>
 #include <gsl/gsl_interp.h>
 #include <bigfile-mpi.h>
 
@@ -13,6 +12,8 @@
 #include <libgadget/physconst.h>
 #include "power.h"
 #include "proto.h"
+#include "timefac.h"
+
 static double Delta_EH(double k);
 static double Delta_Tabulated(double k, enum TransferType Type);
 static double sigma2_int(double k, void * params);
@@ -477,20 +478,20 @@ double tk_eh(double k)		/* from Martin White */
 
 double TopHatSigma2(double R)
 {
-  gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
-  double result,abserr;
-  gsl_function F;
-  F.function = &sigma2_int;
-  F.params = &R;
+    double result,abserr;
+  
+  // Define the integrand as a lambda function, wrapping sigma2_int
+    auto integrand = [R](double k) {
+        return sigma2_int(k, (void*)&R);
+    };
 
   /* note: 500/R is here chosen as integration boundary (infinity) */
   gsl_integration_qags (&F, 0, 500. / R, 0, 1e-4,1000,w,&result, &abserr);
+  result = tanh_sinh_integrate_adaptive(integrand, 0, 500. / R, &abserr, 1e-4, 0.);
 /*   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
-  gsl_integration_workspace_free (w);
   return result;
 }
 
-
 double sigma2_int(double k, void * params)
 {
   double w, x;

From 7d5c5a9580b0b067bacd8eefaed20e21455b2b75 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Tue, 1 Oct 2024 14:28:10 -0400
Subject: [PATCH 026/120] migrate petapm-cufft.c to petapm.c

---
 libgadget/petapm-cufft.c | 1029 --------------------------------------
 libgadget/petapm.c       |  424 +++++-----------
 2 files changed, 126 insertions(+), 1327 deletions(-)
 delete mode 100644 libgadget/petapm-cufft.c

diff --git a/libgadget/petapm-cufft.c b/libgadget/petapm-cufft.c
deleted file mode 100644
index fedf7515..00000000
--- a/libgadget/petapm-cufft.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-/* do NOT use complex.h it breaks the code */
-
-#include "types.h"
-#include "petapm.h"
-
-#include "utils.h"
-#include "walltime.h"
-
-static void
-layout_prepare(PetaPM * pm,
-               struct Layout * L,
-               double * meshbuf,
-               PetaPMRegion * regions,
-               const int Nregions,
-               MPI_Comm comm);
-static void layout_finish(struct Layout * L);
-static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
-static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
-
-/* cell_iterator needs to be thread safe !*/
-typedef void (* cell_iterator)(double * cell_value, double * comm_buffer);
-static void layout_iterate_cells(PetaPM * pm, struct Layout * L, cell_iterator iter, double * real);
-
-struct Pencil { /* a pencil starting at offset, with lenght len */
-    int offset[3];
-    int len;
-    int first;
-    int meshbuf_first; /* first pixel in meshbuf */
-    int task;
-};
-static int pencil_cmp_target(const void * v1, const void * v2);
-static int pos_get_target(PetaPM * pm, const int pos[2]);
-
-/* FIXME: move this to MPIU_. */
-static int64_t reduce_int64(int64_t input, MPI_Comm comm);
-#ifdef DEBUG
-/* for debugging */
-static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize);
-#endif
-
-static MPI_Datatype MPI_PENCIL;
-
-/*Used only in MP-GenIC*/
-cufftComplex *
-petapm_alloc_rhok(PetaPM * pm)
-{
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
-    return rho_k;
-}
-
-static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
-
-static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
-static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
-#define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
-#define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
-#define INACTIVE(i) (CPS->active && !CPS->active(i))
-
-
-PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
-    return &pm->fourier_space_region;
-}
-PetaPMRegion * petapm_get_real_region(PetaPM * pm) {
-    return &pm->real_space_region;
-}
-int petapm_mesh_to_k(PetaPM * pm, int i) {
-    /*Return the position of this point on the Fourier mesh*/
-    return i<=pm->Nmesh/2 ? i : (i-pm->Nmesh);
-}
-int *petapm_get_thistask2d(PetaPM * pm) {
-    return pm->ThisTask2d;
-}
-int *petapm_get_ntask2d(PetaPM * pm) {
-    return pm->NTask2d;
-}
-
-void
-petapm_module_init(int Nthreads)
-{
-    // CUDA Device Initialization if necessary (optional if only one GPU is used)
-    int device_id = 0;
-    cudaSetDevice(device_id);  // Set the active GPU device
-
-    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
-    #ifdef _OPENMP
-    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
-    #endif
-    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
-
-    // get rid of pencil type
-    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    //MPI_Type_commit(&MPI_PENCIL);
-}
-
-void
-petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
-{
-    /* define the global long / short range force cut */
-    pm->BoxSize = BoxSize;
-    pm->Asmth = Asmth;
-    pm->Nmesh = Nmesh;
-    pm->G = G;
-    pm->CellSize = BoxSize / Nmesh;
-    pm->comm = comm;
-
-
-    int ThisTask;
-    int NTask;
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
-
-
-    int ndevices;
-    cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(ThisTask % ndevices);
-    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
-
-    // Logical transform size
-    size_t nx = NTask;      // any value >= NTask is OK
-    size_t ny = NTask;      // any value >= NTask is OK
-    size_t nz = 2 * NTask;  // need to be even and >= NTask
-
-    // We start with Slabs distributed along X (X-Slabs)
-    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
-    // All ranks own all element in the Y and Z dimension
-    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
-    // complex numbers assuming an in-place data layout.
-    int ranks_with_onemore = nx % size;
-    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
-    size_t padded_nz = 2 * (nz / 2 + 1);
-
-    // // Local, distributed, data
-    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
-    // generate_random(data, rank);
-    // std::vector<float> ref = data;
-
-
-
-/********************************not sure if these are useful or not**************************************** */
-    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    ptrdiff_t np[2];
-
-    int ThisTask;
-    int NTask;
-
-    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
-    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
-
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
-
-    /* try to find a square 2d decomposition */
-    int i;
-    int k;
-    for(i = sqrt(NTask) + 1; i >= 0; i --) {
-        if(NTask % i == 0) break;
-    }
-    np[0] = i;
-    np[1] = NTask / i;
-
-message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-
-// Step 1: Create 2D Cartesian grid for the processes
-int dims[2] = {np[0], np[1]};
-int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
-
-// Create 2D Cartesian communicator
-if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-}
-
-// Step 2: Get the Cartesian coordinates of the process in the grid
-int periods_unused[2];
-MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
-
-// Ensure that the task grid matches the expected number of processes
-if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
-    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
-}
-
-// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
-// cuFFTMp might require manual management of the local data size
-// Example: You may need to calculate how much data each process holds based on grid decomposition
-
-pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
-                                                pm->real_space_region.size, 
-                                                pm->real_space_region.offset, 
-                                                pm->fourier_space_region.size, 
-                                                pm->fourier_space_region.offset);
-
-    /*
-     * In fourier space, the transposed array is ordered in
-     * are in (y, z, x). The strides and sizes returned
-     * from local size is in (Nx, Ny, Nz), hence we roll them once
-     * so that the strides will give correct linear indexing for
-     * integer coordinates given in order of (y, z, x).
-     * */
-
-#define ROLL(a, N, j) { \
-    typeof(a[0]) tmp[N]; \
-    ptrdiff_t k; \
-    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
-    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
-    }
-
-    ROLL(pm->fourier_space_region.offset, 3, 1);
-    ROLL(pm->fourier_space_region.size, 3, 1);
-
-#undef ROLL
-
-    /* calculate the strides */
-    petapm_region_init_strides(&pm->real_space_region);
-    petapm_region_init_strides(&pm->fourier_space_region);
-
-
-/******************************** end unsure block **************************************** */
-
-    cudaStreamCreate(&pm->priv->stream);
-    cufftCreate(&pm->priv->plan_forw);
-    cufftCreate(&pm->priv->plan_back);
-
-    // Attach the MPI communicator to the plans
-    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
-    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
-
-    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
-    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
-    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
-    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
-
-    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
-
-    // Set the stream
-    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
-    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
-
-    // Make the plan
-    size_t workspace;
-    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
-    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
-
-
-    // Allocate GPU memory, copy CPU data to GPU
-    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
-    cudaLibXtDesc *desc;
-    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
-    // TODO: what to make of the cpu_data here?
-    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
-
-    /* now lets fill up the mesh2task arrays */
-
-#if 0
-    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
-            pm->real_space_region.offset[0],
-            pm->real_space_region.offset[1],
-            pm->real_space_region.offset[2],
-            pm->real_space_region.size[0],
-            pm->real_space_region.size[1],
-            pm->real_space_region.size[2]);
-#endif
-
-    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
-    for(k = 0; k < 2; k ++) {
-        for(i = 0; i < Nmesh; i ++) {
-            tmp[i] = 0;
-        }
-        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
-            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
-        }
-        /* which column / row hosts this tile? */
-        /* FIXME: this is very inefficient */
-        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-        /*
-        for(i = 0; i < Nmesh; i ++) {
-            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
-        }
-        */
-    }
-    myfree(tmp);
-}
-
-void
-petapm_destroy(PetaPM * pm)
-{
-    cufftDestroy(pm->priv->plan_forw);
-    cufftDestroy(pm->priv->plan_back);
-    MPI_Comm_free(&pm->priv->comm_cart_2d);
-    myfree(pm->Mesh2Task[0]);
-}
-
-/*
- * read out field to particle i, with value no need to be thread safe
- * (particle i is never done by same thread)
- * */
-typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
-static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
-/* apply transfer function to value, kpos array is in x, y, z order */
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H);
-
-static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-/*
- * 1. calls prepare to build the Regions covering particles
- * 2. CIC the particles
- * 3. Transform to rho_k
- * 4. apply global_transfer (if not NULL --
- *       this is the place to fill in gaussian seeds,
- *       the transfer is stacked onto all following transfers.
- * 5. for each transfer, readout in functions
- * 6.    apply transfer from global_transfer -> complex
- * 7.    transform to real
- * 8.    readout
- * 9. free regions
- * */
-
-PetaPMRegion *
-petapm_force_init(
-        PetaPM * pm,
-        petapm_prepare_func prepare,
-        PetaPMParticleStruct * pstruct,
-        int * Nregions,
-        void * userdata) {
-    CPS = pstruct;
-
-    *Nregions = 0;
-    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
-    pm_init_regions(pm, regions, *Nregions);
-
-    pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
-
-    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
-
-    walltime_measure("/PMgrav/init");
-    return regions;
-}
-
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
-
-    PetaPMRegion * region = &pm->fourier_space_region;
-
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
-
-}
-
-cufftComplex * petapm_force_r2c(PetaPM * pm,
-        PetaPMGlobalFunctions * global_functions
-        ) {
-    /* call pfft rho_k is CFT of rho */
-
-    /* this is because
-     *
-     * CFT = DFT * dx **3
-     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
-     * */
-    double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-    memset(real, 0, sizeof(double) * pm->priv->fftsize);
-    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-    walltime_measure("/PMgrav/comm2");
-
-#ifdef DEBUG
-    verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
-    walltime_measure("/PMgrav/Verify");
-#endif
-
-    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
-    myfree(real);
-
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
-
-    /*Do any analysis that may be required before the transfer function is applied*/
-    petapm_transfer_func global_readout = global_functions->global_readout;
-    if(global_readout)
-        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
-    if(global_functions->global_analysis)
-        global_functions->global_analysis(pm);
-    /*Apply the transfer function*/
-    petapm_transfer_func global_transfer = global_functions->global_transfer;
-    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
-    walltime_measure("/PMgrav/r2c");
-
-    myfree(complx);
-    return rho_k;
-}
-
-void
-petapm_force_c2r(PetaPM * pm,
-        cufftComplex * rho_k,
-        PetaPMRegion * regions,
-        const int Nregions,
-        PetaPMFunctions * functions)
-{
-
-    PetaPMFunctions * f = functions;
-    for (f = functions; f->name; f ++) {
-        petapm_transfer_func transfer = f->transfer;
-        petapm_readout_func readout = f->readout;
-
-        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-        /* apply the greens function turn rho_k into potential in fourier space */
-        pm_apply_transfer_function(pm, rho_k, complx, transfer);
-        walltime_measure("/PMgrav/calc");
-
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
-
-        walltime_measure("/PMgrav/c2r");
-        if(f == functions) // Once
-            report_memory_usage("PetaPM");
-        myfree(complx);
-        /* read out the potential: this will copy and free real.*/
-        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-        walltime_measure("/PMgrav/comm");
-
-        pm_iterate(pm, readout, regions, Nregions);
-        walltime_measure("/PMgrav/readout");
-    }
-}
-
-void petapm_force_finish(PetaPM * pm) {
-    layout_finish(&pm->priv->layout);
-    myfree(pm->priv->meshbuf);
-}
-
-void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        void * userdata) {
-    int Nregions;
-    PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
-    if(functions)
-        petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
-    myfree(rho_k);
-    if(CPS->RegionInd)
-        myfree(CPS->RegionInd);
-    myfree(regions);
-    petapm_force_finish(pm);
-}
-
-/******************************************************************************************************************************************** */
-/* build a communication layout */
-
-static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
-static void layout_exchange_pencils(struct Layout * L);
-static void
-layout_prepare (PetaPM * pm,
-                struct Layout * L,
-                double * meshbuf,
-                PetaPMRegion * regions,
-                const int Nregions,
-                MPI_Comm comm)
-{
-    int r;
-    int i;
-    int NTask;
-    L->comm = comm;
-
-    MPI_Comm_size(L->comm, &NTask);
-
-    L->ibuffer = (int *) mymalloc("PMlayout", sizeof(int) * NTask * 8);
-
-    memset(L->ibuffer, 0, sizeof(int) * NTask * 8);
-    L->NpSend = &L->ibuffer[NTask * 0];
-    L->NpRecv = &L->ibuffer[NTask * 1];
-    L->NcSend = &L->ibuffer[NTask * 2];
-    L->NcRecv = &L->ibuffer[NTask * 3];
-    L->DcSend = &L->ibuffer[NTask * 4];
-    L->DcRecv = &L->ibuffer[NTask * 5];
-    L->DpSend = &L->ibuffer[NTask * 6];
-    L->DpRecv = &L->ibuffer[NTask * 7];
-
-    L->NpExport = 0;
-    L->NcExport = 0;
-    L->NpImport = 0;
-    L->NcImport = 0;
-
-    int NpAlloc = 0;
-    /* count pencils until buffer would run out */
-    for (r = 0; r < Nregions; r ++) {
-        NpAlloc += regions[r].size[0] * regions[r].size[1];
-    }
-
-    L->PencilSend = (struct Pencil *) mymalloc("PencilSend", NpAlloc * sizeof(struct Pencil));
-
-    layout_build_pencils(pm, L, meshbuf, regions, Nregions);
-
-    /* sort the pencils by the target rank for ease of next step */
-    qsort_openmp(L->PencilSend, NpAlloc, sizeof(struct Pencil), pencil_cmp_target);
-    /* zero length pixels are moved to the tail */
-
-    /* now shrink NpExport*/
-    L->NpExport = NpAlloc;
-    while(L->NpExport > 0 && L->PencilSend[L->NpExport - 1].len == 0) {
-        L->NpExport --;
-    }
-
-    /* count total number of cells to be exported */
-    int NcExport = 0;
-    for(i = 0; i < L->NpExport; i++) {
-        int task = L->PencilSend[i].task;
-        L->NcSend[task] += L->PencilSend[i].len;
-        NcExport += L->PencilSend[i].len;
-        L->NpSend[task] ++;
-    }
-    L->NcExport = NcExport;
-
-    MPI_Alltoall(L->NpSend, 1, MPI_INT, L->NpRecv, 1, MPI_INT, L->comm);
-    MPI_Alltoall(L->NcSend, 1, MPI_INT, L->NcRecv, 1, MPI_INT, L->comm);
-
-    /* build the displacement array; why doesn't MPI build these automatically? */
-    L->DpSend[0] = 0; L->DpRecv[0] = 0;
-    L->DcSend[0] = 0; L->DcRecv[0] = 0;
-    for(i = 1; i < NTask; i ++) {
-        L->DpSend[i] = L->NpSend[i - 1] + L->DpSend[i - 1];
-        L->DpRecv[i] = L->NpRecv[i - 1] + L->DpRecv[i - 1];
-        L->DcSend[i] = L->NcSend[i - 1] + L->DcSend[i - 1];
-        L->DcRecv[i] = L->NcRecv[i - 1] + L->DcRecv[i - 1];
-    }
-    L->NpImport = L->DpRecv[NTask -1] + L->NpRecv[NTask -1];
-    L->NcImport = L->DcRecv[NTask -1] + L->NcRecv[NTask -1];
-
-    /* some checks */
-    if(L->DpSend[NTask - 1] + L->NpSend[NTask -1] != L->NpExport) {
-        endrun(1, "NpExport = %d NpSend=%d DpSend=%d\n", L->NpExport, L->NpSend[NTask -1], L->DpSend[NTask - 1]);
-    }
-    if(L->DcSend[NTask - 1] + L->NcSend[NTask -1] != L->NcExport) {
-        endrun(1, "NcExport = %d NcSend=%d DcSend=%d\n", L->NcExport, L->NcSend[NTask -1], L->DcSend[NTask - 1]);
-    }
-    int64_t totNpAlloc = reduce_int64(NpAlloc, L->comm);
-    int64_t totNpExport = reduce_int64(L->NpExport, L->comm);
-    int64_t totNcExport = reduce_int64(L->NcExport, L->comm);
-    int64_t totNpImport = reduce_int64(L->NpImport, L->comm);
-    int64_t totNcImport = reduce_int64(L->NcImport, L->comm);
-
-    if(totNpExport != totNpImport) {
-        endrun(1, "totNpExport = %ld\n", totNpExport);
-    }
-    if(totNcExport != totNcImport) {
-        endrun(1, "totNcExport = %ld\n", totNcExport);
-    }
-
-    /* exchange the pencils */
-    message(0, "PetaPM:  %010ld/%010ld Pencils and %010ld Cells\n", totNpExport, totNpAlloc, totNcExport);
-    L->PencilRecv = (struct Pencil *) mymalloc("PencilRecv", L->NpImport * sizeof(struct Pencil));
-    memset(L->PencilRecv, 0xfc, L->NpImport * sizeof(struct Pencil));
-    layout_exchange_pencils(L);
-}
-
-static void
-layout_build_pencils(PetaPM * pm,
-                     struct Layout * L,
-                     double * meshbuf,
-                     PetaPMRegion * regions,
-                     const int Nregions)
-{
-    /* now build pencils to be exported */
-    int p0 = 0;
-    int r;
-    for (r = 0; r < Nregions; r++) {
-        int ix;
-#pragma omp parallel for private(ix)
-        for(ix = 0; ix < regions[r].size[0]; ix++) {
-            int iy;
-            for(iy = 0; iy < regions[r].size[1]; iy++) {
-                int poffset = ix * regions[r].size[1] + iy;
-                struct Pencil * p = &L->PencilSend[p0 + poffset];
-
-                p->offset[0] = ix + regions[r].offset[0];
-                p->offset[1] = iy + regions[r].offset[1];
-                p->offset[2] = regions[r].offset[2];
-                p->len = regions[r].size[2];
-                p->meshbuf_first = (regions[r].buffer - meshbuf) +
-                    regions[r].strides[0] * ix +
-                    regions[r].strides[1] * iy;
-                /* now lets compress the pencil */
-                while((p->len > 0) && (meshbuf[p->meshbuf_first + p->len - 1] == 0.0)) {
-                    p->len --;
-                }
-                while((p->len > 0) && (meshbuf[p->meshbuf_first] == 0.0)) {
-                    p->len --;
-                    p->meshbuf_first++;
-                    p->offset[2] ++;
-                }
-
-                p->task = pos_get_target(pm, p->offset);
-            }
-        }
-        p0 += regions[r].size[0] * regions[r].size[1];
-    }
-
-}
-
-static void layout_exchange_pencils(struct Layout * L) {
-    int i;
-    int offset;
-    int NTask;
-    MPI_Comm_size(L->comm, &NTask);
-    /* build the first pointers to refer to the correct relative buffer locations */
-    /* note that the buffer hasn't bee assembled yet */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        int j;
-        struct Pencil * p = &L->PencilSend[offset];
-        if(L->NpSend[i] == 0) continue;
-        p->first = 0;
-        for(j = 1; j < L->NpSend[i]; j++) {
-            p[j].first = p[j - 1].first + p[j - 1].len;
-        }
-        offset += L->NpSend[i];
-    }
-
-    MPI_Alltoallv(
-            L->PencilSend, L->NpSend, L->DpSend, MPI_PENCIL,
-            L->PencilRecv, L->NpRecv, L->DpRecv, MPI_PENCIL,
-            L->comm);
-
-    /* set first to point to absolute position in the full import cell buffer */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        struct Pencil * p = &L->PencilRecv[offset];
-        int j;
-        for(j = 0; j < L->NpRecv[i]; j++) {
-            p[j].first += L->DcRecv[i];
-        }
-        offset += L->NpRecv[i];
-    }
-
-    /* set first to point to absolute position in the full export cell buffer */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        struct Pencil * p = &L->PencilSend[offset];
-        int j;
-        for(j = 0; j < L->NpSend[i]; j++) {
-            p[j].first += L->DcSend[i];
-        }
-        offset += L->NpSend[i];
-    }
-}
-
-static void layout_finish(struct Layout * L) {
-    myfree(L->PencilRecv);
-    myfree(L->PencilSend);
-    myfree(L->ibuffer);
-}
-
-/* exchange cells to their pfft host, then reduce the cells to the pfft
- * array */
-static void to_pfft(double * cell, double * buf) {
-#pragma omp atomic update
-            cell[0] += buf[0];
-}
-
-static void
-layout_build_and_exchange_cells_to_pfft(
-        PetaPM * pm,
-        struct Layout * L,
-        double * meshbuf,
-        double * real)
-{
-    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
-    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
-
-    int i;
-    int offset;
-
-    /* collect all cells into the send buffer */
-    offset = 0;
-    for(i = 0; i < L->NpExport; i ++) {
-        struct Pencil * p = &L->PencilSend[i];
-        memcpy(L->BufSend + offset, &meshbuf[p->meshbuf_first],
-                sizeof(double) * p->len);
-        offset += p->len;
-    }
-
-    /* receive cells */
-    MPI_Alltoallv(
-            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
-            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
-            L->comm);
-
-#if 0
-    double massExport = 0;
-    for(i = 0; i < L->NcExport; i ++) {
-        massExport += L->BufSend[i];
-    }
-
-    double massImport = 0;
-    for(i = 0; i < L->NcImport; i ++) {
-        massImport += L->BufRecv[i];
-    }
-    double totmassExport;
-    double totmassImport;
-    MPI_Allreduce(&massExport, &totmassExport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
-    MPI_Allreduce(&massImport, &totmassImport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
-    message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
-#endif
-
-    layout_iterate_cells(pm, L, to_pfft, real);
-    myfree(L->BufRecv);
-    myfree(L->BufSend);
-}
-
-/* readout cells on their pfft host, then exchange the cells to the domain
- * host */
-static void to_region(double * cell, double * region) {
-    *region = *cell;
-}
-
-static void
-layout_build_and_exchange_cells_to_local(
-        PetaPM * pm,
-        struct Layout * L,
-        double * meshbuf,
-        double * real)
-{
-    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
-    int i;
-    int offset;
-
-    /*layout_iterate_cells transfers real to L->BufRecv*/
-    layout_iterate_cells(pm, L, to_region, real);
-
-    /*Real is done now: reuse the memory for BufSend*/
-    myfree(real);
-    /*Now allocate BufSend, which is confusingly used to receive data*/
-    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
-
-    /* exchange cells */
-    /* notice the order is reversed from to_pfft */
-    MPI_Alltoallv(
-            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
-            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
-            L->comm);
-
-    /* distribute BufSend to meshbuf */
-    offset = 0;
-    for(i = 0; i < L->NpExport; i ++) {
-        struct Pencil * p = &L->PencilSend[i];
-        memcpy(&meshbuf[p->meshbuf_first],
-                L->BufSend + offset,
-                sizeof(double) * p->len);
-        offset += p->len;
-    }
-    myfree(L->BufSend);
-    myfree(L->BufRecv);
-}
-
-/* iterate over the pairs of real field cells and RecvBuf cells
- *
- * !!! iter has to be thread safe. !!!
- * */
-static void
-layout_iterate_cells(PetaPM * pm,
-                     struct Layout * L,
-                     cell_iterator iter,
-                     double * real)
-{
-    int i;
-#pragma omp parallel for
-    for(i = 0; i < L->NpImport; i ++) {
-        struct Pencil * p = &L->PencilRecv[i];
-        int k;
-        ptrdiff_t linear0 = 0;
-        for(k = 0; k < 2; k ++) {
-            int ix = p->offset[k];
-            while(ix < 0) ix += pm->Nmesh;
-            while(ix >= pm->Nmesh) ix -= pm->Nmesh;
-            ix -= pm->real_space_region.offset[k];
-            if(ix >= pm->real_space_region.size[k]) {
-                /* serious problem assumption about pfft layout was wrong*/
-                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
-            }
-            linear0 += ix * pm->real_space_region.strides[k];
-        }
-        int j;
-        for(j = 0; j < p->len; j ++) {
-            int iz = p->offset[2] + j;
-            while(iz < 0) iz += pm->Nmesh;
-            while(iz >= pm->Nmesh) iz -= pm->Nmesh;
-            if(iz >= pm->real_space_region.size[2]) {
-                /* serious problem assmpution about pfft layout was wrong*/
-                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
-            }
-            ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
-            /*
-             * operate on the pencil, either modifying real or BufRecv
-             * */
-            iter(&real[linear], &L->BufRecv[p->first + j]);
-        }
-    }
-}
-
-static void
-pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions)
-{
-    if(regions) {
-        int i;
-        size_t size = 0;
-        for(i = 0 ; i < Nregions; i ++) {
-            size += regions[i].totalsize;
-        }
-        pm->priv->meshbufsize = size;
-        if ( size == 0 ) return;
-        pm->priv->meshbuf = (double *) mymalloc("PMmesh", size * sizeof(double));
-        /* this takes care of the padding */
-        memset(pm->priv->meshbuf, 0, size * sizeof(double));
-        size = 0;
-        for(i = 0 ; i < Nregions; i ++) {
-            regions[i].buffer = pm->priv->meshbuf + size;
-            size += regions[i].totalsize;
-        }
-    }
-}
-
-
-static void
-pm_iterate_one(PetaPM * pm,
-               int i,
-               pm_iterator iterator,
-               PetaPMRegion * regions,
-               const int Nregions)
-{
-    int k;
-    int iCell[3];  /* integer coordinate on the regional mesh */
-    double Res[3]; /* residual*/
-    double * Pos = POS(i);
-    const int RegionInd = CPS->RegionInd ? CPS->RegionInd[i] : 0;
-
-    /* Asserts that the swallowed particles are not considered (region -2).*/
-    if(RegionInd < 0)
-        return;
-    /* This should never happen: it is pure paranoia and to avoid icc being crazy*/
-    if(RegionInd >= Nregions)
-        endrun(1, "Particle %d has region %d out of bounds %d\n", i, RegionInd, Nregions);
-
-    PetaPMRegion * region = &regions[RegionInd];
-    for(k = 0; k < 3; k++) {
-        double tmp = Pos[k] / pm->CellSize;
-        iCell[k] = floor(tmp);
-        Res[k] = tmp - iCell[k];
-        iCell[k] -= region->offset[k];
-        /* seriously?! particles are supposed to be contained in cells */
-        if(iCell[k] >= region->size[k] - 1 || iCell[k] < 0) {
-            endrun(1, "particle out of cell better stop %d (k=%d) %g %g %g region: %td %td\n", iCell[k],k,
-                Pos[0], Pos[1], Pos[2],
-                region->offset[k], region->size[k]);
-        }
-    }
-
-    int connection;
-    for(connection = 0; connection < 8; connection++) {
-        double weight = 1.0;
-        size_t linear = 0;
-        for(k = 0; k < 3; k++) {
-            int offset = (connection >> k) & 1;
-            int tmp = iCell[k] + offset;
-            linear += tmp * region->strides[k];
-            weight *= offset?
-                /* offset == 1*/ (Res[k])    :
-                /* offset == 0*/ (1 - Res[k]);
-        }
-        if(linear >= region->totalsize) {
-            endrun(1, "particle linear index out of cell better stop\n");
-        }
-        iterator(pm, i, &region->buffer[linear], weight);
-    }
-}
-
-/*
- * iterate over all particle / mesh pairs, call iterator
- * function . iterator function shall be aware of thread safety.
- * no threads run on same particle same time but may
- * access one mesh points same time.
- * */
-static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions) {
-    int i;
-#pragma omp parallel for
-    for(i = 0; i < CPS->NumPart; i ++) {
-        pm_iterate_one(pm, i, iterator, regions, Nregions);
-    }
-}
-
-void petapm_region_init_strides(PetaPMRegion * region) {
-    int k;
-    size_t rt = 1;
-    for(k = 2; k >= 0; k --) {
-        region->strides[k] = rt;
-        rt = region->size[k] * rt;
-    }
-    region->totalsize = rt;
-    region->buffer = NULL;
-}
-
-static int pos_get_target(PetaPM * pm, const int pos[2]) {
-    int k;
-    int task2d[2];
-    int rank;
-    for(k = 0; k < 2; k ++) {
-        int ix = pos[k];
-        while(ix < 0) ix += pm->Nmesh;
-        while(ix >= pm->Nmesh) ix -= pm->Nmesh;
-        task2d[k] = pm->Mesh2Task[k][ix];
-    }
-    MPI_Cart_rank(pm->priv->comm_cart_2d, task2d, &rank);
-    return rank;
-}
-static int pencil_cmp_target(const void * v1, const void * v2) {
-    const struct Pencil * p1 = (const struct Pencil *) v1;
-    const struct Pencil * p2 = (const struct Pencil *) v2;
-    /* move zero length pixels to the end */
-    if(p2->len == 0) return -1;
-    if(p1->len == 0) return 1;
-    int t1 = p1->task;
-    int t2 = p2->task;
-    return ((t2 < t1) - (t1 < t2)) * 2 +
-        ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
-}
-
-#ifdef DEBUG
-static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
-    /* verify the density field */
-    double mass_Part = 0;
-    int j;
-#pragma omp parallel for reduction(+: mass_Part)
-    for(j = 0; j < CPS->NumPart; j ++) {
-        double Mass = *MASS(j);
-        mass_Part += Mass;
-    }
-    double totmass_Part = 0;
-    MPI_Allreduce(&mass_Part, &totmass_Part, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-
-    double mass_Region = 0;
-    size_t i;
-
-#pragma omp parallel for reduction(+: mass_Region)
-    for(i = 0; i < meshsize; i ++) {
-        mass_Region += meshbuf[i];
-    }
-    double totmass_Region = 0;
-    MPI_Allreduce(&mass_Region, &totmass_Region, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-    double mass_CIC = 0;
-#pragma omp parallel for reduction(+: mass_CIC)
-    for(i = 0; i < pm->real_space_region.totalsize; i ++) {
-        mass_CIC += real[i];
-    }
-    double totmass_CIC = 0;
-    MPI_Allreduce(&mass_CIC, &totmass_CIC, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-
-    message(0, "total Region mass err = %g CIC mass err = %g Particle mass = %g\n", totmass_Region / totmass_Part - 1, totmass_CIC / totmass_Part - 1, totmass_Part);
-}
-#endif
-
-
-
-
-/**************
- * functions iterating over particle / mesh pairs
- ***************/
-static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    double Mass = *MASS(i);
-    if(INACTIVE(i))
-        return;
-#pragma omp atomic update
-    mesh[0] += weight * Mass;
-}
-static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
-    int64_t result = 0;
-    MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
-    return result;
-}
-
-/** Some FFT notes
- *
- *
- * CFT = dx * iDFT (thus CFT has no 2pi factors and iCFT has,
- *           same as wikipedia.)
- *
- * iCFT = dk * DFT
- * iCFT(CFG) = dx * dk * DFT(iDFT)
- *           = L / N * (2pi / L) * N
- *           = 2 pi
- * agreed with the usual def that
- * iCFT(CFT) = 2pi
- *
- * **************************8*/
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index fbd6865e..90235718 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -19,7 +19,7 @@ layout_prepare(PetaPM * pm,
                const int Nregions,
                MPI_Comm comm);
 static void layout_finish(struct Layout * L);
-static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+static void layout_build_and_exchange_cells_to_fft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 
 /* cell_iterator needs to be thread safe !*/
@@ -62,13 +62,6 @@ static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access
 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
 #define INACTIVE(i) (CPS->active && !CPS->active(i))
 
-/* (jdavies) reion defs */
-#define TYPE(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_type]))
-#define PI(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_pi]))
-/* NOTE: These are 'myfloat' types */
-#define FESC(i) ((double*) (&((char*)CPS_R->Starslot)[CPS_R->star_elsize * *PI(i) + CPS_R->offset_fesc]))
-#define FESCSPH(i) ((double*) (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_fesc_sph]))
-#define SFR(i) ((double*)  (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_sfr]))
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
     return &pm->fourier_space_region;
@@ -98,12 +91,11 @@ petapm_module_init(int Nthreads)
     #ifdef _OPENMP
     omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
     #endif
-
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    // Initialize the MPI Datatype for the Pencil structure
-    MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    MPI_Type_commit(&MPI_PENCIL);
+    // get rid of pencil type
+    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    //MPI_Type_commit(&MPI_PENCIL);
 }
 
 void
@@ -117,6 +109,40 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     pm->CellSize = BoxSize / Nmesh;
     pm->comm = comm;
 
+
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(ThisTask % ndevices);
+    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
+
+    // Logical transform size
+    size_t nx = NTask;      // any value >= NTask is OK
+    size_t ny = NTask;      // any value >= NTask is OK
+    size_t nz = 2 * NTask;  // need to be even and >= NTask
+
+    // We start with Slabs distributed along X (X-Slabs)
+    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
+    // All ranks own all element in the Y and Z dimension
+    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
+    // complex numbers assuming an in-place data layout.
+    int ranks_with_onemore = nx % size;
+    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
+    size_t padded_nz = 2 * (nz / 2 + 1);
+
+    // // Local, distributed, data
+    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
+    // generate_random(data, rank);
+    // std::vector<float> ref = data;
+
+
+
+/********************************not sure if these are useful or not**************************************** */
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
     ptrdiff_t np[2];
 
@@ -192,22 +218,41 @@ pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d,
     petapm_region_init_strides(&pm->real_space_region);
     petapm_region_init_strides(&pm->fourier_space_region);
 
-    /* planning the fft; need temporary arrays */
 
-    double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+/******************************** end unsure block **************************************** */
 
-    pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
-        n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
-        PFFT_TRANSPOSED_OUT | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
-    pm->priv->plan_back = pfft_plan_dft_c2r_3d(
-        n, complx, real, pm->priv->comm_cart_2d, PFFT_BACKWARD,
-        PFFT_TRANSPOSED_IN | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
+    cudaStreamCreate(&pm->priv->stream);
+    cufftCreate(&pm->priv->plan_forw);
+    cufftCreate(&pm->priv->plan_back);
 
-    myfree(complx);
-    myfree(rho_k);
-    myfree(real);
+    // Attach the MPI communicator to the plans
+    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
+    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
+
+    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
+    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
+    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
+    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
+
+    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+
+    // Set the stream
+    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
+    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
+
+    // Make the plan
+    size_t workspace;
+    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
+    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
+    // TODO: what to make of the cpu_data here?
+    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
 
     /* now lets fill up the mesh2task arrays */
 
@@ -244,8 +289,8 @@ pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d,
 void
 petapm_destroy(PetaPM * pm)
 {
-    pfft_destroy_plan(pm->priv->plan_forw);
-    pfft_destroy_plan(pm->priv->plan_back);
+    cufftDestroy(pm->priv->plan_forw);
+    cufftDestroy(pm->priv->plan_back);
     MPI_Comm_free(&pm->priv->comm_cart_2d);
     myfree(pm->Mesh2Task[0]);
 }
@@ -262,9 +307,6 @@ static void pm_apply_transfer_function(PetaPM * pm,
         cufftComplex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-
 /*
  * 1. calls prepare to build the Regions covering particles
  * 2. CIC the particles
@@ -300,6 +342,48 @@ petapm_force_init(
     return regions;
 }
 
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
+        ){
+    size_t ip = 0;
+
+    PetaPMRegion * region = &pm->fourier_space_region;
+
+#pragma omp parallel for
+    for(ip = 0; ip < region->totalsize; ip ++) {
+        ptrdiff_t tmp = ip;
+        int pos[3];
+        int kpos[3];
+        int64_t k2 = 0.0;
+        int k;
+        for(k = 0; k < 3; k ++) {
+            pos[k] = tmp / region->strides[k];
+            tmp -= pos[k] * region->strides[k];
+            /* lets get the abs pos on the grid*/
+            pos[k] += region->offset[k];
+            /* check */
+            if(pos[k] >= pm->Nmesh) {
+                endrun(1, "position didn't make sense\n");
+            }
+            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
+            /* Watch out the cast */
+            k2 += ((int64_t)kpos[k]) * kpos[k];
+        }
+        /* swap 0 and 1 because fourier space was transposed */
+        /* kpos is y, z, x */
+        pos[0] = kpos[2];
+        pos[1] = kpos[0];
+        pos[2] = kpos[1];
+        dst[ip][0] = src[ip][0];
+        dst[ip][1] = src[ip][1];
+        if(H) {
+            H(pm, k2, pos, &dst[ip]);
+        }
+    }
+
+}
+
 cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
@@ -312,7 +396,7 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
      * */
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
-    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
     walltime_measure("/PMgrav/comm2");
 
 #ifdef DEBUG
@@ -397,205 +481,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
     petapm_force_finish(pm);
 }
 
-/* These functions are for the excursion set reionization module*/
-
-/* initialise one set of regions with custom iterator
- * this is the same as petapm_force_init with a custom iterator
- * (and no CPS definition since it's called multiple times)*/
-PetaPMRegion *
-petapm_reion_init(
-        PetaPM * pm,
-        petapm_prepare_func prepare,
-        pm_iterator iterator,
-        PetaPMParticleStruct * pstruct,
-        int * Nregions,
-        void * userdata) {
-
-    *Nregions = 0;
-    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
-    pm_init_regions(pm, regions, *Nregions);
-
-    walltime_measure("/PMreion/Misc");
-    pm_iterate(pm, iterator, regions, *Nregions);
-    walltime_measure("/PMreion/cic");
-
-    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
-
-    walltime_measure("/PMreion/comm");
-    return regions;
-}
-
-/* 30Mpc to 0.5 Mpc with a delta of 1.1 is ~50 iterations, this should be more than enough*/
-#define MAX_R_ITERATIONS 10000
-
-/* differences from force c2r (why I think I need this separate)
- * radius loop (could do this with long list of same function + global R)
- * I'm pretty sure I need a third function type (reion loop) with all three grids
- * ,after c2r but iteration over the grid, instead of particles */
-void
-petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        cufftComplex * mass_unfiltered, cufftComplex * star_unfiltered, cufftComplex * sfr_unfiltered,
-        PetaPMRegion * regions,
-        const int Nregions,
-        PetaPMFunctions * functions,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr)
-{
-    PetaPMFunctions * f = functions;
-    double R = fmin(R_max,pm_mass->BoxSize);
-    int last_step = 0;
-    int f_count = 0;
-    petapm_readout_func readout = f->readout;
-
-    /* TODO: seriously re-think the allocation ordering in this function */
-    double * mass_real = (double * ) mymalloc2("mass_real", pm_mass->priv->fftsize * sizeof(double));
-
-    //TODO: add CellLengthFactor for lowres (>1Mpc, see old find_HII_bubbles function)
-    while(!last_step) {
-        f_count++;
-        //The last step will be unfiltered
-        if(R/R_delta < R_min || R/R_delta < (pm_mass->CellSize) || f_count > MAX_R_ITERATIONS)
-        {
-            last_step = 1;
-            R = pm_mass->CellSize;
-        }
-
-        //NOTE: The PetaPM structs for reionisation use the G variable for filter radius in order to use
-        //the transfer functions correctly
-        pm_mass->G = R;
-        pm_star->G = R;
-        if(use_sfr)pm_sfr->G = R;
-
-        //TODO: maybe allocate and free these outside the loop
-        cufftComplex * mass_filtered = (cufftComplex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
-        cufftComplex * star_filtered = (cufftComplex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
-        cufftComplex * sfr_filtered;
-        if(use_sfr){
-            sfr_filtered = (cufftComplex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
-        }
-
-        /* apply the filtering at this radius */
-        /*We want the last step to be unfiltered,
-         *  calling apply transfer with NULL should just copy the grids */
-
-        petapm_transfer_func transfer = last_step ? NULL : f->transfer;
-
-        pm_apply_transfer_function(pm_mass, mass_unfiltered, mass_filtered, transfer);
-        pm_apply_transfer_function(pm_star, star_unfiltered, star_filtered, transfer);
-        if(use_sfr){
-            pm_apply_transfer_function(pm_sfr, sfr_unfiltered, sfr_filtered, transfer);
-        }
-        walltime_measure("/PMreion/calc");
-
-        double * star_real = (double * ) mymalloc2("star_real", pm_star->priv->fftsize * sizeof(double));
-        /* back to real space */
-        pfft_execute_dft_c2r(pm_mass->priv->plan_back, mass_filtered, mass_real);
-        pfft_execute_dft_c2r(pm_star->priv->plan_back, star_filtered, star_real);
-        double * sfr_real = NULL;
-        if(use_sfr){
-            sfr_real = (double * ) mymalloc2("sfr_real", pm_sfr->priv->fftsize * sizeof(double));
-            pfft_execute_dft_c2r(pm_sfr->priv->plan_back, sfr_filtered, sfr_real);
-            myfree(sfr_filtered);
-        }
-        walltime_measure("/PMreion/c2r");
-
-        myfree(star_filtered);
-        myfree(mass_filtered);
-
-        /* the reion loop calculates the J21 and stores it,
-         * for now the mass_real grid will be reused to hold J21
-         * on the last filtering step*/
-        reion_loop(pm_mass,pm_star,pm_sfr,mass_real,star_real,sfr_real,last_step);
-
-        /* since we don't need to readout star and sfr grids...*/
-        /* on the last step, the mass grid is populated with J21 and read out*/
-        if(sfr_real){
-            myfree(sfr_real);
-        }
-        myfree(star_real);
-
-        R = R / R_delta;
-    }
-    //J21 grid is exchanged to pm_mass buffer and freed
-    layout_build_and_exchange_cells_to_local(pm_mass, &pm_mass->priv->layout, pm_mass->priv->meshbuf, mass_real);
-    walltime_measure("/PMreion/comm");
-    //J21 read out to particles
-    pm_iterate(pm_mass, readout, regions, Nregions);
-    walltime_measure("/PMreion/readout");
-}
-
-/* We need a slightly different flow for reionisation, so I
- * will define these here instead of messing with the force functions.
- * The c2r function is the same, however we need a new function, reion_loop
- * to run over all three filtered grids, after the inverse transform.
- * The c2r function itself is also different since we need to apply the
- * transfer (filter) function on all three grids and run reion_loop before any readout.*/
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata) {
-
-    //assigning CPS here due to three sets of regions
-    CPS = pstruct;
-    CPS_R = rstruct;
-
-    /* initialise regions for each grid
-     * NOTE: these regions should be identical except for the grid buffer */
-    int Nregions_mass, Nregions_star, Nregions_sfr;
-    PetaPMRegion * regions_mass = petapm_reion_init(pm_mass, prepare, put_particle_to_mesh, pstruct, &Nregions_mass, userdata);
-    PetaPMRegion * regions_star = petapm_reion_init(pm_star, prepare, put_star_to_mesh, pstruct, &Nregions_star, userdata);
-    PetaPMRegion * regions_sfr;
-    if(use_sfr){
-        regions_sfr = petapm_reion_init(pm_sfr, prepare, put_sfr_to_mesh, pstruct, &Nregions_sfr, userdata);
-    }
-
-    walltime_measure("/PMreion/comm2");
-
-    //using force r2c since this part can be done independently
-    cufftComplex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
-    cufftComplex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
-    cufftComplex * sfr_unfiltered = NULL;
-    if(use_sfr){
-        sfr_unfiltered = petapm_force_r2c(pm_sfr, global_functions);
-    }
-
-    //need custom reion_c2r to implement the 3 grid c2r and readout
-    //the readout is only performed on the mass grid so for now I only pass in regions/Nregions for mass
-    if(functions)
-        petapm_reion_c2r(pm_mass, pm_star, pm_sfr,
-               mass_unfiltered, star_unfiltered, sfr_unfiltered,
-               regions_mass, Nregions_mass, functions, reion_loop,
-               R_max, R_min, R_delta, use_sfr);
-
-    //free everything in the correct order
-    if(sfr_unfiltered){
-        myfree(sfr_unfiltered);
-    }
-    myfree(star_unfiltered);
-    myfree(mass_unfiltered);
-
-    if(CPS->RegionInd)
-        myfree(CPS->RegionInd);
-
-    if(use_sfr){
-        myfree(regions_sfr);
-    }
-    myfree(regions_star);
-    myfree(regions_mass);
-
-    if(use_sfr){
-        petapm_force_finish(pm_sfr);
-    }
-    petapm_force_finish(pm_star);
-    petapm_force_finish(pm_mass);
-}
-/* End excursion set reionization module*/
-
+/******************************************************************************************************************************************** */
 /* build a communication layout */
 
 static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
@@ -801,15 +687,15 @@ static void layout_finish(struct Layout * L) {
     myfree(L->ibuffer);
 }
 
-/* exchange cells to their pfft host, then reduce the cells to the pfft
+/* exchange cells to their fft host, then reduce the cells to the fft
  * array */
-static void to_pfft(double * cell, double * buf) {
+static void to_fft(double * cell, double * buf) {
 #pragma omp atomic update
             cell[0] += buf[0];
 }
 
 static void
-layout_build_and_exchange_cells_to_pfft(
+layout_build_and_exchange_cells_to_fft(
         PetaPM * pm,
         struct Layout * L,
         double * meshbuf,
@@ -853,12 +739,12 @@ layout_build_and_exchange_cells_to_pfft(
     message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
 #endif
 
-    layout_iterate_cells(pm, L, to_pfft, real);
+    layout_iterate_cells(pm, L, to_fft, real);
     myfree(L->BufRecv);
     myfree(L->BufSend);
 }
 
-/* readout cells on their pfft host, then exchange the cells to the domain
+/* readout cells on their fft host, then exchange the cells to the domain
  * host */
 static void to_region(double * cell, double * region) {
     *region = *cell;
@@ -884,7 +770,7 @@ layout_build_and_exchange_cells_to_local(
     L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
 
     /* exchange cells */
-    /* notice the order is reversed from to_pfft */
+    /* notice the order is reversed from to_fft */
     MPI_Alltoallv(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
@@ -925,8 +811,8 @@ layout_iterate_cells(PetaPM * pm,
             while(ix >= pm->Nmesh) ix -= pm->Nmesh;
             ix -= pm->real_space_region.offset[k];
             if(ix >= pm->real_space_region.size[k]) {
-                /* serious problem assumption about pfft layout was wrong*/
-                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
+                /* serious problem assumption about fft layout was wrong*/
+                endrun(1, "bad fft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
             }
             linear0 += ix * pm->real_space_region.strides[k];
         }
@@ -936,8 +822,8 @@ layout_iterate_cells(PetaPM * pm,
             while(iz < 0) iz += pm->Nmesh;
             while(iz >= pm->Nmesh) iz -= pm->Nmesh;
             if(iz >= pm->real_space_region.size[2]) {
-                /* serious problem assmpution about pfft layout was wrong*/
-                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
+                /* serious problem assmpution about fft layout was wrong*/
+                endrun(1, "bad fft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
             }
             ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
             /*
@@ -1108,47 +994,7 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 }
 #endif
 
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
-
-    PetaPMRegion * region = &pm->fourier_space_region;
-
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
 
-}
 
 
 /**************
@@ -1161,24 +1007,6 @@ static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weigh
 #pragma omp atomic update
     mesh[0] += weight * Mass;
 }
-//escape fraction scaled GSM
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 4)
-        return;
-    double Mass = *MASS(i);
-    double fesc = *FESC(i);
-#pragma omp atomic update
-    mesh[0] += weight * Mass * fesc;
-}
-//escape fraciton scaled SFR
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 0)
-        return;
-    double Sfr = *SFR(i);
-    double fesc = *FESCSPH(i);
-#pragma omp atomic update
-    mesh[0] += weight * Sfr * fesc;
-}
 static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
     int64_t result = 0;
     MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);

From fb55adf2879009ded57de207ccb325864ece48e0 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Tue, 1 Oct 2024 21:53:58 -0700
Subject: [PATCH 027/120] thermal init integ

---
 libgenic/thermal.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/libgenic/thermal.c b/libgenic/thermal.c
index 5635897c..18b60c7f 100644
--- a/libgenic/thermal.c
+++ b/libgenic/thermal.c
@@ -52,25 +52,24 @@ init_thermalvel(struct thermalvel* thermals, const double v_amp, double max_fd,c
         max_fd = MAX_FERMI_DIRAC;
     thermals->m_vamp = v_amp;
 
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
     double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
+
+    // Lambda function wrapping the Fermi-Dirac kernel
+    auto integrand = [](double x) {
+        return fermi_dirac_kernel(x, nullptr);
+    };
+
     for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++) {
         thermals->fermi_dirac_vel[i] = min_fd+(max_fd-min_fd)* i / (LENGTH_FERMI_DIRAC_TABLE - 1.0);
-        gsl_integration_qag (&F, min_fd, thermals->fermi_dirac_vel[i], 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(thermals->fermi_dirac_cumprob[i]), &abserr);
+        thermals->fermi_dirac_cumprob[i] = tanh_sinh_integrate_adaptive(integrand, min_fd, thermals->fermi_dirac_vel[i], &abserr, 1e-6, 0.);
     //       printf("gsl_integration_qng in fermi_dirac_init_nu. Result %g, error: %g, intervals: %lu\n",fermi_dirac_cumprob[i], abserr,w->size);
     }
     /*Save the largest cum. probability, pre-normalisation,
      * divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
     double total_fd;
-    gsl_integration_qag (&F, 0, MAX_FERMI_DIRAC, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
+    total_fd = tanh_sinh_integrate_adaptive(integrand, 0, MAX_FERMI_DIRAC, &abserr, 1e-6, 0.);
     assert(total_fd > 1.8);
 
-    gsl_integration_workspace_free (w);
-
     double total_frac = thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE-1]/total_fd;
     //Normalise total integral to unity
     for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++)
@@ -117,4 +116,4 @@ add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[])
     Vel[0] += v * sin(theta) * cos(phi);
     Vel[1] += v * sin(theta) * sin(phi);
     Vel[2] += v * cos(theta);
-}
+}
\ No newline at end of file

From 7e602239addacda2a37a2689c4894ec6244d1742 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Tue, 1 Oct 2024 23:53:34 -0700
Subject: [PATCH 028/120] some integ and interp (barycentric rational) cooling
 qso

---
 libgadget/cooling_qso_lightup.c | 24 +++++++++++++-----------
 libgenic/power.c                |  3 +--
 libgenic/thermal.c              |  1 +
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/libgadget/cooling_qso_lightup.c b/libgadget/cooling_qso_lightup.c
index a2e7a7ca..6b674a94 100644
--- a/libgadget/cooling_qso_lightup.c
+++ b/libgadget/cooling_qso_lightup.c
@@ -30,7 +30,7 @@
 #include <mpi.h>
 #include <string.h>
 #include <omp.h>
-#include <gsl/gsl_interp.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
 #include "physconst.h"
 #include "slotsmanager.h"
 #include "partmanager.h"
@@ -47,6 +47,9 @@
 #define E0_HeII 54.4 /* HeII ionization potential in eV*/
 #define HEMASS 4.002602 /* Helium mass in amu*/
 
+boost::math::interpolators::barycentric_rational<double>* HeIII_intp;
+boost::math::interpolators::barycentric_rational<double>* LMFP_intp;
+
 typedef struct
 {
     TreeWalkQueryBase base;
@@ -83,8 +86,6 @@ static int Nreionhist;
 static double * He_zz;
 static double * XHeIII;
 static double * LMFP;
-static gsl_interp * HeIII_intp;
-static gsl_interp * LMFP_intp;
 
 /*This is a helper for the tests*/
 void set_qso_lightup_par(struct qso_lightup_params qso)
@@ -226,11 +227,11 @@ load_heii_reion_hist(const char * reion_hist_file)
     /*Broadcast data to other processors*/
     MPI_Bcast(He_zz, 3 * Nreionhist, MPI_DOUBLE, 0, MPI_COMM_WORLD);
     MPI_Bcast(&qso_inst_heating, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-    /* Initialize the interpolators*/
-    HeIII_intp = gsl_interp_alloc(gsl_interp_linear,Nreionhist);
-    LMFP_intp = gsl_interp_alloc(gsl_interp_linear,Nreionhist);
-    gsl_interp_init(HeIII_intp, He_zz, XHeIII, Nreionhist);
-    gsl_interp_init(LMFP_intp, He_zz, LMFP, Nreionhist);
+
+    // Initialize HeIII interpolation using barycentric rational interpolation
+    HeIII_intp = new boost::math::interpolators::barycentric_rational<double>(He_zz, XHeIII, Nreionhist);
+    // Initialize LMFP interpolation
+    LMFP_intp = new boost::math::interpolators::barycentric_rational<double>(He_zz, LMFP, Nreionhist);
 
     QSOLightupParams.heIIIreion_start = 1/He_zz[0]-1;
 
@@ -271,7 +272,7 @@ get_long_mean_free_path_heating(double redshift)
     if(atime > He_zz[Nreionhist-1])
         return 0;
 
-    double long_mfp_heating = gsl_interp_eval(LMFP_intp, He_zz, LMFP, atime, NULL);
+    double long_mfp_heating = (*LMFP_intp)(atime);
 
     last_zz = redshift;
     last_long_mfp_heating = long_mfp_heating;
@@ -529,7 +530,8 @@ turn_on_quasars(double atime, FOFGroups * fof, ForceTree * gasTree, Cosmology *
     int * qso_cand = NULL;
     int64_t n_gas_tot=0, tot_n_ionized=0, ncand_tot=0;
     MPI_Allreduce(&SlotsManager->info[0].size, &n_gas_tot, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-    double desired_ion_frac = gsl_interp_eval(HeIII_intp, He_zz, XHeIII, atime, NULL);
+    // Evaluate the interpolators
+    double desired_ion_frac = (*HeIII_intp)(atime);
     struct QSOPriv priv;
     priv.fof = fof;
     priv.uu_in_cgs = uu_in_cgs;
@@ -663,7 +665,7 @@ do_heiii_reionization(double atime, FOFGroups * fof, ForceTree * gasTree, Cosmol
 int
 need_change_helium_ionization_fraction(double atime)
 {
-    double desired_ion_frac = gsl_interp_eval(HeIII_intp, He_zz, XHeIII, atime, NULL);
+    double desired_ion_frac = (*HeIII_intp)(atime);
     double curionfrac = gas_ionization_fraction();
     if(curionfrac < desired_ion_frac)
         return 1;
diff --git a/libgenic/power.c b/libgenic/power.c
index ac10a109..e21ac9d0 100644
--- a/libgenic/power.c
+++ b/libgenic/power.c
@@ -12,7 +12,7 @@
 #include <libgadget/physconst.h>
 #include "power.h"
 #include "proto.h"
-#include "timefac.h"
+#include <libgadget/timefac.h>
 
 static double Delta_EH(double k);
 static double Delta_Tabulated(double k, enum TransferType Type);
@@ -486,7 +486,6 @@ double TopHatSigma2(double R)
     };
 
   /* note: 500/R is here chosen as integration boundary (infinity) */
-  gsl_integration_qags (&F, 0, 500. / R, 0, 1e-4,1000,w,&result, &abserr);
   result = tanh_sinh_integrate_adaptive(integrand, 0, 500. / R, &abserr, 1e-4, 0.);
 /*   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
   return result;
diff --git a/libgenic/thermal.c b/libgenic/thermal.c
index 18b60c7f..950819cc 100644
--- a/libgenic/thermal.c
+++ b/libgenic/thermal.c
@@ -4,6 +4,7 @@
 /*For speed of light*/
 #include <libgadget/physconst.h>
 #include <libgadget/utils.h>
+#include <libgadget/timefac.h>
 
 /*The Boltzmann constant in units of eV/K*/
 #define BOLEVK 8.61734e-5

From db4e491d045ff66dd427404aa0dd8e6ed81fb13b Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Wed, 2 Oct 2024 11:09:48 -0700
Subject: [PATCH 029/120] Delete gsl using script from bigfile

---
 depends/bigfile/.github/workflows/main.yaml |   2 +-
 depends/bigfile/utils/Makefile              |   3 -
 depends/bigfile/utils/bigfile-sample-mpi.c  | 280 --------------------
 3 files changed, 1 insertion(+), 284 deletions(-)
 delete mode 100644 depends/bigfile/utils/bigfile-sample-mpi.c

diff --git a/depends/bigfile/.github/workflows/main.yaml b/depends/bigfile/.github/workflows/main.yaml
index b0c7c170..366351ab 100644
--- a/depends/bigfile/.github/workflows/main.yaml
+++ b/depends/bigfile/.github/workflows/main.yaml
@@ -65,7 +65,7 @@ jobs:
                numpy=${{ matrix.numpy-version }} \
                nose cython mpi4py \
                compilers
-        conda install -q -y cmake gsl
+        conda install -q -y cmake
         conda install -q -y runtests
 
     - name: Build C
diff --git a/depends/bigfile/utils/Makefile b/depends/bigfile/utils/Makefile
index 4a5771a4..6b09206d 100644
--- a/depends/bigfile/utils/Makefile
+++ b/depends/bigfile/utils/Makefile
@@ -4,7 +4,6 @@ all: \
 	bigfile-set-attr \
 	bigfile-copy \
 	bigfile-copy-mpi \
-	bigfile-sample-mpi \
 	bigfile-cat \
 	bigfile-create \
 	bigfile-ls \
@@ -19,8 +18,6 @@ bigfile-copy: bigfile-copy.c ../src/libbigfile.a
 	$(CC) -o $@ $< ../src/libbigfile.a -I../src
 bigfile-copy-mpi: bigfile-copy-mpi.c ../src/libbigfile.a ../src/libbigfile-mpi.a
 	$(CC) -o $@ $< ../src/libbigfile-mpi.a ../src/libbigfile.a -I../src
-bigfile-sample-mpi: bigfile-sample-mpi.c ../src/libbigfile.a ../src/libbigfile-mpi.a
-	$(CC) -o $@ $< ../src/libbigfile-mpi.a ../src/libbigfile.a -I../src -lgsl -lgslcblas -lm
 bigfile-cat: bigfile-cat.c ../src/libbigfile.a
 	$(CC) -o $@ $< ../src/libbigfile.a -I../src
 bigfile-create: bigfile-create.c ../src/libbigfile.a
diff --git a/depends/bigfile/utils/bigfile-sample-mpi.c b/depends/bigfile/utils/bigfile-sample-mpi.c
deleted file mode 100644
index 3dcf31b6..00000000
--- a/depends/bigfile/utils/bigfile-sample-mpi.c
+++ /dev/null
@@ -1,280 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <unistd.h>
-#include <math.h>
-#include "bigfile-mpi.h"
-#include <gsl/gsl_rng.h>
-#include <gsl/gsl_randist.h>
-
-void usage() {
-    fprintf(stderr, "usage: bigfile-sample-mpi [-r ratio] [-N Nfile] [-f newfilepath] filepath block newblock\n");
-    exit(1);
-
-}
-#define DONE_TAG 1293
-#define ERROR_TAG 1295
-#define DIE_TAG 1290
-#define WORK_TAG 1291
-
-MPI_Datatype MPI_TYPE_WORK;
-BigFile bf = {0};
-BigFile bfnew = {0};
-BigBlock bb = {0};
-BigBlock bbnew = {0};
-int verbose = 0;
-int Nfile = -1;
-size_t CHUNKSIZE = 1 * 1024 * 1024;
-int ThisTask, NTask;
-char * newfilepath = NULL;
-void slave(void);
-void server(void);
-
-double ratio = 1.0;
-struct work {
-    int64_t offset;
-    int64_t seed;
-    int64_t chunksize;
-    int64_t offsetnew;
-    int64_t nsel;
-};
-
-static size_t filesize();
-
-int main(int argc, char * argv[]) {
-
-    MPI_Init(&argc, &argv);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    MPI_Comm_size(MPI_COMM_WORLD, &NTask);
-
-    MPI_Type_contiguous(sizeof(struct work), MPI_BYTE, &MPI_TYPE_WORK);
-    MPI_Type_commit(&MPI_TYPE_WORK);
-
-    int ch;
-    while(-1 != (ch = getopt(argc, argv, "n:N:vf:r:"))) {
-        switch(ch) {
-            case 'r':
-                ratio = atof(optarg);
-                break;
-            case 'N':
-            case 'n':
-                Nfile = atoi(optarg);
-                break;
-            case 'f':
-                newfilepath = optarg;
-                break;
-            case 'v':
-                verbose = 1;
-                break;
-            default:
-                usage();
-        }
-    }
-    if(argc - optind + 1 != 4) {
-        usage();
-    }
-    argv += optind - 1;
-    if(0 != big_file_mpi_open(&bf, argv[1], MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    if(0 != big_file_mpi_open_block(&bf, &bb, argv[2], MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    if(Nfile == -1 || bb.Nfile == 0) {
-        Nfile = bb.Nfile;
-    }
-    if(newfilepath == NULL) {
-        newfilepath = argv[1];
-    }
-    if(0 != big_file_mpi_create(&bfnew, newfilepath, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to open: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    size_t newsize = filesize();
-    if(0 != big_file_mpi_create_block(&bfnew, &bbnew, argv[3], bb.dtype, bb.nmemb, Nfile, newsize, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to create temp: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-
-    /* copy attrs */
-    size_t nattr;
-    BigAttr * attrs = big_block_list_attrs(&bb, &nattr);
-    int i;
-    for(i = 0; i < nattr; i ++) {
-        BigAttr * attr = &attrs[i];
-        big_block_set_attr(&bbnew, attr->name, attr->data, attr->dtype, attr->nmemb);
-    }
-
-    if(bb.nmemb > 0 && bb.size > 0) {
-    /* copy data */
-        if(ThisTask == 0) {
-            server();
-        } else {
-            slave();
-        }
-    }
-    if(0 != big_block_mpi_close(&bbnew, MPI_COMM_WORLD)) {
-        fprintf(stderr, "failed to close new: %s\n", big_file_get_error_message());
-        exit(1);
-    }
-    big_block_mpi_close(&bb, MPI_COMM_WORLD);
-    big_file_mpi_close(&bf, MPI_COMM_WORLD);
-    big_file_mpi_close(&bfnew, MPI_COMM_WORLD);
-    return 0;
-}
-static size_t filesize() {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(rng, 1984);
-    int64_t offset = 0;
-    int64_t offsetnew = 0;
-    struct work work;
-    for(offset = 0; offset < bb.size; ) {
-        int64_t chunksize = CHUNKSIZE;
-
-        /* never read beyond my end (read_simple caps at EOF) */
-        if(offset + chunksize >= bb.size) {
-            /* this is the last chunk */
-            chunksize = bb.size - offset;
-        }
-        work.offset = offset;
-        work.chunksize = chunksize;
-        work.seed = gsl_rng_get(rng);
-        work.offsetnew = offsetnew;
-        if(ratio == 1.0) {
-            work.nsel = chunksize;
-        } else {
-            work.nsel = gsl_ran_poisson(rng, chunksize * ratio);
-        }
-
-        offset += chunksize;
-        offsetnew += work.nsel;
-    }
-    return offsetnew;
-}
-void server() {
-    int64_t offset = 0;
-    int64_t offsetnew = 0;
-    struct work work;
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(rng, 1984);
-    for(offset = 0; offset < bb.size; ) {
-        int64_t chunksize = CHUNKSIZE;
-        MPI_Status status;
-        int result = 0;
-        MPI_Recv(&result, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
-                &status);
-        if(status.MPI_TAG == ERROR_TAG) {
-            break;
-        }
-
-        /* never read beyond my end (read_simple caps at EOF) */
-        if(offset + chunksize >= bb.size) {
-            /* this is the last chunk */
-            chunksize = bb.size - offset;
-        }
-        work.offset = offset;
-        work.chunksize = chunksize;
-        work.seed = gsl_rng_get(rng);
-        work.offsetnew = offsetnew;
-        if(ratio == 1.0) {
-            work.nsel = chunksize;
-        } else {
-            work.nsel = gsl_ran_poisson(rng, chunksize * ratio);
-        }
-        MPI_Send(&work, 1, MPI_TYPE_WORK, status.MPI_SOURCE, WORK_TAG, MPI_COMM_WORLD);
-
-        offset += chunksize;
-        offsetnew += work.nsel;
-        if(verbose) {
-            fprintf(stderr, "%td / %td done (%0.4g%%)\r", offset, bb.size, (100. / bb.size) * offset);
-        }
-    }
-    int i;
-    for(i = 1; i < NTask; i ++) {
-        struct work work;
-        MPI_Send(&work, 1, MPI_TYPE_WORK, i, DIE_TAG, MPI_COMM_WORLD);
-    }
-
-}
-void slave() {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937);
-    int result = 0;
-    MPI_Send(&result, 1, MPI_INT, 0, DONE_TAG, MPI_COMM_WORLD);
-    while(1) {
-        struct work work;
-        MPI_Status status;
-        MPI_Recv(&work, 1, MPI_TYPE_WORK, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
-
-        if(status.MPI_TAG == DIE_TAG) {
-            break;
-        }
-        gsl_rng_set(rng, work.seed);
-
-        int64_t offset = work.offset;
-        int64_t chunksize = work.chunksize;
-        int64_t offsetnew = work.offsetnew;
-        int64_t nsel = work.nsel;
-        BigArray array;
-        BigBlockPtr ptrnew;
-        BigArray arraynew;
-
-        size_t dims[2];
-        void * buffer = malloc(dtype_itemsize(bb.dtype) * bb.nmemb * nsel);
-        dims[0] = nsel;
-        dims[1] = bb.nmemb;
-        big_array_init(&arraynew, buffer, bb.dtype, 2, dims, NULL);
-
-        ptrdiff_t i;
-        size_t step = dtype_itemsize(bb.dtype) * bb.nmemb;
-        size_t leftover = chunksize;
-        char * p = buffer;
-        char * q;
-        if(0 != big_block_read_simple(&bb, offset, chunksize, &array, NULL)) {
-            fprintf(stderr, "failed to read original: %s\n", big_file_get_error_message());
-            result = -1;
-            goto bad;
-        }
-        q = array.data;
-
-//        printf("%ld %ld\n", nsel, leftover);
-        for(i = 0; i < chunksize; i ++) {
-            int64_t r = gsl_rng_uniform_int(rng, leftover);
-            if(r < nsel) {
-                memcpy(p, q, step);
-                p += step;
-                nsel --;
-            }
-            if(nsel == 0) break;
-            leftover --;
-            q += step;
-        }
-        if(nsel != 0) abort();
-        free(array.data);
-        if(0 != big_block_seek(&bbnew, &ptrnew, offsetnew)) {
-            fprintf(stderr, "failed to seek new: %s\n", big_file_get_error_message());
-            result = -1;
-            free(arraynew.data);
-            goto bad;
-        }
-
-        if(0 != big_block_write(&bbnew, &ptrnew, &arraynew)) {
-            fprintf(stderr, "failed to write new: %s\n", big_file_get_error_message());
-            result = -1;
-            free(arraynew.data);
-            goto bad;
-        }
-
-        free(arraynew.data);
-        MPI_Send(&result, 1, MPI_INT, 0, DONE_TAG, MPI_COMM_WORLD);
-        continue;
-    bad:
-        MPI_Send(&result, 1, MPI_INT, 0, ERROR_TAG, MPI_COMM_WORLD);
-        continue;
-    }
-    return;
-}

From 48ad61f340290683813ee5a1d90f34e93e2cc968 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Wed, 2 Oct 2024 11:17:30 -0700
Subject: [PATCH 030/120] Remove metal return code which is complex to port
 away from GSL and likely hard to run on GPU

---
 gadget/params.c                     |    2 -
 libgadget/Makefile                  |    7 +-
 libgadget/metal_return.c            | 1005 ---------------------------
 libgadget/metal_return.h            |   56 --
 libgadget/run.c                     |    7 -
 libgadget/tests/test_metal_return.c |   71 --
 6 files changed, 1 insertion(+), 1147 deletions(-)
 delete mode 100644 libgadget/metal_return.c
 delete mode 100644 libgadget/metal_return.h
 delete mode 100644 libgadget/tests/test_metal_return.c

diff --git a/gadget/params.c b/gadget/params.c
index 69fe15d0..608189bf 100644
--- a/gadget/params.c
+++ b/gadget/params.c
@@ -21,7 +21,6 @@
 #include <libgadget/timebinmgr.h>
 #include <libgadget/petaio.h>
 #include <libgadget/cooling_qso_lightup.h>
-#include <libgadget/metal_return.h>
 #include <libgadget/uvbg.h>
 #include <libgadget/stats.h>
 #include <libgadget/plane.h>
@@ -438,7 +437,6 @@ void read_parameter_file(char *fname, int * ShowBacktrace, double * MaxMemSizePe
     set_winds_params(ps);
     set_fof_params(ps);
     set_blackhole_params(ps);
-    set_metal_return_params(ps);
     set_stats_params(ps);
     parameter_set_free(ps);
 }
diff --git a/libgadget/Makefile b/libgadget/Makefile
index 51cc4258..03d13418 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -23,7 +23,6 @@ INCL = densitykernel.h \
 	physconst.h   \
 	sfr_eff.h \
 	stats.h \
-	metal_return.h \
 	winds.h \
 	timefac.h \
 	blackhole.h bhdynfric.h bhinfo.h \
@@ -68,7 +67,6 @@ TESTED = hci \
 	timebinmgr \
 	neutrinos_lra \
 	omega_nu_single \
-	metal_return \
 	cooling_rates \
 	density \
 	gravity \
@@ -101,7 +99,7 @@ GADGET_OBJS =  \
 	 run.o drift.o stats.o \
 	 timestep.o init.o checkpoint.o \
 	 sfr_eff.o cooling.o cooling_rates.o cooling_uvfluc.o cooling_qso_lightup.o \
-	 winds.o veldisp.o density.o metal_return.o \
+	 winds.o veldisp.o density.o \
 	 treewalk.o cosmology.o \
 	 gravshort-tree.o gravshort-pair.o hydra.o  timefac.o \
 	 gravpm.o powerspectrum.o \
@@ -159,9 +157,6 @@ all: libgadget.a libgadget-utils.a
 .objs/test_density: tests/test_density.c .objs/density.o libgadget.a ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
 	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
 
-.objs/test_metal_return: tests/test_metal_return.c .objs/metal_return.o libgadget.a ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
-	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
-
 .objs/test_cooling: tests/test_cooling.c .objs/cooling.o .objs/cooling_rates.o .objs/cooling_uvfluc.o ../tests/stub.c ../tests/cmocka.c libgadget-utils.a
 	$(MPICC) $(TCFLAGS) -I../tests/ $^ $(LIBS) -o $@
 
diff --git a/libgadget/metal_return.c b/libgadget/metal_return.c
deleted file mode 100644
index 39af0088..00000000
--- a/libgadget/metal_return.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <gsl/gsl_roots.h>
-#include <gsl/gsl_errno.h>
-#include <omp.h>
-
-#include "physconst.h"
-#include "walltime.h"
-#include "slotsmanager.h"
-#include "treewalk.h"
-#include "metal_return.h"
-#include "densitykernel.h"
-#include "density.h"
-#include "cosmology.h"
-#include "winds.h"
-#include "utils/spinlocks.h"
-#include "metal_tables.h"
-
-/*! \file metal_return.c
- *  \brief Compute the mass return rate of metals from stellar evolution.
- *
- *  This file returns metals from stars with some delay.
- *  Delayed sources followed are AGB stars, SNII and Sn1a.
- *  9 Species specific yields are stored in the stars and the gas particles.
- *  Gas enrichment is not run every timestep, but only for stars that have
- *  significant enrichment, or are young.
- *  The model closely follows Illustris-TNG, https://arxiv.org/abs/1703.02970
- *  However the tables used are slightly different: we consider SNII between 8 and 40 Msun
- *  following Kobayashi 2006, where they use a hybrid of Kobayashi and Portinari.
- *  AGB yields are from Karakas 2010, like TNG, but stars with mass > 6.5 are
- *  from Doherty 2014, not Fishlock 2014. More details of the model can be found in
- *  the Illustris model Vogelsberger 2013: https://arxiv.org/abs/1305.2913
- *  As the Kobayashi table only goes to 13 Msun, stars with masses 8-13 Msun
- *  are assumed to yield like a 13 Msun star, but scaled by a factor of (M/13).
- */
-
-#if NMETALS != NSPECIES
-    #pragma error " Inconsistency in metal number between slots and metals"
-#endif
-
-static struct metal_return_params
-{
-    double Sn1aN0;
-    int SPHWeighting;
-    double MaxNgbDeviation;
-} MetalParams;
-
-/* For tests*/
-void set_metal_params(double Sn1aN0)
-{
-    MetalParams.Sn1aN0 = Sn1aN0;
-}
-
-/*Set the parameters of the hydro module*/
-void
-set_metal_return_params(ParameterSet * ps)
-{
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    if(ThisTask == 0) {
-        MetalParams.Sn1aN0 = param_get_double(ps, "MetalsSn1aN0");
-        MetalParams.SPHWeighting = param_get_int(ps, "MetalsSPHWeighting");
-        MetalParams.MaxNgbDeviation = param_get_double(ps, "MetalsMaxNgbDeviation");
-    }
-    MPI_Bcast(&MetalParams, sizeof(struct metal_return_params), MPI_BYTE, 0, MPI_COMM_WORLD);
-}
-
-/* Build the interpolators for each yield table. We use bilinear interpolation
- * so there is no extra memory allocation and we never free the tables*/
-void setup_metal_table_interp(struct interps * interp)
-{
-    interp->lifetime_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, LIFE_NMET, LIFE_NMASS);
-    gsl_interp2d_init(interp->lifetime_interp, lifetime_metallicity, lifetime_masses, lifetime, LIFE_NMET, LIFE_NMASS);
-    interp->agb_mass_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-    gsl_interp2d_init(interp->agb_mass_interp, agb_metallicities, agb_masses, agb_total_mass, AGB_NMET, AGB_NMASS);
-    interp->agb_metallicity_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-    gsl_interp2d_init(interp->agb_metallicity_interp, agb_metallicities, agb_masses, agb_total_metals, AGB_NMET, AGB_NMASS);
-    int i;
-    for(i=0; i<NMETALS; i++) {
-        interp->agb_metals_interp[i] = gsl_interp2d_alloc(gsl_interp2d_bilinear, AGB_NMET, AGB_NMASS);
-        gsl_interp2d_init(interp->agb_metals_interp[i], agb_metallicities, agb_masses, agb_yield[i], AGB_NMET, AGB_NMASS);
-    }
-    interp->snii_mass_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-    gsl_interp2d_init(interp->snii_mass_interp, snii_metallicities, snii_masses, snii_total_mass, SNII_NMET, SNII_NMASS);
-    interp->snii_metallicity_interp = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-    gsl_interp2d_init(interp->snii_metallicity_interp, snii_metallicities, snii_masses, snii_total_metals, SNII_NMET, SNII_NMASS);
-    for(i=0; i<NMETALS; i++) {
-        interp->snii_metals_interp[i] = gsl_interp2d_alloc(gsl_interp2d_bilinear, SNII_NMET, SNII_NMASS);
-        gsl_interp2d_init(interp->snii_metals_interp[i], snii_metallicities, snii_masses, snii_yield[i], SNII_NMET, SNII_NMASS);
-    }
-}
-
-#define METALS_GET_PRIV(tw) ((struct MetalReturnPriv*) ((tw)->priv))
-
-typedef struct {
-    TreeWalkQueryBase base;
-    MyFloat Metallicity;
-    MyFloat Mass;
-    MyFloat Hsml;
-    MyFloat StarVolumeSPH;
-    /* This is the metal/mass generated this timestep.*/
-    MyFloat MetalSpeciesGenerated[NMETALS];
-    MyFloat MassGenerated;
-    MyFloat MetalGenerated;
-} TreeWalkQueryMetals;
-
-typedef struct {
-    TreeWalkResultBase base;
-    /* This is the total mass returned to
-     * the surrounding gas particles, for mass conservation.*/
-    MyFloat MassReturn;
-} TreeWalkResultMetals;
-
-typedef struct {
-    TreeWalkNgbIterBase base;
-    DensityKernel kernel;
-} TreeWalkNgbIterMetals;
-
-static int
-metal_return_haswork(int n, TreeWalk * tw);
-
-static void
-metal_return_ngbiter(
-    TreeWalkQueryMetals * I,
-    TreeWalkResultMetals * O,
-    TreeWalkNgbIterMetals * iter,
-    LocalTreeWalk * lv
-   );
-
-static void
-metal_return_copy(int place, TreeWalkQueryMetals * input, TreeWalk * tw);
-
-static void
-metal_return_postprocess(int place, TreeWalk * tw);
-
-static void
-metal_return_reduce(const int place, TreeWalkResultMetals * remote, const enum TreeWalkReduceMode mode, TreeWalk * tw);
-
-/* The Chabrier IMF used for computing SnII and AGB yields.
- * See 1305.2913 eq 3*/
-static double chabrier_imf(double mass)
-{
-    if(mass <= 1) {
-        return 0.852464 / mass * exp(- pow(log(mass / 0.079)/ 0.69, 2)/2);
-    }
-    else {
-        return 0.237912 * pow(mass, -2.3);
-    }
-}
-
-double atime_integ(double atime, void * params)
-{
-    Cosmology * CP = (Cosmology *) params;
-    return 1/(hubble_function(CP, atime) * atime);
-}
-
-/* Compute the difference in internal time units between two scale factors.*/
-static double atime_to_myr(Cosmology *CP, double atime1, double atime2, gsl_integration_workspace * gsl_work)
-{
-    /* t = dt/da da = 1/(Ha) da*/
-    /* Approximate hubble function as constant here: we only care
-     * about metal return over a single timestep*/
-    gsl_function ff = {atime_integ, CP};
-    double tmyr, abserr;
-    gsl_integration_qag(&ff, atime1, atime2, 1e-4, 0, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &tmyr, &abserr);
-    return tmyr * CP->UnitTime_in_s / SEC_PER_MEGAYEAR;
-}
-
-/* Functions for the root finder*/
-struct massbin_find_params
-{
-    double dtfind;
-    double stellarmetal;
-    gsl_interp2d * lifetime_tables;
-    gsl_interp_accel * metalacc;
-    gsl_interp_accel * massacc;
-};
-
-/* This is the inverse of the lifetime function from the tables.
- * Need to find the stars with a given lifetime*/
-double
-massendlife (double mass, void *params)
-{
-  struct massbin_find_params *p = (struct massbin_find_params *) params;
-  double tlife = gsl_interp2d_eval(p->lifetime_tables, lifetime_metallicity, lifetime_masses, lifetime, p->stellarmetal, mass, p->metalacc, p->massacc);
-  double tlifemyr = tlife/1e6;
-  return tlifemyr - p->dtfind;
-}
-
-/* Solve the lifetime function to find the lowest and highest mass bin that dies this timestep*/
-double do_rootfinding(struct massbin_find_params *p, double mass_low, double mass_high)
-{
-    int iter = 0;
-    gsl_function F;
-
-    F.function = &massendlife;
-    F.params = p;
-
-    const gsl_root_fsolver_type *T = gsl_root_fsolver_falsepos;
-    gsl_root_fsolver * s = gsl_root_fsolver_alloc (T);
-    gsl_root_fsolver_set (s, &F, mass_low, mass_high);
-
-    /* Iterate until we have an idea of the mass bins dying this timestep.
-     * No check is done for success, but it should always be close enough.*/
-    for(iter = 0; iter < MAXITER; iter++)
-    {
-      gsl_root_fsolver_iterate (s);
-      mass_low = gsl_root_fsolver_x_lower (s);
-      mass_high = gsl_root_fsolver_x_upper (s);
-      int status = gsl_root_test_interval (mass_low, mass_high,
-                                       0, 0.005);
-      //message(4, "lo %g hi %g root %g val %g\n", mass_low, mass_high, gsl_root_fsolver_root(s), massendlife(gsl_root_fsolver_root(s), p));
-      if (status == GSL_SUCCESS)
-        break;
-  }
-  double root = gsl_root_fsolver_root(s);
-  gsl_root_fsolver_free (s);
-  return root;
-}
-
-/* Find the mass bins which die in this timestep using the lifetime table.
- * dtstart, dtend - time at start and end of timestep in Myr.
- * stellarmetal - metallicity of the star.
- * lifetime_tables - 2D interpolation table of the lifetime.
- * masshigh, masslow - pointers in which to store the high and low lifetime limits
- */
-void find_mass_bin_limits(double * masslow, double * masshigh, const double dtstart, const double dtend, double stellarmetal, gsl_interp2d * lifetime_tables)
-{
-    /* Clamp metallicities to the table values.*/
-    if(stellarmetal < lifetime_metallicity[0])
-        stellarmetal = lifetime_metallicity[0];
-    if(stellarmetal > lifetime_metallicity[LIFE_NMET-1])
-        stellarmetal = lifetime_metallicity[LIFE_NMET-1];
-
-    /* Find the root with GSL routines. */
-    struct massbin_find_params p = {0};
-    p.metalacc = gsl_interp_accel_alloc();
-    p.massacc = gsl_interp_accel_alloc();
-    p.lifetime_tables = lifetime_tables;
-    p.stellarmetal = stellarmetal;
-    /* First find stars that died before the end of this timebin*/
-    p.dtfind = dtend;
-    /* If no stars have died yet*/
-    if(massendlife (MAXMASS, &p) >= 0)
-    {
-        *masslow = MAXMASS;
-        *masshigh = MAXMASS;
-        return;
-    }
-    /* All stars die before the end of this timestep*/
-    if(massendlife (agb_masses[0], &p) <= 0)
-        *masslow = lifetime_masses[0];
-    else
-        *masslow = do_rootfinding(&p, agb_masses[0], MAXMASS);
-
-    /* Now find stars that died before the start of this timebin*/
-    p.dtfind = dtstart;
-    /* Now we know that life(masslow) = dtend, so life(masslow) > dtstart, so life(masslow) - dtstart > 0
-     * This is when no stars have died at the beginning of this timestep.*/
-    if(massendlife (MAXMASS, &p) >= 0)
-        *masshigh = MAXMASS;
-    /* This can sometimes happen due to root finding inaccuracy.
-     * Just do this star next timestep.*/
-    else if(massendlife (*masslow, &p) <= 0)
-        *masshigh = *masslow;
-    else
-        *masshigh = do_rootfinding(&p, *masslow, MAXMASS);
-    gsl_interp_accel_free(p.metalacc);
-    gsl_interp_accel_free(p.massacc);
-}
-
-/* Parameters of the interpolator
- * to hand to the imf integral.
- * Use different interpolation structures
- * for mass return, metal return and yield.*/
-struct imf_integ_params
-{
-    gsl_interp2d * interp;
-    const double * masses;
-    const double * metallicities;
-    const double * weights;
-    double metallicity;
-};
-
-/* Integrand for a function which computes a Chabrier IMF weighted quantity.*/
-double chabrier_imf_integ (double mass, void * params)
-{
-    struct imf_integ_params * para = (struct imf_integ_params * ) params;
-    /* This is needed so that the yield for SNII with masses between 8 and 13 Msun
-     * are the same as the smallest mass in the table, 13 Msun,
-     * but they still contribute their number density to the IMF.*/
-    double intpmass = mass;
-    if(mass < para->masses[0])
-        intpmass = para->masses[0];
-    if(mass > para->masses[para->interp->ysize-1])
-        intpmass = para->masses[para->interp->ysize-1];
-    double weight = gsl_interp2d_eval(para->interp, para->metallicities, para->masses, para->weights, para->metallicity, intpmass, NULL, NULL);
-    /* This rescales the return by the original mass of the star, if it was outside the table.
-     * It means that, for example, an 8 Msun star does not return more than 8 Msun. */
-    weight *= (mass/intpmass);
-    return weight * chabrier_imf(mass);
-}
-
-/* Helper for the IMF normalisation*/
-double chabrier_mass(double mass, void * params)
-{
-    return mass * chabrier_imf(mass);
-}
-
-/* Compute factor to normalise the total mass in the IMF to unity.*/
-double compute_imf_norm(gsl_integration_workspace * gsl_work)
-{
-    double norm, abserr;
-    gsl_function ff = {chabrier_mass, NULL};
-    gsl_integration_qag(&ff, MINMASS, MAXMASS, 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &norm, &abserr);
-    return norm;
-}
-
-/* Compute number of Sn1a: has units of N0 = 1.3e-3, which is SN1A/(unit initial mass in M_sun).
- * Zero for age < 40 Myr. */
-double sn1a_number(double dtmyrstart, double dtmyrend, double hub)
-{
-    /* Number of Sn1a events follows a delay time distribution (1305.2913, eq. 10) */
-    const double sn1aindex = 1.12;
-    const double tau8msun = 40;
-    if(dtmyrend < tau8msun)
-        return 0;
-    /* Lower integration limit modelling formation time of WDs*/
-    if(dtmyrstart < tau8msun)
-        dtmyrstart  = tau8msun;
-    /* Total number of Sn1a events from this star: integral evaluated from t=tau8msun to t=hubble time.*/
-    const double totalSN1a = 1- pow(1/(hub*HUBBLE * SEC_PER_MEGAYEAR)/tau8msun, 1-sn1aindex);
-    /* This is the integral of the DTD, normalised to the N0 rate which is in SN/M_sun.*/
-    double Nsn1a = MetalParams.Sn1aN0 /totalSN1a * (pow(dtmyrstart / tau8msun, 1-sn1aindex) - pow(dtmyrend / tau8msun, 1-sn1aindex));
-    return Nsn1a;
-}
-
-/* Compute yield of AGB stars: this is normalised to the yield which has units of Msun / (unit Msun in the initial SSP and so is really dimensionless.)*/
-double compute_agb_yield(gsl_interp2d * agb_interp, const double * agb_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work )
-{
-    struct imf_integ_params para;
-    gsl_function ff = {chabrier_imf_integ, &para};
-    double agbyield = 0, abserr;
-    /* Only return AGB metals for the range of AGB stars*/
-    if (masshigh > SNAGBSWITCH)
-        masshigh = SNAGBSWITCH;
-    if (masslow < agb_masses[0])
-        masslow = agb_masses[0];
-    if (stellarmetal > agb_metallicities[AGB_NMET-1])
-        stellarmetal = agb_metallicities[AGB_NMET-1];
-    if (stellarmetal < agb_metallicities[0])
-        stellarmetal = agb_metallicities[0];
-    /* This happens if no bins in range had dying stars this timestep*/
-    if(masslow >= masshigh)
-        return 0;
-    para.interp = agb_interp;
-    para.masses = agb_masses;
-    para.metallicities = agb_metallicities;
-    para.metallicity = stellarmetal;
-    para.weights = agb_weights;
-    gsl_integration_qag(&ff, masslow, masshigh, 1e-7, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &agbyield, &abserr);
-    return agbyield;
-}
-
-double compute_snii_yield(gsl_interp2d * snii_interp, const double * snii_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work )
-{
-    struct imf_integ_params para;
-    gsl_function ff = {chabrier_imf_integ, &para};
-    double yield = 0, abserr;
-    /* Only return metals for the range of SNII stars.*/
-    if (masshigh > snii_masses[SNII_NMASS-1])
-        masshigh = snii_masses[SNII_NMASS-1];
-    if (masslow < SNAGBSWITCH)
-        masslow = SNAGBSWITCH;
-    if (stellarmetal > snii_metallicities[SNII_NMET-1])
-        stellarmetal = snii_metallicities[SNII_NMET-1];
-    if (stellarmetal < snii_metallicities[0])
-        stellarmetal = snii_metallicities[0];
-    para.interp = snii_interp;
-    para.masses = snii_masses;
-    para.metallicities = snii_metallicities;
-    para.metallicity = stellarmetal;
-    para.weights = snii_weights;
-    /* This happens if no bins in range had dying stars this timestep*/
-    if(masslow >= masshigh)
-        return 0;
-    gsl_integration_qag(&ff, masslow, masshigh, 1e-7, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &yield, &abserr);
-    return yield;
-}
-
-/* Compute the total mass yield for this star in this timestep*/
-static double mass_yield(double dtmyrstart, double dtmyrend, double stellarmetal, double hub, struct interps * interp, double imf_norm, gsl_integration_workspace * gsl_work, double masslow, double masshigh)
-{
-    /* Number of AGB stars/SnII by integrating the IMF*/
-    double agbyield = compute_agb_yield(interp->agb_mass_interp, agb_total_mass, stellarmetal, masslow, masshigh, gsl_work);
-    double sniiyield = compute_snii_yield(interp->snii_mass_interp, snii_total_mass, stellarmetal, masslow, masshigh, gsl_work);
-    /* Fraction of the IMF which goes off this timestep. Normalised by the total IMF so we get a fraction of the SSP.*/
-    double massyield = (agbyield + sniiyield)/imf_norm;
-    /* Mass yield from Sn1a*/
-    double Nsn1a = sn1a_number(dtmyrstart, dtmyrend, hub);
-    massyield += Nsn1a * sn1a_total_metals;
-    //message(3, "masslow %g masshigh %g stellarmetal %g dystart %g dtend %g agb %g snii %g sn1a %g imf_norm %g\n",
-    //        masslow, masshigh, stellarmetal, dtmyrstart, dtmyrend, agbyield, sniiyield, Nsn1a * sn1a_total_metals, imf_norm);
-    return massyield;
-}
-
-/* Compute the total metal yield for this star in this timestep*/
-static double metal_yield(double dtmyrstart, double dtmyrend, double stellarmetal, double hub, struct interps * interp, MyFloat * MetalYields, double imf_norm, gsl_integration_workspace * gsl_work, double masslow, double masshigh)
-{
-    double MetalGenerated = 0;
-    /* Number of AGB stars/SnII by integrating the IMF*/
-    MetalGenerated += compute_agb_yield(interp->agb_metallicity_interp, agb_total_metals, stellarmetal, masslow, masshigh, gsl_work);
-    MetalGenerated += compute_snii_yield(interp->snii_metallicity_interp, snii_total_metals, stellarmetal, masslow, masshigh, gsl_work);
-    MetalGenerated /= imf_norm;
-
-    int i;
-    for(i = 0; i < NMETALS; i++)
-    {
-        MetalYields[i] = 0;
-        MetalYields[i] += compute_agb_yield(interp->agb_metals_interp[i], agb_yield[i], stellarmetal, masslow, masshigh, gsl_work);
-        MetalYields[i] += compute_snii_yield(interp->snii_metals_interp[i], snii_yield[i], stellarmetal, masslow, masshigh, gsl_work);
-        MetalYields[i] /= imf_norm;
-    }
-    double Nsn1a = sn1a_number(dtmyrstart, dtmyrend, hub);
-    for(i = 0; i < NMETALS; i++)
-        MetalYields[i] += Nsn1a * sn1a_yields[i];
-    MetalGenerated += Nsn1a * sn1a_total_metals;
-
-    return MetalGenerated;
-}
-
-/* Initialise the private structure, finding stellar mass return and ages*/
-int64_t
-metal_return_init(const ActiveParticles * act, Cosmology * CP, struct MetalReturnPriv * priv, const double atime)
-{
-    int nthread = omp_get_max_threads();
-    priv->gsl_work = ta_malloc("gsl_work", gsl_integration_workspace *, nthread);
-    int i;
-    /* Allocate a workspace for each thread*/
-    for(i=0; i < nthread; i++)
-        priv->gsl_work[i] = gsl_integration_workspace_alloc(GSL_WORKSPACE);
-    priv->hub = CP->HubbleParam;
-
-    /* Initialize*/
-    setup_metal_table_interp(&priv->interp);
-    priv->StellarAges = (MyFloat *) mymalloc("StellarAges", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->MassReturn = (MyFloat *) mymalloc("MassReturn", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->LowDyingMass = (MyFloat *) mymalloc("LowDyingMass", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->HighDyingMass = (MyFloat *) mymalloc("HighDyingMass", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->StarVolumeSPH = (MyFloat *) mymalloc("StarVolumeSPH", SlotsManager->info[4].size * sizeof(MyFloat));
-
-    priv->imf_norm = compute_imf_norm(priv->gsl_work[0]);
-    /* Maximum possible mass return for below*/
-    double maxmassfrac = mass_yield(0, 1/(CP->HubbleParam*HUBBLE * SEC_PER_MEGAYEAR), snii_metallicities[SNII_NMET-1], CP->HubbleParam, &priv->interp, priv->imf_norm, priv->gsl_work[0],agb_masses[0], MAXMASS);
-
-    int64_t haswork = 0;
-    /* First find the mass return as a fraction of the total mass and the age of the star.
-     * This is done first so we can skip density computation for not active stars*/
-    #pragma omp parallel for reduction(+: haswork)
-    for(i=0; i < act->NumActiveParticle;i++)
-    {
-        int p_i = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        if(P[p_i].Type != 4)
-            continue;
-        int tid = omp_get_thread_num();
-        const int slot = P[p_i].PI;
-        priv->StellarAges[slot] = atime_to_myr(CP, STARP(p_i).FormationTime, atime, priv->gsl_work[tid]);
-        /* Note this takes care of units*/
-        double initialmass = P[p_i].Mass + STARP(p_i).TotalMassReturned;
-        find_mass_bin_limits(&priv->LowDyingMass[slot], &priv->HighDyingMass[slot], STARP(p_i).LastEnrichmentMyr, priv->StellarAges[P[p_i].PI], STARP(p_i).Metallicity, priv->interp.lifetime_interp);
-
-        priv->MassReturn[slot] = initialmass * mass_yield(STARP(p_i).LastEnrichmentMyr, priv->StellarAges[P[p_i].PI], STARP(p_i).Metallicity, CP->HubbleParam, &priv->interp, priv->imf_norm, priv->gsl_work[tid],priv->LowDyingMass[slot], priv->HighDyingMass[slot]);
-        //message(3, "Particle %d PI %d massgen %g mass %g initmass %g\n", p_i, P[p_i].PI, priv->MassReturn[P[p_i].PI], P[p_i].Mass, initialmass);
-        /* Guard against making a zero mass particle and warn since this should not happen.*/
-        if(STARP(p_i).TotalMassReturned + priv->MassReturn[slot] > initialmass * maxmassfrac) {
-            if(priv->MassReturn[slot] / STARP(p_i).TotalMassReturned > 0.01)
-                message(1, "Large mass return id %ld %g from %d mass %g initial %g (maxfrac %g) age %g lastenrich %g metal %g dymass %g %g\n",
-                    P[p_i].ID, priv->MassReturn[slot], p_i, STARP(p_i).TotalMassReturned, initialmass, maxmassfrac, priv->StellarAges[P[p_i].PI], STARP(p_i).LastEnrichmentMyr, STARP(p_i).Metallicity, priv->LowDyingMass[slot], priv->HighDyingMass[slot]);
-            priv->MassReturn[slot] = initialmass * maxmassfrac - STARP(p_i).TotalMassReturned;
-            if(priv->MassReturn[slot] < 0) {
-                priv->MassReturn[slot] = 0;
-            }
-            /* Ensure that we skip this step*/
-            if(!metals_haswork(p_i, priv->MassReturn))
-                STARP(p_i).LastEnrichmentMyr = priv->StellarAges[P[p_i].PI];
-
-        }
-        /* Keep count of how much work we need to do*/
-        if(metals_haswork(p_i, priv->MassReturn))
-            haswork++;
-    }
-    return haswork;
-}
-
-/* Free memory allocated by metal_return_init */
-void
-metal_return_priv_free(struct MetalReturnPriv * priv)
-{
-    myfree(priv->StarVolumeSPH);
-    myfree(priv->HighDyingMass);
-    myfree(priv->LowDyingMass);
-    myfree(priv->MassReturn);
-    myfree(priv->StellarAges);
-
-    int i;
-    for(i=0; i < omp_get_max_threads(); i++)
-        gsl_integration_workspace_free(priv->gsl_work[i]);
-
-    ta_free(priv->gsl_work);
-}
-
-/*! This function is the driver routine for the calculation of metal return. */
-void
-metal_return(const ActiveParticles * act, ForceTree * gasTree, Cosmology * CP, const double atime, const double AvgGasMass)
-{
-    /* Do nothing if no stars yet*/
-    int64_t totstar;
-    MPI_Allreduce(&SlotsManager->info[4].size, &totstar, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-    if(totstar == 0)
-        return;
-
-    struct MetalReturnPriv priv[1];
-
-    int64_t nwork = metal_return_init(act, CP, priv, atime);
-
-    /* Maximum mass of a gas particle after enrichment: cap it at a few times the initial mass.
-     * FIXME: Ideally we should here fork a new particle with a smaller gas mass. We should
-     * figure out then how set the gas entropy. A possibly better idea is to add
-     * a generic routine to split gas particles into the density code.*/
-    priv->MaxGasMass = 4* AvgGasMass;
-
-    int64_t totwork;
-    MPI_Allreduce(&nwork, &totwork, 1, MPI_INT64, MPI_SUM, MPI_COMM_WORLD);
-
-    walltime_measure("/SPH/Metals/Init");
-
-    if(totwork == 0) {
-        metal_return_priv_free(priv);
-        return;
-    }
-
-    if(!gasTree->tree_allocated_flag || !(gasTree->mask & GASMASK))
-        endrun(5, "metal_return called with bad tree allocated %d mask %d\n", gasTree->tree_allocated_flag, gasTree->mask);
-    /* Compute total number of weights around each star for actively returning stars*/
-    stellar_density(act, priv->StarVolumeSPH, priv->MassReturn, gasTree);
-
-    /* Do the metal return*/
-    TreeWalk tw[1] = {{0}};
-
-    tw->ev_label = "METALS";
-    tw->visit = (TreeWalkVisitFunction) treewalk_visit_ngbiter;
-    tw->ngbiter = (TreeWalkNgbIterFunction) metal_return_ngbiter;
-    tw->ngbiter_type_elsize = sizeof(TreeWalkNgbIterMetals);
-    tw->haswork = metal_return_haswork;
-    tw->fill = (TreeWalkFillQueryFunction) metal_return_copy;
-    tw->reduce = (TreeWalkReduceResultFunction) metal_return_reduce;
-    tw->postprocess = (TreeWalkProcessFunction) metal_return_postprocess;
-    tw->query_type_elsize = sizeof(TreeWalkQueryMetals);
-    tw->result_type_elsize = sizeof(TreeWalkResultMetals);
-    tw->tree = gasTree;
-    tw->priv = priv;
-
-    priv->spin = init_spinlocks(SlotsManager->info[0].size);
-    treewalk_run(tw, act->ActiveParticle, act->NumActiveParticle);
-    free_spinlocks(priv->spin);
-
-    metal_return_priv_free(priv);
-
-    /* collect some timing information */
-    walltime_measure("/SPH/Metals/Yield");
-}
-
-/* This function is unusually important:
- * it computes the total amount of metals to be returned in this timestep.*/
-static void
-metal_return_copy(int place, TreeWalkQueryMetals * input, TreeWalk * tw)
-{
-    input->Metallicity = STARP(place).Metallicity;
-    input->Mass = P[place].Mass;
-    input->Hsml = P[place].Hsml;
-    int pi = P[place].PI;
-    input->StarVolumeSPH = METALS_GET_PRIV(tw)->StarVolumeSPH[pi];
-    double InitialMass = P[place].Mass + STARP(place).TotalMassReturned;
-    double dtmyrend = METALS_GET_PRIV(tw)->StellarAges[pi];
-    double dtmyrstart = STARP(place).LastEnrichmentMyr;
-    int tid = omp_get_thread_num();
-    /* This is the total mass returned from this stellar population this timestep. Note this is already in the desired units.*/
-    input->MassGenerated = METALS_GET_PRIV(tw)->MassReturn[pi];
-    /* This returns the total amount of metal produced this timestep, and also fills out MetalSpeciesGenerated, which is an
-     * element by element table of the metal produced by dying stars this timestep.*/
-    double total_z_yield = metal_yield(dtmyrstart, dtmyrend, input->Metallicity, METALS_GET_PRIV(tw)->hub, &METALS_GET_PRIV(tw)->interp, input->MetalSpeciesGenerated, METALS_GET_PRIV(tw)->imf_norm, METALS_GET_PRIV(tw)->gsl_work[tid], METALS_GET_PRIV(tw)->LowDyingMass[pi], METALS_GET_PRIV(tw)->HighDyingMass[pi]);
-    /* The total metal returned is the metal ejected into the ISM this timestep. total_z_yield is given as a fraction of the initial SSP.*/
-    input->MetalGenerated = InitialMass * total_z_yield;
-    //message(3, "Particle %d PI %d z %g massgen %g metallicity %g\n", pi, P[pi].PI, total_z_yield, METALS_GET_PRIV(tw)->MassReturn[pi], STARP(place).Metallicity);
-    /* It should be positive! If it is not, this is some integration error
-     * in the yield table as we cannot destroy metal which is not present.*/
-    if(input->MetalGenerated < 0)
-        input->MetalGenerated = 0;
-    /* Similarly for all the other metal species*/
-    int i;
-    for(i = 0; i < NMETALS; i++) {
-        input->MetalSpeciesGenerated[i] *= InitialMass;
-        if(input->MetalSpeciesGenerated[i] < 0)
-            input->MetalSpeciesGenerated[i] = 0;
-    }
-}
-
-/* Update the mass return variable to contain the amount of mass actually returned.*/
-static void
-metal_return_reduce(int place, TreeWalkResultMetals * remote, enum TreeWalkReduceMode mode, TreeWalk * tw)
-{
-    TREEWALK_REDUCE(METALS_GET_PRIV(tw)->MassReturn[P[place].PI], remote->MassReturn);
-}
-
-/* Update the mass and enrichment variables for the star.
- * Note that the stellar metallicity is not updated, as the
- * metal-forming stars are now dead and their metals in the gas.*/
-static void
-metal_return_postprocess(int place, TreeWalk * tw)
-{
-    /* Conserve mass returned*/
-    P[place].Mass -= METALS_GET_PRIV(tw)->MassReturn[P[place].PI];
-    STARP(place).TotalMassReturned += METALS_GET_PRIV(tw)->MassReturn[P[place].PI];
-    /* Update the last enrichment time*/
-    STARP(place).LastEnrichmentMyr = METALS_GET_PRIV(tw)->StellarAges[P[place].PI];
-}
-
-/*! For all gas particles within the density radius of this star,
- * add a fraction of the total mass and metals generated,
- * weighted by the SPH kernel distance from the star.
- */
-static void
-metal_return_ngbiter(
-    TreeWalkQueryMetals * I,
-    TreeWalkResultMetals * O,
-    TreeWalkNgbIterMetals * iter,
-    LocalTreeWalk * lv
-   )
-{
-    if(iter->base.other == -1) {
-        /* Only return metals to gas*/
-        iter->base.mask = GASMASK;
-        iter->base.Hsml = I->Hsml;
-        iter->base.symmetric = NGB_TREEFIND_ASYMMETRIC;
-        /* Initialise the mass lost by this star in this timestep*/
-        O->MassReturn = 0;
-        density_kernel_init(&iter->kernel, I->Hsml, GetDensityKernelType());
-        return;
-    }
-
-    const int other = iter->base.other;
-    const double r2 = iter->base.r2;
-    const double r = iter->base.r;
-
-    if(r2 > 0 && r2 < iter->kernel.HH)
-    {
-        double wk = 1;
-        const double u = r * iter->kernel.Hinv;
-
-        if(MetalParams.SPHWeighting)
-            wk = density_kernel_wk(&iter->kernel, u);
-        double ThisMetals[NMETALS];
-        if(I->StarVolumeSPH ==0)
-            endrun(3, "StarVolumeSPH %g hsml %g\n", I->StarVolumeSPH, I->Hsml);
-        double newmass;
-        int pi = P[other].PI;
-        lock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-        /* Volume of particle weighted by the SPH kernel*/
-        double volume = P[other].Mass / SPHP(other).Density;
-        double returnfraction = wk * volume / I->StarVolumeSPH;
-        double thismass = returnfraction * I->MassGenerated;
-        /* Ensure that the gas particles don't become overweight.
-         * If there are few gas particles around, the star clusters
-         * will hold onto their metals.*/
-        if(P[other].Mass + thismass > METALS_GET_PRIV(lv->tw)->MaxGasMass) {
-            unlock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-            return;
-        }
-        /* Add metals weighted by SPH kernel*/
-        int i;
-        for(i = 0; i < NMETALS; i++)
-            ThisMetals[i] = returnfraction * I->MetalSpeciesGenerated[i];
-        double thismetal = returnfraction * I->MetalGenerated;
-        /* Add the metals to the particle.*/
-        for(i = 0; i < NMETALS; i++)
-            SPHP(other).Metals[i] = (SPHP(other).Metals[i] * P[other].Mass + ThisMetals[i])/(P[other].Mass + thismass);
-        /* Update total metallicity*/
-        SPHP(other).Metallicity = (SPHP(other).Metallicity * P[other].Mass + thismetal)/(P[other].Mass + thismass);
-        /* Update mass*/
-        double massfrac = (P[other].Mass + thismass) / P[other].Mass;
-        P[other].Mass *= massfrac;
-        /* Density also needs a correction so the volume fraction is unchanged.
-         * This ensures that volume = Mass/Density is unchanged for the next particle
-         * and thus the weighting still sums to unity.*/
-        SPHP(other).Density *= massfrac;
-        /* Keep track of how much was returned for conservation purposes*/
-        O->MassReturn += thismass;
-        newmass = P[other].Mass;
-        unlock_spinlock(pi, METALS_GET_PRIV(lv->tw)->spin);
-        if(newmass <= 0)
-            endrun(3, "New mass %g new metal %g in particle %d id %ld from star mass %g metallicity %g\n",
-                   newmass, SPHP(other).Metallicity, other, P[other].ID, I->Mass, I->Metallicity);
-    }
-}
-
-/* Find stars returning enough metals to the gas.
- * This is a wrapper function to allow for
- * different private structs in different treewalks*/
-int
-metals_haswork(int i, MyFloat * MassReturn)
-{
-    if(P[i].Type != 4)
-        return 0;
-    int pi = P[i].PI;
-    /* Don't do enrichment from all stars, just those with significant enrichment*/
-    if(MassReturn[pi] < 1e-3 * (P[i].Mass + STARP(i).TotalMassReturned))
-        return 0;
-    return 1;
-}
-
-static int
-metal_return_haswork(int i, TreeWalk * tw)
-{
-    return metals_haswork(i, METALS_GET_PRIV(tw)->MassReturn);
-}
-
-/* Number of densities to evaluate simultaneously*/
-#define NHSML 10
-
-typedef struct {
-    TreeWalkNgbIterBase base;
-    DensityKernel kernel[NHSML];
-    double kernel_volume[NHSML];
-} TreeWalkNgbIterStellarDensity;
-
-typedef struct
-{
-    TreeWalkQueryBase base;
-    MyFloat Hsml[NHSML];
-} TreeWalkQueryStellarDensity;
-
-typedef struct {
-    TreeWalkResultBase base;
-    MyFloat VolumeSPH[NHSML];
-    MyFloat Ngb[NHSML];
-    int maxcmpte;
-    int _alignment;
-} TreeWalkResultStellarDensity;
-
-struct StellarDensityPriv {
-    /* Current number of neighbours*/
-    MyFloat (*NumNgb)[NHSML];
-    /* Lower and upper bounds on smoothing length*/
-    MyFloat *Left, *Right;
-    MyFloat (*VolumeSPH)[NHSML];
-    /* For haswork*/
-    MyFloat *MassReturn;
-    /*!< Desired number of SPH neighbours */
-    double DesNumNgb;
-    /* Maximum index where NumNgb is valid. */
-    int * maxcmpte;
-};
-
-#define STELLAR_DENSITY_GET_PRIV(tw) ((struct StellarDensityPriv*) ((tw)->priv))
-
-static int
-stellar_density_haswork(int i, TreeWalk * tw)
-{
-    return metals_haswork(i, STELLAR_DENSITY_GET_PRIV(tw)->MassReturn);
-}
-
-/* Get Hsml for one of the evaluations*/
-static inline double
-effhsml(int place, int i, TreeWalk * tw)
-{
-    int pi = P[place].PI;
-    double left = STELLAR_DENSITY_GET_PRIV(tw)->Left[pi];
-    double right = STELLAR_DENSITY_GET_PRIV(tw)->Right[pi];
-    /* If somehow Hsml has become zero through underflow, use something non-zero
-     * to make sure we converge. */
-    if(left == 0 && right > 0.99*tw->tree->BoxSize && P[place].Hsml == 0) {
-        int fat = force_get_father(place, tw->tree);
-        P[place].Hsml = tw->tree->Nodes[fat].len;
-        if(P[place].Hsml == 0)
-            P[place].Hsml = tw->tree->BoxSize / pow(PartManager->NumPart, 1./3)/4.;
-    }
-    /* Use slightly past the current Hsml as the right most boundary*/
-    if(right > 0.99*tw->tree->BoxSize)
-        right = P[place].Hsml * ((1.+NHSML)/NHSML);
-    /* Use 1/2 of current Hsml for left. The asymmetry is because it is free
-     * to compute extra densities for h < Hsml, but not for h > Hsml.*/
-    if(left == 0)
-        left = 0.1 * P[place].Hsml;
-    /* From left + 1/N  to right - 1/N, evenly spaced in volume,
-     * since NumNgb ~ h^3.*/
-    double rvol = pow(right, 3);
-    double lvol = pow(left, 3);
-    return pow((1.*i+1)/(1.*NHSML+1) * (rvol - lvol) + lvol, 1./3);
-}
-
-static void
-stellar_density_copy(int place, TreeWalkQueryStellarDensity * I, TreeWalk * tw)
-{
-    int i;
-    for(i = 0; i < NHSML; i++)
-        I->Hsml[i] = effhsml(place, i, tw);
-}
-
-static void
-stellar_density_reduce(int place, TreeWalkResultStellarDensity * remote, enum TreeWalkReduceMode mode, TreeWalk * tw)
-{
-    int pi = P[place].PI;
-    int i;
-    if(mode == TREEWALK_PRIMARY || STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi] > remote->maxcmpte)
-        STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi] = remote->maxcmpte;
-    for(i = 0; i < remote->maxcmpte; i++) {
-        TREEWALK_REDUCE(STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi][i], remote->Ngb[i]);
-        TREEWALK_REDUCE(STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][i], remote->VolumeSPH[i]);
-    }
-}
-
-void stellar_density_check_neighbours (int i, TreeWalk * tw)
-{
-    MyFloat * Left = STELLAR_DENSITY_GET_PRIV(tw)->Left;
-    MyFloat * Right = STELLAR_DENSITY_GET_PRIV(tw)->Right;
-
-    int pi = P[i].PI;
-    int tid = omp_get_thread_num();
-    double desnumngb = STELLAR_DENSITY_GET_PRIV(tw)->DesNumNgb;
-
-    const int maxcmpt = STELLAR_DENSITY_GET_PRIV(tw)->maxcmpte[pi];
-    int j;
-    double evalhsml[NHSML];
-    evalhsml[0] = effhsml(i, 0, tw);
-    for(j = 1; j < maxcmpt; j++)
-        evalhsml[j] = effhsml(i, j, tw);
-
-    int close = 0;
-    P[i].Hsml = ngb_narrow_down(&Right[pi],&Left[pi],evalhsml,STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi],maxcmpt,desnumngb,&close,tw->tree->BoxSize);
-    double numngb = STELLAR_DENSITY_GET_PRIV(tw)->NumNgb[pi][close];
-
-    /* Save VolumeSPH*/
-    STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][0] = STELLAR_DENSITY_GET_PRIV(tw)->VolumeSPH[pi][close];
-
-    /* now check whether we had enough neighbours */
-    if(numngb < (desnumngb - MetalParams.MaxNgbDeviation) ||
-            (numngb > (desnumngb + MetalParams.MaxNgbDeviation)))
-    {
-        /* This condition is here to prevent the density code looping forever if it encounters
-         * multiple particles at the same position. If this happens you likely have worse
-         * problems anyway, so warn also. */
-        if((Right[pi] - Left[pi]) < 1.0e-4 * Left[pi])
-        {
-            /* If this happens probably the exchange is screwed up and all your particles have moved to (0,0,0)*/
-            message(1, "Very tight Hsml bounds for i=%d ID=%lu type %d Hsml=%g Left=%g Right=%g Ngbs=%g des = %g Right-Left=%g pos=(%g|%g|%g)\n",
-             i, P[i].ID, P[i].Type, evalhsml[0], Left[pi], Right[pi], numngb, desnumngb, Right[pi] - Left[pi], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
-            return;
-        }
-        /* More work needed: add this particle to the redo queue*/
-        tw->NPRedo[tid][tw->NPLeft[tid]] = i;
-        tw->NPLeft[tid] ++;
-        if(tw->Niteration >= 10)
-            message(1, "i=%d ID=%lu Hsml=%g lastdhsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g pos=(%g|%g|%g)\n",
-             i, P[i].ID, P[i].Hsml, evalhsml[close], Left[pi], Right[pi], numngb, Right[pi] - Left[pi], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
-
-    }
-    if(tw->maxnumngb[tid] < numngb)
-        tw->maxnumngb[tid] = numngb;
-    if(tw->minnumngb[tid] > numngb)
-        tw->minnumngb[tid] = numngb;
-
-}
-
-static void
-stellar_density_ngbiter(
-        TreeWalkQueryStellarDensity * I,
-        TreeWalkResultStellarDensity * O,
-        TreeWalkNgbIterStellarDensity * iter,
-        LocalTreeWalk * lv)
-{
-    if(iter->base.other == -1) {
-        int i;
-        for(i = 0; i < NHSML; i++) {
-            density_kernel_init(&iter->kernel[i], I->Hsml[i], GetDensityKernelType());
-            iter->kernel_volume[i] = density_kernel_volume(&iter->kernel[i]);
-        }
-        iter->base.Hsml = I->Hsml[NHSML-1];
-        iter->base.mask = GASMASK; /* gas only */
-        iter->base.symmetric = NGB_TREEFIND_ASYMMETRIC;
-        O->maxcmpte = NHSML;
-        return;
-    }
-    const int other = iter->base.other;
-    const double r = iter->base.r;
-    const double r2 = iter->base.r2;
-
-    int i;
-    for(i = 0; i < O->maxcmpte; i++) {
-        if(r2 < iter->kernel[i].HH)
-        {
-            const double u = r * iter->kernel[i].Hinv;
-            double wk = density_kernel_wk(&iter->kernel[i], u);
-            O->Ngb[i] += wk * iter->kernel_volume[i];
-            /* For stars we need the total weighting, sum(w_k m_k / rho_k).*/
-            double thisvol = P[other].Mass / SPHP(other).Density;
-            if(MetalParams.SPHWeighting)
-                thisvol *= wk;
-            O->VolumeSPH[i] += thisvol;
-        }
-    }
-    double desnumngb = STELLAR_DENSITY_GET_PRIV(lv->tw)->DesNumNgb;
-    /* If there is an entry which is above desired DesNumNgb,
-     * we don't need to search past it. After this point
-     * all entries in the Ngb table above O->Ngb are invalid.*/
-    for(i = 0; i < NHSML; i++) {
-        if(O->Ngb[i] > desnumngb) {
-            O->maxcmpte = i+1;
-            iter->base.Hsml = I->Hsml[i];
-            break;
-        }
-    }
-
-}
-
-void
-stellar_density(const ActiveParticles * act, MyFloat * StarVolumeSPH, MyFloat * MassReturn, const ForceTree * const tree)
-{
-    TreeWalk tw[1] = {{0}};
-    struct StellarDensityPriv priv[1];
-
-    tw->ev_label = "STELLAR_DENSITY";
-    tw->visit = treewalk_visit_nolist_ngbiter;
-    tw->NoNgblist = 1;
-    tw->ngbiter_type_elsize = sizeof(TreeWalkNgbIterStellarDensity);
-    tw->ngbiter = (TreeWalkNgbIterFunction) stellar_density_ngbiter;
-    tw->haswork = stellar_density_haswork;
-    tw->fill = (TreeWalkFillQueryFunction) stellar_density_copy;
-    tw->reduce = (TreeWalkReduceResultFunction) stellar_density_reduce;
-    tw->postprocess = (TreeWalkProcessFunction) stellar_density_check_neighbours;
-    tw->query_type_elsize = sizeof(TreeWalkQueryStellarDensity);
-    tw->result_type_elsize = sizeof(TreeWalkResultStellarDensity);
-    tw->priv = priv;
-    tw->tree = tree;
-
-    int i;
-
-    priv->MassReturn = MassReturn;
-
-    priv->Left = (MyFloat *) mymalloc("DENS_PRIV->Left", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->Right = (MyFloat *) mymalloc("DENS_PRIV->Right", SlotsManager->info[4].size * sizeof(MyFloat));
-    priv->NumNgb = (MyFloat (*) [NHSML]) mymalloc("DENS_PRIV->NumNgb", SlotsManager->info[4].size * sizeof(priv->NumNgb[0]));
-    priv->VolumeSPH = (MyFloat (*) [NHSML]) mymalloc("DENS_PRIV->VolumeSPH", SlotsManager->info[4].size * sizeof(priv->VolumeSPH[0]));
-    priv->maxcmpte = (int *) mymalloc("maxcmpte", SlotsManager->info[4].size * sizeof(int));
-
-    priv->DesNumNgb = GetNumNgb(GetDensityKernelType());
-
-    #pragma omp parallel for
-    for(i = 0; i < act->NumActiveParticle; i++) {
-        int a = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        /* Skip the garbage particles */
-        if(P[a].IsGarbage)
-            continue;
-        if(!stellar_density_haswork(a, tw))
-            continue;
-        int pi = P[a].PI;
-        priv->Left[pi] = 0;
-        priv->Right[pi] = tree->BoxSize;
-    }
-
-    /* allocate buffers to arrange communication */
-
-    treewalk_do_hsml_loop(tw, act->ActiveParticle, act->NumActiveParticle, 1);
-    #pragma omp parallel for
-    for(i = 0; i < act->NumActiveParticle; i++) {
-        int a = act->ActiveParticle ? act->ActiveParticle[i] : i;
-        /* Skip the garbage particles */
-        if(P[a].IsGarbage)
-            continue;
-        if(!stellar_density_haswork(a, tw))
-            continue;
-        /* Copy the Star Volume SPH*/
-        StarVolumeSPH[P[a].PI] = priv->VolumeSPH[P[a].PI][0];
-        if(priv->VolumeSPH[P[a].PI][0] == 0)
-            endrun(3, "i = %d pi = %d StarVolumeSPH %g hsml %g\n", a, P[a].PI, priv->VolumeSPH[P[a].PI][0], P[a].Hsml);
-    }
-
-    myfree(priv->maxcmpte);
-    myfree(priv->VolumeSPH);
-    myfree(priv->NumNgb);
-    myfree(priv->Right);
-    myfree(priv->Left);
-
-    double timeall = walltime_measure(WALLTIME_IGNORE);
-
-    double timecomp = tw->timecomp0 + tw->timecomp3 + tw->timecomp1 + tw->timecomp2;
-    walltime_add("/SPH/Metals/Density/Compute", timecomp);
-    walltime_add("/SPH/Metals/Density/Wait", tw->timewait1);
-    walltime_add("/SPH/Metals/Density/Reduce", tw->timecommsumm);
-    walltime_add("/SPH/Metals/Density/Misc", timeall - (timecomp + tw->timewait1 + tw->timecommsumm));
-
-    return;
-}
diff --git a/libgadget/metal_return.h b/libgadget/metal_return.h
deleted file mode 100644
index f5acd9c8..00000000
--- a/libgadget/metal_return.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef METAL_RETURN_H
-#define METAL_RETURN_H
-
-#include "forcetree.h"
-#include "timestep.h"
-#include "utils/paramset.h"
-#include <gsl/gsl_interp2d.h>
-#include <gsl/gsl_integration.h>
-#include "slotsmanager.h"
-
-struct interps
-{
-    gsl_interp2d * lifetime_interp;
-    gsl_interp2d * agb_mass_interp;
-    gsl_interp2d * agb_metallicity_interp;
-    gsl_interp2d * agb_metals_interp[NMETALS];
-    gsl_interp2d * snii_mass_interp;
-    gsl_interp2d * snii_metallicity_interp;
-    gsl_interp2d * snii_metals_interp[NMETALS];
-};
-
-/* Build the interpolators for each yield table. We use bilinear interpolation
- * so there is no extra memory allocation and we never free the tables*/
-void setup_metal_table_interp(struct interps * interp);
-
-struct MetalReturnPriv {
-    gsl_integration_workspace ** gsl_work;
-    MyFloat * StellarAges;
-    MyFloat * MassReturn;
-    MyFloat * LowDyingMass;
-    MyFloat * HighDyingMass;
-    double imf_norm;
-    double hub;
-    /* Maximum of the new gas mass*/
-    double MaxGasMass;
-    Cosmology *CP;
-    MyFloat * StarVolumeSPH;
-    struct interps interp;
-    struct SpinLocks * spin;
-};
-
-void metal_return(const ActiveParticles * act, ForceTree * gasTree, Cosmology * CP, const double atime, const double AvgGasMass);
-
-void set_metal_return_params(ParameterSet * ps);
-
-/* Initialise the metal private structure, finding mass return.*/
-int64_t metal_return_init(const ActiveParticles * act, Cosmology * CP, struct MetalReturnPriv * priv, const double atime);
-/* Free memory allocated in metal_return_init*/
-void metal_return_priv_free(struct MetalReturnPriv * priv);
-
-/* Find stellar density, returning the total SPH Volume weights for each particle.*/
-void stellar_density(const ActiveParticles * act, MyFloat * StarVolumeSPH, MyFloat * MassReturn, const ForceTree * const tree);
-
-/* Determines whether metal return runs for this star this timestep*/
-int metals_haswork(int i, MyFloat * MassReturn);
-#endif
diff --git a/libgadget/run.c b/libgadget/run.c
index f001a265..5bf2e0a0 100644
--- a/libgadget/run.c
+++ b/libgadget/run.c
@@ -25,7 +25,6 @@
 #include "blackhole.h"
 #include "hydra.h"
 #include "sfr_eff.h"
-#include "metal_return.h"
 #include "slotsmanager.h"
 #include "hci.h"
 #include "fof.h"
@@ -599,12 +598,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
             if(!gasTree.tree_allocated_flag)
                 force_tree_rebuild_mask(&gasTree, ddecomp, GASMASK | BHMASK, All.OutputDir);
 
-            /* Do this before sfr and bh so the gas hsml always contains DesNumNgb neighbours.*/
-            if(All.MetalReturnOn) {
-                double AvgGasMass = All.CP.OmegaBaryon * 3 * All.CP.Hubble * All.CP.Hubble / (8 * M_PI * All.CP.GravInternal) * pow(PartManager->BoxSize, 3) / header->NTotalInit[0];
-                metal_return(&Act, &gasTree, &All.CP, atime, AvgGasMass);
-            }
-
             /* this will find new black hole seed halos.
              * Note: the FOF code does not know about garbage particles,
              * so ensure we do not have garbage present when we call this.
diff --git a/libgadget/tests/test_metal_return.c b/libgadget/tests/test_metal_return.c
deleted file mode 100644
index 055dfc2e..00000000
--- a/libgadget/tests/test_metal_return.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*Tests for the drift factor module.*/
-#include <stdarg.h>
-#include <stddef.h>
-#include <setjmp.h>
-#include <cmocka.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <gsl/gsl_integration.h>
-#include <gsl/gsl_interp2d.h>
-#include <stdint.h>
-
-#include "stub.h"
-#include "libgadget/utils/endrun.h"
-#include "libgadget/metal_return.h"
-#include "libgadget/slotsmanager.h"
-#include "libgadget/metal_tables.h"
-
-void test_yields(void ** state)
-{
-    gsl_integration_workspace * gsl_work = gsl_integration_workspace_alloc(GSL_WORKSPACE);
-    set_metal_params(1.3e-3);
-
-    struct interps interp;
-    setup_metal_table_interp(&interp);
-    /* Compute factor to normalise the total mass in the IMF to unity.*/
-    double imf_norm = compute_imf_norm(gsl_work);
-    assert_true(fabs(imf_norm - 0.624632) <  0.01);
-
-    double agbyield = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 1, 40, gsl_work);
-    double agbyield2 = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 1, SNAGBSWITCH, gsl_work);
-    assert_true(fabs(agbyield / agbyield2 - 1) < 1e-3);
-    /* Lifetime is about 200 Myr*/
-    double agbyield3 = compute_agb_yield(interp.agb_mass_interp, agb_total_mass, 0.01, 5, 40, gsl_work);
-
-    /* Integrate the region of the IMF which contains SNII and AGB stars. The yields should never be larger than this*/
-    gsl_function ff = {chabrier_mass, NULL};
-    double agbmax, sniimax, abserr;
-    gsl_integration_qag(&ff, agb_total_mass[0], SNAGBSWITCH, 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &agbmax, &abserr);
-    gsl_integration_qag(&ff, SNAGBSWITCH, snii_masses[SNII_NMASS-1], 1e-4, 1e-3, GSL_WORKSPACE, GSL_INTEG_GAUSS61, gsl_work, &sniimax, &abserr);
-
-    double sniiyield = compute_snii_yield(interp.snii_mass_interp, snii_total_mass, 0.01, 1, 40, gsl_work);
-
-    double sn1a = sn1a_number(0, 1500, 0.679)*sn1a_total_metals;
-    assert_true(sn1a < 1.3e-3);
-
-    message(0, "agbyield %g max %g (in 200 Myr: %g)\n", agbyield, agbmax, agbyield3);
-    message(0, "sniiyield %g max %g sn1a %g\n", sniiyield, sniimax, sn1a);
-    message(0, "Total fraction of mass returned %g\n", (sniiyield + sn1a + agbyield)/imf_norm);
-    assert_true(agbyield < agbmax);
-    assert_true(sniiyield < sniimax);
-    assert_true((sniiyield + sn1a + agbyield)/imf_norm < 1.);
-
-    double masslow1, masshigh1;
-    double masslow2, masshigh2;
-    double masslowsum, masshighsum;
-    find_mass_bin_limits(&masslow1, &masshigh1, 0, 30, 0.02, interp.lifetime_interp);
-    find_mass_bin_limits(&masslow2, &masshigh2, 30, 60, 0.02, interp.lifetime_interp);
-    find_mass_bin_limits(&masslowsum, &masshighsum, 0, 60, 0.02, interp.lifetime_interp);
-    message(0, "0 - 30: %g %g 30 - 60 %g %g 0 - 60 %g %g\n", masslow1, masshigh1, masslow2, masshigh2, masslowsum, masshighsum);
-    assert_true(fabs(masslow1 - masshigh2) < 0.01);
-    assert_true(fabs(masslowsum - masslow2) < 0.01);
-}
-
-int main(void) {
-    const struct CMUnitTest tests[] = {
-        cmocka_unit_test(test_yields),
-    };
-    return cmocka_run_group_tests_mpi(tests, NULL, NULL);
-}

From fdff6df08b5b408ff71bd8d8e8dffcb4154b3aec Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Wed, 2 Oct 2024 14:37:15 -0700
Subject: [PATCH 031/120] cooling rates interp, redefine PartManager->Base due
 to macro conflicts, may rename P in future

---
 libgadget/cooling_rates.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/libgadget/cooling_rates.c b/libgadget/cooling_rates.c
index 9e41bbe0..b9399a04 100644
--- a/libgadget/cooling_rates.c
+++ b/libgadget/cooling_rates.c
@@ -59,7 +59,15 @@
 #include <mpi.h>
 #include <stdio.h>
 #include <string.h>
-#include <gsl/gsl_interp.h>
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+
+#include <boost/math/interpolators/barycentric_rational.hpp>
+
+// Optionally, redefine P afterward if you still need it
+#define P PartManager->Base
 #include "physconst.h"
 #include "utils/endrun.h"
 #include "utils/paramset.h"
@@ -67,7 +75,7 @@
 
 static struct cooling_params CoolingParams;
 
-static gsl_interp * GrayOpac;
+boost::math::interpolators::barycentric_rational<double>* GrayOpac;
 
 /*Tables for the self-shielding correction. Note these are not well-measured for z > 5!*/
 #define NGRAY 6
@@ -75,11 +83,11 @@ static gsl_interp * GrayOpac;
 static const double GrayOpac_ydata[NGRAY] = { 2.59e-18, 2.37e-18, 2.27e-18, 2.15e-18, 2.02e-18, 1.94e-18};
 static const double GrayOpac_zz[NGRAY] = {0, 1, 2, 3, 4, 5};
 
-/*Convenience structure bundling together the gsl interpolation routines.*/
+/*Convenience structure bundling together the interpolation routines.*/
 struct itp_type
 {
     double * ydata;
-    gsl_interp * intp;
+    boost::math::interpolators::barycentric_rational<double>* intp;
 };
 /*Interpolation objects for the redshift evolution of the UVB.*/
 /*Number of entries in the table*/
@@ -119,8 +127,7 @@ static double * cool_freefree1;
 static void
 init_itp_type(double * xarr, struct itp_type * Gamma, int Nelem)
 {
-    Gamma->intp = gsl_interp_alloc(gsl_interp_linear,Nelem);
-    gsl_interp_init(Gamma->intp, xarr, Gamma->ydata, Nelem);
+    Gamma->intp = new boost::math::interpolators::barycentric_rational<double>(xarr, Gamma->ydata, Nelem);
 }
 
 /* Helper function to correctly load a value in the TREECOOL file*/
@@ -325,7 +332,7 @@ get_photo_rate(double redshift, struct itp_type * Gamma_tab)
     else if (log1z < Gamma_log1z[0])
         photo_rate = Gamma_tab->ydata[0];
     else {
-        photo_rate = gsl_interp_eval(Gamma_tab->intp, Gamma_log1z, Gamma_tab->ydata, log1z, NULL);
+        photo_rate = (*Gamma_tab->intp)(log1z);
     }
     return pow(10, photo_rate) * CoolingParams.PhotoIonizeFactor;
 }
@@ -355,7 +362,7 @@ get_self_shield_dens(double redshift, const struct UVBG * uvbg)
     else if (redshift >= GrayOpac_zz[NGRAY-1])
         greyopac = GrayOpac_ydata[NGRAY-1];
     else {
-        greyopac = gsl_interp_eval(GrayOpac, GrayOpac_zz, GrayOpac_ydata,redshift, NULL);
+        greyopac = (*GrayOpac)(redshift);
     }
     return 6.73e-3 * pow(greyopac / 2.49e-18, -2./3)*pow(G12, 2./3)*pow(CoolingParams.fBar/0.17,-1./3);
 }
@@ -408,7 +415,7 @@ get_photorate_coeff(double alpha, struct itp_type * Gamma_tab)
     else if (alpha < Gamma_alpha[0])
         photo_rate = Gamma_tab->ydata[0];
     else {
-        photo_rate = gsl_interp_eval(Gamma_tab->intp, Gamma_alpha, Gamma_tab->ydata, alpha, NULL);
+        photo_rate = (*Gamma_tab->intp)(alpha);
     }
     //pow 10 here because the treecool load does log10
     return pow(10,photo_rate) * CoolingParams.PhotoIonizeFactor;
@@ -1107,10 +1114,8 @@ init_cooling_rates(const char * TreeCoolFile, const char * J21CoeffFile, const c
     CoolingParams.fBar = CP->OmegaBaryon / CP->OmegaCDM;
     CoolingParams.rho_crit_baryon = CP->OmegaBaryon * 3.0 * pow(CP->HubbleParam*HUBBLE,2.0) /(8.0*M_PI*GRAVITY);
 
-    /* Initialize the interpolation for the self-shielding module as a function of redshift.
-     * A crash has been observed in GSL with a cspline interpolator. */
-    GrayOpac = gsl_interp_alloc(gsl_interp_linear,NGRAY);
-    gsl_interp_init(GrayOpac,GrayOpac_zz,GrayOpac_ydata, NGRAY);
+    /* Initialize the interpolation for the self-shielding module as a function of redshift.*/
+    GrayOpac = new boost::math::interpolators::barycentric_rational<double>(GrayOpac_zz,GrayOpac_ydata, NGRAY);
 
     if(!TreeCoolFile || strnlen(TreeCoolFile,100) == 0) {
         CoolingParams.PhotoIonizationOn = 0;

From a40f49092eb3d45fc7ce1f0def40afb522927f77 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Wed, 2 Oct 2024 15:43:55 -0700
Subject: [PATCH 032/120] neutrino interp

---
 libgadget/neutrinos_lra.c | 88 ++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 57 deletions(-)

diff --git a/libgadget/neutrinos_lra.c b/libgadget/neutrinos_lra.c
index 50e60f1c..6eaaff0a 100644
--- a/libgadget/neutrinos_lra.c
+++ b/libgadget/neutrinos_lra.c
@@ -10,7 +10,7 @@
 #include <string.h>
 #include <bigfile-mpi.h>
 #include <gsl/gsl_errno.h>
-#include <gsl/gsl_interp.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
 #include <gsl/gsl_sf_bessel.h>
 
 #include "neutrinos_lra.h"
@@ -26,8 +26,6 @@
 
 /** Floating point accuracy*/
 #define FLOAT_ACC   1e-6
-/** Number of bins in integrations*/
-#define GSL_VAL 400
 
 /** Update the last value of delta_tot in the table with a new value computed
  from the given delta_cdm_curr and delta_nu_curr.
@@ -100,19 +98,16 @@ static void delta_tot_first_init(_delta_tot_table * const d_tot, const int nk_in
     d_tot->nk=nk_in;
     const double OmegaNua3=get_omega_nu_nopart(d_tot->omnu, d_tot->TimeTransfer)*pow(d_tot->TimeTransfer,3);
     const double OmegaNu1 = get_omega_nu(d_tot->omnu, 1);
-    gsl_interp_accel *acc = gsl_interp_accel_alloc();
-    gsl_interp * spline;
-    if(t_init->NPowerTable > 2)
-        spline = gsl_interp_alloc(gsl_interp_cspline,t_init->NPowerTable);
-    else
-        spline = gsl_interp_alloc(gsl_interp_linear,t_init->NPowerTable);
-    gsl_interp_init(spline,t_init->logk,t_init->T_nu,t_init->NPowerTable);
+    boost::math::interpolators::barycentric_rational<double>* spline;
+
+    spline = new boost::math::interpolators::barycentric_rational<double>(t_init->logk,t_init->T_nu,t_init->NPowerTable);
+
     /*Check we have a long enough power table: power tables are in log_10*/
     if(log10(wavenum[d_tot->nk-1]) > t_init->logk[t_init->NPowerTable-1])
         endrun(2,"Want k = %g but maximum in CLASS table is %g\n",wavenum[d_tot->nk-1], pow(10, t_init->logk[t_init->NPowerTable-1]));
     for(ik=0;ik<d_tot->nk;ik++) {
             /* T_nu contains T_nu / T_cdm.*/
-            double T_nubyT_nonu = gsl_interp_eval(spline,t_init->logk,t_init->T_nu,log10(wavenum[ik]),acc);
+            double T_nubyT_nonu = (*spline)(log10(wavenum[ik]));
             /*Initialise delta_nu_init to use the first timestep's delta_cdm_curr
              * so that it includes potential Rayleigh scattering. */
             d_tot->delta_nu_init[ik] = delta_cdm_curr[ik]*T_nubyT_nonu;
@@ -122,8 +117,6 @@ static void delta_tot_first_init(_delta_tot_table * const d_tot, const int nk_in
             /*Set up the wavenumber array*/
             d_tot->wavenum[ik] = wavenum[ik];
     }
-    gsl_interp_accel_free(acc);
-    gsl_interp_free(spline);
 
     /*If we are not restarting, make sure we set the scale factor*/
     d_tot->scalefact[0]=log(TimeIC);
@@ -155,19 +148,20 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
         double * logPower = (double *) mymalloc("logpk", PowerSpectrum->nonzero * sizeof(double));
         for(i = 0; i < PowerSpectrum->nonzero; i++)
             logPower[i] = log(PowerSpectrum->Power[i]);
-        gsl_interp * pkint = gsl_interp_alloc(gsl_interp_linear, PowerSpectrum->nonzero);
-        gsl_interp_init(pkint, PowerSpectrum->logknu, logPower, PowerSpectrum->nonzero);
-        gsl_interp_accel * pkacc = gsl_interp_accel_alloc();
+
+        boost::math::interpolators::barycentric_rational<double> pkint(PowerSpectrum->logknu, logPower, PowerSpectrum->nonzero);
+        double xmin = PowerSpectrum->logknu[0];
+        double xmax = PowerSpectrum->logknu[PowerSpectrum->nonzero-1];
+
         for(i = 0; i < delta_tot_table.nk; i++) {
             double logk = log(delta_tot_table.wavenum[i]);
-            if(pkint->xmax < logk || pkint->xmin > logk)
+            if(xmax < logk || xmin > logk)
                 Power_in[i] = delta_tot_table.delta_tot[i][delta_tot_table.ia-1];
             else
-                Power_in[i] = exp(gsl_interp_eval(pkint, PowerSpectrum->logknu, logPower, logk, pkacc));
+                Power_in[i] = exp(pkint(logk));
+
         }
         myfree(logPower);
-        gsl_interp_accel_free(pkacc);
-        gsl_interp_free(pkint);
     }
 
     const double partnu = particle_nu_fraction(&CP->ONu.hybnu, Time, 0);
@@ -202,8 +196,7 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
     }
     double * delta_nu_ratio = (double *) mymalloc2("dnu_rat", delta_tot_table.nk * sizeof(double));
     double * logwavenum = (double *) mymalloc2("logwavenum", delta_tot_table.nk * sizeof(double));
-    gsl_interp * pkint = gsl_interp_alloc(gsl_interp_linear, delta_tot_table.nk);
-    gsl_interp_accel * pkacc = gsl_interp_accel_alloc();
+
     /*We want to interpolate in log space*/
     for(i=0; i < delta_tot_table.nk; i++) {
         if(isnan(delta_tot_table.delta_nu_last[i]))
@@ -216,7 +209,10 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
     }
     if(delta_tot_table.nk != PowerSpectrum->nonzero)
         myfree(Power_in);
-    gsl_interp_init(pkint, logwavenum, delta_nu_ratio, delta_tot_table.nk);
+    boost::math::interpolators::barycentric_rational<double> pkint(logwavenum, delta_nu_ratio, delta_tot_table.nk);
+
+    double xmin = logwavenum[0];
+    double xmax = logwavenum[delta_tot_table.nk-1];
 
     /*We want to interpolate in log space*/
     for(i=0; i < PowerSpectrum->nonzero; i++) {
@@ -224,14 +220,12 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
             PowerSpectrum->delta_nu_ratio[i] = delta_nu_ratio[i];
         else {
             double logk = PowerSpectrum->logknu[i];
-            if(logk > pkint->xmax)
-                logk = pkint->xmax;
-            PowerSpectrum->delta_nu_ratio[i] = gsl_interp_eval(pkint, logwavenum, delta_nu_ratio, logk, pkacc);
+            if(logk > xmax)
+                logk = xmax;
+            PowerSpectrum->delta_nu_ratio[i] = pkint(logk);
         }
     }
 
-    gsl_interp_accel_free(pkacc);
-    gsl_interp_free(pkint);
     myfree(logwavenum);
     myfree(delta_nu_ratio);
 }
@@ -257,9 +251,6 @@ void powerspectrum_nu_save(struct _powerspectrum * PowerSpectrum, const char * O
     }
     fclose(fp);
     myfree(fname);
-    /*Clean up the neutrino memory now we saved the power spectrum.*/
-    gsl_interp_free(PowerSpectrum->nu_spline);
-    gsl_interp_accel_free(PowerSpectrum->nu_acc);
 }
 
 void petaio_save_neutrinos(BigFile * bf, int ThisTask)
@@ -630,12 +621,10 @@ struct _delta_nu_int_params
     double k;
     /**Neutrino mass divided by k_B T_nu*/
     double mnubykT;
-    gsl_interp_accel *acc;
-    gsl_interp *spline;
+    boost::math::interpolators::barycentric_rational<double>* spline;
     Cosmology * CP;
     /**Precomputed free-streaming lengths*/
-    gsl_interp_accel *fs_acc;
-    gsl_interp *fs_spline;
+    boost::math::interpolators::barycentric_rational<double>* fs_spline;
     double * fslengths;
     double * fsscales;
     /**Make sure this is at the same k as above*/
@@ -650,12 +639,12 @@ struct _delta_nu_int_params
 };
 typedef struct _delta_nu_int_params delta_nu_int_params;
 
-/**GSL integration kernel for get_delta_nu*/
+/**integration kernel for get_delta_nu*/
 double get_delta_nu_int(double logai, void * params)
 {
     delta_nu_int_params * p = (delta_nu_int_params *) params;
-    double fsl_aia = gsl_interp_eval(p->fs_spline,p->fsscales,p->fslengths,logai,p->fs_acc);
-    double delta_tot_at_a = gsl_interp_eval(p->spline,p->scale,p->delta_tot,logai,p->acc);
+    double fsl_aia = (*p->fs_spline)(logai);
+    double delta_tot_at_a = (*p->spline)(logai);
     double specJ = specialJ(p->k*fsl_aia/p->mnubykT, p->qc, p->nufrac_low);
     double ai = exp(logai);
     return fsl_aia/(ai*hubble_function(p->CP, ai)) * specJ * delta_tot_at_a;
@@ -711,15 +700,7 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
   /*If neutrino mass is zero, we are not accurate, just use the initial conditions piece*/
   if(Na > 1 && mnubykT > 0){
         delta_nu_int_params params;
-        params.acc = gsl_interp_accel_alloc();
-        /*Use cubic interpolation*/
-        if(Na > 2) {
-                params.spline=gsl_interp_alloc(gsl_interp_cspline,Na);
-        }
-        /*Unless we have only two points*/
-        else {
-                params.spline=gsl_interp_alloc(gsl_interp_linear,Na);
-        }
+
         params.scale=d_tot->scalefact;
         params.mnubykT=mnubykT;
         params.qc = qc;
@@ -729,8 +710,7 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
          * which is exactly where it doesn't matter, but
          * we still want to be safe. */
         int Nfs = Na*16;
-        params.fs_acc = gsl_interp_accel_alloc();
-        params.fs_spline=gsl_interp_alloc(gsl_interp_cspline,Nfs);
+
         params.CP = CP;
         /*Pre-compute the free-streaming lengths, which are scale-independent*/
         double * fslengths = (double *) mymalloc("fslengths", Nfs* sizeof(double));
@@ -742,17 +722,13 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
         params.fslengths = fslengths;
         params.fsscales = fsscales;
 
-        if (!params.spline || !params.acc || !params.fs_spline || !params.fs_acc || !fslengths || !fsscales) {
-            endrun(2016, "Error initializing and allocating memory for interpolators.\n");
-        }
+        params.fs_spline = new boost::math::interpolators::barycentric_rational<double>(params.fsscales,params.fslengths,Nfs);
 
-        gsl_interp_init(params.fs_spline,params.fsscales,params.fslengths,Nfs);
         for (ik = 0; ik < d_tot->nk; ik++) {
             double abserr,d_nu_tmp;
             params.k=d_tot->wavenum[ik];
             params.delta_tot=d_tot->delta_tot[ik];
-            gsl_interp_init(params.spline,params.scale,params.delta_tot,Na);
-
+            params.spline = new boost::math::interpolators::barycentric_rational<double>(params.scale,params.delta_tot,Na);
             // Define the integrand as a lambda function wrapping get_delta_nu_int
             auto integrand = [&params](double logai) {
                 return get_delta_nu_int(logai, (void *)&params);
@@ -760,8 +736,6 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
             d_nu_tmp = tanh_sinh_integrate_adaptive(integrand, log(d_tot->TimeTransfer), log(a), &abserr, relerr);
             delta_nu_curr[ik] += d_tot->delta_nu_prefac * d_nu_tmp;
          }
-         gsl_interp_free(params.spline);
-         gsl_interp_accel_free(params.acc);
          myfree(fsscales);
          myfree(fslengths);
    }

From 339d6a18f349f6268223b842dc35f3424fbc0f27 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 10:52:43 -0400
Subject: [PATCH 033/120] cleanup

---
 libgadget/petapm.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 90235718..35853ca9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -141,10 +141,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     // std::vector<float> ref = data;
 
 
-
-/********************************not sure if these are useful or not**************************************** */
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    ptrdiff_t np[2];
+    ptrdiff_t np[2]; // 2D arrangement of ranks
 
     int ThisTask;
     int NTask;
@@ -184,11 +182,8 @@ if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
     endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
 }
 
-// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
-// cuFFTMp might require manual management of the local data size
-// Example: You may need to calculate how much data each process holds based on grid decomposition
-
-pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+//local_fft_size_cufftmp
+pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d, 
                                                 pm->real_space_region.size, 
                                                 pm->real_space_region.offset, 
                                                 pm->fourier_space_region.size, 

From 7abd5ce37714cd7328b58724fdd0659c20e8c8b8 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 10:58:17 -0400
Subject: [PATCH 034/120] modified task 2d decomp in petapm

---
 libgadget/petapm.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 35853ca9..671123b9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -142,6 +142,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
 
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
+    //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
     ptrdiff_t np[2]; // 2D arrangement of ranks
 
     int ThisTask;
@@ -154,13 +155,15 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_size(comm, &NTask);
 
     /* try to find a square 2d decomposition */
+    /* CUDA NOTE: CufftMp only supports square decomposition, 
+    so Ntask has to be a perfect square*/
     int i;
     int k;
-    for(i = sqrt(NTask) + 1; i >= 0; i --) {
-        if(NTask % i == 0) break;
+    np[0] = sqrt(NTask);
+    np[1] = Ntask / np[0];
+    if (np[0] * np[1] != NTask) {
+        endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
-    np[0] = i;
-    np[1] = NTask / i;
 
 message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 

From ac7c211b4c235192c2ee72d313acb4d9440652eb Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 13:55:28 -0400
Subject: [PATCH 035/120] rm previous default slab code->use pencil

---
 libgadget/petapm.c | 44 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 671123b9..40b5a6e7 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -115,32 +115,6 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
 
-
-    int ndevices;
-    cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(ThisTask % ndevices);
-    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
-
-    // Logical transform size
-    size_t nx = NTask;      // any value >= NTask is OK
-    size_t ny = NTask;      // any value >= NTask is OK
-    size_t nz = 2 * NTask;  // need to be even and >= NTask
-
-    // We start with Slabs distributed along X (X-Slabs)
-    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
-    // All ranks own all element in the Y and Z dimension
-    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
-    // complex numbers assuming an in-place data layout.
-    int ranks_with_onemore = nx % size;
-    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
-    size_t padded_nz = 2 * (nz / 2 + 1);
-
-    // // Local, distributed, data
-    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
-    // generate_random(data, rank);
-    // std::vector<float> ref = data;
-
-
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
     //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
     ptrdiff_t np[2]; // 2D arrangement of ranks
@@ -157,24 +131,22 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
     so Ntask has to be a perfect square*/
-    int i;
-    int k;
     np[0] = sqrt(NTask);
     np[1] = Ntask / np[0];
     if (np[0] * np[1] != NTask) {
         endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 
-// Step 1: Create 2D Cartesian grid for the processes
-int dims[2] = {np[0], np[1]};
-int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+    // Step 1: Create 2D Cartesian grid for the processes
+    int dims[2] = {np[0], np[1]};
+    int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
 
-// Create 2D Cartesian communicator
-if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-}
+    // Create 2D Cartesian communicator
+    if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+        endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+    }
 
 // Step 2: Get the Cartesian coordinates of the process in the grid
 int periods_unused[2];

From 3687d0e840dadddd503764f533c1cde88effbabc Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 14:05:02 -0400
Subject: [PATCH 036/120] clean up some reion stuff in petapm.h

---
 libgadget/petapm.h | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 3db41533..b3eb580b 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -86,19 +86,7 @@ typedef struct {
     int (*active) (int i);
     int64_t NumPart;
 } PetaPMParticleStruct;
-
-/* extra particle info used in reionisation*/
-typedef struct {
-    size_t offset_type; //offset in particle data to type
-    size_t offset_pi; //offset in particle data to property index
-    void * Sphslot; //pointer to SPH slot
-    size_t sph_elsize; //element size of SPH slot
-    size_t offset_sfr; //offset in SPH slot to star formation rate
-    size_t offset_fesc_sph; //offset in SPH slot to escape fraction
-    void* Starslot; //pointer to fof groups
-    size_t star_elsize; //element size of fof group
-    size_t offset_fesc; //offset in fof groups to fof mass
-} PetaPMReionPartStruct;
+zq
 
 typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
@@ -110,9 +98,6 @@ typedef struct {
     petapm_readout_func readout;
 } PetaPMFunctions;
 
-/* Reion Loop function, applied after c2r, doesn't iterate over all particles*/
-typedef void (*petapm_reion_func)(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, double * mass_real, double * star_real, double * sfr_real, int last_step);
-
 /* this mixes up fourier space analysis; with transfer. Shall split them. */
 typedef struct {
     /* this is a fourier space readout; need a better name */
@@ -158,15 +143,4 @@ int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
 cufftComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
-
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata);
-
 #endif

From 28ebe2d52827677dc5dee112bd5e4ff94574d6da Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 14:42:17 -0400
Subject: [PATCH 037/120] rewrite petapm_init

---
 libgadget/petapm.c | 160 ++++++++++++++++-----------------------------
 1 file changed, 58 insertions(+), 102 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 40b5a6e7..284f0351 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,6 +7,7 @@
 
 #include "types.h"
 #include "petapm.h"
+#include "box_iterator.hpp"
 
 #include "utils.h"
 #include "walltime.h"
@@ -93,9 +94,9 @@ petapm_module_init(int Nthreads)
     #endif
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    // get rid of pencil type
-    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    //MPI_Type_commit(&MPI_PENCIL);
+    get rid of pencil type
+    MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    MPI_Type_commit(&MPI_PENCIL);
 }
 
 void
@@ -114,19 +115,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     int NTask;
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
-
-    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
-    ptrdiff_t np[2]; // 2D arrangement of ranks
-
-    int ThisTask;
-    int NTask;
-
-    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
-    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
-
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(rank % ndevices);
 
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
@@ -138,63 +129,53 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     }
 
     message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-
-    // Step 1: Create 2D Cartesian grid for the processes
-    int dims[2] = {np[0], np[1]};
-    int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
-
-    // Create 2D Cartesian communicator
-    if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-        endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-    }
-
-// Step 2: Get the Cartesian coordinates of the process in the grid
-int periods_unused[2];
-MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
-
-// Ensure that the task grid matches the expected number of processes
-if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
-    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
-}
-
-//local_fft_size_cufftmp
-pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d, 
-                                                pm->real_space_region.size, 
-                                                pm->real_space_region.offset, 
-                                                pm->fourier_space_region.size, 
-                                                pm->fourier_space_region.offset);
-
-    /*
-     * In fourier space, the transposed array is ordered in
-     * are in (y, z, x). The strides and sizes returned
-     * from local size is in (Nx, Ny, Nz), hence we roll them once
-     * so that the strides will give correct linear indexing for
-     * integer coordinates given in order of (y, z, x).
-     * */
-
-#define ROLL(a, N, j) { \
-    typeof(a[0]) tmp[N]; \
-    ptrdiff_t k; \
-    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
-    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
-    }
-
-    ROLL(pm->fourier_space_region.offset, 3, 1);
-    ROLL(pm->fourier_space_region.size, 3, 1);
-
-#undef ROLL
-
-    /* calculate the strides */
-    petapm_region_init_strides(&pm->real_space_region);
-    petapm_region_init_strides(&pm->fourier_space_region);
-
-
-/******************************** end unsure block **************************************** */
-
+    // Define custom data distribution
+    int64 nx               = Nmesh;
+    int64 ny               = Nmesh;
+    int64 nz               = Nmesh;
+    int64 nz_real          = nz;
+    int64 nz_complex       = (nz/2+1);
+    int64 nz_real_padded   = 2*nz_complex;
+
+    // Describe the data distribution using boxes
+    auto make_box = [](int64 lower[3], int64 upper[3], int64 strides[3]) {
+        Box3D box;
+        for(int i = 0; i < 3; i++) {
+            box.lower[i] = lower[i];
+            box.upper[i] = upper[i];
+            box.strides[i] = strides[i];
+        }
+        return box;
+    };
+
+    auto displacement = [](int64 length, int rank, int size) {
+        int ranks_cutoff = length % size;
+        return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+    };
+
+    // Input data are real pencils in X & Y, along Z
+    // Strides are packed and in-place (i.e., real is padded)
+    int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
+    int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
+    int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
+    box_real = make_box(lower, upper, strides);
+    boxes_real.push_back(make_box(lower, upper, strides));
+
+    // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
+    // Strides are packed
+    // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
+    // to ensure cuFFTMp will only perform two communication phases.
+    // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
+    int64 lower[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
+    int64 upper[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
+    box_complex = make_box(lower, upper, strides);
+
+
+    //===============================================================================================
     cudaStreamCreate(&pm->priv->stream);
     cufftCreate(&pm->priv->plan_forw);
     cufftCreate(&pm->priv->plan_back);
-
     // Attach the MPI communicator to the plans
     cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
     cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
@@ -204,8 +185,8 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
 
-    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+    cufftXtSetDistribution(pm->priv->plan_forw, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    cufftXtSetDistribution(pm->priv->plan_back, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
 
     // Set the stream
     cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
@@ -217,6 +198,11 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
 
 
+
+
+    //===============================================================================================
+
+
     // Allocate GPU memory, copy CPU data to GPU
     // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
     cudaLibXtDesc *desc;
@@ -224,36 +210,6 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     // TODO: what to make of the cpu_data here?
     cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
 
-    /* now lets fill up the mesh2task arrays */
-
-#if 0
-    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
-            pm->real_space_region.offset[0],
-            pm->real_space_region.offset[1],
-            pm->real_space_region.offset[2],
-            pm->real_space_region.size[0],
-            pm->real_space_region.size[1],
-            pm->real_space_region.size[2]);
-#endif
-
-    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
-    for(k = 0; k < 2; k ++) {
-        for(i = 0; i < Nmesh; i ++) {
-            tmp[i] = 0;
-        }
-        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
-            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
-        }
-        /* which column / row hosts this tile? */
-        /* FIXME: this is very inefficient */
-        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-        /*
-        for(i = 0; i < Nmesh; i ++) {
-            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
-        }
-        */
-    }
-    myfree(tmp);
 }
 
 void

From d99e58516d2648533b27b3b9187b9526f3ffa5c0 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 15:07:19 -0400
Subject: [PATCH 038/120] rewrite r2c in petapm_force_r2c

---
 libgadget/petapm.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 284f0351..038f3aaa 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -197,19 +197,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
 
-
-
-
     //===============================================================================================
 
 
-    // Allocate GPU memory, copy CPU data to GPU
-    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
-    cudaLibXtDesc *desc;
-    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
-    // TODO: what to make of the cpu_data here?
-    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
-
 }
 
 void
@@ -313,27 +303,32 @@ static void pm_apply_transfer_function(PetaPM * pm,
 cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
-    /* call pfft rho_k is CFT of rho */
-
-    /* this is because
-     *
-     * CFT = DFT * dx **3
-     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
-     * */
+     // CUDA TODO: figureout how to properly get fftsize
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
     layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
     walltime_measure("/PMgrav/comm2");
-
 #ifdef DEBUG
     verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
     walltime_measure("/PMgrav/Verify");
 #endif
 
     cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
+
+    // CUDA TODO: figure out if this is needed
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_INPLACE);
+    // copy real array to gpu
+    cufftXtMemcpy(pm->priv->plan_back, (void*)pm->priv->desc, (void*)real, CUFFT_COPY_HOST_TO_DEVICE);
+    // execute the plan
+    cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD);
     myfree(real);
 
+
+    //=============================== End of R2C =============================================
+
+    //========================== Begin Transfer Function =====================================
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
     /*Do any analysis that may be required before the transfer function is applied*/

From ab8d4a98d3a39367533783c0fd97beb1dd38c1b6 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 15:19:40 -0400
Subject: [PATCH 039/120] modified c2r, almost done with petapm changes

---
 libgadget/petapm.c | 8 ++++----
 libgadget/petapm.h | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 038f3aaa..78c2c29b 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -327,7 +327,6 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
 
 
     //=============================== End of R2C =============================================
-
     //========================== Begin Transfer Function =====================================
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
@@ -363,9 +362,10 @@ petapm_force_c2r(PetaPM * pm,
         /* apply the greens function turn rho_k into potential in fourier space */
         pm_apply_transfer_function(pm, rho_k, complx, transfer);
         walltime_measure("/PMgrav/calc");
-
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
+        // double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+        /* CUDA TODO: BUT WHERE DO I INPUT THE ACTUAL ARRAY? */
+        cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE);
+        double * real = (double * ) pm->priv->desc->descriptor->data[0];
 
         walltime_measure("/PMgrav/c2r");
         if(f == functions) // Once
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index b3eb580b..3b6c68f8 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -4,11 +4,14 @@
 
 #include "powerspectrum.h"
 
+
 typedef struct Region {
     /* represents a region in the FFT Mesh */
     ptrdiff_t offset[3];
     ptrdiff_t size[3];
     ptrdiff_t strides[3];
+
+
     size_t totalsize;
     double * buffer;
     /* below are used mostly for investigation */
@@ -58,6 +61,7 @@ typedef struct PetaPMPriv {
     double * meshbuf;
     size_t meshbufsize;
     struct Layout layout;
+    cudaLibXtDesc *desc;
 } PetaPMPriv;
 
 typedef struct PetaPM {

From 19321facec16f6f22b439a900eb7fd9508f7e78a Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:03:46 -0400
Subject: [PATCH 040/120] added box structure to petapm

---
 libgadget/petapm.c | 5 +----
 libgadget/petapm.h | 7 +++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 78c2c29b..aa0587a9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,7 +7,6 @@
 
 #include "types.h"
 #include "petapm.h"
-#include "box_iterator.hpp"
 
 #include "utils.h"
 #include "walltime.h"
@@ -155,6 +154,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     // Input data are real pencils in X & Y, along Z
     // Strides are packed and in-place (i.e., real is padded)
+    
     int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
     int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
     int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
@@ -196,10 +196,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     size_t workspace;
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
-
     //===============================================================================================
-
-
 }
 
 void
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 3b6c68f8..0f704dd0 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -4,6 +4,13 @@
 
 #include "powerspectrum.h"
 
+using int64 = long long int;
+
+struct Box3D {
+    int64 lower[3];
+    int64 upper[3];
+    int64 strides[3];
+};
 
 typedef struct Region {
     /* represents a region in the FFT Mesh */

From 41ea4ece9fd5f701b3b1783576aeb5d48f9a2db5 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 17:32:13 -0500
Subject: [PATCH 041/120] fixed compiler errors in petapm

---
 libgadget/petapm.c | 38 ++++++++++++++++++++------------------
 libgadget/petapm.h |  1 -
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index aa0587a9..81c33990 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -57,7 +57,6 @@ petapm_alloc_rhok(PetaPM * pm)
 static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
 
 static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
-static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
 #define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
 #define INACTIVE(i) (CPS->active && !CPS->active(i))
@@ -88,12 +87,11 @@ petapm_module_init(int Nthreads)
     cudaSetDevice(device_id);  // Set the active GPU device
 
     // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
-    #ifdef _OPENMP
-    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
-    #endif
+    // #ifdef _OPENMP
+    // omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    // #endif
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    get rid of pencil type
     MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
     MPI_Type_commit(&MPI_PENCIL);
 }
@@ -116,18 +114,18 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_size(comm, &NTask);
     int ndevices;
     cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(rank % ndevices);
+    cudaSetDevice(ThisTask % ndevices);
 
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
     so Ntask has to be a perfect square*/
-    np[0] = sqrt(NTask);
-    np[1] = Ntask / np[0];
-    if (np[0] * np[1] != NTask) {
+    int nranks1d;
+    nranks1d = sqrt(NTask);
+    if (nranks1d != NTask/nranks1d) {
         endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+    message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);
     // Define custom data distribution
     int64 nx               = Nmesh;
     int64 ny               = Nmesh;
@@ -154,22 +152,26 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     // Input data are real pencils in X & Y, along Z
     // Strides are packed and in-place (i.e., real is padded)
-    
+    Box3D box_real;
+    Box3D box_complex;
+    int i,j;
+    i = ThisTask / nranks1d;
+    j = ThisTask % nranks1d;
+
     int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
     int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
     int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
     box_real = make_box(lower, upper, strides);
-    boxes_real.push_back(make_box(lower, upper, strides));
 
     // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
     // Strides are packed
     // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
     // to ensure cuFFTMp will only perform two communication phases.
     // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
-    int64 lower[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
-    int64 upper[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
-    int64 strides[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
-    box_complex = make_box(lower, upper, strides);
+    int64 lower_c[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
+    int64 upper_c[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides_c[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
+    box_complex = make_box(lower_c, upper_c, strides_c);
 
 
     //===============================================================================================
@@ -288,8 +290,8 @@ static void pm_apply_transfer_function(PetaPM * pm,
         pos[0] = kpos[2];
         pos[1] = kpos[0];
         pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
+        dst[ip].x = src[ip].x;
+        dst[ip].y = src[ip].y;
         if(H) {
             H(pm, k2, pos, &dst[ip]);
         }
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 0f704dd0..c6e14469 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -97,7 +97,6 @@ typedef struct {
     int (*active) (int i);
     int64_t NumPart;
 } PetaPMParticleStruct;
-zq
 
 typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);

From b6f6b79158660703e7b32cdfcb32d06de4b17d2e Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:36:54 -0400
Subject: [PATCH 042/120] pfft->cufft type change in zeldovich

---
 libgenic/zeldovich.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index ffc2bfee..6c606f6f 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <math.h>
 /* do NOT use complex.h it breaks the code */
-#include <pfft.h>
 #include "allvars.h"
 #include "proto.h"
 #include "power.h"
@@ -16,13 +15,13 @@
 #include <libgadget/utils.h>
 
 #define MESH2K(i) petapm_mesh_to_k(i)
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_density(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -30,7 +29,7 @@ static void readout_vel_z(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_y(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight);
-static void gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
+static void gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
 
 static inline double periodic_wrap(double x, const double BoxSize)
 {
@@ -218,7 +217,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
            &icprep);
 
     /*This allocates the memory*/
-    pfft_complex * rho_k = petapm_alloc_rhok(pm);
+    cufftComplex * rho_k = petapm_alloc_rhok(pm);
 
     gaussian_fill(pm->Nmesh, petapm_get_fourier_region(pm),
 		  rho_k, GenicConfig.UnitaryAmplitude, GenicConfig.InvertPhase, GenicConfig.Seed);
@@ -274,7 +273,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
  *
  *********************/
 
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     if(k2) {
         /* density is smoothed in k space by a gaussian kernel of 1 mesh grid */
         double r2 = 1.0 / pm->Nmesh;
@@ -289,7 +288,7 @@ static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex
     }
 }
 
-static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * value, int include_growth) {
+static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, cufftComplex * value, int include_growth) {
     if(k2) {
         double fac = 1./ (2 * M_PI) / sqrt(pm->BoxSize) * kaxis / k2;
         /*
@@ -313,23 +312,23 @@ static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * val
     }
 }
 
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 1);
 }
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 1);
 }
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 1);
 }
 
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 0);
 }
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 0);
 }
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 0);
 }
 
@@ -360,7 +359,7 @@ static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight) {
 }
 
 static void
-gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
+gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
 {
     /* fastpm deals with strides properly; petapm not. So we translate it here. */
     PMDesc pm[1];

From 4238964cdfa1fc9ff87377c924df7a45a49226d3 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:40:03 -0400
Subject: [PATCH 043/120] fix cufft complex number indexing

---
 libgenic/zeldovich.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index 6c606f6f..c10e7641 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -283,8 +283,8 @@ static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex
         double kmag = sqrt(k2) * 2 * M_PI / pm->BoxSize;
         fac *= DeltaSpec(kmag, ptype) / sqrt(pm->BoxSize * pm->BoxSize * pm->BoxSize);
 
-        value[0][0] *= fac;
-        value[0][1] *= fac;
+        value[0].x *= fac;
+        value[0].y *= fac;
     }
 }
 
@@ -306,9 +306,9 @@ static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, cufftComplex * val
             fac *= dlogGrowth(kmag, ptype);
         else
             fac *= DeltaSpec(kmag, ptype);
-        double tmp = value[0][0];
-        value[0][0] = - value[0][1] * fac;
-        value[0][1] = tmp * fac;
+        double tmp = value[0].x;
+        value[0].x = - value[0].y * fac;
+        value[0].y = tmp * fac;
     }
 }
 

From 940f056a6ec777ad27ffdcce00734a21085e8a7e Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:44:14 -0400
Subject: [PATCH 044/120] pfft->cufft type in glass.c

---
 libgenic/glass.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libgenic/glass.c b/libgenic/glass.c
index 5d8e6bbd..36259c9a 100644
--- a/libgenic/glass.c
+++ b/libgenic/glass.c
@@ -15,10 +15,10 @@
 #include <libgadget/powerspectrum.h>
 #include <libgadget/gravity.h>
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_z(PetaPM *pm, int i, double * mesh, double weight);
@@ -279,7 +279,7 @@ _prepare(PetaPM * pm, PetaPMParticleStruct * pstruct, void * userdata, int * Nre
  *
  *********************/
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex *value) {
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex *value) {
 
     double f = 1.0;
     const double smth = 1.0 / k2;
@@ -302,13 +302,13 @@ static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex
 
     if(k2 == 0) {
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -323,7 +323,7 @@ static double diff_kernel(double w) {
     return 1 / 6.0 * (8 * sin (w) - sin (2 * w));
 }
 
-static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM *pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -332,18 +332,18 @@ static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight) {

From 49a03c2e7c9d8e397f83695987b5f0d21f7ddbdf Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 15:58:46 -0700
Subject: [PATCH 045/120] bug fix: avoid floating-point precision issues and
 reduce the approximation order if data points are less

---
 libgadget/neutrinos_lra.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libgadget/neutrinos_lra.c b/libgadget/neutrinos_lra.c
index 6eaaff0a..8969ccb0 100644
--- a/libgadget/neutrinos_lra.c
+++ b/libgadget/neutrinos_lra.c
@@ -209,6 +209,7 @@ void delta_nu_from_power(struct _powerspectrum * PowerSpectrum, Cosmology * CP,
     }
     if(delta_tot_table.nk != PowerSpectrum->nonzero)
         myfree(Power_in);
+
     boost::math::interpolators::barycentric_rational<double> pkint(logwavenum, delta_nu_ratio, delta_tot_table.nk);
 
     double xmin = logwavenum[0];
@@ -717,6 +718,8 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
         double * fsscales = (double *) mymalloc("fsscales", Nfs* sizeof(double));
         for(ik=0; ik < Nfs; ik++) {
             fsscales[ik] = log(d_tot->TimeTransfer) + ik*(log(a) - log(d_tot->TimeTransfer))/(Nfs-1.);
+            if (ik == Nfs-1)
+                fsscales[ik] = log(a); // Make sure the last point is exactly a without precision loss
             fslengths[ik] = fslength(CP, fsscales[ik], log(a),d_tot->light);
         }
         params.fslengths = fslengths;
@@ -724,11 +727,19 @@ void get_delta_nu(Cosmology * CP, const _delta_tot_table * const d_tot, const do
 
         params.fs_spline = new boost::math::interpolators::barycentric_rational<double>(params.fsscales,params.fslengths,Nfs);
 
+        // if Na is less than 4, the approximation order for interpolation should be adjusted
+        size_t approx_order = 3;
+        if (Na < 4) {
+            approx_order = Na - 1;
+        }
+
         for (ik = 0; ik < d_tot->nk; ik++) {
             double abserr,d_nu_tmp;
             params.k=d_tot->wavenum[ik];
             params.delta_tot=d_tot->delta_tot[ik];
-            params.spline = new boost::math::interpolators::barycentric_rational<double>(params.scale,params.delta_tot,Na);
+            // print the number of data points
+            
+            params.spline = new boost::math::interpolators::barycentric_rational<double>(params.scale,params.delta_tot,Na,approx_order);
             // Define the integrand as a lambda function wrapping get_delta_nu_int
             auto integrand = [&params](double logai) {
                 return get_delta_nu_int(logai, (void *)&params);

From 72b320d53152d61df73e50aaf3c20605a82af9a0 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 17:46:33 -0700
Subject: [PATCH 046/120] power interp

---
 libgenic/power.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/libgenic/power.c b/libgenic/power.c
index e21ac9d0..a6d2aa9c 100644
--- a/libgenic/power.c
+++ b/libgenic/power.c
@@ -3,7 +3,7 @@
 #include <math.h>
 #include <stddef.h>
 #include <mpi.h>
-#include <gsl/gsl_interp.h>
+#include <boost/math/interpolators/barycentric_rational.hpp>
 #include <bigfile-mpi.h>
 
 #include <libgadget/cosmology.h>
@@ -37,7 +37,7 @@ struct table
     int Nentry;
     double * logk;
     double * logD[MAXCOLS];
-    gsl_interp * mat_intp[MAXCOLS];
+    boost::math::interpolators::barycentric_rational<double>* mat_intp[MAXCOLS];
 };
 
 /*Typedef for a function that parses the table from text*/
@@ -76,12 +76,12 @@ static double get_Tabulated(double k, enum TransferType Type, double oobval)
     if(logk < power_table.logk[0] || logk > power_table.logk[power_table.Nentry - 1])
       return oobval;
 
-    double logD = gsl_interp_eval(power_table.mat_intp[0], power_table.logk, power_table.logD[0], logk, NULL);
+    double logD = (*power_table.mat_intp[0])(logk);
     double trans = 1;
     /*Transfer table stores (T_type(k) / T_tot(k))*/
     if(transfer_table.Nentry > 0)
        if(Type >= DELTA_BAR && Type < DELTA_TOT)
-          trans = gsl_interp_eval(transfer_table.mat_intp[Type], transfer_table.logk, transfer_table.logD[Type], logk, NULL);
+          trans = (*transfer_table.mat_intp[Type])(logk);
 
     /*Convert delta from (Mpc/h)^3/2 to kpc/h^3/2*/
     logD += 1.5 * log10(scale);
@@ -322,9 +322,6 @@ void read_power_table(int ThisTask, const char * inputfile, const int ncols, str
     }
 
     MPI_Bcast(out_tab->logk, (ncols+1)*out_tab->Nentry, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-    for(j=0; j<ncols; j++) {
-        out_tab->mat_intp[j] = gsl_interp_alloc(gsl_interp_cspline,out_tab->Nentry);
-    }
 }
 
 int
@@ -393,7 +390,7 @@ init_transfer_table(int ThisTask, double InitTime, const struct power_params * c
     }
     /*Initialise the interpolation*/
     for(t = 0; t < MAXCOLS; t++)
-        gsl_interp_init(transfer_table.mat_intp[t],transfer_table.logk, transfer_table.logD[t],transfer_table.Nentry);
+        transfer_table.mat_intp[t] = new boost::math::interpolators::barycentric_rational<double>(transfer_table.logk, transfer_table.logD[t], transfer_table.Nentry);
 
     message(0,"Scale-dependent growth calculated. Mean = %g %g %g %g %g\n",meangrowth[0], meangrowth[1], meangrowth[2], meangrowth[3], meangrowth[4]);
     message(0, "Power spectrum rows: %d, Transfer: %d (%g -> %g)\n", power_table.Nentry, transfer_table.Nentry, transfer_table.logD[DELTA_BAR][0],transfer_table.logD[DELTA_BAR][transfer_table.Nentry-1]);
@@ -411,7 +408,7 @@ int init_powerspectrum(int ThisTask, double InitTime, double UnitLength_in_cm_in
     if(ppar->WhichSpectrum == 2) {
         read_power_table(ThisTask, ppar->FileWithInputSpectrum, 1, &power_table, InitTime, parse_power);
         /*Initialise the interpolation*/
-        gsl_interp_init(power_table.mat_intp[0],power_table.logk, power_table.logD[0],power_table.Nentry);
+        power_table.mat_intp[0] = new boost::math::interpolators::barycentric_rational<double>(power_table.logk, power_table.logD[0], power_table.Nentry);
         transfer_table.Nentry = 0;
         if(ppar->DifferentTransferFunctions || ppar->ScaleDepVelocity) {
             init_transfer_table(ThisTask, InitTime, ppar);

From 3eadc51f24b926616dcd61becf6cdb538a41d9ce Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Thu, 3 Oct 2024 17:58:38 -0700
Subject: [PATCH 047/120] Remove random thermal velocity support

---
 genic/main.c                  |  84 +-----------------------
 genic/params.c                |   4 +-
 libgenic/Makefile             |   7 +-
 libgenic/tests/test_thermal.c |  78 ----------------------
 libgenic/thermal.c            | 120 ----------------------------------
 libgenic/thermal.h            |  42 ------------
 6 files changed, 5 insertions(+), 330 deletions(-)
 delete mode 100644 libgenic/tests/test_thermal.c
 delete mode 100644 libgenic/thermal.c
 delete mode 100644 libgenic/thermal.h

diff --git a/genic/main.c b/genic/main.c
index 61b800d5..3efd1bb5 100644
--- a/genic/main.c
+++ b/genic/main.c
@@ -8,7 +8,6 @@
 #include <bigfile-mpi.h>
 #include <libgenic/allvars.h>
 #include <libgenic/proto.h>
-#include <libgenic/thermal.h>
 #include <libgadget/walltime.h>
 #include <libgadget/physconst.h>
 #include <libgadget/petapm.h>
@@ -63,18 +62,10 @@ int main(int argc, char **argv)
   const double meanspacing = All2.BoxSize / DMAX(All2.Ngrid, All2.NgridGas);
   double shift_gas = -All2.ProduceGas * 0.5 * (CP.Omega0 - CP.OmegaBaryon) / CP.Omega0 * meanspacing;
   double shift_dm = All2.ProduceGas * 0.5 * CP.OmegaBaryon / CP.Omega0 * meanspacing;
-  
-  double shift_nu = 0;
-  if(!All2.ProduceGas && All2.NGridNu > 0) {
-      double OmegaNu = get_omega_nu(&CP.ONu, 1);
-      shift_nu = -0.5 * (CP.Omega0 - OmegaNu) / CP.Omega0 * meanspacing;
-      shift_dm = 0.5 * OmegaNu / CP.Omega0 * meanspacing;
-  }
-    
+
   if(All2.PrePosGridCenter){
       shift_dm += 0.5 * meanspacing;
       shift_gas += 0.5 * meanspacing;
-      shift_nu += 0.5 * meanspacing;
   }
 
   /*Write the header*/
@@ -88,15 +79,6 @@ int main(int argc, char **argv)
 
   const int64_t TotNu = (int64_t) All2.NGridNu*All2.NGridNu*All2.NGridNu;
   double total_nufrac = 0;
-  struct thermalvel nu_therm;
-  if(TotNu > 0) {
-    const double kBMNu = 3*CP.ONu.kBtnu / (CP.MNu[0]+CP.MNu[1]+CP.MNu[2]);
-    double v_th = NU_V0(All2.TimeIC, kBMNu, All2.units.UnitVelocity_in_cm_per_s);
-    if(!All2.UsePeculiarVelocity)
-        v_th /= sqrt(All2.TimeIC);
-    total_nufrac = init_thermalvel(&nu_therm, v_th, All2.Max_nuvel/v_th, 0);
-    message(0,"F-D velocity scale: %g. Max particle vel: %g. Fraction of mass in particles: %g\n",v_th*sqrt(All2.TimeIC), All2.Max_nuvel*sqrt(All2.TimeIC), total_nufrac);
-  }
   saveheader(&bf, TotNumPart, TotNumPartGas, TotNu, total_nufrac, All2.BoxSize, &CP, All2);
 
   /*Save the transfer functions*/
@@ -163,33 +145,6 @@ int main(int argc, char **argv)
 
   if(NumPartCDM > 0) {
     displacement_fields(pm, DMType, ICP, NumPartCDM, &CP, All2);
-
-    /*Add a thermal velocity to WDM particles*/
-    if(All2.WDM_therm_mass > 0){
-        int i;
-        double v_th = WDM_V0(All2.TimeIC, All2.WDM_therm_mass, CP.Omega0 - CP.OmegaBaryon - get_omega_nu(&CP.ONu, 1), CP.HubbleParam, All2.units.UnitVelocity_in_cm_per_s);
-        if(!All2.UsePeculiarVelocity)
-           v_th /= sqrt(All2.TimeIC);
-        struct thermalvel WDM;
-        init_thermalvel(&WDM, v_th, 10000/v_th, 0);
-        unsigned int * seedtable = init_rng(All2.Seed+1,All2.Ngrid);
-        gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-        /*Seed the random number table with the Id.*/
-        gsl_rng_set(g_rng, seedtable[0]);
-
-        for(i = 0; i < NumPartCDM; i++) {
-             /*Find the slab, and reseed if it has zero z rank*/
-             if(i % All2.Ngrid == 0) {
-                  uint64_t id = idgen_create_id_from_index(idgen_cdm, i);
-                  /*Seed the random number table with x,y index.*/
-                  gsl_rng_set(g_rng, seedtable[id / All2.Ngrid]);
-             }
-             add_thermal_speeds(&WDM, g_rng, ICP[i].Vel);
-        }
-        gsl_rng_free(g_rng);
-        myfree(seedtable);
-    }
-
     write_particle_data(idgen_cdm, 1, &bf, 0, All2.SavePrePos, All2.NumFiles, All2.NumWriters, ICP);
   }
 
@@ -200,43 +155,6 @@ int main(int argc, char **argv)
   }
   myfree(ICP);
 
-  /*Now add random velocity neutrino particles*/
-  if(All2.NGridNu > 0) {
-      int i;
-      IDGenerator idgen_nu[1];
-      idgen_init(idgen_nu, pm, All2.NGridNu, All2.BoxSize);
-
-      int NumPartNu = idgen_nu->NumPart;
-      ICP = (struct ic_part_data *) mymalloc("PartTable", NumPartNu*sizeof(struct ic_part_data));
-
-      NumPartNu = setup_grid(idgen_nu, shift_nu, mass[2], ICP);
-
-	  /*Write initial positions into ICP struct (for neutrinos)*/
-	  for(j=0; j<NumPartNu; j++)
-		  for(k=0; k<3; k++)
-		      ICP[j].PrePos[k] = ICP[j].Pos[k];
-
-      displacement_fields(pm, NuType, ICP, NumPartNu, &CP, All2);
-      unsigned int * seedtable = init_rng(All2.Seed+2,All2.NGridNu);
-      gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-      /*Just in case*/
-      gsl_rng_set(g_rng, seedtable[0]);
-      for(i = 0; i < NumPartNu; i++) {
-           /*Find the slab, and reseed if it has zero z rank*/
-           if(i % All2.NGridNu == 0) {
-                uint64_t id = idgen_create_id_from_index(idgen_nu, i);
-                /*Seed the random number table with x,y index.*/
-                gsl_rng_set(g_rng, seedtable[id / All2.NGridNu]);
-           }
-           add_thermal_speeds(&nu_therm, g_rng, ICP[i].Vel);
-      }
-      gsl_rng_free(g_rng);
-      myfree(seedtable);
-
-      write_particle_data(idgen_nu, 2, &bf, TotNumPart+TotNumPartGas, All2.SavePrePos, All2.NumFiles, All2.NumWriters, ICP);
-      myfree(ICP);
-  }
-
   petapm_destroy(pm);
   big_file_mpi_close(&bf, MPI_COMM_WORLD);
 
diff --git a/genic/params.c b/genic/params.c
index 8fa2fa07..25b23c88 100644
--- a/genic/params.c
+++ b/genic/params.c
@@ -28,7 +28,7 @@ create_parameters(void)
     param_declare_int(ps, "Nmesh", OPTIONAL, 0, "Size of the FFT grid used to estimate displacements. Should be > Ngrid.");
     param_declare_int(ps, "Ngrid", REQUIRED, 0, "Size of regular grid on which the undisplaced CDM particles are created.");
     param_declare_int(ps, "NgridGas", OPTIONAL, -1, "Size of regular grid on which the undisplaced gas particles are created.");
-    param_declare_int(ps, "NgridNu", OPTIONAL, 0, "Number of neutrino particles created for hybrid neutrinos.");
+    param_declare_int(ps, "NgridNu", OPTIONAL, 0, "Number of neutrino particles created for hybrid neutrinos. Not supported in this version.");
     param_declare_int(ps, "Seed", REQUIRED, 0, "Random number generator seed used for the phases of the Gaussian random field.");
     param_declare_int(ps, "MakeGlassGas", OPTIONAL, -1, "Generate Glass IC for gas instead of Grid IC.");
     param_declare_int(ps, "MakeGlassCDM", OPTIONAL, 0, "Generate Glass IC for CDM instead of Grid IC.");
@@ -43,7 +43,7 @@ create_parameters(void)
     param_declare_double(ps, "MNue", OPTIONAL, 0, "First neutrino mass in eV.");
     param_declare_double(ps, "MNum", OPTIONAL, 0, "Second neutrino mass in eV.");
     param_declare_double(ps, "MNut", OPTIONAL, 0, "Third neutrino mass in eV.");
-    param_declare_double(ps, "MWDM_therm", OPTIONAL, 0, "Assign a thermal velocity to the DM. Specifies WDM particle mass in keV.");
+    param_declare_double(ps, "MWDM_therm", OPTIONAL, 0, "Not supported in this version.");
     param_declare_double(ps, "Max_nuvel", OPTIONAL, 5000, "Maximum neutrino velocity sampled from the F-D distribution.");
 
     param_declare_int(ps, "DifferentTransferFunctions", OPTIONAL, 1, "Use species specific transfer functions for baryon and CDM.");
diff --git a/libgenic/Makefile b/libgenic/Makefile
index 7e1ef08c..281999e0 100644
--- a/libgenic/Makefile
+++ b/libgenic/Makefile
@@ -4,17 +4,14 @@ CONFIG ?= ../Options.mk
 
 include $(CONFIG)
 
-INCL=../libgadget/config.h \
-    power.h allvars.h thermal.h proto.h pmesh.h
-
-TESTED = power thermal
+TESTED = power
 TESTBIN := $(TESTED:%=.objs/test_%) $(MPI_TESTED:%=.objs/test_%)
 SUITE?= $(TESTED:%=test_%)
 MPISUITE = $(MPI_TESTED:%=test_%)
 
 include ../Makefile.rules
 
-OBJS = power.o zeldovich.o glass.o save.o thermal.o
+OBJS = power.o zeldovich.o glass.o save.o
 
 OBJS := $(OBJS:%.o=.objs/%.o)
 
diff --git a/libgenic/tests/test_thermal.c b/libgenic/tests/test_thermal.c
deleted file mode 100644
index 41dec626..00000000
--- a/libgenic/tests/test_thermal.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*Tests for the thermal velocity module, ported from S-GenIC.*/
-
-#include <stdarg.h>
-#include <stddef.h>
-#include <setjmp.h>
-#include <cmocka.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "stub.h"
-#include <libgadget/config.h>
-#include <libgenic/thermal.h>
-
-/*Check that the neutrino velocity NU_V0 is sensible*/
-static void
-test_mean_velocity(void ** state)
-{
-    /*Check has units of velocity*/
-    assert_true(fabs(NU_V0(1, 1, 1e3) - 100*NU_V0(1, 1, 1e5)) < 1e-6);
-    /*Check scales linearly with neutrino mass*/
-    assert_true(fabs(10*NU_V0(1, 0.1, 1e5) - NU_V0(1, 1, 1e5)) < 1e-6);
-    /*Check scales as z (gadget's cosmological velocity unit is accounted for outside)*/
-    assert_true(fabs(0.5*NU_V0(0.5, 1, 1e5) -  NU_V0(1, 1, 1e5)) < 1e-6);
-}
-
-static void
-test_thermal_vel(void ** state)
-{
-    /*Seed table with velocity of 100 km/s*/
-    struct thermalvel nu_vels;
-    init_thermalvel(&nu_vels, 100, 5000/100, 0);
-
-    /*Test getting the distribution*/
-    assert_true(fabs(nu_vels.fermi_dirac_vel[0]) < 1e-6);
-    assert_true(fabs(nu_vels.fermi_dirac_vel[LENGTH_FERMI_DIRAC_TABLE - 1] -  MAX_FERMI_DIRAC) < 1e-3);
-
-    /*Number verified by mathematica*/
-    int ii = 0;
-    while(nu_vels.fermi_dirac_cumprob[ii] < 0.5) {
-        ii++;
-    }
-    assert_true(fabs(nu_vels.fermi_dirac_vel[ii] - 2.839075) < 0.002);
-    /*Check some statistical properties (max, min, mean)*/
-    double mean = 0;
-    double max = 0;
-    double min = 1e10;
-    int nsample;
-    float Vel[3] = {0};
-    int64_t MaxID = 100000;
-    gsl_rng * g_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    for (nsample=0; nsample < MaxID; nsample++)
-    {
-        add_thermal_speeds(&nu_vels, g_rng, Vel);
-        double v2 = sqrt(Vel[0]*Vel[0]+Vel[1]*Vel[1]+Vel[2]*Vel[2]);
-        if(v2 > max)
-            max = v2;
-        if(v2 < min)
-            min = v2;
-        mean+=v2;
-        memset(Vel, 0, 3*sizeof(float));
-    }
-    gsl_rng_free(g_rng);
-    mean/=nsample;
-    /*Mean should be roughly 3*zeta(4)/zeta(3)*7/8/(3/4)* m_vamp*/
-    assert_true(fabs(mean - 3*pow(M_PI,4)/90./1.202057*(7./8)/(3/4.)*100) < 1);
-    assert_true(min > 0);
-    assert_true( max < MAX_FERMI_DIRAC*100);
-}
-
-int main(void) {
-    const struct CMUnitTest tests[] = {
-        cmocka_unit_test(test_mean_velocity),
-        cmocka_unit_test(test_thermal_vel)
-    };
-    return cmocka_run_group_tests_mpi(tests, NULL, NULL);
-}
diff --git a/libgenic/thermal.c b/libgenic/thermal.c
deleted file mode 100644
index 5635897c..00000000
--- a/libgenic/thermal.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <gsl/gsl_integration.h>
-#include <assert.h>
-#include "thermal.h"
-/*For speed of light*/
-#include <libgadget/physconst.h>
-#include <libgadget/utils.h>
-
-/*The Boltzmann constant in units of eV/K*/
-#define BOLEVK 8.61734e-5
-
-/* This function converts the dimensionless units used in the integral to dimensionful units.
- * Unit scaling velocity for neutrinos:
- * This is an arbitrary rescaling of the unit system in the Fermi-Dirac kernel so we can integrate dimensionless quantities.
- * The true thing to integrate is:
- * q^2 /(e^(q c / kT) + 1 ) dq between 0 and q.
- * So we choose x = (q c / kT_0) and integrate between 0 and x_0.
- * The units are restored by multiplying the resulting x by kT/c for q
- * To get a v we then use q = a m v/c^2
- * to get:   v/c =x kT/(m a)*/
-/*NOTE: this m is the mass of a SINGLE neutrino species, not the sum of neutrinos!*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s)
-{
-    return kBTNubyMNu / Time * (LIGHTCGS / UnitVelocity_in_cm_per_s);
-}
-
-//Amplitude of the random velocity for WDM
-double WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s)
-{
-        //Not actually sure where this equation comes from: the fiducial values are from Bode, Ostriker & Turok 2001.
-        double WDM_V0 = 0.012 / Time * pow(Omega_CDM / 0.3, 1.0 / 3) * pow(HubbleParam / 0.65, 2.0 / 3) * pow(1.0 /WDM_therm_mass,4.0 / 3);
-        WDM_V0 *= 1.0e5 / UnitVelocity_in_cm_per_s;
-        return WDM_V0;
-}
-
-/*Fermi-Dirac kernel for below*/
-static double
-fermi_dirac_kernel(double x, void * params)
-{
-  return x * x / (exp(x) + 1);
-}
-
-/*Initialise the probability tables*/
-double
-init_thermalvel(struct thermalvel* thermals, const double v_amp, double max_fd,const double min_fd)
-{
-    int i;
-    if(max_fd <= min_fd)
-        endrun(1,"Thermal vel called with negative interval: %g <= %g\n", max_fd, min_fd);
-
-    if(max_fd > MAX_FERMI_DIRAC)
-        max_fd = MAX_FERMI_DIRAC;
-    thermals->m_vamp = v_amp;
-
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
-    double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++) {
-        thermals->fermi_dirac_vel[i] = min_fd+(max_fd-min_fd)* i / (LENGTH_FERMI_DIRAC_TABLE - 1.0);
-        gsl_integration_qag (&F, min_fd, thermals->fermi_dirac_vel[i], 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(thermals->fermi_dirac_cumprob[i]), &abserr);
-    //       printf("gsl_integration_qng in fermi_dirac_init_nu. Result %g, error: %g, intervals: %lu\n",fermi_dirac_cumprob[i], abserr,w->size);
-    }
-    /*Save the largest cum. probability, pre-normalisation,
-     * divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
-    double total_fd;
-    gsl_integration_qag (&F, 0, MAX_FERMI_DIRAC, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
-    assert(total_fd > 1.8);
-
-    gsl_integration_workspace_free (w);
-
-    double total_frac = thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE-1]/total_fd;
-    //Normalise total integral to unity
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++)
-        thermals->fermi_dirac_cumprob[i] /= thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE - 1];
-
-    /*Initialise the GSL table*/
-    thermals->fd_intp = gsl_interp_alloc(gsl_interp_cspline,LENGTH_FERMI_DIRAC_TABLE);
-    thermals->fd_intp_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel,LENGTH_FERMI_DIRAC_TABLE);
-    return total_frac;
-}
-
-/*Generate a table of random seeds, one for each pencil.*/
-unsigned int *
-init_rng(int Seed, int Nmesh)
-{
-    unsigned int * seedtable = (unsigned int *) mymalloc("randseeds", Nmesh*Nmesh*sizeof(unsigned int));
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(rng, Seed);
-
-    int i, j;
-    for(i = 0; i < Nmesh; i++)
-        for(j=0; j < Nmesh; j++)
-        {
-            seedtable[i+Nmesh*j] = gsl_rng_get(rng);
-        }
-    gsl_rng_free(rng);
-    return seedtable;
-}
-
-/* Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.
- * The particle Id is used as a seed for the RNG.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[])
-{
-    const double p = gsl_rng_uniform (g_rng);
-    /*m_vamp multiples by the dimensional factor to get a velocity again.*/
-    const double v = thermals->m_vamp * gsl_interp_eval(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel, p, thermals->fd_intp_acc);
-
-    /*Random phase*/
-    const double phi = 2 * M_PI * gsl_rng_uniform (g_rng);
-    const double theta = acos(2 * gsl_rng_uniform (g_rng) - 1);
-
-    Vel[0] += v * sin(theta) * cos(phi);
-    Vel[1] += v * sin(theta) * sin(phi);
-    Vel[2] += v * cos(theta);
-}
diff --git a/libgenic/thermal.h b/libgenic/thermal.h
deleted file mode 100644
index a134ed4d..00000000
--- a/libgenic/thermal.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef THERMALVEL_H
-#define THERMALVEL_H
-
-#include <gsl/gsl_interp.h>
-#include <gsl/gsl_rng.h>
-/*Length of the table*/
-#define MAX_FERMI_DIRAC          17.0
-#define LENGTH_FERMI_DIRAC_TABLE 2000
-
-struct thermalvel
-{
-    double fermi_dirac_vel[LENGTH_FERMI_DIRAC_TABLE];
-    double fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE];
-    double m_vamp;
-    gsl_interp * fd_intp;
-    gsl_interp_accel * fd_intp_acc;
-};
-
-/*Single parameter is the amplitude of the random velocities. All the physics is in here.
- * max_fd and min_fd give the maximum and minimum velocities to integrate over.
- * Note these values are dimensionless*/
-/*Returns total fraction of the Fermi-Dirac distribution between max_fd and min_fd*/
-double
-init_thermalvel(struct thermalvel * thermals, const double v_amp, double max_fd, const double min_fd);
-
-/*Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[]);
-
-/*Amplitude of the random velocity for neutrinos*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s);
-
-/*Amplitude of the random velocity for WDM*/
-double
-WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s);
-
-unsigned int *
-init_rng(int Seed, int Nmesh);
-
-
-#endif

From 60870bf8f26ac110efde83798a6c7d40b2e5e2d9 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 18:03:22 -0700
Subject: [PATCH 048/120] gravpm powerspec interp

---
 libgadget/gravpm.c        |  7 ++-----
 libgadget/powerspectrum.h | 12 +++++++++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index 829443c7..e4ad9f2c 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -318,9 +318,7 @@ static void compute_neutrino_power(PetaPM * pm) {
     delta_nu_from_power(ps, GravPM.CP, GravPM.Time, GravPM.TimeIC);
 
     /*Initialize the interpolation for the neutrinos*/
-    ps->nu_spline = gsl_interp_alloc(gsl_interp_linear,ps->nonzero);
-    ps->nu_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(ps->nu_spline,ps->logknu,ps->delta_nu_ratio,ps->nonzero);
+    ps->nu_spline = new boost::math::interpolators::barycentric_rational<double>(ps->logknu, ps->delta_nu_ratio, ps->nonzero);
     /*Zero power spectrum, which is stored with the neutrinos*/
     powerspectrum_zero(ps);
 }
@@ -430,8 +428,7 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
          *            = (M_cdm + M_nu) * delta_t
          * This is correct for the forces, and gives the right power spectrum,
          * once we multiply PowerSpectrum.Norm by (Omega0 / (Omega0 - OmegaNu))**2 */
-        const double nufac = 1 + ps->nu_prefac * gsl_interp_eval(ps->nu_spline,ps->logknu,
-                                                                       ps->delta_nu_ratio,logk2,ps->nu_acc);
+        const double nufac = 1 + ps->nu_prefac * (*ps->nu_spline)(logk2);
         value[0][0] *= nufac;
         value[0][1] *= nufac;
     }
diff --git a/libgadget/powerspectrum.h b/libgadget/powerspectrum.h
index b9320cbc..315add84 100644
--- a/libgadget/powerspectrum.h
+++ b/libgadget/powerspectrum.h
@@ -3,7 +3,14 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <gsl/gsl_interp.h>
+
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+#include <boost/math/interpolators/barycentric_rational.hpp>
+
+#define P PartManager->Base
 
 typedef struct _powerspectrum {
     double * kk;
@@ -20,8 +27,7 @@ typedef struct _powerspectrum {
     double * logknu;
     double * delta_nu_ratio;
     double nu_prefac;
-    gsl_interp *nu_spline;
-    gsl_interp_accel * nu_acc;
+    boost::math::interpolators::barycentric_rational<double>* nu_spline;
 
 } Power;
 

From 552d5a3af34cf37468ea51222e7a525b7338bd66 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 18:21:01 -0700
Subject: [PATCH 049/120] omnu interp

---
 libgadget/omega_nu_single.c | 10 +++-------
 libgadget/omega_nu_single.h |  9 ++++++---
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/libgadget/omega_nu_single.c b/libgadget/omega_nu_single.c
index 322f84d2..083cd2ed 100644
--- a/libgadget/omega_nu_single.c
+++ b/libgadget/omega_nu_single.c
@@ -13,8 +13,6 @@
 #define NRHOTAB 200
 /** Floating point accuracy*/
 #define FLOAT_ACC   1e-6
-/** Number of bins in integrations*/
-#define GSL_VAL 200
 
 void init_omega_nu(_omega_nu * omnu, const double MNu[], const double a0, const double HubbleParam, const double tcmb0)
 {
@@ -139,9 +137,7 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
      /*Allocate memory for arrays*/
      rho_nu_tab->loga = (double *) mymalloc("rho_nu_table",2*NRHOTAB*sizeof(double));
      rho_nu_tab->rhonu = rho_nu_tab->loga+NRHOTAB;
-     rho_nu_tab->acc = gsl_interp_accel_alloc();
-     rho_nu_tab->interp=gsl_interp_alloc(gsl_interp_cspline,NRHOTAB);
-     if(!rho_nu_tab->interp || !rho_nu_tab->acc || !rho_nu_tab->loga)
+     if(!rho_nu_tab->loga)
          endrun(2035,"Could not initialise tables for neutrino matter density\n");
 
      for(i=0; i< NRHOTAB; i++){
@@ -161,7 +157,7 @@ void rho_nu_init(_rho_nu_single * const rho_nu_tab, double a0, const double mnu,
         rho_nu_tab->rhonu[i] = result / pow(exp(rho_nu_tab->loga[i]), 4) * get_rho_nu_conversion();
      }
 
-     gsl_interp_init(rho_nu_tab->interp,rho_nu_tab->loga,rho_nu_tab->rhonu,NRHOTAB);
+     rho_nu_tab->interp = new boost::math::interpolators::barycentric_rational<double>(rho_nu_tab->loga, rho_nu_tab->rhonu, NRHOTAB);
      return;
 }
 
@@ -205,7 +201,7 @@ double rho_nu(const _rho_nu_single * rho_nu_tab, const double a, const double kT
             if (!rho_nu_tab->loga || loga < rho_nu_tab->loga[0])
                 rho_nu_val = rel_rho_nu(a,kT);
             else
-                rho_nu_val=gsl_interp_eval(rho_nu_tab->interp,rho_nu_tab->loga,rho_nu_tab->rhonu,loga,rho_nu_tab->acc);
+                rho_nu_val=(*rho_nu_tab->interp)(loga);
         }
         return rho_nu_val;
 }
diff --git a/libgadget/omega_nu_single.h b/libgadget/omega_nu_single.h
index 9cbdbd34..ff24a896 100644
--- a/libgadget/omega_nu_single.h
+++ b/libgadget/omega_nu_single.h
@@ -3,7 +3,11 @@
 /** \file
  * Routines for computing the matter density in a single neutrino species*/
 
-#include <gsl/gsl_interp.h>
+// Undefine P before including Boost
+#ifdef P
+#undef P
+#endif
+#include <boost/math/interpolators/barycentric_rational.hpp>
 
 /** Ratio between the massless neutrino temperature and the CMB temperature.
  * Note there is a slight correction from 4/11
@@ -24,8 +28,7 @@
 struct _rho_nu_single {
     double * loga;
     double * rhonu;
-    gsl_interp * interp;
-    gsl_interp_accel * acc;
+    boost::math::interpolators::barycentric_rational<double>* interp;
     /*Neutrino mass for this structure*/
     double mnu;
 };

From 8a338409ebb0a1691f4808cb661585715a1bfcfd Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 20:55:47 -0700
Subject: [PATCH 050/120] pmesh rng boost mt

---
 libgenic/pmesh.h | 58 ++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/libgenic/pmesh.h b/libgenic/pmesh.h
index f73f533b..53c3498f 100644
--- a/libgenic/pmesh.h
+++ b/libgenic/pmesh.h
@@ -1,13 +1,10 @@
 #ifndef PMESH_H
 #define PMESH_H
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <libgadget/petapm.h>
 #include <libgadget/utils.h>
 
-/*
- * The following functions are from fastpm/libfastpm/initialcondition.c.
- * Agrees with nbodykit's pmesh/whitenoise.c, which agrees with n-genic.
- * */
 typedef struct {
     struct {
         ptrdiff_t start[3];
@@ -19,9 +16,10 @@ typedef struct {
 } PMDesc;
 
 static inline void
-SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, gsl_rng * rng)
+SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, boost::random::mt19937 & rng)
 {
-    unsigned int seed = 0x7fffffff * gsl_rng_uniform(rng);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
+    unsigned int seed = static_cast<unsigned int>(0x7fffffff * dist(rng));
 
     int ii[2] = {i, (pm->Nmesh[0] - i) % pm->Nmesh[0]};
     int jj[2] = {j, (pm->Nmesh[1] - j) % pm->Nmesh[1]};
@@ -41,6 +39,7 @@ SETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, gsl_rng * rng)
         }
     }
 }
+
 static inline unsigned int
 GETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, int d1, int d2)
 {
@@ -54,11 +53,12 @@ GETSEED(PMDesc * pm, unsigned int * table[2][2], int i, int j, int d1, int d2)
 }
 
 static void
-SAMPLE(gsl_rng * rng, double * ampl, double * phase)
+SAMPLE(boost::random::mt19937 & rng, double * ampl, double * phase)
 {
-    *phase = gsl_rng_uniform(rng) * 2 * M_PI;
+    boost::random::uniform_real_distribution<double> dist(0, 1);
+    *phase = dist(rng) * 2 * M_PI;
     *ampl = 0;
-    do *ampl = gsl_rng_uniform(rng); while(*ampl == 0);
+    do *ampl = dist(rng); while(*ampl == 0);
 }
 
 static void
@@ -68,8 +68,8 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
     int d;
     int i, j, k;
 
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(rng, seed);
+    // Initialize the Boost RNG
+    boost::random::mt19937 rng(seed);
 
     unsigned int * seedtable[2][2];
     for(i = 0; i < 2; i ++)
@@ -88,15 +88,13 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
         for(j = 0; j < i; j++) SETSEED(pm, seedtable, pm->Nmesh[0] - 1 - i, pm->Nmesh[1] - 1 - j, rng);
         for(j = 0; j < i + 1; j++) SETSEED(pm, seedtable, pm->Nmesh[1] - 1 - j, pm->Nmesh[0] - 1 - i, rng);
     }
-    gsl_rng_free(rng);
 
     ptrdiff_t irel[3];
     for(i = pm->ORegion.start[0];
         i < pm->ORegion.start[0] + pm->ORegion.size[0];
         i ++) {
 
-        gsl_rng * lower_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-        gsl_rng * this_rng = gsl_rng_alloc(gsl_rng_ranlxd1);
+        boost::random::mt19937 lower_rng, this_rng;
 
         int ci = pm->Nmesh[0] - i;
         if(ci >= pm->Nmesh[0]) ci -= pm->Nmesh[0];
@@ -104,15 +102,10 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
         for(j = pm->ORegion.start[1];
             j < pm->ORegion.start[1] + pm->ORegion.size[1];
             j ++) {
-            /* always pull the gaussian from the lower quadrant plane for k = 0
-             * plane*/
-            /* always pull the whitenoise from the lower quadrant plane for k = 0
-             * plane and k == All.Nmesh / 2 plane*/
             int d1 = 0, d2 = 0;
             int cj = pm->Nmesh[1] - j;
             if(cj >= pm->Nmesh[1]) cj -= pm->Nmesh[1];
 
-            /* d1, d2 points to the conjugate quandrant */
             if( (ci == i && cj < j)
              || (ci < i && cj != j)
              || (ci < i && cj == j)) {
@@ -121,20 +114,17 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
             }
 
             unsigned int seed_conj, seed_this;
-            /* the lower quadrant generator */
             seed_conj = GETSEED(pm, seedtable, i, j, d1, d2);
-            gsl_rng_set(lower_rng, seed_conj);
+            lower_rng.seed(seed_conj);
 
             seed_this = GETSEED(pm, seedtable, i, j, 0, 0);
-            gsl_rng_set(this_rng, seed_this);
+            this_rng.seed(seed_this);
 
             for(k = 0; k <= pm->Nmesh[2] / 2; k ++) {
                 int use_conj = (d1 != 0 || d2 != 0) && (k == 0 || k == pm->Nmesh[2] / 2);
 
                 double ampl, phase;
                 if(use_conj) {
-                    /* on k = 0 and All.Nmesh/2 plane, we use the lower quadrant generator,
-                     * then hermit transform the result if it is nessessary */
                     SAMPLE(this_rng, &ampl, &phase);
                     SAMPLE(lower_rng, &ampl, &phase);
                 } else {
@@ -152,14 +142,12 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
                 if(irel[2] < 0) continue;
                 if(irel[2] >= pm->ORegion.size[2]) continue;
 
-                /* we want two numbers that are of std ~ 1/sqrt(2) */
                 ampl = sqrt(- log(ampl));
 
-                if (setUnitaryAmplitude) ampl = 1.0; /* cos and sin gives 1/sqrt(2)*/
-
+                if (setUnitaryAmplitude) ampl = 1.0;
 
                 if (setInvertPhase){
-                  phase += M_PI; /*invert phase*/
+                  phase += M_PI;
                 }
 
                 (delta_k + 2 * ip)[0] = ampl * cos(phase);
@@ -172,29 +160,21 @@ pmic_fill_gaussian_gadget(PMDesc * pm, double * delta_k, int seed, int setUnitar
                 if((pm->Nmesh[0] - iabs[0]) % pm->Nmesh[0] == iabs[0] &&
                    (pm->Nmesh[1] - iabs[1]) % pm->Nmesh[1] == iabs[1] &&
                    (pm->Nmesh[2] - iabs[2]) % pm->Nmesh[2] == iabs[2]) {
-                    /* The mode is self conjuguate, thus imaginary mode must be zero */
                     (delta_k + 2 * ip)[1] = 0;
                     (delta_k + 2 * ip)[0] = ampl * cos(phase);
                 }
 
                 if(iabs[0] == 0 && iabs[1] == 0 && iabs[2] == 0) {
-                    /* the mean is zero */
                     (delta_k + 2 * ip)[0] = 0;
                     (delta_k + 2 * ip)[1] = 0;
                 }
             }
         }
-        gsl_rng_free(lower_rng);
-        gsl_rng_free(this_rng);
     }
     for(i = 1; i >= 0; i --)
     for(j = 1; j >= 0; j --) {
         myfree(seedtable[i][j]);
     }
-/*
-    char * fn[1000];
-    sprintf(fn, "canvas.dump.f4.%d", pm->ThisTask);
-    fwrite(pm->canvas, sizeof(pm->canvas[0]), pm->ORegion.total * 2, fopen(fn, "w"));
-*/
 }
-#endif
+
+#endif
\ No newline at end of file

From 4ff91743fb743e9cad2ba7cad7d70a593ec2cb3c Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Thu, 3 Oct 2024 22:42:33 -0700
Subject: [PATCH 051/120] system random

---
 libgadget/utils/system.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgadget/utils/system.c b/libgadget/utils/system.c
index 359b5672..49a58100 100644
--- a/libgadget/utils/system.c
+++ b/libgadget/utils/system.c
@@ -11,7 +11,8 @@
 #include <sys/resource.h>
 #include <unistd.h>
 #include <signal.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <omp.h>
 
 #define __UTILS_SYSTEM_C
@@ -66,15 +67,14 @@ RandTable set_random_numbers(uint64_t seed, const size_t rndtablesize)
     rnd.Table = (double *) mymalloc2("Random", rndtablesize * sizeof(double));
     rnd.tablesize = rndtablesize;
     /* start-up seed */
-    gsl_rng * random_generator = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(random_generator, seed);
 
+    boost::random::mt19937 random_generator(seed);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Populate a table with uniform random numbers between 0 and 1*/
     size_t i;
     for(i = 0; i < rndtablesize; i++)
-        rnd.Table[i] = gsl_rng_uniform(random_generator);
+        rnd.Table[i] = dist(random_generator);
 
-    gsl_rng_free(random_generator);
     return rnd;
 }
 

From 243dd9a2854c76b806f2a4a3103902e0ed3971bc Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 14:33:33 -0700
Subject: [PATCH 052/120] gsl err ode

---
 gadget/main.c         |   9 ----
 libgadget/cosmology.c | 104 ++++++++++++++++++++++++++++--------------
 libgenic/glass.c      |  13 +++---
 3 files changed, 76 insertions(+), 50 deletions(-)

diff --git a/gadget/main.c b/gadget/main.c
index f3e84eee..1cf28fa0 100644
--- a/gadget/main.c
+++ b/gadget/main.c
@@ -6,7 +6,6 @@
 #include <sys/resource.h>
 #include <unistd.h>
 #include <math.h>
-#include <gsl/gsl_errno.h>
 #include <omp.h>
 
 #include <libgadget/slotsmanager.h>
@@ -21,11 +20,6 @@
 
 #include "params.h"
 
-void gsl_handler (const char * reason, const char * file, int line, int gsl_errno)
-{
-    endrun(2001,"GSL_ERROR in file: %s, line %d, errno:%d, error: %s\n",file, line, gsl_errno, reason);
-}
-
 /*! \file main.c
  *  \brief start of the program
  */
@@ -107,9 +101,6 @@ int main(int argc, char **argv)
         endrun(0, "Need to give the snapshot number if FOF is selected for output\n");
     }
 
-    /*Set up GSL so it gives a proper MPI termination*/
-    gsl_set_error_handler(gsl_handler);
-
     /*Initialize the memory manager*/
     mymalloc_init(MaxMemSizePerNode);
 
diff --git a/libgadget/cosmology.c b/libgadget/cosmology.c
index 61a8840d..8eb4c8d9 100644
--- a/libgadget/cosmology.c
+++ b/libgadget/cosmology.c
@@ -1,7 +1,5 @@
 #include <math.h>
-#include <gsl/gsl_errno.h>
-#include <gsl/gsl_odeiv2.h>
-
+#include <boost/numeric/odeint.hpp>
 #include "cosmology.h"
 #include "physconst.h"
 #include "utils.h"
@@ -107,6 +105,24 @@ int growth_ode(double a, const double yy[], double dyda[], void * params)
     return GSL_SUCCESS;
 }
 
+// Define the ODE system for the growth factor
+void growth_ode(const std::vector<double> &yy, std::vector<double> &dyda, double a, void * params)
+{
+    Cosmology * CP = (Cosmology *) params;
+    const double hub = hubble_function(CP, a) / CP->Hubble;
+
+    dyda[0] = yy[1] / pow(a, 3) / hub;
+    /*Only use gravitating part*/
+    /* Note: we do not include neutrinos
+     * here as they are free-streaming at the initial time.
+     * This is not right if our box is very large and thus overlaps
+     * with their free-streaming scale. In that case the growth factor will be scale-dependent
+     * and we need to numerically differentiate. In practice the box will either be larger
+     * than the horizon, and so need radiation perturbations, or the neutrino
+     * mass will be larger than current constraints allow, so we just warn for now.*/
+    dyda[1] = yy[0] * 1.5 * a * (CP->OmegaCDM + CP->OmegaBaryon) / (a * a * a) / hub;
+}
+
 /** The growth function is given as a 2nd order DE in Peacock 1999, Cosmological Physics.
  * D'' + a'/a D' - 1.5 * (a'/a)^2 D = 0
  * 1/a (a D')' - 1.5 (a'/a)^2 D
@@ -114,39 +130,59 @@ int growth_ode(double a, const double yy[], double dyda[], void * params)
  * Define F = a^3 H dD/da
  * and we have: dF/da = 1.5 a H D
  */
-double growth(Cosmology * CP, double a, double * dDda)
+
+double growth(Cosmology *CP, double a, double *dDda)
 {
-  gsl_odeiv2_system FF;
-  FF.function = &growth_ode;
-  FF.jacobian = NULL;
-  FF.params = CP;
-  FF.dimension = 2;
-  gsl_odeiv2_driver * drive = gsl_odeiv2_driver_alloc_standard_new(&FF,gsl_odeiv2_step_rkf45, 1e-5, 1e-8,1e-8,1,1);
-   /* We start early to avoid lambda.*/
-  double curtime = 1e-5;
-  /* Handle even earlier times*/
-  if(a < curtime)
-      curtime = a / 10;
-  /* Initial velocity chosen so that D = Omegar + 3/2 Omega_m a,
-   * the solution for a matter/radiation universe.*
-   * Note the normalisation of D is arbitrary
-   * and never seen outside this function.*/
-  double yinit[2] = {1.5 * (CP->OmegaCDM + CP->OmegaBaryon)/(curtime*curtime), pow(curtime,3)*hubble_function(CP, curtime)/CP->Hubble * 1.5 * (CP->OmegaCDM + CP->OmegaBaryon)/(curtime*curtime*curtime)};
-  if(CP->RadiationOn)
-      yinit[0] += CP->OmegaG/pow(curtime, 4)+get_omega_nu(&CP->ONu, curtime);
-
-  int stat = gsl_odeiv2_driver_apply(drive, &curtime,a, yinit);
-  if (stat != GSL_SUCCESS) {
-      endrun(1,"gsl_odeiv in growth: %d. Result at %g is %g %g\n",stat, curtime, yinit[0], yinit[1]);
-  }
-  gsl_odeiv2_driver_free(drive);
-  /*Store derivative of D if needed.*/
-  if(dDda) {
-      *dDda = yinit[1]/pow(a,3)/(hubble_function(CP, a)/CP->Hubble);
-  }
-  return yinit[0];
-}
+    using namespace boost::numeric::odeint;
+
+    // Define a default start time (scale factor)
+    double curtime = 1e-5;
+
+    // Adjust `curtime` if `a` is smaller than the default
+    if (a < curtime) {
+        curtime = a / 10.0;  // Ensure `curtime` is smaller than the target `a`
+    }
+
+    // Initial conditions for the growth factor
+    std::vector<double> yinit(2);
+
+    // Initial conditions at curtime: [D(curtime), D'(curtime)]
+    yinit[0] = 1.5 * (CP->OmegaCDM + CP->OmegaBaryon) / (curtime * curtime);  
+    yinit[1] = pow(curtime, 3) * hubble_function(CP, curtime) / CP->Hubble *
+               1.5 * (CP->OmegaCDM + CP->OmegaBaryon) / (curtime * curtime * curtime); 
+
+    // Include radiation if enabled
+    if (CP->RadiationOn) {
+        yinit[0] += CP->OmegaG / pow(curtime, 4) + get_omega_nu(&CP->ONu, curtime);
+    }
 
+    // Define the ODE system (as a lambda function)
+    auto growth_system = [&CP](const std::vector<double> &yy, std::vector<double> &dyda, double a) {
+        growth_ode(yy, dyda, a, CP);
+    };
+
+    // Use Boost's Runge-Kutta-Fehlberg (RKF45) adaptive step-size integrator
+    runge_kutta_cash_karp54<std::vector<double>> stepper;
+    double abs_error = 1e-8;
+    double rel_error = 1e-8;
+    double step_size = 1e-5;
+
+    try {
+        // Integrate the ODE from curtime (curtime) to the given `a`
+        integrate_adaptive(make_controlled(abs_error, rel_error, stepper),
+                           growth_system, yinit, curtime, a, step_size);
+    } catch (...) {
+        endrun(1, "Boost ODE solver failed during integration\n");
+    }
+
+    // If the derivative is needed, store it in dDda
+    if (dDda) {
+        *dDda = yinit[1] / pow(a, 3) / (hubble_function(CP, a) / CP->Hubble);
+    }
+
+    // Return the growth factor D(a)
+    return yinit[0];
+}
 /*
  * This is the Zeldovich approximation prefactor,
  * f1 = d ln D1 / dlna = a / D (dD/da)
diff --git a/libgenic/glass.c b/libgenic/glass.c
index 5d8e6bbd..e9925016 100644
--- a/libgenic/glass.c
+++ b/libgenic/glass.c
@@ -4,7 +4,8 @@
 #include <string.h>
 #include <omp.h>
 
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include "allvars.h"
 #include "proto.h"
@@ -40,29 +41,27 @@ static void glass_stats(struct ic_part_data * ICP, int NumPart);
 int
 setup_glass(IDGenerator * idgen, PetaPM * pm, double shift, int seed, double mass, struct ic_part_data * ICP, const double UnitLength_in_cm, const char * OutputDir)
 {
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
     int ThisTask;
     MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    gsl_rng_set(rng, seed + ThisTask);
+    boost::random::mt19937 rng(seed + ThisTask);
     memset(ICP, 0, idgen->NumPart*sizeof(struct ic_part_data));
+    boost::random::uniform_real_distribution<double> dist(0, 1);
 
     int i;
     /* Note: this loop should nto be omp because
-     * of the call to gsl_rng_uniform*/
+     * of the call to rng_uniform*/
     for(i = 0; i < idgen->NumPart; i ++) {
         int k;
         idgen_create_pos_from_index(idgen, i, &ICP[i].Pos[0]);
         /* a spread of 3 will kill most of the grid anisotropy structure;
          * and still being local */
         for(k = 0; k < 3; k++) {
-            double rand = idgen->BoxSize / idgen->Ngrid * 3 * (gsl_rng_uniform(rng) - 0.5);
+            double rand = idgen->BoxSize / idgen->Ngrid * 3 * (dist(rng) - 0.5);
             ICP[i].Pos[k] += shift + rand;
         }
         ICP[i].Mass = mass;
     }
 
-    gsl_rng_free(rng);
-
     char * fn = fastpm_strdup_printf("powerspectrum-glass-%08X", seed);
     glass_evolve(pm, 14, fn, ICP, idgen->NumPart, UnitLength_in_cm, OutputDir);
     myfree(fn);

From 2f926b65d16cb611649d639141d88eadf9e90c78 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 14:44:46 -0700
Subject: [PATCH 053/120] boost bessel

---
 libgadget/cosmology.c     | 17 -----------------
 libgadget/neutrinos_lra.c |  7 ++++---
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/libgadget/cosmology.c b/libgadget/cosmology.c
index 8eb4c8d9..04c9223a 100644
--- a/libgadget/cosmology.c
+++ b/libgadget/cosmology.c
@@ -88,23 +88,6 @@ double GrowthFactor(Cosmology * CP, double astart, double aend)
     return growth(CP, astart, NULL) / growth(CP, aend, NULL);
 }
 
-int growth_ode(double a, const double yy[], double dyda[], void * params)
-{
-    Cosmology * CP = (Cosmology *) params;
-    const double hub = hubble_function(CP, a)/CP->Hubble;
-    dyda[0] = yy[1]/pow(a,3)/hub;
-    /*Only use gravitating part*/
-    /* Note: we do not include neutrinos
-     * here as they are free-streaming at the initial time.
-     * This is not right if our box is very large and thus overlaps
-     * with their free-streaming scale. In that case the growth factor will be scale-dependent
-     * and we need to numerically differentiate. In practice the box will either be larger
-     * than the horizon, and so need radiation perturbations, or the neutrino
-     * mass will be larger than current constraints allow, so we just warn for now.*/
-    dyda[1] = yy[0] * 1.5 * a * (CP->OmegaCDM + CP->OmegaBaryon)/(a*a*a) / hub;
-    return GSL_SUCCESS;
-}
-
 // Define the ODE system for the growth factor
 void growth_ode(const std::vector<double> &yy, std::vector<double> &dyda, double a, void * params)
 {
diff --git a/libgadget/neutrinos_lra.c b/libgadget/neutrinos_lra.c
index 8969ccb0..5af9b3e3 100644
--- a/libgadget/neutrinos_lra.c
+++ b/libgadget/neutrinos_lra.c
@@ -9,9 +9,8 @@
 #include <math.h>
 #include <string.h>
 #include <bigfile-mpi.h>
-#include <gsl/gsl_errno.h>
 #include <boost/math/interpolators/barycentric_rational.hpp>
-#include <gsl/gsl_sf_bessel.h>
+#include <boost/math/special_functions/bessel.hpp>
 
 #include "neutrinos_lra.h"
 
@@ -583,7 +582,9 @@ static inline double specialJ_fit(const double x)
 /*Asymptotic series expansion from YAH. Not good when qc * x is small, but fine otherwise.*/
 static inline double II(const double x, const double qc, const int n)
 {
-    return (n*n+n*n*n*qc+n*qc*x*x - x*x)* qc*gsl_sf_bessel_j0(qc*x) + (2*n+n*n*qc+qc*x*x)*cos(qc*x);
+    using boost::math::cyl_bessel_j;  // Import Boost Bessel function
+    return (n*n+n*n*n*qc+n*qc*x*x - x*x) * qc * cyl_bessel_j(0, qc * x)  // Bessel J0
+           + (2 * n + n * n * qc + qc * x * x) * cos(qc * x);
 }
 
 /* Fourier transform of truncated Fermi Dirac distribution, with support on q > qc only.

From bb53dfa9af1df674a6be3cbc3a550f93d2b04f2d Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 20:44:33 -0700
Subject: [PATCH 054/120] test to boost

---
 libgadget/tests/test_cosmology.c |  6 +++---
 libgadget/tests/test_density.c   | 19 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/libgadget/tests/test_cosmology.c b/libgadget/tests/test_cosmology.c
index b7594cd5..a6d05f96 100644
--- a/libgadget/tests/test_cosmology.c
+++ b/libgadget/tests/test_cosmology.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
-#include <gsl/gsl_sf_hyperg.h>
+#include <boost/math/special_functions/hypergeometric_2f1.hpp>
 #include <libgadget/physconst.h>
 #include <libgadget/cosmology.h>
 #include "stub.h"
@@ -51,8 +51,8 @@ static inline double radgrow(double aa, double omegar) {
 
 //Omega_L + Omega_M = 1 => D+ ~ Gauss hypergeometric function
 static inline double growth(double aa, double omegam) {
-    double omegal = 1-omegam;
-    return aa * gsl_sf_hyperg_2F1(1./3, 1, 11./6, -omegal/omegam*pow(aa,3));
+    double omegal = 1 - omegam;
+    return aa * boost::math::hypergeometric_2f1(1./3, 1, 11./6, -omegal/omegam * pow(aa, 3));
 }
 
 static void test_cosmology(void ** state)
diff --git a/libgadget/tests/test_density.c b/libgadget/tests/test_density.c
index 176d4de0..e33436e3 100644
--- a/libgadget/tests/test_density.c
+++ b/libgadget/tests/test_density.c
@@ -8,7 +8,8 @@
 #include <mpi.h>
 #include <stdio.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include <libgadget/partmanager.h>
 #include <libgadget/walltime.h>
@@ -28,7 +29,7 @@ struct density_testdata
     struct sph_pred_data sph_pred;
     DomainDecomp ddecomp;
     struct density_params dp;
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 
 /* Perform some simple checks on the densities*/
@@ -204,8 +205,9 @@ static void test_density_close(void ** state) {
     do_density_test(state, numpart, 0.131726, 1e-4);
 }
 
-void do_random_test(void **state, gsl_rng * r, const int numpart)
+void do_random_test(void **state, boost::random::mt19937 &r, const int numpart)
 {
+    boost::random::uniform_real_distribution<double> dist(0.0, 1.0);
     /* Create a randomly space set of particles, 8x8x8, all of type 0. */
     int i;
     for(i=0; i<numpart/4; i++) {
@@ -215,7 +217,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
 
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         P[i].Type = 0;
@@ -223,7 +225,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
         P[i].Hsml = PartManager->BoxSize/cbrt(numpart);
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         P[i].Type = 0;
@@ -231,7 +233,7 @@ void do_random_test(void **state, gsl_rng * r, const int numpart)
         P[i].Hsml = PartManager->BoxSize/cbrt(numpart);
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     do_density_test(state, numpart, 0.187515, 1e-3);
 }
@@ -240,7 +242,7 @@ static void test_density_random(void ** state) {
     /*Set up the particle data*/
     int ncbrt = 32;
     struct density_testdata * data = * (struct density_testdata **) state;
-    gsl_rng * r = (gsl_rng *) data->r;
+    boost::random::mt19937 &r = (boost::random::mt19937) data->r;
     int numpart = ncbrt*ncbrt*ncbrt;
     /*Allocate tree*/
     /*Base pointer*/
@@ -324,8 +326,7 @@ static int setup_density(void **state) {
     data->dp.BlackHoleMaxAccretionRadius = 99999.;
 
     set_densitypar(data->dp);
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     return 0;
 }

From 10483b5f8a2180b6907571f19571ff452026fbf9 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 21:23:41 -0700
Subject: [PATCH 055/120] test all to boost

---
 libgadget/tests/test_density.c         |  1 -
 libgadget/tests/test_exchange.c        |  1 -
 libgadget/tests/test_fof.c             | 10 +++++-----
 libgadget/tests/test_forcetree.c       | 19 ++++++++++---------
 libgadget/tests/test_gravity.c         | 19 ++++++++++---------
 libgadget/tests/test_omega_nu_single.c | 13 +++++--------
 libgadget/tests/test_peano.c           |  1 -
 libgadget/tests/test_slotsmanager.c    |  1 -
 libgadget/tests/test_timefac.c         | 14 ++++++--------
 9 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/libgadget/tests/test_density.c b/libgadget/tests/test_density.c
index e33436e3..a716ec36 100644
--- a/libgadget/tests/test_density.c
+++ b/libgadget/tests/test_density.c
@@ -282,7 +282,6 @@ static int teardown_density(void **state) {
     myfree(data->ddecomp.Tasks);
     myfree(data->ddecomp.TopLeaves);
     myfree(data->ddecomp.TopNodes);
-    free(data->r);
     myfree(data);
     return 0;
 }
diff --git a/libgadget/tests/test_exchange.c b/libgadget/tests/test_exchange.c
index dc6de144..5cdc4616 100644
--- a/libgadget/tests/test_exchange.c
+++ b/libgadget/tests/test_exchange.c
@@ -9,7 +9,6 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
 
 #define qsort_openmp qsort
 
diff --git a/libgadget/tests/test_fof.c b/libgadget/tests/test_fof.c
index 1a5f5852..45f1e72d 100644
--- a/libgadget/tests/test_fof.c
+++ b/libgadget/tests/test_fof.c
@@ -9,7 +9,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #define qsort_openmp qsort
 
@@ -30,8 +31,8 @@ setup_particles(int NumPart, double BoxSize)
     int ThisTask;
     MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
 
-    gsl_rng * r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(r, 0);
+    boost::random::mt19937 r(0);
+    boost::random::uniform_real_distribution<double> dist(0, 1);
 
     particle_alloc_memory(PartManager, BoxSize, 1.5 * NumPart);
     PartManager->NumPart = NumPart;
@@ -46,10 +47,9 @@ setup_particles(int NumPart, double BoxSize)
         P[i].IsGarbage = 0;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = BoxSize * dist(r);
     }
 
-    gsl_rng_free(r);
     /* TODO: Here create particles in some halo-like configuration*/
 
     return 0;
diff --git a/libgadget/tests/test_forcetree.c b/libgadget/tests/test_forcetree.c
index 6ae7190d..d13f2163 100644
--- a/libgadget/tests/test_forcetree.c
+++ b/libgadget/tests/test_forcetree.c
@@ -10,7 +10,8 @@
 #include <stdio.h>
 #include <time.h>
 #include <omp.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 
 #include <libgadget/forcetree.h>
 #include <libgadget/partmanager.h>
@@ -23,7 +24,7 @@
 struct forcetree_testdata
 {
     DomainDecomp ddecomp;
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 
 #define NODECACHE_SIZE 100
@@ -355,8 +356,9 @@ static void test_rebuild_close(void ** state) {
     myfree(PartManager->Base);
 }
 
-void do_random_test(gsl_rng * r, const int numpart, const ForceTree tb, DomainDecomp * ddecomp)
+void do_random_test(boost::random::mt19937 & r, const int numpart, const ForceTree tb, DomainDecomp * ddecomp)
 {
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Create a regular grid of particles, 8x8x8, all of type 1,
      * in a box 8 kpc across.*/
     int i;
@@ -364,19 +366,19 @@ void do_random_test(gsl_rng * r, const int numpart, const ForceTree tb, DomainDe
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         P[i].Type = 1;
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     PartManager->NumPart = numpart;
     do_tree_test(numpart, tb, ddecomp);
@@ -387,7 +389,7 @@ static void test_rebuild_random(void ** state) {
     int ncbrt = 64;
     struct forcetree_testdata * data = * (struct forcetree_testdata **) state;
     DomainDecomp ddecomp = data->ddecomp;
-    gsl_rng * r = (gsl_rng *) data->r;
+    boost::random::mt19937 & r = data->r;
     int numpart = ncbrt*ncbrt*ncbrt;
     particle_alloc_memory(PartManager, 8, numpart);
     /*Allocate tree*/
@@ -442,8 +444,7 @@ static int setup_tree(void **state) {
     /*Set up the top-level domain grid*/
     struct forcetree_testdata *data = malloc(sizeof(struct forcetree_testdata));
     trivial_domain(&data->ddecomp);
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     walltime_init(&Clocks);
     return 0;
diff --git a/libgadget/tests/test_gravity.c b/libgadget/tests/test_gravity.c
index d38e9bbb..bb0e40da 100644
--- a/libgadget/tests/test_gravity.c
+++ b/libgadget/tests/test_gravity.c
@@ -9,7 +9,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <omp.h>
 
 #include "stub.h"
@@ -30,7 +31,7 @@ static struct ClockTable CT;
 /* The true struct for the state variable*/
 struct forcetree_testdata
 {
-    gsl_rng * r;
+    boost::random::mt19937 r;
 };
 static const double G = 43.0071;
 
@@ -280,25 +281,26 @@ static void test_force_close(void ** state) {
     myfree(P);
 }
 
-void do_random_test(gsl_rng * r, const int numpart)
+void do_random_test(boost::random::mt19937 & r, const int numpart)
 {
+    boost::random::uniform_real_distribution<double> dist(0, 1);
     /* Create a regular grid of particles, 8x8x8, all of type 1,
      * in a box 8 kpc across.*/
     int i;
     for(i=0; i<numpart/4; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize * gsl_rng_uniform(r);
+            P[i].Pos[j] = PartManager->BoxSize * dist(r);
     }
     for(i=numpart/4; i<3*numpart/4; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize/2 + PartManager->BoxSize/8 * exp(pow(dist(r)-0.5,2));
     }
     for(i=3*numpart/4; i<numpart; i++) {
         int j;
         for(j=0; j<3; j++)
-            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(gsl_rng_uniform(r)-0.5,2));
+            P[i].Pos[j] = PartManager->BoxSize*0.1 + PartManager->BoxSize/32 * exp(pow(dist(r)-0.5,2));
     }
     PartManager->NumPart = numpart;
     do_force_test(48, 1.5, 0.002, 1);
@@ -308,7 +310,7 @@ static void test_force_random(void ** state) {
     /*Set up the particle data*/
     int numpart = PartManager->NumPart;
     struct forcetree_testdata * data = * (struct forcetree_testdata **) state;
-    gsl_rng * r = data->r;
+    boost::random::mt19937 & r = data->r;
     particle_alloc_memory(PartManager, 8, numpart);
     int i;
     for(i=0; i<2; i++) {
@@ -334,8 +336,7 @@ static int setup_tree(void **state) {
     init_forcetree_params(0.7);
     /*Set up the top-level domain grid*/
     struct forcetree_testdata *data = malloc(sizeof(struct forcetree_testdata));
-    data->r = gsl_rng_alloc(gsl_rng_mt19937);
-    gsl_rng_set(data->r, 0);
+    data->r = boost::random::mt19937(0);
     *state = (void *) data;
     return 0;
 }
diff --git a/libgadget/tests/test_omega_nu_single.c b/libgadget/tests/test_omega_nu_single.c
index 7357cf5a..683d6f2b 100644
--- a/libgadget/tests/test_omega_nu_single.c
+++ b/libgadget/tests/test_omega_nu_single.c
@@ -4,10 +4,10 @@
 #include <cmocka.h>
 #include <stdio.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
 #include "stub.h"
 #include "../omega_nu_single.h"
 #include "../physconst.h"
+#include "../timefac.h"
 
 #define  T_CMB0      2.7255	/* present-day CMB temperature, from Fixsen 2009 */
 
@@ -33,7 +33,6 @@ static void test_rho_nu_init(void **state) {
 /*Check massless neutrinos work*/
 #define STEFAN_BOLTZMANN 5.670373e-5
 #define OMEGAR (4*STEFAN_BOLTZMANN*8*M_PI*GRAVITY/(3*LIGHTCGS*LIGHTCGS*LIGHTCGS*HUBBLE*HUBBLE*HubbleParam*HubbleParam)*pow(T_CMB0,4))
-#define GSL_VAL 200
 
 
 /* Check that the table gives the right answer. */
@@ -76,17 +75,15 @@ double rho_nu_int(double q, void * params);
 
 double do_exact_rho_nu_integration(double a, double mnu, double rhocrit)
 {
-    gsl_function F;
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (GSL_VAL);
+    auto integrand = [&param](double q) {
+        return rho_nu_int(q, (void*) &param);
+    };
     double abserr;
-    F.function = &rho_nu_int;
     double kTnu = BOLEVK*TNUCMB*T_CMB0;
     double param[2] = {mnu * a, kTnu};
-    F.params = &param;
     double result;
-    gsl_integration_qag (&F, 0, 500*kTnu,0 , 1e-9,GSL_VAL,6,w,&result, &abserr);
+    result = tanh_sinh_integrate_adaptive(integrand, 0, 500*kTnu, &abserr, 1e-9);
     result*=get_rho_nu_conversion()/pow(a,4)/rhocrit;
-    gsl_integration_workspace_free (w);
     return result;
 }
 
diff --git a/libgadget/tests/test_peano.c b/libgadget/tests/test_peano.c
index 049009ce..8f78fd00 100644
--- a/libgadget/tests/test_peano.c
+++ b/libgadget/tests/test_peano.c
@@ -7,7 +7,6 @@
 #include <math.h>
 #include <mpi.h>
 #include <stdio.h>
-#include <gsl/gsl_rng.h>
 
 #include <libgadget/utils/peano.h>
 #include "stub.h"
diff --git a/libgadget/tests/test_slotsmanager.c b/libgadget/tests/test_slotsmanager.c
index 21f54e9f..f6d806e4 100644
--- a/libgadget/tests/test_slotsmanager.c
+++ b/libgadget/tests/test_slotsmanager.c
@@ -7,7 +7,6 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <gsl/gsl_rng.h>
 
 #include "stub.h"
 
diff --git a/libgadget/tests/test_timefac.c b/libgadget/tests/test_timefac.c
index a983b6fb..7ab23b9e 100644
--- a/libgadget/tests/test_timefac.c
+++ b/libgadget/tests/test_timefac.c
@@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include <gsl/gsl_integration.h>
+#include "../timefac.h"
 #include <stdint.h>
 
 #include "stub.h"
@@ -64,14 +64,12 @@ static inline inttime_t get_ti(double aa)
 double exact_drift_factor(Cosmology * CP, double a1, double a2, int exp)
 {
     double result, abserr;
-    gsl_function F;
-    gsl_integration_workspace *workspace;
-    workspace = gsl_integration_workspace_alloc(10000);
-    F.function = &fac_integ;
+    
     struct fac_params ff = {CP, exp};
-    F.params = &ff;
-    gsl_integration_qag(&F, a1,a2, 0, 1.0e-8, 10000, GSL_INTEG_GAUSS61, workspace, &result, &abserr);
-    gsl_integration_workspace_free(workspace);
+    auto integrand = [&ff](double a) {
+        return fac_integ(a, (void*)&ff);
+    };
+    result = tanh_sinh_integrate_adaptive(integrand, a1, a2, &abserr, 1e-8);
     return result;
 }
 

From de78732ddd7ae078106266813ed9176fb0807d49 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 21:57:05 -0700
Subject: [PATCH 056/120] back

---
 libgenic/thermal.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libgenic/thermal.c b/libgenic/thermal.c
index 950819cc..8f548d63 100644
--- a/libgenic/thermal.c
+++ b/libgenic/thermal.c
@@ -4,7 +4,6 @@
 /*For speed of light*/
 #include <libgadget/physconst.h>
 #include <libgadget/utils.h>
-#include <libgadget/timefac.h>
 
 /*The Boltzmann constant in units of eV/K*/
 #define BOLEVK 8.61734e-5
@@ -53,24 +52,25 @@ init_thermalvel(struct thermalvel* thermals, const double v_amp, double max_fd,c
         max_fd = MAX_FERMI_DIRAC;
     thermals->m_vamp = v_amp;
 
+    /*These functions are so smooth that we don't need much space*/
+    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
     double abserr;
-
-    // Lambda function wrapping the Fermi-Dirac kernel
-    auto integrand = [](double x) {
-        return fermi_dirac_kernel(x, nullptr);
-    };
-
+    gsl_function F;
+    F.function = &fermi_dirac_kernel;
+    F.params = NULL;
     for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++) {
         thermals->fermi_dirac_vel[i] = min_fd+(max_fd-min_fd)* i / (LENGTH_FERMI_DIRAC_TABLE - 1.0);
-        thermals->fermi_dirac_cumprob[i] = tanh_sinh_integrate_adaptive(integrand, min_fd, thermals->fermi_dirac_vel[i], &abserr, 1e-6, 0.);
+        gsl_integration_qag (&F, min_fd, thermals->fermi_dirac_vel[i], 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(thermals->fermi_dirac_cumprob[i]), &abserr);
     //       printf("gsl_integration_qng in fermi_dirac_init_nu. Result %g, error: %g, intervals: %lu\n",fermi_dirac_cumprob[i], abserr,w->size);
     }
     /*Save the largest cum. probability, pre-normalisation,
      * divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
     double total_fd;
-    total_fd = tanh_sinh_integrate_adaptive(integrand, 0, MAX_FERMI_DIRAC, &abserr, 1e-6, 0.);
+    gsl_integration_qag (&F, 0, MAX_FERMI_DIRAC, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
     assert(total_fd > 1.8);
 
+    gsl_integration_workspace_free (w);
+
     double total_frac = thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE-1]/total_fd;
     //Normalise total integral to unity
     for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++)

From efc0a6d9d23069bd2fb5ef5099c4768172b230c2 Mon Sep 17 00:00:00 2001
From: Yanhui Yang <yanhui.yang@email.ucr.edu>
Date: Fri, 4 Oct 2024 22:03:59 -0700
Subject: [PATCH 057/120] for conflicts

---
 libgenic/thermal.c | 120 ---------------------------------------------
 libgenic/thermal.h |  42 ----------------
 2 files changed, 162 deletions(-)
 delete mode 100644 libgenic/thermal.c
 delete mode 100644 libgenic/thermal.h

diff --git a/libgenic/thermal.c b/libgenic/thermal.c
deleted file mode 100644
index 8f548d63..00000000
--- a/libgenic/thermal.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <gsl/gsl_integration.h>
-#include <assert.h>
-#include "thermal.h"
-/*For speed of light*/
-#include <libgadget/physconst.h>
-#include <libgadget/utils.h>
-
-/*The Boltzmann constant in units of eV/K*/
-#define BOLEVK 8.61734e-5
-
-/* This function converts the dimensionless units used in the integral to dimensionful units.
- * Unit scaling velocity for neutrinos:
- * This is an arbitrary rescaling of the unit system in the Fermi-Dirac kernel so we can integrate dimensionless quantities.
- * The true thing to integrate is:
- * q^2 /(e^(q c / kT) + 1 ) dq between 0 and q.
- * So we choose x = (q c / kT_0) and integrate between 0 and x_0.
- * The units are restored by multiplying the resulting x by kT/c for q
- * To get a v we then use q = a m v/c^2
- * to get:   v/c =x kT/(m a)*/
-/*NOTE: this m is the mass of a SINGLE neutrino species, not the sum of neutrinos!*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s)
-{
-    return kBTNubyMNu / Time * (LIGHTCGS / UnitVelocity_in_cm_per_s);
-}
-
-//Amplitude of the random velocity for WDM
-double WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s)
-{
-        //Not actually sure where this equation comes from: the fiducial values are from Bode, Ostriker & Turok 2001.
-        double WDM_V0 = 0.012 / Time * pow(Omega_CDM / 0.3, 1.0 / 3) * pow(HubbleParam / 0.65, 2.0 / 3) * pow(1.0 /WDM_therm_mass,4.0 / 3);
-        WDM_V0 *= 1.0e5 / UnitVelocity_in_cm_per_s;
-        return WDM_V0;
-}
-
-/*Fermi-Dirac kernel for below*/
-static double
-fermi_dirac_kernel(double x, void * params)
-{
-  return x * x / (exp(x) + 1);
-}
-
-/*Initialise the probability tables*/
-double
-init_thermalvel(struct thermalvel* thermals, const double v_amp, double max_fd,const double min_fd)
-{
-    int i;
-    if(max_fd <= min_fd)
-        endrun(1,"Thermal vel called with negative interval: %g <= %g\n", max_fd, min_fd);
-
-    if(max_fd > MAX_FERMI_DIRAC)
-        max_fd = MAX_FERMI_DIRAC;
-    thermals->m_vamp = v_amp;
-
-    /*These functions are so smooth that we don't need much space*/
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (100);
-    double abserr;
-    gsl_function F;
-    F.function = &fermi_dirac_kernel;
-    F.params = NULL;
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++) {
-        thermals->fermi_dirac_vel[i] = min_fd+(max_fd-min_fd)* i / (LENGTH_FERMI_DIRAC_TABLE - 1.0);
-        gsl_integration_qag (&F, min_fd, thermals->fermi_dirac_vel[i], 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(thermals->fermi_dirac_cumprob[i]), &abserr);
-    //       printf("gsl_integration_qng in fermi_dirac_init_nu. Result %g, error: %g, intervals: %lu\n",fermi_dirac_cumprob[i], abserr,w->size);
-    }
-    /*Save the largest cum. probability, pre-normalisation,
-     * divided by the total F-D probability (which is 3 Zeta(3)/2 ~ 1.8 if MAX_FERMI_DIRAC is large enough*/
-    double total_fd;
-    gsl_integration_qag (&F, 0, MAX_FERMI_DIRAC, 0, 1e-6,100,GSL_INTEG_GAUSS61, w,&(total_fd), &abserr);
-    assert(total_fd > 1.8);
-
-    gsl_integration_workspace_free (w);
-
-    double total_frac = thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE-1]/total_fd;
-    //Normalise total integral to unity
-    for(i = 0; i < LENGTH_FERMI_DIRAC_TABLE; i++)
-        thermals->fermi_dirac_cumprob[i] /= thermals->fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE - 1];
-
-    /*Initialise the GSL table*/
-    thermals->fd_intp = gsl_interp_alloc(gsl_interp_cspline,LENGTH_FERMI_DIRAC_TABLE);
-    thermals->fd_intp_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel,LENGTH_FERMI_DIRAC_TABLE);
-    return total_frac;
-}
-
-/*Generate a table of random seeds, one for each pencil.*/
-unsigned int *
-init_rng(int Seed, int Nmesh)
-{
-    unsigned int * seedtable = (unsigned int *) mymalloc("randseeds", Nmesh*Nmesh*sizeof(unsigned int));
-    gsl_rng * rng = gsl_rng_alloc(gsl_rng_ranlxd1);
-    gsl_rng_set(rng, Seed);
-
-    int i, j;
-    for(i = 0; i < Nmesh; i++)
-        for(j=0; j < Nmesh; j++)
-        {
-            seedtable[i+Nmesh*j] = gsl_rng_get(rng);
-        }
-    gsl_rng_free(rng);
-    return seedtable;
-}
-
-/* Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.
- * The particle Id is used as a seed for the RNG.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[])
-{
-    const double p = gsl_rng_uniform (g_rng);
-    /*m_vamp multiples by the dimensional factor to get a velocity again.*/
-    const double v = thermals->m_vamp * gsl_interp_eval(thermals->fd_intp,thermals->fermi_dirac_cumprob, thermals->fermi_dirac_vel, p, thermals->fd_intp_acc);
-
-    /*Random phase*/
-    const double phi = 2 * M_PI * gsl_rng_uniform (g_rng);
-    const double theta = acos(2 * gsl_rng_uniform (g_rng) - 1);
-
-    Vel[0] += v * sin(theta) * cos(phi);
-    Vel[1] += v * sin(theta) * sin(phi);
-    Vel[2] += v * cos(theta);
-}
\ No newline at end of file
diff --git a/libgenic/thermal.h b/libgenic/thermal.h
deleted file mode 100644
index a134ed4d..00000000
--- a/libgenic/thermal.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef THERMALVEL_H
-#define THERMALVEL_H
-
-#include <gsl/gsl_interp.h>
-#include <gsl/gsl_rng.h>
-/*Length of the table*/
-#define MAX_FERMI_DIRAC          17.0
-#define LENGTH_FERMI_DIRAC_TABLE 2000
-
-struct thermalvel
-{
-    double fermi_dirac_vel[LENGTH_FERMI_DIRAC_TABLE];
-    double fermi_dirac_cumprob[LENGTH_FERMI_DIRAC_TABLE];
-    double m_vamp;
-    gsl_interp * fd_intp;
-    gsl_interp_accel * fd_intp_acc;
-};
-
-/*Single parameter is the amplitude of the random velocities. All the physics is in here.
- * max_fd and min_fd give the maximum and minimum velocities to integrate over.
- * Note these values are dimensionless*/
-/*Returns total fraction of the Fermi-Dirac distribution between max_fd and min_fd*/
-double
-init_thermalvel(struct thermalvel * thermals, const double v_amp, double max_fd, const double min_fd);
-
-/*Add a randomly generated thermal speed in v_amp*(min_fd, max_fd) to a 3-velocity.*/
-void
-add_thermal_speeds(struct thermalvel * thermals, gsl_rng *g_rng, float Vel[]);
-
-/*Amplitude of the random velocity for neutrinos*/
-double
-NU_V0(const double Time, const double kBTNubyMNu, const double UnitVelocity_in_cm_per_s);
-
-/*Amplitude of the random velocity for WDM*/
-double
-WDM_V0(const double Time, const double WDM_therm_mass, const double Omega_CDM, const double HubbleParam, const double UnitVelocity_in_cm_per_s);
-
-unsigned int *
-init_rng(int Seed, int Nmesh);
-
-
-#endif

From acbadd74f34a9bce70378523d5fc071ccff3fb97 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 21:50:18 -0700
Subject: [PATCH 058/120] Remove header connected with metal return

---
 libgadget/metal_tables.h | 439 ---------------------------------------
 1 file changed, 439 deletions(-)
 delete mode 100644 libgadget/metal_tables.h

diff --git a/libgadget/metal_tables.h b/libgadget/metal_tables.h
deleted file mode 100644
index 9d265753..00000000
--- a/libgadget/metal_tables.h
+++ /dev/null
@@ -1,439 +0,0 @@
-#ifndef METAL_TABLES_H
-#define METAL_TABLES_H
-
-/* Metals followed:
- * H, He, C, N, O, Ne, Mg, Si, Fe (9, following 1703.02970)
- */
-#define NSPECIES 9
-/* Largest mass in the IMF normalisation*/
-#define MAXMASS 40
-/* Only used for IMF normalisation*/
-#define MINMASS 0.1
-/* Mass in solar at which the yield tables switch from AGB stars to SNII*/
-#define SNAGBSWITCH 8
-/* Metallicity values (in terms of metal yield, not solar metallicity)
- * for the stellar lifetime table. Columns of lifetime.*/
-#define LIFE_NMET 5
-#define LIFE_NMASS 30
-static const double lifetime_metallicity[LIFE_NMET] = { 0.0004 , 0.004 , 0.008, 0.02, 0.05 };
-/* Mass values in solar masses for the stellar lifetime table. Rows of lifetime*/
-static const double lifetime_masses[LIFE_NMASS] = {0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5,
-    1.6, 1.7, 1.8, 1.9, 2.0, 2.5, 3 , 4 , 5 , 6 , 7 , 9 , 12 , 15 , 20 , 30 , 40 , 60 , 100, 120};
-/* Stellar lifetimes as a function of mass and metallicity in years.
- * Table 14 of Portinari et al, astro-ph/9711337 */
-static const double lifetime[LIFE_NMASS*LIFE_NMET] = {
-4.28e+10,   5.35E+10,   6.47E+10,   7.92E+10,   7.18E+10,
-2.37E+10,   2.95E+10,   3.54E+10,   4.45E+10,   4.00E+10,
-1.41E+10,   1.73E+10,   2.09E+10,   2.61E+10,   2.33E+10,
-8.97E+09,   1.09E+10,   1.30E+10,   1.59E+10,   1.42E+10,
-6.03E+09,   7.13E+09,   8.46E+09,   1.03E+10,   8.88E+09,
-4.23E+09,   4.93E+09,   5.72E+09,   6.89E+09,   5.95E+09,
-3.08E+09,   3.52E+09,   4.12E+09,   4.73E+09,   4.39E+09,
-2.34E+09,   2.64E+09,   2.92E+09,   3.59E+09,   3.37E+09,
-1.92E+09,   2.39E+09,   2.36E+09,   2.87E+09,   3.10E+09,
-1.66E+09,   1.95E+09,   2.18E+09,   2.64E+09,   2.51E+09,
-1.39E+09,   1.63E+09,   1.82E+09,   2.18E+09,   2.06E+09,
-1.18E+09,   1.28E+09,   1.58E+09,   1.84E+09,   1.76E+09,
-1.11E+09,   1.25E+09,   1.41E+09,   1.59E+09,   1.51E+09,
-9.66E+08,   1.23E+09,   1.25E+09,   1.38E+09,   1.34E+09,
-8.33E+08,   1.08E+09,   1.23E+09,   1.21E+09,   1.24E+09,
-4.64E+08,   5.98E+08,   6.86E+08,   7.64E+08,   6.58E+08,
-3.03E+08,   3.67E+08,   4.12E+08,   4.56E+08,   3.81E+08,
-1.61E+08,   1.82E+08,   1.93E+08,   2.03E+08,   1.64E+08,
-1.01E+08,   1.11E+08,   1.15E+08,   1.15E+08,   8.91E+07,
-7.15E+07,   7.62E+07,   7.71E+07,   7.45E+07,   5.67E+07,
-5.33E+07,   5.61E+07,   5.59E+07,   5.31E+07,   3.97E+07,
-3.42E+07,   3.51E+07,   3.44E+07,   3.17E+07,   2.33E+07,
-2.13E+07,   2.14E+07,   2.10E+07,   1.89E+07,   1.39E+07,
-1.54E+07,   1.52E+07,   1.49E+07,   1.33E+07,   9.95E+06,
-1.06E+07,   1.05E+07,   1.01E+07,   9.15E+06,   6.99E+06,
-6.90E+06,   6.85E+06,   6.65E+06,   6.13E+06,   5.15E+06,
-5.45E+06,   5.44E+06,   5.30E+06,   5.12E+06,   4.34E+06,
-4.20E+06,   4.19E+06,   4.15E+06,   4.12E+06,   3.62E+06,
-3.32E+06,   3.38E+06,   3.44E+06,   3.39E+06,   3.11E+06,
-3.11E+06,   3.23E+06,   3.32E+06,   3.23E+06,   3.11E+06};
-
-/* Sn1a yields from the W7 model of Nomoto et al 1997 https://arxiv.org/abs/astro-ph/9706025
- * I extracted this from the latex source of their table 1 by hand.
- * total_metals is just the sum of all metal masses in the table.
- */
-static const double sn1a_total_metals = 1.3743416565891;
-static const double sn1a_yields[NSPECIES] = {0, 0, 4.83E-02, 1.16E-06 , 1.43E-01 , 4.51E-03, 8.57E-03, 1.53E-01, 7.43e-01};
-
-/* AGB yields from Karakas 2010, 0912.2142 Tables A2 - A5. These have been parsed by the script in tools/extract_yields.py
- * Massive stars are from Doherty 2014, https://doi.org/10.1093/mnras/stt1877 and https://doi.org/10.1093/mnras/stu571
- * Some of the metallicities in Karakas are listed at M = 2 and some at M = 2.1. I have altered them all to be at M = 2,
- * a change which is within the uncertainty of the calculation.
- */
-#define AGB_NMET 4
-#define AGB_NMASS 18
-static const double agb_masses[AGB_NMASS] = { 1.00,1.25,1.50,1.75,1.90,2.00,2.25,2.50,3.00,3.50,4.00,4.50,5.00,5.50,6.00,6.50,7.00,7.50 };
-static const double agb_metallicities[AGB_NMET] = { 0.0001,0.0040,0.0080,0.0200 };
-static const double agb_total_mass[AGB_NMET*AGB_NMASS] = {
-0.280,0.390,0.423,0.436,
-0.582,0.608,0.650,0.676,
-0.839,0.872,0.867,0.900,
-1.086,1.120,1.114,1.135,
-1.219,1.260,1.260,1.270,
-1.315,1.450,1.456,1.360,
-1.537,1.586,1.598,1.590,
-1.768,1.829,1.837,1.837,
-2.187,2.269,2.306,2.318,
-2.646,2.686,2.734,2.782,
-3.126,3.148,3.164,3.208,
-3.603,3.628,3.639,3.648,
-4.071,4.095,4.114,4.121,
-4.534,4.568,4.593,4.600,
-4.994,5.023,5.052,5.071,
-5.401,5.494,5.548,5.537,
-5.827,5.936,6.001,6.033,
-6.269,6.342,6.442,6.489,
-};
-
-static const double agb_total_metals[AGB_NMET*AGB_NMASS] = {
-2.939e-04,1.485e-03,3.221e-03,8.302e-03,
-3.962e-03,2.500e-03,4.963e-03,1.290e-02,
-1.255e-02,5.246e-03,6.885e-03,1.721e-02,
-2.588e-02,1.014e-02,1.164e-02,2.172e-02,
-3.383e-02,1.416e-02,1.501e-02,2.431e-02,
-4.129e-02,1.806e-02,2.186e-02,2.603e-02,
-5.727e-02,3.139e-02,2.788e-02,3.145e-02,
-5.650e-02,4.520e-02,3.684e-02,4.116e-02,
-2.582e-02,5.087e-02,5.969e-02,6.170e-02,
-2.710e-02,3.576e-02,4.916e-02,7.676e-02,
-3.387e-02,4.534e-02,4.494e-02,7.494e-02,
-2.771e-02,6.428e-02,6.819e-02,8.330e-02,
-2.863e-02,8.468e-02,8.391e-02,9.272e-02,
-3.246e-02,6.702e-02,7.944e-02,1.112e-01,
-4.058e-02,6.991e-02,7.896e-02,1.164e-01,
-1.007e-01,4.809e-02,7.900e-02,1.224e-01,
-4.971e-02,3.936e-02,6.945e-02,1.349e-01,
-3.387e-02,3.916e-02,6.827e-02,1.432e-01,
-
-};
-
-static const double agb_yield[NSPECIES][AGB_NMET*AGB_NMASS] = {
-{2.098917e-01,2.837685e-01,3.030716e-01,2.917916e-01,
-4.181523e-01,4.402004e-01,4.647374e-01,4.526932e-01,
-5.871375e-01,6.310413e-01,6.216631e-01,6.062396e-01,
-7.231110e-01,8.003024e-01,7.952502e-01,7.689168e-01,
-7.975993e-01,8.940117e-01,8.979415e-01,8.627354e-01,
-8.418292e-01,1.031207e+00,1.025440e+00,9.253072e-01,
-9.679005e-01,1.086865e+00,1.113040e+00,1.078653e+00,
-1.159112e+00,1.219034e+00,1.258521e+00,1.233581e+00,
-1.581442e+00,1.538121e+00,1.534155e+00,1.524957e+00,
-1.893222e+00,1.915657e+00,1.904093e+00,1.819895e+00,
-2.169540e+00,2.193368e+00,2.242207e+00,2.143635e+00,
-2.481520e+00,2.449144e+00,2.483966e+00,2.400789e+00,
-2.761358e+00,2.636303e+00,2.707931e+00,2.652623e+00,
-3.037755e+00,2.989116e+00,3.035734e+00,2.888854e+00,
-3.240722e+00,3.205465e+00,3.278734e+00,3.136674e+00,
-3.129000e+00,3.531000e+00,3.577000e+00,3.389015e+00,
-3.581000e+00,3.824000e+00,3.870000e+00,3.770000e+00,
-3.959000e+00,4.085000e+00,4.144000e+00,4.017000e+00,
-
-},
-{
-6.976463e-02,1.046962e-01,1.166566e-01,1.358523e-01,
-1.598219e-01,1.652351e-01,1.802360e-01,2.103396e-01,
-2.392354e-01,2.356368e-01,2.383760e-01,2.764771e-01,
-3.369195e-01,3.094681e-01,3.070183e-01,3.442725e-01,
-3.874754e-01,3.517370e-01,3.469518e-01,3.828618e-01,
-4.317810e-01,4.006273e-01,4.085950e-01,4.085601e-01,
-5.117177e-01,4.676308e-01,4.569640e-01,4.797870e-01,
-5.522625e-01,5.646438e-01,5.415177e-01,5.621283e-01,
-5.795870e-01,6.798629e-01,7.120018e-01,7.311886e-01,
-7.255036e-01,7.344115e-01,7.805668e-01,8.851663e-01,
-9.223878e-01,9.090908e-01,8.766484e-01,9.892259e-01,
-1.093530e+00,1.114345e+00,1.086616e+00,1.163685e+00,
-1.280739e+00,1.373748e+00,1.321897e+00,1.375409e+00,
-1.463488e+00,1.511581e+00,1.477543e+00,1.599672e+00,
-1.712394e+00,1.747269e+00,1.693976e+00,1.817578e+00,
-2.171000e+00,1.915004e+00,1.892005e+00,2.025270e+00,
-2.196000e+00,2.073004e+00,2.062005e+00,2.128016e+00,
-2.276000e+00,2.218006e+00,2.230006e+00,2.329018e+00,
-
-},
-{
-2.548307e-04,2.007317e-04,4.499520e-04,1.188860e-03,
-3.733844e-03,4.464190e-04,6.100381e-04,1.652404e-03,
-1.161688e-02,2.206198e-03,9.797027e-04,2.018329e-03,
-2.314672e-02,6.015899e-03,3.854005e-03,2.424878e-03,
-2.968959e-02,9.332017e-03,6.109907e-03,2.706045e-03,
-3.570287e-02,1.239873e-02,1.124944e-02,2.891137e-03,
-4.844249e-02,2.385695e-02,1.585494e-02,4.315774e-03,
-4.921946e-02,3.498655e-02,2.250193e-02,9.550566e-03,
-2.395744e-02,3.907886e-02,3.964889e-02,2.093392e-02,
-7.499905e-03,2.477810e-02,2.842070e-02,2.737855e-02,
-5.496491e-03,1.091623e-02,2.201131e-02,1.921261e-02,
-3.847183e-03,6.090742e-03,7.251039e-03,1.996513e-02,
-3.193289e-03,5.875254e-03,4.790409e-03,1.786269e-02,
-2.979007e-03,6.065955e-03,6.545073e-03,8.263152e-03,
-1.564622e-03,5.016426e-03,5.207464e-03,6.271126e-03,
-3.853600e-03,4.820900e-03,6.313500e-03,6.291284e-03,
-2.563200e-03,3.790400e-03,5.379300e-03,7.561300e-03,
-2.565400e-03,3.831900e-03,5.280200e-03,8.279700e-03,
-
-},
-{
-5.571887e-06,1.361231e-04,2.725037e-04,6.631505e-04,
-2.301341e-05,2.542933e-04,5.158191e-04,1.252953e-03,
-4.452569e-05,3.972116e-04,7.765967e-04,1.880492e-03,
-5.958874e-05,5.543761e-04,1.064622e-03,2.507770e-03,
-7.103837e-05,6.368285e-04,1.191760e-03,2.815089e-03,
-7.813261e-05,6.860504e-04,1.421203e-03,3.024347e-03,
-8.142337e-05,8.018469e-04,1.559451e-03,3.694201e-03,
-5.836106e-05,9.885715e-04,1.904179e-03,4.309833e-03,
-8.782401e-05,1.217263e-03,2.468241e-03,5.661651e-03,
-1.805374e-02,1.648287e-03,3.023767e-03,7.118027e-03,
-2.634043e-02,2.366309e-02,3.487779e-03,8.631895e-03,
-2.241779e-02,4.579481e-02,3.820866e-02,1.046125e-02,
-2.405146e-02,6.564541e-02,5.544794e-02,1.642645e-02,
-2.796230e-02,4.863977e-02,4.800098e-02,3.906971e-02,
-3.745824e-02,5.402872e-02,4.988818e-02,4.182413e-02,
-9.045626e-02,3.020097e-02,4.330114e-02,4.271098e-02,
-4.486655e-02,2.116088e-02,3.290107e-02,4.301120e-02,
-2.942762e-02,1.918132e-02,2.870115e-02,4.533135e-02,
-
-},
-{
-2.490971e-05,7.457671e-04,1.624870e-03,4.196570e-03,
-1.234212e-04,1.169008e-03,2.494431e-03,6.505379e-03,
-3.160840e-04,1.706989e-03,3.330410e-03,8.659561e-03,
-4.793708e-04,2.266871e-03,4.320108e-03,1.092500e-02,
-5.378037e-04,2.564215e-03,4.910615e-03,1.222423e-02,
-5.795720e-04,2.952074e-03,5.648813e-03,1.308835e-02,
-7.052051e-04,3.252561e-03,6.188843e-03,1.514512e-02,
-7.763611e-04,3.616575e-03,6.863281e-03,1.731160e-02,
-5.221822e-04,4.374406e-03,8.296453e-03,2.123638e-02,
-5.783944e-04,5.266665e-03,9.988093e-03,2.512805e-02,
-7.999336e-04,6.077652e-03,1.176262e-02,2.924345e-02,
-6.513333e-04,6.113582e-03,1.287679e-02,3.280990e-02,
-6.109250e-04,5.122539e-03,1.201272e-02,3.599422e-02,
-5.915777e-04,5.593279e-03,1.320244e-02,3.827719e-02,
-6.622771e-04,3.847015e-03,1.164147e-02,4.077411e-02,
-1.920547e-03,5.879672e-03,1.483454e-02,4.364789e-02,
-1.101692e-03,7.012498e-03,1.623960e-02,4.776699e-02,
-1.245540e-03,8.272425e-03,1.848258e-02,5.049371e-02,
-
-},
-{
-3.771378e-06,1.366522e-04,2.967020e-04,7.653311e-04,
-7.088088e-05,2.154086e-04,4.557682e-04,1.186591e-03,
-5.502761e-04,3.397790e-04,6.141479e-04,1.578606e-03,
-2.107102e-03,5.352139e-04,8.800038e-04,1.988661e-03,
-3.370820e-03,7.546288e-04,1.072599e-03,2.223432e-03,
-4.668795e-03,1.022163e-03,1.535058e-03,2.380030e-03,
-7.470195e-03,2.341272e-03,2.068702e-03,2.850746e-03,
-5.582660e-03,4.206462e-03,2.998946e-03,3.696213e-03,
-9.685537e-04,4.331039e-03,5.897057e-03,5.878638e-03,
-6.893982e-04,1.962699e-03,3.747073e-03,7.487394e-03,
-8.870731e-04,2.101035e-03,3.135203e-03,6.760196e-03,
-5.356187e-04,3.045464e-03,4.243832e-03,7.393902e-03,
-4.735030e-04,3.782563e-03,5.084168e-03,8.110790e-03,
-4.972539e-04,2.788993e-03,4.618498e-03,9.353449e-03,
-4.242995e-04,2.676837e-03,4.541172e-03,9.767407e-03,
-1.414044e-03,2.517535e-03,5.249992e-03,1.032522e-02,
-3.300342e-04,2.519584e-03,5.119844e-03,1.234227e-02,
-1.866221e-04,2.662918e-03,5.306701e-03,1.307199e-02,
-
-},
-{
-9.455019e-07,5.151808e-05,1.117829e-04,2.882281e-04,
-2.232505e-06,8.029951e-05,1.717731e-04,4.469390e-04,
-6.836223e-06,1.151794e-04,2.290949e-04,5.949837e-04,
-3.313334e-05,1.483363e-04,2.945120e-04,7.503477e-04,
-6.736667e-05,1.679304e-04,3.335481e-04,8.395954e-04,
-1.247014e-04,1.948988e-04,3.878231e-04,8.990899e-04,
-3.621839e-04,2.331503e-04,4.297748e-04,1.051289e-03,
-7.180272e-04,3.186369e-04,5.078613e-04,1.216990e-03,
-2.311555e-04,5.492023e-04,7.524818e-04,1.556575e-03,
-2.196609e-04,5.836734e-04,9.220928e-04,1.920263e-03,
-2.767304e-04,7.972837e-04,1.015380e-03,2.198472e-03,
-1.887811e-04,1.148456e-03,1.511265e-03,2.545408e-03,
-2.277470e-04,1.842968e-03,1.907678e-03,2.872014e-03,
-3.411340e-04,1.315514e-03,1.897493e-03,3.416546e-03,
-3.504029e-04,1.486829e-03,2.012915e-03,3.675149e-03,
-2.503435e-03,1.097400e-03,2.109200e-03,3.984398e-03,
-6.246600e-04,1.042920e-03,2.058100e-03,4.734800e-03,
-2.920890e-04,1.122820e-03,2.193400e-03,5.149200e-03,
-
-},
-{
-1.003708e-06,5.549328e-05,1.204001e-04,3.104456e-04,
-2.090670e-06,8.651391e-05,1.850165e-04,4.813928e-04,
-3.027505e-06,1.240917e-04,2.468060e-04,6.408511e-04,
-3.979750e-06,1.594153e-04,3.171329e-04,8.081894e-04,
-4.539297e-06,1.793837e-04,3.587247e-04,9.043193e-04,
-5.069056e-06,2.064813e-04,4.146332e-04,9.684054e-04,
-7.413180e-06,2.262163e-04,4.552052e-04,1.132337e-03,
-1.456918e-05,2.615355e-04,5.236178e-04,1.308336e-03,
-1.150049e-05,3.277709e-04,6.597087e-04,1.651721e-03,
-1.354274e-05,3.924938e-04,7.852946e-04,1.983755e-03,
-1.726750e-05,4.656181e-04,9.124538e-04,2.287597e-03,
-1.690565e-05,5.476943e-04,1.065059e-03,2.604582e-03,
-1.957340e-05,6.417297e-04,1.222199e-03,2.943220e-03,
-2.466138e-05,6.879804e-04,1.350983e-03,3.299326e-03,
-2.855312e-05,7.600837e-04,1.487615e-03,3.634983e-03,
-1.379400e-04,9.100400e-04,1.834180e-03,3.968367e-03,
-7.630900e-05,9.734000e-04,1.966520e-03,4.892300e-03,
-4.450300e-05,1.039640e-03,2.101830e-03,5.266300e-03,
-
-},
-{
-1.778093e-06,9.938364e-05,2.156261e-04,5.559812e-04,
-3.689621e-06,1.549358e-04,3.313467e-04,8.621287e-04,
-5.302608e-06,2.222145e-04,4.420010e-04,1.147705e-03,
-6.825926e-06,2.854161e-04,5.678945e-04,1.447389e-03,
-7.663427e-06,3.210919e-04,6.423236e-04,1.619551e-03,
-8.258400e-06,3.695058e-04,7.422465e-04,1.734322e-03,
-9.513126e-06,4.039695e-04,8.146398e-04,2.027870e-03,
-1.076578e-05,4.652282e-04,9.363337e-04,2.342809e-03,
-1.365988e-05,5.732944e-04,1.173206e-03,2.956308e-03,
-1.657160e-05,6.786403e-04,1.387979e-03,3.546689e-03,
-1.955280e-05,7.936696e-04,1.606529e-03,4.088933e-03,
-2.264573e-05,9.098115e-04,1.837633e-03,4.647151e-03,
-2.560047e-05,1.020177e-03,2.071569e-03,5.248905e-03,
-2.846764e-05,1.148467e-03,2.319909e-03,5.850658e-03,
-3.131396e-05,1.263550e-03,2.554443e-03,6.452388e-03,
-3.726125e-05,1.565131e-03,3.160576e-03,7.047671e-03,
-4.105254e-05,1.694741e-03,3.425661e-03,8.622082e-03,
-4.456126e-05,1.811054e-03,3.679863e-03,9.274617e-03,
-}
-};
-
-/* Supernova II yields are from Kobayashi 2006. There is a mass gap from 8 - 13 Msun, between AGB and SNII,
- * for which we extrapolate Kobayashi 2006 yields to lower masses.*/
-#define SNII_NMET 4
-#define SNII_NMASS 7
-static const double snii_masses[SNII_NMASS] = { 13.00,15.00,18.00,20.00,25.00,30.00,40.00 };
-static const double snii_metallicities[SNII_NMET] = { 0.0000,0.0010,0.0040,0.0200 };
-static const double snii_total_mass[SNII_NMET*SNII_NMASS] = {
-11.430,11.280,11.250,11.130,
-13.520,13.390,12.890,12.640,
-16.350,16.140,14.980,15.180,
-18.340,17.870,17.760,16.810,
-23.080,22.510,22.350,19.940,
-27.930,26.990,25.000,22.480,
-37.110,34.640,30.120,19.620,
-
-};
-
-static const double snii_total_metals[SNII_NMET*SNII_NMASS] = {
-8.208e-01,9.820e-01,8.451e-01,6.734e-01,
-1.539e+00,7.860e-01,8.312e-01,6.045e-01,
-2.501e+00,1.135e+00,1.451e+00,1.539e+00,
-3.634e+00,3.489e+00,1.778e+00,2.110e+00,
-4.411e+00,5.733e+00,3.703e+00,4.296e+00,
-6.712e+00,7.570e+00,6.944e+00,5.378e+00,
-1.120e+01,1.088e+01,1.171e+01,1.136e+01,
-
-};
-
-static const double snii_yield[NSPECIES][SNII_NMET*SNII_NMASS] = {
-{6.600000e+00,6.440000e+00,6.370000e+00,6.160000e+00,
-7.580000e+00,7.450000e+00,7.110000e+00,6.790000e+00,
-8.430000e+00,8.460000e+00,7.470000e+00,7.530000e+00,
-8.770000e+00,8.430000e+00,8.950000e+00,7.930000e+00,
-1.060000e+01,9.800000e+00,1.020000e+01,8.410000e+00,
-1.170000e+01,1.110000e+01,1.010000e+01,8.750000e+00,
-1.400000e+01,1.290000e+01,1.030000e+01,3.550000e+00,
-
-},
-{
-4.010041e+00,3.860143e+00,4.040170e+00,4.300196e+00,
-4.400041e+00,5.160153e+00,4.950159e+00,5.250218e+00,
-5.420033e+00,6.540157e+00,6.060224e+00,6.110230e+00,
-5.940048e+00,5.940160e+00,7.030175e+00,6.760238e+00,
-8.030211e+00,6.970126e+00,8.480185e+00,7.240221e+00,
-9.520206e+00,8.380144e+00,7.920184e+00,8.360212e+00,
-1.190003e+01,1.090012e+01,8.120180e+00,4.710051e+00,
-
-},
-{
-7.410008e-02,1.071670e-01,8.798800e-02,1.080000e-01,
-1.720001e-01,8.505380e-02,8.830900e-02,6.625000e-02,
-2.190000e-01,1.300720e-01,1.653000e-01,1.373800e-01,
-2.110000e-01,1.280196e-01,9.769200e-02,2.464500e-01,
-2.940000e-01,2.150981e-01,1.323830e-01,2.186000e-01,
-3.380000e-01,1.210820e-01,1.823390e-01,2.519200e-01,
-4.290000e-01,7.398200e-02,4.583680e-01,5.964310e-01,
-
-},
-{
-1.830064e-03,9.077570e-03,9.086840e-03,4.804090e-02,
-1.860069e-03,3.580859e-03,1.290870e-02,6.155970e-02,
-1.890240e-04,4.470921e-03,1.262000e-01,6.611530e-02,
-5.421130e-05,1.290137e-02,1.842780e-02,7.212400e-02,
-5.911180e-04,9.207240e-03,3.159530e-02,1.306000e-01,
-1.656800e-06,6.190379e-03,2.010498e-02,1.020066e-01,
-1.218000e-06,8.692450e-03,2.600501e-02,5.810572e-02,
-
-},
-{
-4.500017e-01,5.058796e-01,3.870375e-01,2.223680e-01,
-7.730065e-01,2.943916e-01,2.930546e-01,1.653520e-01,
-1.380005e+00,4.223302e-01,5.741100e-01,7.825760e-01,
-2.110000e+00,2.180030e+00,9.953840e-01,1.056171e+00,
-2.790002e+00,3.820098e+00,2.200960e+00,2.435640e+00,
-4.810000e+00,5.330076e+00,4.790164e+00,3.227870e+00,
-8.380000e+00,8.370055e+00,7.960996e+00,7.343272e+00,
-
-},
-{
-1.530074e-02,6.751500e-02,1.332350e-01,3.944500e-02,
-3.270537e-01,1.903415e-01,1.258970e-01,3.575000e-02,
-4.941169e-01,1.775626e-01,2.051700e-01,1.558320e-01,
-9.121122e-01,6.283170e-01,2.794180e-01,4.048500e-01,
-5.330335e-01,1.221979e+00,8.249530e-01,8.713900e-01,
-8.511408e-01,1.452171e+00,9.439010e-01,9.585700e-01,
-3.070175e-01,2.879870e-01,1.884040e+00,2.225870e+00,
-
-},
-{
-8.642770e-02,6.583400e-02,4.642000e-02,2.994000e-02,
-6.889700e-02,6.572000e-02,7.848000e-02,4.110000e-02,
-1.584600e-01,6.117300e-02,8.396000e-02,1.159800e-01,
-1.503540e-01,2.468400e-01,1.005800e-01,9.487000e-02,
-1.200906e-01,1.827300e-01,2.457500e-01,2.766000e-01,
-2.273760e-01,2.938200e-01,2.321700e-01,2.472000e-01,
-4.785540e-01,7.073200e-01,4.043000e-01,4.562000e-01,
-
-},
-{
-8.257000e-02,9.317000e-02,6.229700e-02,7.784000e-02,
-7.358800e-02,4.370700e-02,1.054100e-01,8.875000e-02,
-1.167870e-01,1.541350e-01,1.006900e-01,1.147800e-01,
-9.969200e-02,1.298860e-01,1.268800e-01,6.768000e-02,
-3.513464e-01,1.207150e-01,1.225100e-01,1.412500e-01,
-2.488430e-01,1.667390e-01,4.031900e-01,2.579800e-01,
-1.036660e+00,8.971400e-01,5.340500e-01,2.607500e-01,
-
-},
-{
-7.172600e-02,7.559680e-02,7.493770e-02,8.746100e-02,
-7.238000e-02,7.327010e-02,7.540890e-02,8.976000e-02,
-7.227800e-02,7.444550e-02,9.409600e-02,9.294600e-02,
-7.228700e-02,7.404090e-02,7.768170e-02,9.375600e-02,
-7.377700e-02,7.395780e-02,7.734500e-02,9.664700e-02,
-7.457300e-02,7.518580e-02,8.269400e-02,1.038800e-01,
-8.000101e-02,8.257700e-02,8.525500e-02,8.967500e-02,
-}
-};
-
-/* These definitions are here for the tests*/
-#define GSL_WORKSPACE 1000
-
-double compute_imf_norm(gsl_integration_workspace * gsl_work);
-double compute_agb_yield(gsl_interp2d * agb_interp, const double * agb_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work );
-double compute_snii_yield(gsl_interp2d * snii_interp, const double * snii_weights, double stellarmetal, double masslow, double masshigh, gsl_integration_workspace * gsl_work );
-
-double chabrier_mass(double mass, void * params);
-
-double sn1a_number(double dtmyrstart, double dtmyrend, double hub);
-
-void set_metal_params(double Sn1aN0);
-
-void find_mass_bin_limits(double * masslow, double * masshigh, const double dtstart, const double dtend, double stellarmetal, gsl_interp2d * lifetime_tables);
-
-#endif

From b83542708555d8a810234929c93684b5d33e31d1 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 21:51:45 -0700
Subject: [PATCH 059/120] Remove largely unused lightcone

---
 libgadget/Makefile    |   2 +-
 libgadget/lightcone.c | 268 ------------------------------------------
 libgadget/lightcone.h |   7 --
 3 files changed, 1 insertion(+), 276 deletions(-)
 delete mode 100644 libgadget/lightcone.c
 delete mode 100644 libgadget/lightcone.h

diff --git a/libgadget/Makefile b/libgadget/Makefile
index 03d13418..ab569eb9 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -105,7 +105,7 @@ GADGET_OBJS =  \
 	 gravpm.o powerspectrum.o \
 	 forcetree.o \
 	 petapm.o gravity.o \
-	 densitykernel.o lightcone.o walltime.o\
+	 densitykernel.o walltime.o\
 	 runtests.o \
 	 neutrinos_lra.o \
      omega_nu_single.o \
diff --git a/libgadget/lightcone.c b/libgadget/lightcone.c
deleted file mode 100644
index c5bc6322..00000000
--- a/libgadget/lightcone.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <gsl/gsl_integration.h>
-/*For mkdir*/
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include "utils.h"
-
-#include "timefac.h"
-#include "partmanager.h"
-#include "cosmology.h"
-#include "physconst.h"
-
-#define NENTRY 4096
-static double tab_loga[NENTRY];
-static double dloga;
-static double tab_Dc[NENTRY];
-/*
- * light cone on the fly:
- *
- * assuming the origin is at (0, 0, 0)
- *
- * */
-
-/*
- * replicas to consider, function of redshift;
- *
- * */
-static int Nreplica;
-static int BoxBoost = 20;
-static double Reps[8192][3];
-static double HorizonDistance2;
-static double HorizonDistance;
-static double HorizonDistancePrev;
-static double HorizonDistance2Prev;
-static double HorizonDistanceRef;
-static double zmin = 0.1;
-static double zmax = 80.0;
-static double ReferenceRedshift = 2.0; /* write all particles below this redshift; write a fraction above this. */
-static double SampleFraction; /* current fraction of particle gets written */
-static FILE * fd_lightcone;
-
-static double lightcone_get_horizon(double a);
-static void lightcone_cross(int p, double ddrift, const RandTable * const rnd);
-static void lightcone_set_time(double a, const double BoxSize);
-/*
-M, L = self.M, self.L
-  logx = numpy.linspace(log10amin, 0, Np)
-  def kernel(log10a):
-    a = numpy.exp(log10a)
-    return 1 / self.Ea(a) * a ** -1 # dz = - 1 / a dlog10a
-  y = numpy.array( [romberg(kernel, log10a, 0, vec_func=True, divmax=10) for log10a in logx])
-*/
-static double kernel(double loga, void * params) {
-    double a = exp(loga);
-      Cosmology * CP = (Cosmology *) params;
-    return 1 / hubble_function(CP, a) * CP->Hubble / a;
-}
-
-static void lightcone_init_entry(Cosmology * CP, int i, const double UnitLength_in_cm) {
-    tab_loga[i] = - dloga * (NENTRY - i - 1);
-
-    gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000);
-
-    double result, error;
-
-    gsl_function F;
-    F.function = &kernel;
-    F.params = CP;
-    gsl_integration_qags (&F, tab_loga[i], 0, 0, 1e-7, 1000,
-            w, &result, &error);
-
-    /* result is in DH, hubble distance */
-    /* convert to cm / h */
-    result *= LIGHTCGS / HUBBLE;
-    /* convert to Kpc/h or internal units */
-    result /= UnitLength_in_cm;
-
-    gsl_integration_workspace_free (w);
-    tab_Dc[i] = result;
-//    double a = exp(tab_loga[i]);
-//    double z = 1 / a - 1;
-//    printf("a = %g z = %g Dc = %g\n", a, z, result);
-}
-
-void lightcone_init(Cosmology * CP, double timeBegin, const double UnitLength_in_cm, const char * OutputDir)
-{
-    int i;
-    dloga = (0.0 - log(timeBegin)) / (NENTRY - 1);
-    for(i = 0; i < NENTRY; i ++) {
-        lightcone_init_entry(CP, i, UnitLength_in_cm);
-    };
-    char buf[1024];
-    int chunk = 100;
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-
-    sprintf(buf, "%s/lightcone/", OutputDir);
-    mkdir(buf, 02755);
-    sprintf(buf, "%s/lightcone/%03d/", OutputDir, (int)(ThisTask / chunk));
-    mkdir(buf, 02755);
-    sprintf(buf, "%s/lightcone/%03d/lightcone-%05d.raw", OutputDir, (int)(ThisTask / chunk), ThisTask);
-
-    fd_lightcone = fopen(buf, "a+");
-    if(fd_lightcone == NULL) {
-        endrun(1, "failed to open %s\n", buf);
-    }
-    HorizonDistanceRef = lightcone_get_horizon(1 / (1 + ReferenceRedshift));
-    printf("lightcone reference redshift = %g distance = %g\n",
-            ReferenceRedshift, HorizonDistanceRef);
-}
-
-/* returns the horizon distance */
-static double lightcone_get_horizon(double a) {
-    double loga = log(a);
-    int bin = (log(a) -tab_loga[0]) / dloga;
-    if (bin < 0) {
-        return tab_Dc[0];
-    }
-    if (bin >= NENTRY - 1) {
-        return tab_Dc[NENTRY - 1];
-    }
-    double u1 = loga - tab_loga[bin];
-    double u2 = tab_loga[bin + 1] - loga;
-    u1 /= (tab_loga[bin + 1] - tab_loga[bin]);
-    u2 /= (tab_loga[bin + 1] - tab_loga[bin]);
-    return tab_Dc[bin] * u2 + tab_Dc[bin + 1] * u1;
-}
-
-/* fill in the table of box offsets for current time */
-static void update_replicas(double a, double BoxSize) {
-    int Nmax = BoxBoost * BoxBoost * BoxBoost;
-    int i;
-    int rx, ry, rz;
-    rx = ry = rz = 0;
-    Nreplica = 0;
-
-    for(i = 0; i < Nmax; i ++) {
-        double dx = BoxSize * rx;
-        double dy = BoxSize * ry;
-        double dz = BoxSize * rz;
-        double d1, d2;
-        d1 = dx * dx + dy * dy + dz * dz;
-        dx += BoxSize;
-        dy += BoxSize;
-        dz += BoxSize;
-        d2 = dx * dx + dy * dy + dz * dz;
-        if(d1 <= HorizonDistance2 && d2 >= HorizonDistance2) {
-            Reps[Nreplica][0] = rx * BoxSize;
-            Reps[Nreplica][1] = ry * BoxSize;
-            Reps[Nreplica][2] = rz * BoxSize;
-            Nreplica ++;
-            if(Nreplica > 1000) {
-                endrun(951234, "too many replica");
-            }
-        }
-        rz ++;
-        if(rz == BoxBoost) {
-            rz = 0;
-            ry ++;
-        }
-        if(ry == BoxBoost) {
-            ry = 0;
-            rx ++;
-        }
-    }
-}
-
-/* Compute a list of particles which crossed
- * the lightcone boundaries on this timestep and
- * write them to the lightcone file*/
-void lightcone_compute(double a, double BoxSize, Cosmology * CP, inttime_t ti_curr, inttime_t ti_next, const RandTable * const rnd)
-{
-    int i;
-    lightcone_set_time(a, BoxSize);
-    const double ddrift = get_exact_drift_factor(CP, ti_curr, ti_next);
-    #pragma omp parallel for
-    for(i = 0; i < PartManager->NumPart; i++)
-    {
-        lightcone_cross(i, ddrift, rnd);
-    }
-}
-
-void lightcone_set_time(double a, const double BoxSize) {
-    double z = 1 / a - 1;
-    if(z > zmin && z < zmax) {
-        HorizonDistancePrev = HorizonDistance;
-        HorizonDistance2Prev = HorizonDistance2;
-        HorizonDistance = lightcone_get_horizon(a);
-        HorizonDistance2 = HorizonDistance * HorizonDistance;
-        update_replicas(a, BoxSize);
-        fflush(fd_lightcone);
-        if (z < ReferenceRedshift) {
-            SampleFraction = 1.0;
-        } else {
-            /* write a smaller fraction of the points at high redshift
-             */
-            /* This is the angular resolution rule */
-            SampleFraction = HorizonDistanceRef / HorizonDistance;
-            SampleFraction *= SampleFraction;
-            SampleFraction *= SampleFraction;
-            /* This is the luminosity resolution rule */
-#if 0
-            SampleFraction = HorizonDistanceRef / HorizonDistance;
-            SampleFraction *= (1 + ReferenceRedshift) / (1 + z);
-            SampleFraction *= SampleFraction;
-
-#endif
-        }
-        message(0,"RefRedeshit=%g, SampleFraction=%g HorizonDistance=%g\n", ReferenceRedshift, SampleFraction, HorizonDistance);
-    } else {
-        SampleFraction = 0;
-    }
-}
-
-/* check crossing of the horizon, write the particle */
-static void lightcone_cross(int p, double ddrift, const RandTable * const rnd) {
-    if(SampleFraction <= 0.0) return;
-    int i;
-    int k;
-    /* DM only */
-    if(P[p].Type != 1) return;
-
-    for(i = 0; i < Nreplica; i++) {
-        double r = get_random_number(P[p].ID + i, rnd);
-        if(r > SampleFraction) continue;
-
-        double pnew[3];
-        double pold[3];
-        double p3[4];
-        double dnew = 0, dold = 0;
-        for(k = 0; k < 3; k ++) {
-            pold[k] = P[p].Pos[k] + Reps[i][k] - PartManager->CurrentParticleOffset[k];
-            pnew[k] = P[p].Pos[k] + P[i].Vel[k] * ddrift - PartManager->CurrentParticleOffset[k];
-            dnew += pnew[k] * pnew[k];
-            dold += pold[k] * pold[k];
-        }
-        if(
-            (dold <= HorizonDistance2Prev && dnew >= HorizonDistance2)
-         ) {
-            double u1, u2;
-            if(dold != dnew) {
-                double cnew, cold;
-                dnew = sqrt(dnew);
-                dold = sqrt(dold);
-                cnew = dnew - HorizonDistance;
-                cold = dold - HorizonDistancePrev;
-                u1 = -cold / (cnew - cold);
-                u2 = cnew / (cnew - cold);
-            } else {
-                /* really should write all particles along the line:
-                 * this partilce is moving along the horizon! */
-                u1 = u2 = 0.5;
-            }
-
-            /* write particle position */
-            for(k = 0; k < 3; k ++) {
-                p3[k] = pold[k] * u2 + pnew[k] * u1;
-            }
-            p3[3] = SampleFraction;
-            fwrite(p3, sizeof(double), 4, fd_lightcone);
-        }
-    }
-}
diff --git a/libgadget/lightcone.h b/libgadget/lightcone.h
deleted file mode 100644
index 5f254100..00000000
--- a/libgadget/lightcone.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef LIGHTCONE_H
-#define LIGHTCONE_H
-
-/* Initialise the lightcone code module. */
-void lightcone_init(Cosmology * CP, double timeBegin, const double UnitLength_in_cm, const char * OutputDir);
-void lightcone_compute(double a, double BoxSize, Cosmology * CP, inttime_t ti_curr, inttime_t ti_next, const RandTable * const rnd);
-#endif

From be19356926a864939be8616be7eaa550169ec8bc Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 21:54:04 -0700
Subject: [PATCH 060/120] Remove old Options files

---
 platform-options/Options.mk.BlueTides    | 15 ---------------
 platform-options/Options.mk.cori         | 10 ----------
 platform-options/Options.mk.coriknl      |  9 ---------
 platform-options/Options.mk.edison       | 10 ----------
 platform-options/Options.mk.example.coma |  8 --------
 platform-options/Options.mk.pfe          |  6 ------
 platform-options/Options.mk.stampede2    | 10 ----------
 7 files changed, 68 deletions(-)
 delete mode 100644 platform-options/Options.mk.BlueTides
 delete mode 100644 platform-options/Options.mk.cori
 delete mode 100644 platform-options/Options.mk.coriknl
 delete mode 100644 platform-options/Options.mk.edison
 delete mode 100644 platform-options/Options.mk.example.coma
 delete mode 100644 platform-options/Options.mk.pfe
 delete mode 100644 platform-options/Options.mk.stampede2

diff --git a/platform-options/Options.mk.BlueTides b/platform-options/Options.mk.BlueTides
deleted file mode 100644
index d998d040..00000000
--- a/platform-options/Options.mk.BlueTides
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is the example for building a MP-Gadget
-# that runs the BlueTides simulation
-# on BlueWaters
-# the silly compiler is 
-
-#CC       = cc -h gnu -h omp
-MPICC       = cc
-#
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -static -fopenmp -O3 -Ofast -g
-#OPTIMIZE =  -g -static -h aggress -h flex_mp=default -h negmsgs -O3
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = -L$(GSL_DIR)/lib -lgsl -lgslcblas
-#OPT += VALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.cori b/platform-options/Options.mk.cori
deleted file mode 100644
index c4f813cc..00000000
--- a/platform-options/Options.mk.cori
+++ /dev/null
@@ -1,10 +0,0 @@
-MPICC       =   cc
-
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=native
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.coriknl b/platform-options/Options.mk.coriknl
deleted file mode 100644
index dc580324..00000000
--- a/platform-options/Options.mk.coriknl
+++ /dev/null
@@ -1,9 +0,0 @@
-MPICC       =   cc
-
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=knl
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.edison b/platform-options/Options.mk.edison
deleted file mode 100644
index c4f813cc..00000000
--- a/platform-options/Options.mk.edison
+++ /dev/null
@@ -1,10 +0,0 @@
-MPICC       =   cc
-
-# For GCC add -mpc64 if IEEE 64bit FP is desired.
-OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math -march=native
-
-GSL_INCL = -I$(GSL_DIR)/include
-GSL_LIBS = $(GSL) -lmvec -lmvec_nonshared
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.example.coma b/platform-options/Options.mk.example.coma
deleted file mode 100644
index f3132eac..00000000
--- a/platform-options/Options.mk.example.coma
+++ /dev/null
@@ -1,8 +0,0 @@
-MPICC       =   mpiicc
-
-OPTIMIZE =  -openmp -O0 -g
-GSL_INCL = -I/opt/gsl/impi/include/gsl
-GSL_LIBS = -L/opt/gsl/impi/lib64 -lgsl -lgslcblas
-
-#OPT += -DDEBUG
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
diff --git a/platform-options/Options.mk.pfe b/platform-options/Options.mk.pfe
deleted file mode 100644
index 4f3b38d7..00000000
--- a/platform-options/Options.mk.pfe
+++ /dev/null
@@ -1,6 +0,0 @@
-OPTIMIZE =  -fopenmp -O3 -g
-GSL_INCL = -I$(HOME)/anaconda3/envs/3.5/include
-GSL_LIBS = $(HOME)/anaconda3/envs/3.5/lib/libgsl.a $(HOME)/anaconda3/envs/3.5/lib/libgslcblas.a
-
-#OPT += -DVALGRIND  # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages
diff --git a/platform-options/Options.mk.stampede2 b/platform-options/Options.mk.stampede2
deleted file mode 100644
index cc54e5b2..00000000
--- a/platform-options/Options.mk.stampede2
+++ /dev/null
@@ -1,10 +0,0 @@
-# Stampede 2 KNL nodes use icc.
-# TACC_VEC_FLAGS define: -xCORE-AVX2 -axMIC-AVX512,CORE-AVX512 
-# which means the base instruction set is CORE-AVX2, and 
-# alternate versions of some routines are generated for KNL and SKX nodes.
-#
-# -simd is important: it aligns various structures and without it intel's MPI crashes.
-OPTIMIZE =  -fopenmp -O3 -g -Wall ${TACC_VEC_FLAGS} -Zp16 -fp-model fast=1 -simd
-
-#OPT += VALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages

From 224438abbd74f3e01cf8c938aa0b0cb5eeee9868 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 21:55:44 -0700
Subject: [PATCH 061/120] Remove GSL flags from build files

---
 libgadget/run.c                       | 9 ---------
 platform-options/Options.mk.macos     | 3 ---
 platform-options/Options.mk.scanbuild | 2 --
 platform-options/Options.mk.travis    | 2 --
 platform-options/Options.mk.vista     | 8 --------
 5 files changed, 24 deletions(-)

diff --git a/libgadget/run.c b/libgadget/run.c
index 5bf2e0a0..737b18c8 100644
--- a/libgadget/run.c
+++ b/libgadget/run.c
@@ -29,7 +29,6 @@
 #include "hci.h"
 #include "fof.h"
 #include "cooling_qso_lightup.h"
-#include "lightcone.h"
 #include "timefac.h"
 #include "uvbg.h"
 #include "neutrinos_lra.h"
@@ -248,9 +247,6 @@ begrun(const int RestartSnapNum, struct header_data * head)
 
     gravshort_fill_ntab(All.ShortRangeForceWindowType, All.Asmth);
 
-    if(All.LightconeOn)
-        lightcone_init(&All.CP, head->TimeSnapshot, head->UnitLength_in_cm, All.OutputDir);
-
     /* Ensure that the timeline runs at least to the current time*/
     if(head->TimeSnapshot > All.TimeMax)
         All.TimeMax = head->TimeSnapshot;
@@ -651,11 +647,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
         /* We don't need this timestep's tree anymore.*/
         force_tree_free(&gasTree);
 
-        /* Compute the list of particles that cross a lightcone and write it to disc.
-         * This should happen when kick and drift times are synchronised.*/
-        if(All.LightconeOn)
-            lightcone_compute(atime, PartManager->BoxSize, &All.CP, Ti_Last, Ti_Next, &rnd);
-
         /* Now done with random numbers*/
         if(rnd.Table)
             free_random_numbers(&rnd);
diff --git a/platform-options/Options.mk.macos b/platform-options/Options.mk.macos
index 4e646ae7..5b076bf5 100644
--- a/platform-options/Options.mk.macos
+++ b/platform-options/Options.mk.macos
@@ -7,9 +7,6 @@ OPTIMIZE += -fno-common -fopenmp
 # Find the sdk path on Mac
 OPT += -isysroot $(shell xcrun -sdk macosx --show-sdk-path)
 
-GSL_INCL = $(shell pkg-config --cflags gsl)
-GSL_LIBS = $(shell pkg-config --libs gsl)
-
 OPT += -DVALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
 #OPT += -DDEBUG      # print a lot of debugging messages
 #Use alternative OpenMP locks, instead of the pthread spinlocks. Required on mac.
diff --git a/platform-options/Options.mk.scanbuild b/platform-options/Options.mk.scanbuild
index c794faeb..43e07ca9 100644
--- a/platform-options/Options.mk.scanbuild
+++ b/platform-options/Options.mk.scanbuild
@@ -5,8 +5,6 @@ MPICC=$(CC) -I/usr/include/openmpi-x86_64 -L/usr/lib64/openmpi/lib -lmpi
 OPTIMIZE =  -fopenmp -O0 -std=gnu99 -g -Wall -Wextra -Wno-unused-parameter -Wvla
 
 #--------------------------------------- Basic operation mode of code
-#OPT += -DLIGHTCONE                       # write a lightcone on the fly; in development
 #OPT += -DNO_OPENMP_SPINLOCK
 OPT += -DVALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
 OPT += -DDEBUG      # print a lot of debugging messages
-OPT += -DEXCUR_REION  # reionization with excursion set
diff --git a/platform-options/Options.mk.travis b/platform-options/Options.mk.travis
index 0e0235cf..ab4aa2a4 100644
--- a/platform-options/Options.mk.travis
+++ b/platform-options/Options.mk.travis
@@ -1,6 +1,4 @@
 OPTIMIZE =  -fopenmp -O2 -g -std=gnu99
-GSL_INCL = 
-GSL_LIBS = -lgsl -lgslcblas
 AR=ar
 SHELL = /bin/bash
 
diff --git a/platform-options/Options.mk.vista b/platform-options/Options.mk.vista
index ce9934bd..aeabbda7 100644
--- a/platform-options/Options.mk.vista
+++ b/platform-options/Options.mk.vista
@@ -1,7 +1,5 @@
 #These variables are set to useful defaults, but may be overriden if needed
 #MPICC=mpicc
-GSL_LIBS=-L$(TACC_GSL_LIB) -lgsl -lgslcblas
-GSL_INCL=-I$(TACC_GSL_INC)
 #This is a good optimized build default for nvc
 OPTIMIZE =  -mp -g -Wall -fast
 #This is a good non-optimized default for debugging
@@ -12,9 +10,3 @@ OPTIMIZE =  -mp -g -Wall -fast
 #OPT += -DDEBUG      # print a lot of debugging messages
 #Disable openmp locking. This means no threading.
 #OPT += -DNO_OPENMP_SPINLOCK
-
-#-----------
-#OPT += -DEXCUR_REION  # reionization with excursion set
-
-#--------- CFITSIO (required only for saving potential plane files)
-# OPT += -DUSE_CFITSIO

From d84bb4f8528866a808e1c4f47bb4d05c6da95399 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 21:56:47 -0700
Subject: [PATCH 062/120] Remove gsl from build files

---
 maintainer/build-MPGadget.sh | 1 -
 maintainer/conda-env.yaml    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/maintainer/build-MPGadget.sh b/maintainer/build-MPGadget.sh
index 6daec9b0..dfee5fe9 100644
--- a/maintainer/build-MPGadget.sh
+++ b/maintainer/build-MPGadget.sh
@@ -59,7 +59,6 @@ function build {
         cp platform-options/Options.mk.$host Options.mk
         ./bootstrap.sh
     fi
-    module load gsl
     make
 
     popd
diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 7e1267c2..1d7bd6d2 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -13,4 +13,3 @@ dependencies:
   # Below are for building MP-Gadget
   - mpich
   - gcc_linux-64=9.3.0   # Pin to latest version compatible with mpich 3.3.2: https://github.com/AnacondaRecipes/mpich-feedstock/issues/5
-  - gsl

From a03aa9d02ad826efa9ad841806a607f33eab47b8 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:01:35 -0700
Subject: [PATCH 063/120] Some last GSL cleanups in the build system

---
 Makefile.rules                       |  8 +++-----
 Options.mk.example                   |  2 --
 README.rst                           | 26 ++------------------------
 depends/bigfile/CMakeLists.txt       |  1 -
 depends/bigfile/utils/CMakeLists.txt | 11 -----------
 libgadget/run.c                      |  4 ++--
 libgadget/tests/test_cosmology.c     |  1 -
 libgadget/utils/system.h             |  1 -
 8 files changed, 7 insertions(+), 47 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index 22c9b4c9..402a635d 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -5,8 +5,6 @@ MPICC ?= mpicc
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
-GSL_INCL ?= $(shell pkg-config --cflags gsl)
-GSL_LIBS ?= $(shell pkg-config --libs gsl)
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
     FITSIO_INCL ?= $(shell pkg-config --cflags cfitsio)
@@ -23,7 +21,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(FITSIO_INCL) $(CUDA_INCL)
+CFLAGS = $(OPTIONS) $(FITSIO_INCL) $(CUDA_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -31,7 +29,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm -lboost_system -lboost_math_c99 $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm -lboost_system -lboost_math_c99 $(FITSIO_LIBS) $(CUDA_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
@@ -43,4 +41,4 @@ V ?= 0
 
 # Rule to compile .cu files (using nvcc)
 .objs/%.o: %.cu
-	$(NVCC) $(NVOPTIMIZE) -c $< -o $@
\ No newline at end of file
+	$(NVCC) $(NVOPTIMIZE) -c $< -o $@
diff --git a/Options.mk.example b/Options.mk.example
index a5d3eefe..65eea878 100644
--- a/Options.mk.example
+++ b/Options.mk.example
@@ -5,8 +5,6 @@
 #NVCC=nvcc
 #NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
 
-#GSL_LIBS=
-#GSL_INCL=
 #This is a good optimized build default for gcc
 OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
 #This is a good non-optimized default for debugging
diff --git a/README.rst b/README.rst
index 17c4af28..5aaf5d37 100644
--- a/README.rst
+++ b/README.rst
@@ -12,7 +12,7 @@ Description
 
 This version of Gadget is derived from main P-Gadget / Gadget-2, with the gravity solver algorithm from Gadget-4.
 It is the source code used to run the BlueTides and ASTRID simulations (http://bluetides-project.org).
-MP-Gadget requires GSL and a C compiler with OpenMP 4.5 support.
+MP-Gadget requires a C++ compiler with OpenMP 4.5 support.
 
 The infrastructure is heavily reworked. As a summary:
 
@@ -50,17 +50,6 @@ First time users:
     make -j
 
 The Makefile will automatically copy Options.mk.example to Options.mk. The default compile flags are appropriate for a linux using gcc, but may not be optimal.
-We will need gsl. On HPC systems with the modules command,
-usually it can be loaded with
-
-.. code:: bash
-
-    module load gsl
-
-    env | grep GSL  # check if GSL path is reasonable
-
-On a common PC/Linux system, refer to your package vendor how to
-install gsl and gsl-devel.
 
 If you wish to perform compile-time customisation (to, eg, change optimizations or use different compilers), you need an Options.mk file. The initial defaults are stored in Options.mk.example.
 
@@ -81,8 +70,6 @@ Compile-time options may be set in Options.mk. The remaining compile time option
 - EXCUR_REION enables the excursion set reionization model.
 - USE_CFITSIO enables the output of lenstools compatible potential planes using cfitsio,
 
-If compilation fails with errors related to the GSL, you may also need to set the GSL_INC or GSL_LIB variables in Options.mk to the filesystem path containing the GSL headers and libraries.
-
 To run a N-Body sim, use IC files with no gas particles.
 
 Now we are ready to build
@@ -140,15 +127,6 @@ Refer to https://github.com/rainwoodman/bigfile for usage.
 Otherwise directly open the blocks with Fortran or C, noting the data-type
 information and attributes in header and attrs files (in plain text)
 
-GLIBC 2.22
-----------
-
-Cray updated their GLIBC to 2.22+ recently.
-A good move but it happens to be a buggy version of GLIBC:
-https://sourceware.org/bugzilla/show_bug.cgi?id=19590
-causing non-existing symbols like `_ZGVcN4v___log_finite`.
-Adding `-lmvec -lmvec_nonshared` to GSL_LIBS works around the issue.
-
 Bigfile
 -------
 
@@ -192,7 +170,7 @@ For usage of the code, here is a DOI for this repository that you can cite
 Licence
 -------
 
-MP-Gadget is distributed under the terms of a 3-clause BSD license or the GNU General Public License v2 or later, at the option of the user. The use of PFFT and GSL libraries usually forces distribution under the terms of the GNU General Public License v3.
+MP-Gadget is distributed under the terms of a 3-clause BSD license or the GNU General Public License v2 or later, at the option of the user.
 
 Status
 ------
diff --git a/depends/bigfile/CMakeLists.txt b/depends/bigfile/CMakeLists.txt
index 48527264..843bc005 100644
--- a/depends/bigfile/CMakeLists.txt
+++ b/depends/bigfile/CMakeLists.txt
@@ -4,7 +4,6 @@ project(bigfile)
 
 # Finding optional dependencies
 find_package(MPI)
-find_package(GSL)
 
 # Add library subdirectoy
 add_subdirectory(src)
diff --git a/depends/bigfile/utils/CMakeLists.txt b/depends/bigfile/utils/CMakeLists.txt
index ce157662..540787e6 100644
--- a/depends/bigfile/utils/CMakeLists.txt
+++ b/depends/bigfile/utils/CMakeLists.txt
@@ -55,17 +55,6 @@ if(${MPI_C_FOUND})
     
     install(TARGETS bigfile-copy-mpi bigfile-iosim
             RUNTIME DESTINATION bin)
-    
-    if(${GSL_FOUND})
-        include_directories(${GSL_INCLUDE_DIRS})
-
-        # bigfile-sample-mpi
-        add_executable(bigfile-sample-mpi bigfile-sample-mpi.c)
-        target_link_libraries(bigfile-sample-mpi bigfile-mpi bigfile ${GSL_LIBRARIES} ${MPI_C_LIBRARIES})
-        
-        install(TARGETS bigfile-sample-mpi
-                RUNTIME DESTINATION bin)
-    endif()
 endif()
 
 # Install bash scripts
diff --git a/libgadget/run.c b/libgadget/run.c
index 737b18c8..d40075df 100644
--- a/libgadget/run.c
+++ b/libgadget/run.c
@@ -390,8 +390,8 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
 
         /* We need to re-seed the random number table each timestep.
          * The seed needs to be the same on all processors, and a different
-         * value each timestep. Only the lowest 32 bits are used in the GSL
-         * random number generator. The populated part of the timestep hierarchy
+         * value each timestep. Only the lowest 32 bits are used in some
+         * random number generators. The populated part of the timestep hierarchy
          * is added to the random seed. The current snapshot is folded into
          * bits 32 - 23 so that the random tables do not cycle after every snapshot.
          * We may still cycle after 512 snapshots but that should be far enough apart. */
diff --git a/libgadget/tests/test_cosmology.c b/libgadget/tests/test_cosmology.c
index a6d05f96..c1913927 100644
--- a/libgadget/tests/test_cosmology.c
+++ b/libgadget/tests/test_cosmology.c
@@ -82,7 +82,6 @@ static void test_cosmology(void ** state)
     assert_true(fabs(GrowthFactor(&CP, 0.01,0.001) - radgrow(0.01, CP.OmegaG)/radgrow(0.001, CP.OmegaG))< 1e-3);
 
     //Check against exact solutions from gr-qc/0504089: No radiation!
-    //Note that the GSL hyperg needs the last argument to be < 1
     double omegam = 0.5;
     setup_cosmology(&CP, omegam, 0.0455, 0.7);
     CP.RadiationOn = 0;
diff --git a/libgadget/utils/system.h b/libgadget/utils/system.h
index 0e5d7432..4c19854c 100644
--- a/libgadget/utils/system.h
+++ b/libgadget/utils/system.h
@@ -31,7 +31,6 @@ double get_physmem_bytes(void);
  * independent of processor.*/
 double get_random_number(const uint64_t id, const RandTable * const rnd);
 /* Generate the random number table. The seed should be the same on each processor so the output is invariant to
- * To quote the GSL documentation: 'Note that the most generators only accept 32-bit seeds, with higher values being reduced modulo 2^32.'
  * It is important that each timestep uses a new seed value, so the seed should change by less than 2^32 each timestep.
  * The random number table is heap-allocated high, and random numbers are uniform doubles between 0 and 1.*/
 RandTable set_random_numbers(uint64_t seed, const size_t rndtablesize);

From d6393039c55c7654178754a06ed08d90f1e90660 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:15:27 -0700
Subject: [PATCH 064/120] Change default mpicc to mpic++

---
 Makefile.rules                    | 2 +-
 platform-options/Options.mk.vista | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index 402a635d..8099dea2 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -1,7 +1,7 @@
 # vim: set ft=make:
 #
 AR ?= ar
-MPICC ?= mpicc
+MPICC ?= mpic++
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
diff --git a/platform-options/Options.mk.vista b/platform-options/Options.mk.vista
index aeabbda7..28c4d944 100644
--- a/platform-options/Options.mk.vista
+++ b/platform-options/Options.mk.vista
@@ -1,5 +1,4 @@
 #These variables are set to useful defaults, but may be overriden if needed
-#MPICC=mpicc
 #This is a good optimized build default for nvc
 OPTIMIZE =  -mp -g -Wall -fast
 #This is a good non-optimized default for debugging

From a1b2ac64cc323218c0f21511f9781f13325112a0 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:16:30 -0700
Subject: [PATCH 065/120] Adjust comment

---
 libgenic/power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgenic/power.c b/libgenic/power.c
index a6d2aa9c..e959970b 100644
--- a/libgenic/power.c
+++ b/libgenic/power.c
@@ -484,7 +484,7 @@ double TopHatSigma2(double R)
 
   /* note: 500/R is here chosen as integration boundary (infinity) */
   result = tanh_sinh_integrate_adaptive(integrand, 0, 500. / R, &abserr, 1e-4, 0.);
-/*   printf("gsl_integration_qng in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
+/*   printf("integration in TopHatSigma2. Result %g, error: %g, intervals: %lu\n",result, abserr,w->size); */
   return result;
 }
 

From 55a321a6078491f22c6ccce7959281f443b01dad Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:27:40 -0700
Subject: [PATCH 066/120] Try to fix travis C++ build

---
 maintainer/conda-env.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 1d7bd6d2..428e50d1 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -11,5 +11,5 @@ dependencies:
   - numpy
   - configobj
   # Below are for building MP-Gadget
-  - mpich
-  - gcc_linux-64=9.3.0   # Pin to latest version compatible with mpich 3.3.2: https://github.com/AnacondaRecipes/mpich-feedstock/issues/5
+  - openmpi
+  - gxx_linux-64

From ced2fa2815ee467bdee9de548facaaa92c78c26e Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:32:05 -0700
Subject: [PATCH 067/120] Still need C compiler for bigfile

---
 maintainer/conda-env.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 428e50d1..296a0dc1 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -13,3 +13,4 @@ dependencies:
   # Below are for building MP-Gadget
   - openmpi
   - gxx_linux-64
+  - gcc_linux-64

From f6923330978a8411f62c8a1fbd8fb35c53bcfb83 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:48:42 -0700
Subject: [PATCH 068/120] Install more c++ packages

---
 .github/workflows/main.yaml | 3 ++-
 maintainer/conda-env.yaml   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 36145a85..962b0efb 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -20,7 +20,8 @@ jobs:
 
     - name: Checkout source code
       uses: actions/checkout@v4
-
+    - name: Install MPI
+          sudo apt install -y -q build-essential mpich libmpich-dev
     - name: Cache conda
       uses: actions/cache@v4
       env:
diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 296a0dc1..2d267100 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -12,5 +12,6 @@ dependencies:
   - configobj
   # Below are for building MP-Gadget
   - openmpi
+  - compilers
   - gxx_linux-64
   - gcc_linux-64

From 6638fe1059f32b47b47d569701608b1eb4c80868 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:50:57 -0700
Subject: [PATCH 069/120] Fix github workflow

---
 .github/workflows/main.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 962b0efb..5885eb15 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -21,7 +21,7 @@ jobs:
     - name: Checkout source code
       uses: actions/checkout@v4
     - name: Install MPI
-          sudo apt install -y -q build-essential mpich libmpich-dev
+      run: sudo apt install -y -q build-essential mpich libmpich-dev
     - name: Cache conda
       uses: actions/cache@v4
       env:

From a8e958e5018835637cf4cc1ed7b25735c882d882 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 22:58:20 -0700
Subject: [PATCH 070/120] Will this fix it

---
 .github/workflows/main.yaml |  2 +-
 maintainer/conda-env.yaml   | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 5885eb15..52af609a 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -21,7 +21,7 @@ jobs:
     - name: Checkout source code
       uses: actions/checkout@v4
     - name: Install MPI
-      run: sudo apt install -y -q build-essential mpich libmpich-dev
+      run: sudo apt install -y -q build-essential mpich libmpich-dev python
     - name: Cache conda
       uses: actions/cache@v4
       env:
diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 2d267100..6f8c493f 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -4,14 +4,14 @@ channels:
   - bccp
 
 dependencies:
+  # Below are for building MP-Gadget
+  - gxx_linux-64
+  - gcc_linux-64
+  - mpich
+  # For the tests
   - python
+  - numpy
+  - matplotlib
   - mpi4py
   - nbodykit
-  - matplotlib
-  - numpy
   - configobj
-  # Below are for building MP-Gadget
-  - openmpi
-  - compilers
-  - gxx_linux-64
-  - gcc_linux-64

From 57da356a2e381f34106c4fb3cd6d58cc6a423a51 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Fri, 4 Oct 2024 23:02:20 -0700
Subject: [PATCH 071/120] Try this

---
 .github/workflows/main.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 52af609a..f9622c4c 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -21,7 +21,7 @@ jobs:
     - name: Checkout source code
       uses: actions/checkout@v4
     - name: Install MPI
-      run: sudo apt install -y -q build-essential mpich libmpich-dev python
+      run: sudo apt install -y -q build-essential mpich libmpich-dev python-is-python3 python3
     - name: Cache conda
       uses: actions/cache@v4
       env:

From 7ed64bf60ec0dcff9e2264bf35b7e69738be79ff Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Sat, 5 Oct 2024 08:07:37 -0700
Subject: [PATCH 072/120] Restore the gcc version pin for mpich as it is still
 necessary

---
 maintainer/conda-env.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index 6f8c493f..e36a081e 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -5,9 +5,10 @@ channels:
 
 dependencies:
   # Below are for building MP-Gadget
-  - gxx_linux-64
-  - gcc_linux-64
+  # Pin to latest version compatible with mpich 3.3.2: https://github.com/AnacondaRecipes/mpich-feedstock/issues/5
   - mpich
+  - gxx_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
   # For the tests
   - python
   - numpy

From 5abc6b70029b2fe8e5665f922a5c6cd4e0aca8e7 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Sat, 5 Oct 2024 15:34:11 -0700
Subject: [PATCH 073/120] Add boost

---
 maintainer/conda-env.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maintainer/conda-env.yaml b/maintainer/conda-env.yaml
index e36a081e..40627ccb 100644
--- a/maintainer/conda-env.yaml
+++ b/maintainer/conda-env.yaml
@@ -9,6 +9,7 @@ dependencies:
   - mpich
   - gxx_linux-64=9.3.0
   - gcc_linux-64=9.3.0
+  - boost
   # For the tests
   - python
   - numpy

From 37428403c26e9bf4526ef388ba9e2bad49c9c3ae Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 15:22:48 -0500
Subject: [PATCH 074/120] modified lib to add Boost path

keep upstream Makefile.rules
---
 Makefile       |  3 ++-
 Makefile.rules | 11 +++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 322699bb..802532f8 100644
--- a/Makefile
+++ b/Makefile
@@ -8,12 +8,13 @@ include Makefile.version
 FILES = $(shell git ls-files)
 
 all: $(CONFIG)
+	@echo "=================$(BOOST_LIBS)======================="
+	@echo "=================$(GSL_LIBS)======================="
 	cd depends; $(MAKE)
 	cd libgadget; $(MAKE)
 	cd libgenic; $(MAKE)
 	cd gadget; $(MAKE)
 	cd genic; $(MAKE)
-
 clean :
 	cd libgadget; $(MAKE) clean
 	cd libgenic; $(MAKE) clean
diff --git a/Makefile.rules b/Makefile.rules
index 8099dea2..db579939 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -5,6 +5,13 @@ MPICC ?= mpic++
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
+GSL_INCL ?= $(shell pkg-config --cflags gsl)
+GSL_LIBS ?= $(shell pkg-config --libs gsl)
+#BOOST_INCL ?= $(shell pkg-config --cflags boost)
+#BOOST_LIBS ?= $(shell pkg-config --libs boost)
+all:
+	@echo "=================$(BOOST_LIBS)======================="
+
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
     FITSIO_INCL ?= $(shell pkg-config --cflags cfitsio)
@@ -21,7 +28,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(FITSIO_INCL) $(CUDA_INCL)
+CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -29,7 +36,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm -lboost_system -lboost_math_c99 $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 

From 12dcd6b008f559a9b93eabaa20e3b049c66b4963 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:30:39 -0400
Subject: [PATCH 075/120] changed pfft calls/vars to cufft

---
 libgadget/petapm.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 2a3a45c8..97a47dd9 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,6 +1,6 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <pfft.h>
+#include <cufftmp.h>   // NC:library change
 
 #include "powerspectrum.h"
 
@@ -49,8 +49,8 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
-    pfft_plan plan_forw;
-    pfft_plan plan_back;
+    cufftmpHandle_t plan_forw; // NC:change plan function call
+    cufftmpHandle_t plan_back;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */
@@ -99,7 +99,7 @@ typedef struct {
     size_t offset_fesc; //offset in fof groups to fof mass
 } PetaPMReionPartStruct;
 
-typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
 typedef PetaPMRegion * (*petapm_prepare_func)(PetaPM * pm, PetaPMParticleStruct * pstruct, void * data, int *Nregions);
 
@@ -142,13 +142,13 @@ PetaPMRegion * petapm_force_init(PetaPM * pm,
         PetaPMParticleStruct * pstruct,
         int * Nregions,
         void * userdata);
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
-        );
+        ); // NC: changed returned complex type
 void petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k, PetaPMRegion * regions,
+        cufftComplex * rho_k, PetaPMRegion * regions,
         const int Nregions,
-        PetaPMFunctions * functions);
+        PetaPMFunctions * functions); // NC: changed input complex type
 void petapm_force_finish(PetaPM * pm);
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm);
@@ -156,7 +156,7 @@ PetaPMRegion * petapm_get_real_region(PetaPM * pm);
 int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
-pfft_complex * petapm_alloc_rhok(PetaPM * pm);
+cufftComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
 
 void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
         petapm_prepare_func prepare,

From 9708be43d186b92d6529a6285f6a79bbabb07925 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:32:03 -0400
Subject: [PATCH 076/120] changed pfft calls/vars to cufft in petapm.c

---
 libgadget/petapm.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index aeda7bb3..c1b8e12d 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -46,10 +46,10 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 static MPI_Datatype MPI_PENCIL;
 
 /*Used only in MP-GenIC*/
-pfft_complex *
+cufftComplex *
 petapm_alloc_rhok(PetaPM * pm)
 {
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
     memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
@@ -174,8 +174,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     /* planning the fft; need temporary arrays */
 
     double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
 
     pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
         n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
@@ -237,8 +237,8 @@ typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
 /* apply transfer function to value, kpos array is in x, y, z order */
 static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H);
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
 static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
@@ -279,7 +279,7 @@ petapm_force_init(
     return regions;
 }
 
-pfft_complex * petapm_force_r2c(PetaPM * pm,
+cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
     /* call pfft rho_k is CFT of rho */
@@ -299,11 +299,11 @@ pfft_complex * petapm_force_r2c(PetaPM * pm,
     walltime_measure("/PMgrav/Verify");
 #endif
 
-    pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
     pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
     myfree(real);
 
-    pfft_complex * rho_k = (pfft_complex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
     /*Do any analysis that may be required before the transfer function is applied*/
     petapm_transfer_func global_readout = global_functions->global_readout;
@@ -322,7 +322,7 @@ pfft_complex * petapm_force_r2c(PetaPM * pm,
 
 void
 petapm_force_c2r(PetaPM * pm,
-        pfft_complex * rho_k,
+        cufftComplex * rho_k,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions)
@@ -333,7 +333,7 @@ petapm_force_c2r(PetaPM * pm,
         petapm_transfer_func transfer = f->transfer;
         petapm_readout_func readout = f->readout;
 
-        pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
         /* apply the greens function turn rho_k into potential in fourier space */
         pm_apply_transfer_function(pm, rho_k, complx, transfer);
         walltime_measure("/PMgrav/calc");
@@ -366,7 +366,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
         void * userdata) {
     int Nregions;
     PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    pfft_complex * rho_k = petapm_force_r2c(pm, global_functions);
+    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
     if(functions)
         petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
     myfree(rho_k);
@@ -413,7 +413,7 @@ petapm_reion_init(
  * ,after c2r but iteration over the grid, instead of particles */
 void
 petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        pfft_complex * mass_unfiltered, pfft_complex * star_unfiltered, pfft_complex * sfr_unfiltered,
+        cufftComplex * mass_unfiltered, cufftComplex * star_unfiltered, cufftComplex * sfr_unfiltered,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions,
@@ -446,11 +446,11 @@ petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
         if(use_sfr)pm_sfr->G = R;
 
         //TODO: maybe allocate and free these outside the loop
-        pfft_complex * mass_filtered = (pfft_complex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
-        pfft_complex * star_filtered = (pfft_complex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
-        pfft_complex * sfr_filtered;
+        cufftComplex * mass_filtered = (cufftComplex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
+        cufftComplex * star_filtered = (cufftComplex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
+        cufftComplex * sfr_filtered;
         if(use_sfr){
-            sfr_filtered = (pfft_complex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
+            sfr_filtered = (cufftComplex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
         }
 
         /* apply the filtering at this radius */
@@ -536,9 +536,9 @@ void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
     walltime_measure("/PMreion/comm2");
 
     //using force r2c since this part can be done independently
-    pfft_complex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
-    pfft_complex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
-    pfft_complex * sfr_unfiltered = NULL;
+    cufftComplex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
+    cufftComplex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
+    cufftComplex * sfr_unfiltered = NULL;
     if(use_sfr){
         sfr_unfiltered = petapm_force_r2c(pm_sfr, global_functions);
     }
@@ -1088,8 +1088,8 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 #endif
 
 static void pm_apply_transfer_function(PetaPM * pm,
-        pfft_complex * src,
-        pfft_complex * dst, petapm_transfer_func H
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
         ){
     size_t ip = 0;
 

From 7a9b859c4a3a02a9f429796d8fd56cf492704e42 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 16:40:27 -0400
Subject: [PATCH 077/120] fixed typo in lib include

---
 libgadget/petapm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 97a47dd9..1fa1070c 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,6 +1,6 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <cufftmp.h>   // NC:library change
+#include <cufftMp.h>   // NC:library change
 
 #include "powerspectrum.h"
 

From c06e66ef2cddcae772f70d1507e3ec0a00872223 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Sun, 29 Sep 2024 21:55:49 -0500
Subject: [PATCH 078/120] fixed cufft complex indexing

adopt remote changes
---
 libgadget/gravpm.c | 53 ++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index e4ad9f2c..b2ab750b 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -20,11 +20,11 @@ static int pm_mark_region_for_node(int startno, int rid, int * RegionInd, const
 static void convert_node_to_region(PetaPM * pm, PetaPMRegion * r, struct NODE * Nodes);
 
 static int hybrid_nu_gravpm_is_active(int i);
-static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void compute_neutrino_power(PetaPM * pm);
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -318,7 +318,9 @@ static void compute_neutrino_power(PetaPM * pm) {
     delta_nu_from_power(ps, GravPM.CP, GravPM.Time, GravPM.TimeIC);
 
     /*Initialize the interpolation for the neutrinos*/
-    ps->nu_spline = new boost::math::interpolators::barycentric_rational<double>(ps->logknu, ps->delta_nu_ratio, ps->nonzero);
+    ps->nu_spline = gsl_interp_alloc(gsl_interp_linear,ps->nonzero);
+    ps->nu_acc = gsl_interp_accel_alloc();
+    gsl_interp_init(ps->nu_spline,ps->logknu,ps->delta_nu_ratio,ps->nonzero);
     /*Zero power spectrum, which is stored with the neutrinos*/
     powerspectrum_zero(ps);
 }
@@ -326,11 +328,11 @@ static void compute_neutrino_power(PetaPM * pm) {
 /* Compute the power spectrum of the fourier transformed grid in value.
  * Store it in the PowerSpectrum structure */
 void
-powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh)
+powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh)
 {
     if(k2 == 0) {
         /* Save zero mode corresponding to the mean as the normalisation factor.*/
-        PowerSpectrum->Norm = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        PowerSpectrum->Norm = (value[0].x * value[0].x + value[0].y * value[0].y);
         return;
     }
     /* Measure power spectrum: we don't want the zero mode.
@@ -342,7 +344,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
         int kint=floor(binsperunit*log(k2)/2.);
         int w;
         const double keff = sqrt(kpos[0]*kpos[0]+kpos[1]*kpos[1]+kpos[2]*kpos[2]);
-        const double m = (value[0][0] * value[0][0] + value[0][1] * value[0][1]);
+        const double m = (value[0].x * value[0].x + value[0].y * value[0].y);
         /*Make sure we do not overflow (although this should never happen)*/
         if(kint >= PowerSpectrum->size)
             return;
@@ -360,7 +362,7 @@ powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3
 
 /*Just read the power spectrum, without changing the input value.*/
 void
-measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value) {
+measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value) {
     double f = 1.0;
     /* the CIC deconvolution kernel is
      *
@@ -379,7 +381,7 @@ measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value
 }
 
 static void
-potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
+potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value)
 {
     const double asmth2 = pow((2 * M_PI) * pm->Asmth / pm->Nmesh,2);
     double f = 1.0;
@@ -428,9 +430,10 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
          *            = (M_cdm + M_nu) * delta_t
          * This is correct for the forces, and gives the right power spectrum,
          * once we multiply PowerSpectrum.Norm by (Omega0 / (Omega0 - OmegaNu))**2 */
-        const double nufac = 1 + ps->nu_prefac * (*ps->nu_spline)(logk2);
-        value[0][0] *= nufac;
-        value[0][1] *= nufac;
+        const double nufac = 1 + ps->nu_prefac * gsl_interp_eval(ps->nu_spline,ps->logknu,
+                                                                       ps->delta_nu_ratio,logk2,ps->nu_acc);
+        value[0].x *= nufac;
+        value[0].y *= nufac;
     }
 
     /*Compute the power spectrum*/
@@ -441,13 +444,13 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value)
             ps->Norm *= MtotbyMcdm*MtotbyMcdm;
         }
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -470,7 +473,7 @@ static int hybrid_nu_gravpm_is_active(int i) {
         return 1;
 }
 
-static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM * pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -479,18 +482,18 @@ static void force_transfer(PetaPM * pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_potential(PetaPM * pm, int i, double * mesh, double weight) {

From ec678acb025802d4f76ea3eeba0a2b626c56e7fc Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 08:43:37 -0500
Subject: [PATCH 079/120] added cufft/cuda libs to Makefile.rules

---
 Makefile.rules | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index db579939..56101cbe 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -9,8 +9,6 @@ GSL_INCL ?= $(shell pkg-config --cflags gsl)
 GSL_LIBS ?= $(shell pkg-config --libs gsl)
 #BOOST_INCL ?= $(shell pkg-config --cflags boost)
 #BOOST_LIBS ?= $(shell pkg-config --libs boost)
-all:
-	@echo "=================$(BOOST_LIBS)======================="
 
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
@@ -21,6 +19,10 @@ endif
 ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
     CUDA_INCL ?= 
     CUDA_LIBS ?= -lcudart
+    CUFFTMP_INCL ?= 
+    CUFFTMP_LIBS ?= -lcufftMp
+    NVSHMEM_INCL ?= 
+    NVSHMEM_LIBS ?= -lnvshmem_host
     NVCC ?= nvcc
     NVOPTIMIZE ?= -O3
 endif
@@ -28,7 +30,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL)
+CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -36,7 +38,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
-LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS)
+LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 

From a9277c1d47740dace322a8eb889b771c0cf7b5f5 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 08:44:12 -0500
Subject: [PATCH 080/120] some progress on pfft->cufftmp

---
 libgadget/gravity.h |  4 ++--
 libgadget/petapm.c  | 51 ++++++++++++++++++++++++++++++++-------------
 libgadget/petapm.h  |  4 ++--
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/libgadget/gravity.h b/libgadget/gravity.h
index e5d2dcf1..adeb04e8 100644
--- a/libgadget/gravity.h
+++ b/libgadget/gravity.h
@@ -58,9 +58,9 @@ void grav_short_pair(const ActiveParticles * act, PetaPM * pm, ForceTree * tree,
 void grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFloat (* AccelStore)[3], double rho0, inttime_t Ti_Current);
 
 /*Read the power spectrum, without changing the input value.*/
-void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex *value);
+void measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value);
 
 /* Compute the power spectrum of the Fourier transformed grid in value.*/
-void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], pfft_complex * const value, const double invwindow, double Nmesh);
+void powerspectrum_add_mode(Power * PowerSpectrum, const int64_t k2, const int kpos[3], cufftComplex * const value, const double invwindow, double Nmesh);
 
 #endif
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index c1b8e12d..fbd6865e 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -90,11 +90,18 @@ int *petapm_get_ntask2d(PetaPM * pm) {
 void
 petapm_module_init(int Nthreads)
 {
-    pfft_init();
+    // CUDA Device Initialization if necessary (optional if only one GPU is used)
+    int device_id = 0;
+    cudaSetDevice(device_id);  // Set the active GPU device
 
-    pfft_plan_with_nthreads(Nthreads);
+    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
+    #ifdef _OPENMP
+    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    #endif
 
-    /* initialize the MPI Datatype of pencil */
+    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
+
+    // Initialize the MPI Datatype for the Pencil structure
     MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
     MPI_Type_commit(&MPI_PENCIL);
 }
@@ -131,21 +138,35 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     np[0] = i;
     np[1] = NTask / i;
 
-    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-    if( pfft_create_procmesh_2d(comm, np[0], np[1], &pm->priv->comm_cart_2d) ){
-        endrun(0, "Error: This test file only works with %td processes.\n", np[0]*np[1]);
-    }
+message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 
-    int periods_unused[2];
-    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+// Step 1: Create 2D Cartesian grid for the processes
+int dims[2] = {np[0], np[1]};
+int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+
+// Create 2D Cartesian communicator
+if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+}
+
+// Step 2: Get the Cartesian coordinates of the process in the grid
+int periods_unused[2];
+MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+
+// Ensure that the task grid matches the expected number of processes
+if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
+    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+}
 
-    if(pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1])
-        endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
+// cuFFTMp might require manual management of the local data size
+// Example: You may need to calculate how much data each process holds based on grid decomposition
 
-    pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
-           PFFT_TRANSPOSED_OUT,
-           pm->real_space_region.size, pm->real_space_region.offset,
-           pm->fourier_space_region.size, pm->fourier_space_region.offset);
+pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+                                                pm->real_space_region.size, 
+                                                pm->real_space_region.offset, 
+                                                pm->fourier_space_region.size, 
+                                                pm->fourier_space_region.offset);
 
     /*
      * In fourier space, the transposed array is ordered in
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 1fa1070c..e1f93700 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -49,8 +49,8 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
-    cufftmpHandle_t plan_forw; // NC:change plan function call
-    cufftmpHandle_t plan_back;
+    cufftHandle plan_forw; // NC:change plan function call
+    cufftHandle plan_back;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */

From 9a6015ca9f24360b65a7a8a7e0b431f93347c363 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 23:37:21 -0400
Subject: [PATCH 081/120] add cuda stream to petapm struct

---
 libgadget/petapm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index e1f93700..3db41533 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -51,6 +51,7 @@ typedef struct PetaPMPriv {
     int fftsize;
     cufftHandle plan_forw; // NC:change plan function call
     cufftHandle plan_back;
+    cudaStream_t stream;
     MPI_Comm comm_cart_2d;
 
     /* these variables are allocated every force calculation */

From cd2b1ffed6dfd29fa01e00baa8963773f5f4e2e1 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 30 Sep 2024 23:39:47 -0400
Subject: [PATCH 082/120] remove reion stuff; main changes to petapm_init,
 destroy, transfer function

---
 libgadget/petapm-cufft.c | 1029 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 1029 insertions(+)
 create mode 100644 libgadget/petapm-cufft.c

diff --git a/libgadget/petapm-cufft.c b/libgadget/petapm-cufft.c
new file mode 100644
index 00000000..fedf7515
--- /dev/null
+++ b/libgadget/petapm-cufft.c
@@ -0,0 +1,1029 @@
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+/* do NOT use complex.h it breaks the code */
+
+#include "types.h"
+#include "petapm.h"
+
+#include "utils.h"
+#include "walltime.h"
+
+static void
+layout_prepare(PetaPM * pm,
+               struct Layout * L,
+               double * meshbuf,
+               PetaPMRegion * regions,
+               const int Nregions,
+               MPI_Comm comm);
+static void layout_finish(struct Layout * L);
+static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+
+/* cell_iterator needs to be thread safe !*/
+typedef void (* cell_iterator)(double * cell_value, double * comm_buffer);
+static void layout_iterate_cells(PetaPM * pm, struct Layout * L, cell_iterator iter, double * real);
+
+struct Pencil { /* a pencil starting at offset, with lenght len */
+    int offset[3];
+    int len;
+    int first;
+    int meshbuf_first; /* first pixel in meshbuf */
+    int task;
+};
+static int pencil_cmp_target(const void * v1, const void * v2);
+static int pos_get_target(PetaPM * pm, const int pos[2]);
+
+/* FIXME: move this to MPIU_. */
+static int64_t reduce_int64(int64_t input, MPI_Comm comm);
+#ifdef DEBUG
+/* for debugging */
+static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize);
+#endif
+
+static MPI_Datatype MPI_PENCIL;
+
+/*Used only in MP-GenIC*/
+cufftComplex *
+petapm_alloc_rhok(PetaPM * pm)
+{
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
+    return rho_k;
+}
+
+static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
+
+static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
+static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
+#define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
+#define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
+#define INACTIVE(i) (CPS->active && !CPS->active(i))
+
+
+PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
+    return &pm->fourier_space_region;
+}
+PetaPMRegion * petapm_get_real_region(PetaPM * pm) {
+    return &pm->real_space_region;
+}
+int petapm_mesh_to_k(PetaPM * pm, int i) {
+    /*Return the position of this point on the Fourier mesh*/
+    return i<=pm->Nmesh/2 ? i : (i-pm->Nmesh);
+}
+int *petapm_get_thistask2d(PetaPM * pm) {
+    return pm->ThisTask2d;
+}
+int *petapm_get_ntask2d(PetaPM * pm) {
+    return pm->NTask2d;
+}
+
+void
+petapm_module_init(int Nthreads)
+{
+    // CUDA Device Initialization if necessary (optional if only one GPU is used)
+    int device_id = 0;
+    cudaSetDevice(device_id);  // Set the active GPU device
+
+    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
+    #ifdef _OPENMP
+    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    #endif
+    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
+
+    // get rid of pencil type
+    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    //MPI_Type_commit(&MPI_PENCIL);
+}
+
+void
+petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
+{
+    /* define the global long / short range force cut */
+    pm->BoxSize = BoxSize;
+    pm->Asmth = Asmth;
+    pm->Nmesh = Nmesh;
+    pm->G = G;
+    pm->CellSize = BoxSize / Nmesh;
+    pm->comm = comm;
+
+
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(ThisTask % ndevices);
+    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
+
+    // Logical transform size
+    size_t nx = NTask;      // any value >= NTask is OK
+    size_t ny = NTask;      // any value >= NTask is OK
+    size_t nz = 2 * NTask;  // need to be even and >= NTask
+
+    // We start with Slabs distributed along X (X-Slabs)
+    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
+    // All ranks own all element in the Y and Z dimension
+    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
+    // complex numbers assuming an in-place data layout.
+    int ranks_with_onemore = nx % size;
+    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
+    size_t padded_nz = 2 * (nz / 2 + 1);
+
+    // // Local, distributed, data
+    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
+    // generate_random(data, rank);
+    // std::vector<float> ref = data;
+
+
+
+/********************************not sure if these are useful or not**************************************** */
+    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
+    ptrdiff_t np[2];
+
+    int ThisTask;
+    int NTask;
+
+    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
+    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
+
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+    /* try to find a square 2d decomposition */
+    int i;
+    int k;
+    for(i = sqrt(NTask) + 1; i >= 0; i --) {
+        if(NTask % i == 0) break;
+    }
+    np[0] = i;
+    np[1] = NTask / i;
+
+message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+
+// Step 1: Create 2D Cartesian grid for the processes
+int dims[2] = {np[0], np[1]};
+int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+
+// Create 2D Cartesian communicator
+if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+}
+
+// Step 2: Get the Cartesian coordinates of the process in the grid
+int periods_unused[2];
+MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
+
+// Ensure that the task grid matches the expected number of processes
+if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
+    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
+}
+
+// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
+// cuFFTMp might require manual management of the local data size
+// Example: You may need to calculate how much data each process holds based on grid decomposition
+
+pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+                                                pm->real_space_region.size, 
+                                                pm->real_space_region.offset, 
+                                                pm->fourier_space_region.size, 
+                                                pm->fourier_space_region.offset);
+
+    /*
+     * In fourier space, the transposed array is ordered in
+     * are in (y, z, x). The strides and sizes returned
+     * from local size is in (Nx, Ny, Nz), hence we roll them once
+     * so that the strides will give correct linear indexing for
+     * integer coordinates given in order of (y, z, x).
+     * */
+
+#define ROLL(a, N, j) { \
+    typeof(a[0]) tmp[N]; \
+    ptrdiff_t k; \
+    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
+    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
+    }
+
+    ROLL(pm->fourier_space_region.offset, 3, 1);
+    ROLL(pm->fourier_space_region.size, 3, 1);
+
+#undef ROLL
+
+    /* calculate the strides */
+    petapm_region_init_strides(&pm->real_space_region);
+    petapm_region_init_strides(&pm->fourier_space_region);
+
+
+/******************************** end unsure block **************************************** */
+
+    cudaStreamCreate(&pm->priv->stream);
+    cufftCreate(&pm->priv->plan_forw);
+    cufftCreate(&pm->priv->plan_back);
+
+    // Attach the MPI communicator to the plans
+    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
+    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
+
+    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
+    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
+    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
+    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
+
+    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+
+    // Set the stream
+    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
+    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
+
+    // Make the plan
+    size_t workspace;
+    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
+    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
+    // TODO: what to make of the cpu_data here?
+    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
+
+    /* now lets fill up the mesh2task arrays */
+
+#if 0
+    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
+            pm->real_space_region.offset[0],
+            pm->real_space_region.offset[1],
+            pm->real_space_region.offset[2],
+            pm->real_space_region.size[0],
+            pm->real_space_region.size[1],
+            pm->real_space_region.size[2]);
+#endif
+
+    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+    for(k = 0; k < 2; k ++) {
+        for(i = 0; i < Nmesh; i ++) {
+            tmp[i] = 0;
+        }
+        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
+            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
+        }
+        /* which column / row hosts this tile? */
+        /* FIXME: this is very inefficient */
+        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
+        /*
+        for(i = 0; i < Nmesh; i ++) {
+            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
+        }
+        */
+    }
+    myfree(tmp);
+}
+
+void
+petapm_destroy(PetaPM * pm)
+{
+    cufftDestroy(pm->priv->plan_forw);
+    cufftDestroy(pm->priv->plan_back);
+    MPI_Comm_free(&pm->priv->comm_cart_2d);
+    myfree(pm->Mesh2Task[0]);
+}
+
+/*
+ * read out field to particle i, with value no need to be thread safe
+ * (particle i is never done by same thread)
+ * */
+typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
+static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
+/* apply transfer function to value, kpos array is in x, y, z order */
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H);
+
+static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
+/*
+ * 1. calls prepare to build the Regions covering particles
+ * 2. CIC the particles
+ * 3. Transform to rho_k
+ * 4. apply global_transfer (if not NULL --
+ *       this is the place to fill in gaussian seeds,
+ *       the transfer is stacked onto all following transfers.
+ * 5. for each transfer, readout in functions
+ * 6.    apply transfer from global_transfer -> complex
+ * 7.    transform to real
+ * 8.    readout
+ * 9. free regions
+ * */
+
+PetaPMRegion *
+petapm_force_init(
+        PetaPM * pm,
+        petapm_prepare_func prepare,
+        PetaPMParticleStruct * pstruct,
+        int * Nregions,
+        void * userdata) {
+    CPS = pstruct;
+
+    *Nregions = 0;
+    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
+    pm_init_regions(pm, regions, *Nregions);
+
+    pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
+
+    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
+
+    walltime_measure("/PMgrav/init");
+    return regions;
+}
+
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
+        ){
+    size_t ip = 0;
+
+    PetaPMRegion * region = &pm->fourier_space_region;
+
+#pragma omp parallel for
+    for(ip = 0; ip < region->totalsize; ip ++) {
+        ptrdiff_t tmp = ip;
+        int pos[3];
+        int kpos[3];
+        int64_t k2 = 0.0;
+        int k;
+        for(k = 0; k < 3; k ++) {
+            pos[k] = tmp / region->strides[k];
+            tmp -= pos[k] * region->strides[k];
+            /* lets get the abs pos on the grid*/
+            pos[k] += region->offset[k];
+            /* check */
+            if(pos[k] >= pm->Nmesh) {
+                endrun(1, "position didn't make sense\n");
+            }
+            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
+            /* Watch out the cast */
+            k2 += ((int64_t)kpos[k]) * kpos[k];
+        }
+        /* swap 0 and 1 because fourier space was transposed */
+        /* kpos is y, z, x */
+        pos[0] = kpos[2];
+        pos[1] = kpos[0];
+        pos[2] = kpos[1];
+        dst[ip][0] = src[ip][0];
+        dst[ip][1] = src[ip][1];
+        if(H) {
+            H(pm, k2, pos, &dst[ip]);
+        }
+    }
+
+}
+
+cufftComplex * petapm_force_r2c(PetaPM * pm,
+        PetaPMGlobalFunctions * global_functions
+        ) {
+    /* call pfft rho_k is CFT of rho */
+
+    /* this is because
+     *
+     * CFT = DFT * dx **3
+     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
+     * */
+    double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    memset(real, 0, sizeof(double) * pm->priv->fftsize);
+    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    walltime_measure("/PMgrav/comm2");
+
+#ifdef DEBUG
+    verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
+    walltime_measure("/PMgrav/Verify");
+#endif
+
+    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
+    myfree(real);
+
+    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
+
+    /*Do any analysis that may be required before the transfer function is applied*/
+    petapm_transfer_func global_readout = global_functions->global_readout;
+    if(global_readout)
+        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
+    if(global_functions->global_analysis)
+        global_functions->global_analysis(pm);
+    /*Apply the transfer function*/
+    petapm_transfer_func global_transfer = global_functions->global_transfer;
+    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
+    walltime_measure("/PMgrav/r2c");
+
+    myfree(complx);
+    return rho_k;
+}
+
+void
+petapm_force_c2r(PetaPM * pm,
+        cufftComplex * rho_k,
+        PetaPMRegion * regions,
+        const int Nregions,
+        PetaPMFunctions * functions)
+{
+
+    PetaPMFunctions * f = functions;
+    for (f = functions; f->name; f ++) {
+        petapm_transfer_func transfer = f->transfer;
+        petapm_readout_func readout = f->readout;
+
+        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+        /* apply the greens function turn rho_k into potential in fourier space */
+        pm_apply_transfer_function(pm, rho_k, complx, transfer);
+        walltime_measure("/PMgrav/calc");
+
+        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
+
+        walltime_measure("/PMgrav/c2r");
+        if(f == functions) // Once
+            report_memory_usage("PetaPM");
+        myfree(complx);
+        /* read out the potential: this will copy and free real.*/
+        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+        walltime_measure("/PMgrav/comm");
+
+        pm_iterate(pm, readout, regions, Nregions);
+        walltime_measure("/PMgrav/readout");
+    }
+}
+
+void petapm_force_finish(PetaPM * pm) {
+    layout_finish(&pm->priv->layout);
+    myfree(pm->priv->meshbuf);
+}
+
+void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
+        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
+        PetaPMFunctions * functions,
+        PetaPMParticleStruct * pstruct,
+        void * userdata) {
+    int Nregions;
+    PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
+    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
+    if(functions)
+        petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
+    myfree(rho_k);
+    if(CPS->RegionInd)
+        myfree(CPS->RegionInd);
+    myfree(regions);
+    petapm_force_finish(pm);
+}
+
+/******************************************************************************************************************************************** */
+/* build a communication layout */
+
+static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
+static void layout_exchange_pencils(struct Layout * L);
+static void
+layout_prepare (PetaPM * pm,
+                struct Layout * L,
+                double * meshbuf,
+                PetaPMRegion * regions,
+                const int Nregions,
+                MPI_Comm comm)
+{
+    int r;
+    int i;
+    int NTask;
+    L->comm = comm;
+
+    MPI_Comm_size(L->comm, &NTask);
+
+    L->ibuffer = (int *) mymalloc("PMlayout", sizeof(int) * NTask * 8);
+
+    memset(L->ibuffer, 0, sizeof(int) * NTask * 8);
+    L->NpSend = &L->ibuffer[NTask * 0];
+    L->NpRecv = &L->ibuffer[NTask * 1];
+    L->NcSend = &L->ibuffer[NTask * 2];
+    L->NcRecv = &L->ibuffer[NTask * 3];
+    L->DcSend = &L->ibuffer[NTask * 4];
+    L->DcRecv = &L->ibuffer[NTask * 5];
+    L->DpSend = &L->ibuffer[NTask * 6];
+    L->DpRecv = &L->ibuffer[NTask * 7];
+
+    L->NpExport = 0;
+    L->NcExport = 0;
+    L->NpImport = 0;
+    L->NcImport = 0;
+
+    int NpAlloc = 0;
+    /* count pencils until buffer would run out */
+    for (r = 0; r < Nregions; r ++) {
+        NpAlloc += regions[r].size[0] * regions[r].size[1];
+    }
+
+    L->PencilSend = (struct Pencil *) mymalloc("PencilSend", NpAlloc * sizeof(struct Pencil));
+
+    layout_build_pencils(pm, L, meshbuf, regions, Nregions);
+
+    /* sort the pencils by the target rank for ease of next step */
+    qsort_openmp(L->PencilSend, NpAlloc, sizeof(struct Pencil), pencil_cmp_target);
+    /* zero length pixels are moved to the tail */
+
+    /* now shrink NpExport*/
+    L->NpExport = NpAlloc;
+    while(L->NpExport > 0 && L->PencilSend[L->NpExport - 1].len == 0) {
+        L->NpExport --;
+    }
+
+    /* count total number of cells to be exported */
+    int NcExport = 0;
+    for(i = 0; i < L->NpExport; i++) {
+        int task = L->PencilSend[i].task;
+        L->NcSend[task] += L->PencilSend[i].len;
+        NcExport += L->PencilSend[i].len;
+        L->NpSend[task] ++;
+    }
+    L->NcExport = NcExport;
+
+    MPI_Alltoall(L->NpSend, 1, MPI_INT, L->NpRecv, 1, MPI_INT, L->comm);
+    MPI_Alltoall(L->NcSend, 1, MPI_INT, L->NcRecv, 1, MPI_INT, L->comm);
+
+    /* build the displacement array; why doesn't MPI build these automatically? */
+    L->DpSend[0] = 0; L->DpRecv[0] = 0;
+    L->DcSend[0] = 0; L->DcRecv[0] = 0;
+    for(i = 1; i < NTask; i ++) {
+        L->DpSend[i] = L->NpSend[i - 1] + L->DpSend[i - 1];
+        L->DpRecv[i] = L->NpRecv[i - 1] + L->DpRecv[i - 1];
+        L->DcSend[i] = L->NcSend[i - 1] + L->DcSend[i - 1];
+        L->DcRecv[i] = L->NcRecv[i - 1] + L->DcRecv[i - 1];
+    }
+    L->NpImport = L->DpRecv[NTask -1] + L->NpRecv[NTask -1];
+    L->NcImport = L->DcRecv[NTask -1] + L->NcRecv[NTask -1];
+
+    /* some checks */
+    if(L->DpSend[NTask - 1] + L->NpSend[NTask -1] != L->NpExport) {
+        endrun(1, "NpExport = %d NpSend=%d DpSend=%d\n", L->NpExport, L->NpSend[NTask -1], L->DpSend[NTask - 1]);
+    }
+    if(L->DcSend[NTask - 1] + L->NcSend[NTask -1] != L->NcExport) {
+        endrun(1, "NcExport = %d NcSend=%d DcSend=%d\n", L->NcExport, L->NcSend[NTask -1], L->DcSend[NTask - 1]);
+    }
+    int64_t totNpAlloc = reduce_int64(NpAlloc, L->comm);
+    int64_t totNpExport = reduce_int64(L->NpExport, L->comm);
+    int64_t totNcExport = reduce_int64(L->NcExport, L->comm);
+    int64_t totNpImport = reduce_int64(L->NpImport, L->comm);
+    int64_t totNcImport = reduce_int64(L->NcImport, L->comm);
+
+    if(totNpExport != totNpImport) {
+        endrun(1, "totNpExport = %ld\n", totNpExport);
+    }
+    if(totNcExport != totNcImport) {
+        endrun(1, "totNcExport = %ld\n", totNcExport);
+    }
+
+    /* exchange the pencils */
+    message(0, "PetaPM:  %010ld/%010ld Pencils and %010ld Cells\n", totNpExport, totNpAlloc, totNcExport);
+    L->PencilRecv = (struct Pencil *) mymalloc("PencilRecv", L->NpImport * sizeof(struct Pencil));
+    memset(L->PencilRecv, 0xfc, L->NpImport * sizeof(struct Pencil));
+    layout_exchange_pencils(L);
+}
+
+static void
+layout_build_pencils(PetaPM * pm,
+                     struct Layout * L,
+                     double * meshbuf,
+                     PetaPMRegion * regions,
+                     const int Nregions)
+{
+    /* now build pencils to be exported */
+    int p0 = 0;
+    int r;
+    for (r = 0; r < Nregions; r++) {
+        int ix;
+#pragma omp parallel for private(ix)
+        for(ix = 0; ix < regions[r].size[0]; ix++) {
+            int iy;
+            for(iy = 0; iy < regions[r].size[1]; iy++) {
+                int poffset = ix * regions[r].size[1] + iy;
+                struct Pencil * p = &L->PencilSend[p0 + poffset];
+
+                p->offset[0] = ix + regions[r].offset[0];
+                p->offset[1] = iy + regions[r].offset[1];
+                p->offset[2] = regions[r].offset[2];
+                p->len = regions[r].size[2];
+                p->meshbuf_first = (regions[r].buffer - meshbuf) +
+                    regions[r].strides[0] * ix +
+                    regions[r].strides[1] * iy;
+                /* now lets compress the pencil */
+                while((p->len > 0) && (meshbuf[p->meshbuf_first + p->len - 1] == 0.0)) {
+                    p->len --;
+                }
+                while((p->len > 0) && (meshbuf[p->meshbuf_first] == 0.0)) {
+                    p->len --;
+                    p->meshbuf_first++;
+                    p->offset[2] ++;
+                }
+
+                p->task = pos_get_target(pm, p->offset);
+            }
+        }
+        p0 += regions[r].size[0] * regions[r].size[1];
+    }
+
+}
+
+static void layout_exchange_pencils(struct Layout * L) {
+    int i;
+    int offset;
+    int NTask;
+    MPI_Comm_size(L->comm, &NTask);
+    /* build the first pointers to refer to the correct relative buffer locations */
+    /* note that the buffer hasn't bee assembled yet */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        int j;
+        struct Pencil * p = &L->PencilSend[offset];
+        if(L->NpSend[i] == 0) continue;
+        p->first = 0;
+        for(j = 1; j < L->NpSend[i]; j++) {
+            p[j].first = p[j - 1].first + p[j - 1].len;
+        }
+        offset += L->NpSend[i];
+    }
+
+    MPI_Alltoallv(
+            L->PencilSend, L->NpSend, L->DpSend, MPI_PENCIL,
+            L->PencilRecv, L->NpRecv, L->DpRecv, MPI_PENCIL,
+            L->comm);
+
+    /* set first to point to absolute position in the full import cell buffer */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        struct Pencil * p = &L->PencilRecv[offset];
+        int j;
+        for(j = 0; j < L->NpRecv[i]; j++) {
+            p[j].first += L->DcRecv[i];
+        }
+        offset += L->NpRecv[i];
+    }
+
+    /* set first to point to absolute position in the full export cell buffer */
+    offset = 0;
+    for(i = 0; i < NTask; i ++) {
+        struct Pencil * p = &L->PencilSend[offset];
+        int j;
+        for(j = 0; j < L->NpSend[i]; j++) {
+            p[j].first += L->DcSend[i];
+        }
+        offset += L->NpSend[i];
+    }
+}
+
+static void layout_finish(struct Layout * L) {
+    myfree(L->PencilRecv);
+    myfree(L->PencilSend);
+    myfree(L->ibuffer);
+}
+
+/* exchange cells to their pfft host, then reduce the cells to the pfft
+ * array */
+static void to_pfft(double * cell, double * buf) {
+#pragma omp atomic update
+            cell[0] += buf[0];
+}
+
+static void
+layout_build_and_exchange_cells_to_pfft(
+        PetaPM * pm,
+        struct Layout * L,
+        double * meshbuf,
+        double * real)
+{
+    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
+    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
+
+    int i;
+    int offset;
+
+    /* collect all cells into the send buffer */
+    offset = 0;
+    for(i = 0; i < L->NpExport; i ++) {
+        struct Pencil * p = &L->PencilSend[i];
+        memcpy(L->BufSend + offset, &meshbuf[p->meshbuf_first],
+                sizeof(double) * p->len);
+        offset += p->len;
+    }
+
+    /* receive cells */
+    MPI_Alltoallv(
+            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
+            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
+            L->comm);
+
+#if 0
+    double massExport = 0;
+    for(i = 0; i < L->NcExport; i ++) {
+        massExport += L->BufSend[i];
+    }
+
+    double massImport = 0;
+    for(i = 0; i < L->NcImport; i ++) {
+        massImport += L->BufRecv[i];
+    }
+    double totmassExport;
+    double totmassImport;
+    MPI_Allreduce(&massExport, &totmassExport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
+    MPI_Allreduce(&massImport, &totmassImport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
+    message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
+#endif
+
+    layout_iterate_cells(pm, L, to_pfft, real);
+    myfree(L->BufRecv);
+    myfree(L->BufSend);
+}
+
+/* readout cells on their pfft host, then exchange the cells to the domain
+ * host */
+static void to_region(double * cell, double * region) {
+    *region = *cell;
+}
+
+static void
+layout_build_and_exchange_cells_to_local(
+        PetaPM * pm,
+        struct Layout * L,
+        double * meshbuf,
+        double * real)
+{
+    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
+    int i;
+    int offset;
+
+    /*layout_iterate_cells transfers real to L->BufRecv*/
+    layout_iterate_cells(pm, L, to_region, real);
+
+    /*Real is done now: reuse the memory for BufSend*/
+    myfree(real);
+    /*Now allocate BufSend, which is confusingly used to receive data*/
+    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
+
+    /* exchange cells */
+    /* notice the order is reversed from to_pfft */
+    MPI_Alltoallv(
+            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
+            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
+            L->comm);
+
+    /* distribute BufSend to meshbuf */
+    offset = 0;
+    for(i = 0; i < L->NpExport; i ++) {
+        struct Pencil * p = &L->PencilSend[i];
+        memcpy(&meshbuf[p->meshbuf_first],
+                L->BufSend + offset,
+                sizeof(double) * p->len);
+        offset += p->len;
+    }
+    myfree(L->BufSend);
+    myfree(L->BufRecv);
+}
+
+/* iterate over the pairs of real field cells and RecvBuf cells
+ *
+ * !!! iter has to be thread safe. !!!
+ * */
+static void
+layout_iterate_cells(PetaPM * pm,
+                     struct Layout * L,
+                     cell_iterator iter,
+                     double * real)
+{
+    int i;
+#pragma omp parallel for
+    for(i = 0; i < L->NpImport; i ++) {
+        struct Pencil * p = &L->PencilRecv[i];
+        int k;
+        ptrdiff_t linear0 = 0;
+        for(k = 0; k < 2; k ++) {
+            int ix = p->offset[k];
+            while(ix < 0) ix += pm->Nmesh;
+            while(ix >= pm->Nmesh) ix -= pm->Nmesh;
+            ix -= pm->real_space_region.offset[k];
+            if(ix >= pm->real_space_region.size[k]) {
+                /* serious problem assumption about pfft layout was wrong*/
+                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
+            }
+            linear0 += ix * pm->real_space_region.strides[k];
+        }
+        int j;
+        for(j = 0; j < p->len; j ++) {
+            int iz = p->offset[2] + j;
+            while(iz < 0) iz += pm->Nmesh;
+            while(iz >= pm->Nmesh) iz -= pm->Nmesh;
+            if(iz >= pm->real_space_region.size[2]) {
+                /* serious problem assmpution about pfft layout was wrong*/
+                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
+            }
+            ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
+            /*
+             * operate on the pencil, either modifying real or BufRecv
+             * */
+            iter(&real[linear], &L->BufRecv[p->first + j]);
+        }
+    }
+}
+
+static void
+pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions)
+{
+    if(regions) {
+        int i;
+        size_t size = 0;
+        for(i = 0 ; i < Nregions; i ++) {
+            size += regions[i].totalsize;
+        }
+        pm->priv->meshbufsize = size;
+        if ( size == 0 ) return;
+        pm->priv->meshbuf = (double *) mymalloc("PMmesh", size * sizeof(double));
+        /* this takes care of the padding */
+        memset(pm->priv->meshbuf, 0, size * sizeof(double));
+        size = 0;
+        for(i = 0 ; i < Nregions; i ++) {
+            regions[i].buffer = pm->priv->meshbuf + size;
+            size += regions[i].totalsize;
+        }
+    }
+}
+
+
+static void
+pm_iterate_one(PetaPM * pm,
+               int i,
+               pm_iterator iterator,
+               PetaPMRegion * regions,
+               const int Nregions)
+{
+    int k;
+    int iCell[3];  /* integer coordinate on the regional mesh */
+    double Res[3]; /* residual*/
+    double * Pos = POS(i);
+    const int RegionInd = CPS->RegionInd ? CPS->RegionInd[i] : 0;
+
+    /* Asserts that the swallowed particles are not considered (region -2).*/
+    if(RegionInd < 0)
+        return;
+    /* This should never happen: it is pure paranoia and to avoid icc being crazy*/
+    if(RegionInd >= Nregions)
+        endrun(1, "Particle %d has region %d out of bounds %d\n", i, RegionInd, Nregions);
+
+    PetaPMRegion * region = &regions[RegionInd];
+    for(k = 0; k < 3; k++) {
+        double tmp = Pos[k] / pm->CellSize;
+        iCell[k] = floor(tmp);
+        Res[k] = tmp - iCell[k];
+        iCell[k] -= region->offset[k];
+        /* seriously?! particles are supposed to be contained in cells */
+        if(iCell[k] >= region->size[k] - 1 || iCell[k] < 0) {
+            endrun(1, "particle out of cell better stop %d (k=%d) %g %g %g region: %td %td\n", iCell[k],k,
+                Pos[0], Pos[1], Pos[2],
+                region->offset[k], region->size[k]);
+        }
+    }
+
+    int connection;
+    for(connection = 0; connection < 8; connection++) {
+        double weight = 1.0;
+        size_t linear = 0;
+        for(k = 0; k < 3; k++) {
+            int offset = (connection >> k) & 1;
+            int tmp = iCell[k] + offset;
+            linear += tmp * region->strides[k];
+            weight *= offset?
+                /* offset == 1*/ (Res[k])    :
+                /* offset == 0*/ (1 - Res[k]);
+        }
+        if(linear >= region->totalsize) {
+            endrun(1, "particle linear index out of cell better stop\n");
+        }
+        iterator(pm, i, &region->buffer[linear], weight);
+    }
+}
+
+/*
+ * iterate over all particle / mesh pairs, call iterator
+ * function . iterator function shall be aware of thread safety.
+ * no threads run on same particle same time but may
+ * access one mesh points same time.
+ * */
+static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions) {
+    int i;
+#pragma omp parallel for
+    for(i = 0; i < CPS->NumPart; i ++) {
+        pm_iterate_one(pm, i, iterator, regions, Nregions);
+    }
+}
+
+void petapm_region_init_strides(PetaPMRegion * region) {
+    int k;
+    size_t rt = 1;
+    for(k = 2; k >= 0; k --) {
+        region->strides[k] = rt;
+        rt = region->size[k] * rt;
+    }
+    region->totalsize = rt;
+    region->buffer = NULL;
+}
+
+static int pos_get_target(PetaPM * pm, const int pos[2]) {
+    int k;
+    int task2d[2];
+    int rank;
+    for(k = 0; k < 2; k ++) {
+        int ix = pos[k];
+        while(ix < 0) ix += pm->Nmesh;
+        while(ix >= pm->Nmesh) ix -= pm->Nmesh;
+        task2d[k] = pm->Mesh2Task[k][ix];
+    }
+    MPI_Cart_rank(pm->priv->comm_cart_2d, task2d, &rank);
+    return rank;
+}
+static int pencil_cmp_target(const void * v1, const void * v2) {
+    const struct Pencil * p1 = (const struct Pencil *) v1;
+    const struct Pencil * p2 = (const struct Pencil *) v2;
+    /* move zero length pixels to the end */
+    if(p2->len == 0) return -1;
+    if(p1->len == 0) return 1;
+    int t1 = p1->task;
+    int t2 = p2->task;
+    return ((t2 < t1) - (t1 < t2)) * 2 +
+        ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
+}
+
+#ifdef DEBUG
+static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
+    /* verify the density field */
+    double mass_Part = 0;
+    int j;
+#pragma omp parallel for reduction(+: mass_Part)
+    for(j = 0; j < CPS->NumPart; j ++) {
+        double Mass = *MASS(j);
+        mass_Part += Mass;
+    }
+    double totmass_Part = 0;
+    MPI_Allreduce(&mass_Part, &totmass_Part, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+
+    double mass_Region = 0;
+    size_t i;
+
+#pragma omp parallel for reduction(+: mass_Region)
+    for(i = 0; i < meshsize; i ++) {
+        mass_Region += meshbuf[i];
+    }
+    double totmass_Region = 0;
+    MPI_Allreduce(&mass_Region, &totmass_Region, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+    double mass_CIC = 0;
+#pragma omp parallel for reduction(+: mass_CIC)
+    for(i = 0; i < pm->real_space_region.totalsize; i ++) {
+        mass_CIC += real[i];
+    }
+    double totmass_CIC = 0;
+    MPI_Allreduce(&mass_CIC, &totmass_CIC, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
+
+    message(0, "total Region mass err = %g CIC mass err = %g Particle mass = %g\n", totmass_Region / totmass_Part - 1, totmass_CIC / totmass_Part - 1, totmass_Part);
+}
+#endif
+
+
+
+
+/**************
+ * functions iterating over particle / mesh pairs
+ ***************/
+static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
+    double Mass = *MASS(i);
+    if(INACTIVE(i))
+        return;
+#pragma omp atomic update
+    mesh[0] += weight * Mass;
+}
+static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
+    int64_t result = 0;
+    MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
+    return result;
+}
+
+/** Some FFT notes
+ *
+ *
+ * CFT = dx * iDFT (thus CFT has no 2pi factors and iCFT has,
+ *           same as wikipedia.)
+ *
+ * iCFT = dk * DFT
+ * iCFT(CFG) = dx * dk * DFT(iDFT)
+ *           = L / N * (2pi / L) * N
+ *           = 2 pi
+ * agreed with the usual def that
+ * iCFT(CFT) = 2pi
+ *
+ * **************************8*/

From fb9766c0a45aeb2f7e02b205da20f8ea62ece728 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Tue, 1 Oct 2024 14:28:10 -0400
Subject: [PATCH 083/120] migrate petapm-cufft.c to petapm.c

---
 libgadget/petapm-cufft.c | 1029 --------------------------------------
 libgadget/petapm.c       |  424 +++++-----------
 2 files changed, 126 insertions(+), 1327 deletions(-)
 delete mode 100644 libgadget/petapm-cufft.c

diff --git a/libgadget/petapm-cufft.c b/libgadget/petapm-cufft.c
deleted file mode 100644
index fedf7515..00000000
--- a/libgadget/petapm-cufft.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-/* do NOT use complex.h it breaks the code */
-
-#include "types.h"
-#include "petapm.h"
-
-#include "utils.h"
-#include "walltime.h"
-
-static void
-layout_prepare(PetaPM * pm,
-               struct Layout * L,
-               double * meshbuf,
-               PetaPMRegion * regions,
-               const int Nregions,
-               MPI_Comm comm);
-static void layout_finish(struct Layout * L);
-static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
-static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
-
-/* cell_iterator needs to be thread safe !*/
-typedef void (* cell_iterator)(double * cell_value, double * comm_buffer);
-static void layout_iterate_cells(PetaPM * pm, struct Layout * L, cell_iterator iter, double * real);
-
-struct Pencil { /* a pencil starting at offset, with lenght len */
-    int offset[3];
-    int len;
-    int first;
-    int meshbuf_first; /* first pixel in meshbuf */
-    int task;
-};
-static int pencil_cmp_target(const void * v1, const void * v2);
-static int pos_get_target(PetaPM * pm, const int pos[2]);
-
-/* FIXME: move this to MPIU_. */
-static int64_t reduce_int64(int64_t input, MPI_Comm comm);
-#ifdef DEBUG
-/* for debugging */
-static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize);
-#endif
-
-static MPI_Datatype MPI_PENCIL;
-
-/*Used only in MP-GenIC*/
-cufftComplex *
-petapm_alloc_rhok(PetaPM * pm)
-{
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
-    return rho_k;
-}
-
-static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
-
-static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
-static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
-#define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
-#define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
-#define INACTIVE(i) (CPS->active && !CPS->active(i))
-
-
-PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
-    return &pm->fourier_space_region;
-}
-PetaPMRegion * petapm_get_real_region(PetaPM * pm) {
-    return &pm->real_space_region;
-}
-int petapm_mesh_to_k(PetaPM * pm, int i) {
-    /*Return the position of this point on the Fourier mesh*/
-    return i<=pm->Nmesh/2 ? i : (i-pm->Nmesh);
-}
-int *petapm_get_thistask2d(PetaPM * pm) {
-    return pm->ThisTask2d;
-}
-int *petapm_get_ntask2d(PetaPM * pm) {
-    return pm->NTask2d;
-}
-
-void
-petapm_module_init(int Nthreads)
-{
-    // CUDA Device Initialization if necessary (optional if only one GPU is used)
-    int device_id = 0;
-    cudaSetDevice(device_id);  // Set the active GPU device
-
-    // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
-    #ifdef _OPENMP
-    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
-    #endif
-    // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
-
-    // get rid of pencil type
-    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    //MPI_Type_commit(&MPI_PENCIL);
-}
-
-void
-petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
-{
-    /* define the global long / short range force cut */
-    pm->BoxSize = BoxSize;
-    pm->Asmth = Asmth;
-    pm->Nmesh = Nmesh;
-    pm->G = G;
-    pm->CellSize = BoxSize / Nmesh;
-    pm->comm = comm;
-
-
-    int ThisTask;
-    int NTask;
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
-
-
-    int ndevices;
-    cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(ThisTask % ndevices);
-    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
-
-    // Logical transform size
-    size_t nx = NTask;      // any value >= NTask is OK
-    size_t ny = NTask;      // any value >= NTask is OK
-    size_t nz = 2 * NTask;  // need to be even and >= NTask
-
-    // We start with Slabs distributed along X (X-Slabs)
-    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
-    // All ranks own all element in the Y and Z dimension
-    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
-    // complex numbers assuming an in-place data layout.
-    int ranks_with_onemore = nx % size;
-    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
-    size_t padded_nz = 2 * (nz / 2 + 1);
-
-    // // Local, distributed, data
-    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
-    // generate_random(data, rank);
-    // std::vector<float> ref = data;
-
-
-
-/********************************not sure if these are useful or not**************************************** */
-    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    ptrdiff_t np[2];
-
-    int ThisTask;
-    int NTask;
-
-    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
-    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
-
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
-
-    /* try to find a square 2d decomposition */
-    int i;
-    int k;
-    for(i = sqrt(NTask) + 1; i >= 0; i --) {
-        if(NTask % i == 0) break;
-    }
-    np[0] = i;
-    np[1] = NTask / i;
-
-message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-
-// Step 1: Create 2D Cartesian grid for the processes
-int dims[2] = {np[0], np[1]};
-int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
-
-// Create 2D Cartesian communicator
-if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-}
-
-// Step 2: Get the Cartesian coordinates of the process in the grid
-int periods_unused[2];
-MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
-
-// Ensure that the task grid matches the expected number of processes
-if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
-    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
-}
-
-// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
-// cuFFTMp might require manual management of the local data size
-// Example: You may need to calculate how much data each process holds based on grid decomposition
-
-pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
-                                                pm->real_space_region.size, 
-                                                pm->real_space_region.offset, 
-                                                pm->fourier_space_region.size, 
-                                                pm->fourier_space_region.offset);
-
-    /*
-     * In fourier space, the transposed array is ordered in
-     * are in (y, z, x). The strides and sizes returned
-     * from local size is in (Nx, Ny, Nz), hence we roll them once
-     * so that the strides will give correct linear indexing for
-     * integer coordinates given in order of (y, z, x).
-     * */
-
-#define ROLL(a, N, j) { \
-    typeof(a[0]) tmp[N]; \
-    ptrdiff_t k; \
-    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
-    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
-    }
-
-    ROLL(pm->fourier_space_region.offset, 3, 1);
-    ROLL(pm->fourier_space_region.size, 3, 1);
-
-#undef ROLL
-
-    /* calculate the strides */
-    petapm_region_init_strides(&pm->real_space_region);
-    petapm_region_init_strides(&pm->fourier_space_region);
-
-
-/******************************** end unsure block **************************************** */
-
-    cudaStreamCreate(&pm->priv->stream);
-    cufftCreate(&pm->priv->plan_forw);
-    cufftCreate(&pm->priv->plan_back);
-
-    // Attach the MPI communicator to the plans
-    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
-    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
-
-    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
-    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
-    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
-    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
-
-    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
-
-    // Set the stream
-    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
-    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
-
-    // Make the plan
-    size_t workspace;
-    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
-    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
-
-
-    // Allocate GPU memory, copy CPU data to GPU
-    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
-    cudaLibXtDesc *desc;
-    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
-    // TODO: what to make of the cpu_data here?
-    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
-
-    /* now lets fill up the mesh2task arrays */
-
-#if 0
-    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
-            pm->real_space_region.offset[0],
-            pm->real_space_region.offset[1],
-            pm->real_space_region.offset[2],
-            pm->real_space_region.size[0],
-            pm->real_space_region.size[1],
-            pm->real_space_region.size[2]);
-#endif
-
-    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
-    for(k = 0; k < 2; k ++) {
-        for(i = 0; i < Nmesh; i ++) {
-            tmp[i] = 0;
-        }
-        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
-            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
-        }
-        /* which column / row hosts this tile? */
-        /* FIXME: this is very inefficient */
-        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-        /*
-        for(i = 0; i < Nmesh; i ++) {
-            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
-        }
-        */
-    }
-    myfree(tmp);
-}
-
-void
-petapm_destroy(PetaPM * pm)
-{
-    cufftDestroy(pm->priv->plan_forw);
-    cufftDestroy(pm->priv->plan_back);
-    MPI_Comm_free(&pm->priv->comm_cart_2d);
-    myfree(pm->Mesh2Task[0]);
-}
-
-/*
- * read out field to particle i, with value no need to be thread safe
- * (particle i is never done by same thread)
- * */
-typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
-static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
-/* apply transfer function to value, kpos array is in x, y, z order */
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H);
-
-static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-/*
- * 1. calls prepare to build the Regions covering particles
- * 2. CIC the particles
- * 3. Transform to rho_k
- * 4. apply global_transfer (if not NULL --
- *       this is the place to fill in gaussian seeds,
- *       the transfer is stacked onto all following transfers.
- * 5. for each transfer, readout in functions
- * 6.    apply transfer from global_transfer -> complex
- * 7.    transform to real
- * 8.    readout
- * 9. free regions
- * */
-
-PetaPMRegion *
-petapm_force_init(
-        PetaPM * pm,
-        petapm_prepare_func prepare,
-        PetaPMParticleStruct * pstruct,
-        int * Nregions,
-        void * userdata) {
-    CPS = pstruct;
-
-    *Nregions = 0;
-    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
-    pm_init_regions(pm, regions, *Nregions);
-
-    pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
-
-    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
-
-    walltime_measure("/PMgrav/init");
-    return regions;
-}
-
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
-
-    PetaPMRegion * region = &pm->fourier_space_region;
-
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
-
-}
-
-cufftComplex * petapm_force_r2c(PetaPM * pm,
-        PetaPMGlobalFunctions * global_functions
-        ) {
-    /* call pfft rho_k is CFT of rho */
-
-    /* this is because
-     *
-     * CFT = DFT * dx **3
-     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
-     * */
-    double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-    memset(real, 0, sizeof(double) * pm->priv->fftsize);
-    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-    walltime_measure("/PMgrav/comm2");
-
-#ifdef DEBUG
-    verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
-    walltime_measure("/PMgrav/Verify");
-#endif
-
-    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
-    myfree(real);
-
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
-
-    /*Do any analysis that may be required before the transfer function is applied*/
-    petapm_transfer_func global_readout = global_functions->global_readout;
-    if(global_readout)
-        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
-    if(global_functions->global_analysis)
-        global_functions->global_analysis(pm);
-    /*Apply the transfer function*/
-    petapm_transfer_func global_transfer = global_functions->global_transfer;
-    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
-    walltime_measure("/PMgrav/r2c");
-
-    myfree(complx);
-    return rho_k;
-}
-
-void
-petapm_force_c2r(PetaPM * pm,
-        cufftComplex * rho_k,
-        PetaPMRegion * regions,
-        const int Nregions,
-        PetaPMFunctions * functions)
-{
-
-    PetaPMFunctions * f = functions;
-    for (f = functions; f->name; f ++) {
-        petapm_transfer_func transfer = f->transfer;
-        petapm_readout_func readout = f->readout;
-
-        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-        /* apply the greens function turn rho_k into potential in fourier space */
-        pm_apply_transfer_function(pm, rho_k, complx, transfer);
-        walltime_measure("/PMgrav/calc");
-
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
-
-        walltime_measure("/PMgrav/c2r");
-        if(f == functions) // Once
-            report_memory_usage("PetaPM");
-        myfree(complx);
-        /* read out the potential: this will copy and free real.*/
-        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-        walltime_measure("/PMgrav/comm");
-
-        pm_iterate(pm, readout, regions, Nregions);
-        walltime_measure("/PMgrav/readout");
-    }
-}
-
-void petapm_force_finish(PetaPM * pm) {
-    layout_finish(&pm->priv->layout);
-    myfree(pm->priv->meshbuf);
-}
-
-void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        void * userdata) {
-    int Nregions;
-    PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
-    if(functions)
-        petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
-    myfree(rho_k);
-    if(CPS->RegionInd)
-        myfree(CPS->RegionInd);
-    myfree(regions);
-    petapm_force_finish(pm);
-}
-
-/******************************************************************************************************************************************** */
-/* build a communication layout */
-
-static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
-static void layout_exchange_pencils(struct Layout * L);
-static void
-layout_prepare (PetaPM * pm,
-                struct Layout * L,
-                double * meshbuf,
-                PetaPMRegion * regions,
-                const int Nregions,
-                MPI_Comm comm)
-{
-    int r;
-    int i;
-    int NTask;
-    L->comm = comm;
-
-    MPI_Comm_size(L->comm, &NTask);
-
-    L->ibuffer = (int *) mymalloc("PMlayout", sizeof(int) * NTask * 8);
-
-    memset(L->ibuffer, 0, sizeof(int) * NTask * 8);
-    L->NpSend = &L->ibuffer[NTask * 0];
-    L->NpRecv = &L->ibuffer[NTask * 1];
-    L->NcSend = &L->ibuffer[NTask * 2];
-    L->NcRecv = &L->ibuffer[NTask * 3];
-    L->DcSend = &L->ibuffer[NTask * 4];
-    L->DcRecv = &L->ibuffer[NTask * 5];
-    L->DpSend = &L->ibuffer[NTask * 6];
-    L->DpRecv = &L->ibuffer[NTask * 7];
-
-    L->NpExport = 0;
-    L->NcExport = 0;
-    L->NpImport = 0;
-    L->NcImport = 0;
-
-    int NpAlloc = 0;
-    /* count pencils until buffer would run out */
-    for (r = 0; r < Nregions; r ++) {
-        NpAlloc += regions[r].size[0] * regions[r].size[1];
-    }
-
-    L->PencilSend = (struct Pencil *) mymalloc("PencilSend", NpAlloc * sizeof(struct Pencil));
-
-    layout_build_pencils(pm, L, meshbuf, regions, Nregions);
-
-    /* sort the pencils by the target rank for ease of next step */
-    qsort_openmp(L->PencilSend, NpAlloc, sizeof(struct Pencil), pencil_cmp_target);
-    /* zero length pixels are moved to the tail */
-
-    /* now shrink NpExport*/
-    L->NpExport = NpAlloc;
-    while(L->NpExport > 0 && L->PencilSend[L->NpExport - 1].len == 0) {
-        L->NpExport --;
-    }
-
-    /* count total number of cells to be exported */
-    int NcExport = 0;
-    for(i = 0; i < L->NpExport; i++) {
-        int task = L->PencilSend[i].task;
-        L->NcSend[task] += L->PencilSend[i].len;
-        NcExport += L->PencilSend[i].len;
-        L->NpSend[task] ++;
-    }
-    L->NcExport = NcExport;
-
-    MPI_Alltoall(L->NpSend, 1, MPI_INT, L->NpRecv, 1, MPI_INT, L->comm);
-    MPI_Alltoall(L->NcSend, 1, MPI_INT, L->NcRecv, 1, MPI_INT, L->comm);
-
-    /* build the displacement array; why doesn't MPI build these automatically? */
-    L->DpSend[0] = 0; L->DpRecv[0] = 0;
-    L->DcSend[0] = 0; L->DcRecv[0] = 0;
-    for(i = 1; i < NTask; i ++) {
-        L->DpSend[i] = L->NpSend[i - 1] + L->DpSend[i - 1];
-        L->DpRecv[i] = L->NpRecv[i - 1] + L->DpRecv[i - 1];
-        L->DcSend[i] = L->NcSend[i - 1] + L->DcSend[i - 1];
-        L->DcRecv[i] = L->NcRecv[i - 1] + L->DcRecv[i - 1];
-    }
-    L->NpImport = L->DpRecv[NTask -1] + L->NpRecv[NTask -1];
-    L->NcImport = L->DcRecv[NTask -1] + L->NcRecv[NTask -1];
-
-    /* some checks */
-    if(L->DpSend[NTask - 1] + L->NpSend[NTask -1] != L->NpExport) {
-        endrun(1, "NpExport = %d NpSend=%d DpSend=%d\n", L->NpExport, L->NpSend[NTask -1], L->DpSend[NTask - 1]);
-    }
-    if(L->DcSend[NTask - 1] + L->NcSend[NTask -1] != L->NcExport) {
-        endrun(1, "NcExport = %d NcSend=%d DcSend=%d\n", L->NcExport, L->NcSend[NTask -1], L->DcSend[NTask - 1]);
-    }
-    int64_t totNpAlloc = reduce_int64(NpAlloc, L->comm);
-    int64_t totNpExport = reduce_int64(L->NpExport, L->comm);
-    int64_t totNcExport = reduce_int64(L->NcExport, L->comm);
-    int64_t totNpImport = reduce_int64(L->NpImport, L->comm);
-    int64_t totNcImport = reduce_int64(L->NcImport, L->comm);
-
-    if(totNpExport != totNpImport) {
-        endrun(1, "totNpExport = %ld\n", totNpExport);
-    }
-    if(totNcExport != totNcImport) {
-        endrun(1, "totNcExport = %ld\n", totNcExport);
-    }
-
-    /* exchange the pencils */
-    message(0, "PetaPM:  %010ld/%010ld Pencils and %010ld Cells\n", totNpExport, totNpAlloc, totNcExport);
-    L->PencilRecv = (struct Pencil *) mymalloc("PencilRecv", L->NpImport * sizeof(struct Pencil));
-    memset(L->PencilRecv, 0xfc, L->NpImport * sizeof(struct Pencil));
-    layout_exchange_pencils(L);
-}
-
-static void
-layout_build_pencils(PetaPM * pm,
-                     struct Layout * L,
-                     double * meshbuf,
-                     PetaPMRegion * regions,
-                     const int Nregions)
-{
-    /* now build pencils to be exported */
-    int p0 = 0;
-    int r;
-    for (r = 0; r < Nregions; r++) {
-        int ix;
-#pragma omp parallel for private(ix)
-        for(ix = 0; ix < regions[r].size[0]; ix++) {
-            int iy;
-            for(iy = 0; iy < regions[r].size[1]; iy++) {
-                int poffset = ix * regions[r].size[1] + iy;
-                struct Pencil * p = &L->PencilSend[p0 + poffset];
-
-                p->offset[0] = ix + regions[r].offset[0];
-                p->offset[1] = iy + regions[r].offset[1];
-                p->offset[2] = regions[r].offset[2];
-                p->len = regions[r].size[2];
-                p->meshbuf_first = (regions[r].buffer - meshbuf) +
-                    regions[r].strides[0] * ix +
-                    regions[r].strides[1] * iy;
-                /* now lets compress the pencil */
-                while((p->len > 0) && (meshbuf[p->meshbuf_first + p->len - 1] == 0.0)) {
-                    p->len --;
-                }
-                while((p->len > 0) && (meshbuf[p->meshbuf_first] == 0.0)) {
-                    p->len --;
-                    p->meshbuf_first++;
-                    p->offset[2] ++;
-                }
-
-                p->task = pos_get_target(pm, p->offset);
-            }
-        }
-        p0 += regions[r].size[0] * regions[r].size[1];
-    }
-
-}
-
-static void layout_exchange_pencils(struct Layout * L) {
-    int i;
-    int offset;
-    int NTask;
-    MPI_Comm_size(L->comm, &NTask);
-    /* build the first pointers to refer to the correct relative buffer locations */
-    /* note that the buffer hasn't bee assembled yet */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        int j;
-        struct Pencil * p = &L->PencilSend[offset];
-        if(L->NpSend[i] == 0) continue;
-        p->first = 0;
-        for(j = 1; j < L->NpSend[i]; j++) {
-            p[j].first = p[j - 1].first + p[j - 1].len;
-        }
-        offset += L->NpSend[i];
-    }
-
-    MPI_Alltoallv(
-            L->PencilSend, L->NpSend, L->DpSend, MPI_PENCIL,
-            L->PencilRecv, L->NpRecv, L->DpRecv, MPI_PENCIL,
-            L->comm);
-
-    /* set first to point to absolute position in the full import cell buffer */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        struct Pencil * p = &L->PencilRecv[offset];
-        int j;
-        for(j = 0; j < L->NpRecv[i]; j++) {
-            p[j].first += L->DcRecv[i];
-        }
-        offset += L->NpRecv[i];
-    }
-
-    /* set first to point to absolute position in the full export cell buffer */
-    offset = 0;
-    for(i = 0; i < NTask; i ++) {
-        struct Pencil * p = &L->PencilSend[offset];
-        int j;
-        for(j = 0; j < L->NpSend[i]; j++) {
-            p[j].first += L->DcSend[i];
-        }
-        offset += L->NpSend[i];
-    }
-}
-
-static void layout_finish(struct Layout * L) {
-    myfree(L->PencilRecv);
-    myfree(L->PencilSend);
-    myfree(L->ibuffer);
-}
-
-/* exchange cells to their pfft host, then reduce the cells to the pfft
- * array */
-static void to_pfft(double * cell, double * buf) {
-#pragma omp atomic update
-            cell[0] += buf[0];
-}
-
-static void
-layout_build_and_exchange_cells_to_pfft(
-        PetaPM * pm,
-        struct Layout * L,
-        double * meshbuf,
-        double * real)
-{
-    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
-    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
-
-    int i;
-    int offset;
-
-    /* collect all cells into the send buffer */
-    offset = 0;
-    for(i = 0; i < L->NpExport; i ++) {
-        struct Pencil * p = &L->PencilSend[i];
-        memcpy(L->BufSend + offset, &meshbuf[p->meshbuf_first],
-                sizeof(double) * p->len);
-        offset += p->len;
-    }
-
-    /* receive cells */
-    MPI_Alltoallv(
-            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
-            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
-            L->comm);
-
-#if 0
-    double massExport = 0;
-    for(i = 0; i < L->NcExport; i ++) {
-        massExport += L->BufSend[i];
-    }
-
-    double massImport = 0;
-    for(i = 0; i < L->NcImport; i ++) {
-        massImport += L->BufRecv[i];
-    }
-    double totmassExport;
-    double totmassImport;
-    MPI_Allreduce(&massExport, &totmassExport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
-    MPI_Allreduce(&massImport, &totmassImport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
-    message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
-#endif
-
-    layout_iterate_cells(pm, L, to_pfft, real);
-    myfree(L->BufRecv);
-    myfree(L->BufSend);
-}
-
-/* readout cells on their pfft host, then exchange the cells to the domain
- * host */
-static void to_region(double * cell, double * region) {
-    *region = *cell;
-}
-
-static void
-layout_build_and_exchange_cells_to_local(
-        PetaPM * pm,
-        struct Layout * L,
-        double * meshbuf,
-        double * real)
-{
-    L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
-    int i;
-    int offset;
-
-    /*layout_iterate_cells transfers real to L->BufRecv*/
-    layout_iterate_cells(pm, L, to_region, real);
-
-    /*Real is done now: reuse the memory for BufSend*/
-    myfree(real);
-    /*Now allocate BufSend, which is confusingly used to receive data*/
-    L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
-
-    /* exchange cells */
-    /* notice the order is reversed from to_pfft */
-    MPI_Alltoallv(
-            L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
-            L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
-            L->comm);
-
-    /* distribute BufSend to meshbuf */
-    offset = 0;
-    for(i = 0; i < L->NpExport; i ++) {
-        struct Pencil * p = &L->PencilSend[i];
-        memcpy(&meshbuf[p->meshbuf_first],
-                L->BufSend + offset,
-                sizeof(double) * p->len);
-        offset += p->len;
-    }
-    myfree(L->BufSend);
-    myfree(L->BufRecv);
-}
-
-/* iterate over the pairs of real field cells and RecvBuf cells
- *
- * !!! iter has to be thread safe. !!!
- * */
-static void
-layout_iterate_cells(PetaPM * pm,
-                     struct Layout * L,
-                     cell_iterator iter,
-                     double * real)
-{
-    int i;
-#pragma omp parallel for
-    for(i = 0; i < L->NpImport; i ++) {
-        struct Pencil * p = &L->PencilRecv[i];
-        int k;
-        ptrdiff_t linear0 = 0;
-        for(k = 0; k < 2; k ++) {
-            int ix = p->offset[k];
-            while(ix < 0) ix += pm->Nmesh;
-            while(ix >= pm->Nmesh) ix -= pm->Nmesh;
-            ix -= pm->real_space_region.offset[k];
-            if(ix >= pm->real_space_region.size[k]) {
-                /* serious problem assumption about pfft layout was wrong*/
-                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
-            }
-            linear0 += ix * pm->real_space_region.strides[k];
-        }
-        int j;
-        for(j = 0; j < p->len; j ++) {
-            int iz = p->offset[2] + j;
-            while(iz < 0) iz += pm->Nmesh;
-            while(iz >= pm->Nmesh) iz -= pm->Nmesh;
-            if(iz >= pm->real_space_region.size[2]) {
-                /* serious problem assmpution about pfft layout was wrong*/
-                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
-            }
-            ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
-            /*
-             * operate on the pencil, either modifying real or BufRecv
-             * */
-            iter(&real[linear], &L->BufRecv[p->first + j]);
-        }
-    }
-}
-
-static void
-pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions)
-{
-    if(regions) {
-        int i;
-        size_t size = 0;
-        for(i = 0 ; i < Nregions; i ++) {
-            size += regions[i].totalsize;
-        }
-        pm->priv->meshbufsize = size;
-        if ( size == 0 ) return;
-        pm->priv->meshbuf = (double *) mymalloc("PMmesh", size * sizeof(double));
-        /* this takes care of the padding */
-        memset(pm->priv->meshbuf, 0, size * sizeof(double));
-        size = 0;
-        for(i = 0 ; i < Nregions; i ++) {
-            regions[i].buffer = pm->priv->meshbuf + size;
-            size += regions[i].totalsize;
-        }
-    }
-}
-
-
-static void
-pm_iterate_one(PetaPM * pm,
-               int i,
-               pm_iterator iterator,
-               PetaPMRegion * regions,
-               const int Nregions)
-{
-    int k;
-    int iCell[3];  /* integer coordinate on the regional mesh */
-    double Res[3]; /* residual*/
-    double * Pos = POS(i);
-    const int RegionInd = CPS->RegionInd ? CPS->RegionInd[i] : 0;
-
-    /* Asserts that the swallowed particles are not considered (region -2).*/
-    if(RegionInd < 0)
-        return;
-    /* This should never happen: it is pure paranoia and to avoid icc being crazy*/
-    if(RegionInd >= Nregions)
-        endrun(1, "Particle %d has region %d out of bounds %d\n", i, RegionInd, Nregions);
-
-    PetaPMRegion * region = &regions[RegionInd];
-    for(k = 0; k < 3; k++) {
-        double tmp = Pos[k] / pm->CellSize;
-        iCell[k] = floor(tmp);
-        Res[k] = tmp - iCell[k];
-        iCell[k] -= region->offset[k];
-        /* seriously?! particles are supposed to be contained in cells */
-        if(iCell[k] >= region->size[k] - 1 || iCell[k] < 0) {
-            endrun(1, "particle out of cell better stop %d (k=%d) %g %g %g region: %td %td\n", iCell[k],k,
-                Pos[0], Pos[1], Pos[2],
-                region->offset[k], region->size[k]);
-        }
-    }
-
-    int connection;
-    for(connection = 0; connection < 8; connection++) {
-        double weight = 1.0;
-        size_t linear = 0;
-        for(k = 0; k < 3; k++) {
-            int offset = (connection >> k) & 1;
-            int tmp = iCell[k] + offset;
-            linear += tmp * region->strides[k];
-            weight *= offset?
-                /* offset == 1*/ (Res[k])    :
-                /* offset == 0*/ (1 - Res[k]);
-        }
-        if(linear >= region->totalsize) {
-            endrun(1, "particle linear index out of cell better stop\n");
-        }
-        iterator(pm, i, &region->buffer[linear], weight);
-    }
-}
-
-/*
- * iterate over all particle / mesh pairs, call iterator
- * function . iterator function shall be aware of thread safety.
- * no threads run on same particle same time but may
- * access one mesh points same time.
- * */
-static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions) {
-    int i;
-#pragma omp parallel for
-    for(i = 0; i < CPS->NumPart; i ++) {
-        pm_iterate_one(pm, i, iterator, regions, Nregions);
-    }
-}
-
-void petapm_region_init_strides(PetaPMRegion * region) {
-    int k;
-    size_t rt = 1;
-    for(k = 2; k >= 0; k --) {
-        region->strides[k] = rt;
-        rt = region->size[k] * rt;
-    }
-    region->totalsize = rt;
-    region->buffer = NULL;
-}
-
-static int pos_get_target(PetaPM * pm, const int pos[2]) {
-    int k;
-    int task2d[2];
-    int rank;
-    for(k = 0; k < 2; k ++) {
-        int ix = pos[k];
-        while(ix < 0) ix += pm->Nmesh;
-        while(ix >= pm->Nmesh) ix -= pm->Nmesh;
-        task2d[k] = pm->Mesh2Task[k][ix];
-    }
-    MPI_Cart_rank(pm->priv->comm_cart_2d, task2d, &rank);
-    return rank;
-}
-static int pencil_cmp_target(const void * v1, const void * v2) {
-    const struct Pencil * p1 = (const struct Pencil *) v1;
-    const struct Pencil * p2 = (const struct Pencil *) v2;
-    /* move zero length pixels to the end */
-    if(p2->len == 0) return -1;
-    if(p1->len == 0) return 1;
-    int t1 = p1->task;
-    int t2 = p2->task;
-    return ((t2 < t1) - (t1 < t2)) * 2 +
-        ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
-}
-
-#ifdef DEBUG
-static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
-    /* verify the density field */
-    double mass_Part = 0;
-    int j;
-#pragma omp parallel for reduction(+: mass_Part)
-    for(j = 0; j < CPS->NumPart; j ++) {
-        double Mass = *MASS(j);
-        mass_Part += Mass;
-    }
-    double totmass_Part = 0;
-    MPI_Allreduce(&mass_Part, &totmass_Part, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-
-    double mass_Region = 0;
-    size_t i;
-
-#pragma omp parallel for reduction(+: mass_Region)
-    for(i = 0; i < meshsize; i ++) {
-        mass_Region += meshbuf[i];
-    }
-    double totmass_Region = 0;
-    MPI_Allreduce(&mass_Region, &totmass_Region, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-    double mass_CIC = 0;
-#pragma omp parallel for reduction(+: mass_CIC)
-    for(i = 0; i < pm->real_space_region.totalsize; i ++) {
-        mass_CIC += real[i];
-    }
-    double totmass_CIC = 0;
-    MPI_Allreduce(&mass_CIC, &totmass_CIC, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
-
-    message(0, "total Region mass err = %g CIC mass err = %g Particle mass = %g\n", totmass_Region / totmass_Part - 1, totmass_CIC / totmass_Part - 1, totmass_Part);
-}
-#endif
-
-
-
-
-/**************
- * functions iterating over particle / mesh pairs
- ***************/
-static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    double Mass = *MASS(i);
-    if(INACTIVE(i))
-        return;
-#pragma omp atomic update
-    mesh[0] += weight * Mass;
-}
-static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
-    int64_t result = 0;
-    MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
-    return result;
-}
-
-/** Some FFT notes
- *
- *
- * CFT = dx * iDFT (thus CFT has no 2pi factors and iCFT has,
- *           same as wikipedia.)
- *
- * iCFT = dk * DFT
- * iCFT(CFG) = dx * dk * DFT(iDFT)
- *           = L / N * (2pi / L) * N
- *           = 2 pi
- * agreed with the usual def that
- * iCFT(CFT) = 2pi
- *
- * **************************8*/
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index fbd6865e..90235718 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -19,7 +19,7 @@ layout_prepare(PetaPM * pm,
                const int Nregions,
                MPI_Comm comm);
 static void layout_finish(struct Layout * L);
-static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
+static void layout_build_and_exchange_cells_to_fft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
 
 /* cell_iterator needs to be thread safe !*/
@@ -62,13 +62,6 @@ static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access
 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
 #define INACTIVE(i) (CPS->active && !CPS->active(i))
 
-/* (jdavies) reion defs */
-#define TYPE(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_type]))
-#define PI(i) ((int*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS_R->offset_pi]))
-/* NOTE: These are 'myfloat' types */
-#define FESC(i) ((double*) (&((char*)CPS_R->Starslot)[CPS_R->star_elsize * *PI(i) + CPS_R->offset_fesc]))
-#define FESCSPH(i) ((double*) (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_fesc_sph]))
-#define SFR(i) ((double*)  (&((char*)CPS_R->Sphslot)[CPS_R->sph_elsize * *PI(i) + CPS_R->offset_sfr]))
 
 PetaPMRegion * petapm_get_fourier_region(PetaPM * pm) {
     return &pm->fourier_space_region;
@@ -98,12 +91,11 @@ petapm_module_init(int Nthreads)
     #ifdef _OPENMP
     omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
     #endif
-
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    // Initialize the MPI Datatype for the Pencil structure
-    MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    MPI_Type_commit(&MPI_PENCIL);
+    // get rid of pencil type
+    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    //MPI_Type_commit(&MPI_PENCIL);
 }
 
 void
@@ -117,6 +109,40 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     pm->CellSize = BoxSize / Nmesh;
     pm->comm = comm;
 
+
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(comm, &ThisTask);
+    MPI_Comm_size(comm, &NTask);
+
+
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(ThisTask % ndevices);
+    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
+
+    // Logical transform size
+    size_t nx = NTask;      // any value >= NTask is OK
+    size_t ny = NTask;      // any value >= NTask is OK
+    size_t nz = 2 * NTask;  // need to be even and >= NTask
+
+    // We start with Slabs distributed along X (X-Slabs)
+    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
+    // All ranks own all element in the Y and Z dimension
+    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
+    // complex numbers assuming an in-place data layout.
+    int ranks_with_onemore = nx % size;
+    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
+    size_t padded_nz = 2 * (nz / 2 + 1);
+
+    // // Local, distributed, data
+    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
+    // generate_random(data, rank);
+    // std::vector<float> ref = data;
+
+
+
+/********************************not sure if these are useful or not**************************************** */
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
     ptrdiff_t np[2];
 
@@ -192,22 +218,41 @@ pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d,
     petapm_region_init_strides(&pm->real_space_region);
     petapm_region_init_strides(&pm->fourier_space_region);
 
-    /* planning the fft; need temporary arrays */
 
-    double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
-    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
+/******************************** end unsure block **************************************** */
 
-    pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
-        n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
-        PFFT_TRANSPOSED_OUT | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
-    pm->priv->plan_back = pfft_plan_dft_c2r_3d(
-        n, complx, real, pm->priv->comm_cart_2d, PFFT_BACKWARD,
-        PFFT_TRANSPOSED_IN | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
+    cudaStreamCreate(&pm->priv->stream);
+    cufftCreate(&pm->priv->plan_forw);
+    cufftCreate(&pm->priv->plan_back);
 
-    myfree(complx);
-    myfree(rho_k);
-    myfree(real);
+    // Attach the MPI communicator to the plans
+    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
+    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
+
+    // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
+    // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
+    // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
+    // So, in both, the "input" box should be the real box and the "output" box should be the complex box
+
+    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+
+    // Set the stream
+    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
+    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
+
+    // Make the plan
+    size_t workspace;
+    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
+    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
+    // TODO: what to make of the cpu_data here?
+    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
 
     /* now lets fill up the mesh2task arrays */
 
@@ -244,8 +289,8 @@ pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d,
 void
 petapm_destroy(PetaPM * pm)
 {
-    pfft_destroy_plan(pm->priv->plan_forw);
-    pfft_destroy_plan(pm->priv->plan_back);
+    cufftDestroy(pm->priv->plan_forw);
+    cufftDestroy(pm->priv->plan_back);
     MPI_Comm_free(&pm->priv->comm_cart_2d);
     myfree(pm->Mesh2Task[0]);
 }
@@ -262,9 +307,6 @@ static void pm_apply_transfer_function(PetaPM * pm,
         cufftComplex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
-
 /*
  * 1. calls prepare to build the Regions covering particles
  * 2. CIC the particles
@@ -300,6 +342,48 @@ petapm_force_init(
     return regions;
 }
 
+static void pm_apply_transfer_function(PetaPM * pm,
+        cufftComplex * src,
+        cufftComplex * dst, petapm_transfer_func H
+        ){
+    size_t ip = 0;
+
+    PetaPMRegion * region = &pm->fourier_space_region;
+
+#pragma omp parallel for
+    for(ip = 0; ip < region->totalsize; ip ++) {
+        ptrdiff_t tmp = ip;
+        int pos[3];
+        int kpos[3];
+        int64_t k2 = 0.0;
+        int k;
+        for(k = 0; k < 3; k ++) {
+            pos[k] = tmp / region->strides[k];
+            tmp -= pos[k] * region->strides[k];
+            /* lets get the abs pos on the grid*/
+            pos[k] += region->offset[k];
+            /* check */
+            if(pos[k] >= pm->Nmesh) {
+                endrun(1, "position didn't make sense\n");
+            }
+            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
+            /* Watch out the cast */
+            k2 += ((int64_t)kpos[k]) * kpos[k];
+        }
+        /* swap 0 and 1 because fourier space was transposed */
+        /* kpos is y, z, x */
+        pos[0] = kpos[2];
+        pos[1] = kpos[0];
+        pos[2] = kpos[1];
+        dst[ip][0] = src[ip][0];
+        dst[ip][1] = src[ip][1];
+        if(H) {
+            H(pm, k2, pos, &dst[ip]);
+        }
+    }
+
+}
+
 cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
@@ -312,7 +396,7 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
      * */
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
-    layout_build_and_exchange_cells_to_pfft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
     walltime_measure("/PMgrav/comm2");
 
 #ifdef DEBUG
@@ -397,205 +481,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
     petapm_force_finish(pm);
 }
 
-/* These functions are for the excursion set reionization module*/
-
-/* initialise one set of regions with custom iterator
- * this is the same as petapm_force_init with a custom iterator
- * (and no CPS definition since it's called multiple times)*/
-PetaPMRegion *
-petapm_reion_init(
-        PetaPM * pm,
-        petapm_prepare_func prepare,
-        pm_iterator iterator,
-        PetaPMParticleStruct * pstruct,
-        int * Nregions,
-        void * userdata) {
-
-    *Nregions = 0;
-    PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
-    pm_init_regions(pm, regions, *Nregions);
-
-    walltime_measure("/PMreion/Misc");
-    pm_iterate(pm, iterator, regions, *Nregions);
-    walltime_measure("/PMreion/cic");
-
-    layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
-
-    walltime_measure("/PMreion/comm");
-    return regions;
-}
-
-/* 30Mpc to 0.5 Mpc with a delta of 1.1 is ~50 iterations, this should be more than enough*/
-#define MAX_R_ITERATIONS 10000
-
-/* differences from force c2r (why I think I need this separate)
- * radius loop (could do this with long list of same function + global R)
- * I'm pretty sure I need a third function type (reion loop) with all three grids
- * ,after c2r but iteration over the grid, instead of particles */
-void
-petapm_reion_c2r(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        cufftComplex * mass_unfiltered, cufftComplex * star_unfiltered, cufftComplex * sfr_unfiltered,
-        PetaPMRegion * regions,
-        const int Nregions,
-        PetaPMFunctions * functions,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr)
-{
-    PetaPMFunctions * f = functions;
-    double R = fmin(R_max,pm_mass->BoxSize);
-    int last_step = 0;
-    int f_count = 0;
-    petapm_readout_func readout = f->readout;
-
-    /* TODO: seriously re-think the allocation ordering in this function */
-    double * mass_real = (double * ) mymalloc2("mass_real", pm_mass->priv->fftsize * sizeof(double));
-
-    //TODO: add CellLengthFactor for lowres (>1Mpc, see old find_HII_bubbles function)
-    while(!last_step) {
-        f_count++;
-        //The last step will be unfiltered
-        if(R/R_delta < R_min || R/R_delta < (pm_mass->CellSize) || f_count > MAX_R_ITERATIONS)
-        {
-            last_step = 1;
-            R = pm_mass->CellSize;
-        }
-
-        //NOTE: The PetaPM structs for reionisation use the G variable for filter radius in order to use
-        //the transfer functions correctly
-        pm_mass->G = R;
-        pm_star->G = R;
-        if(use_sfr)pm_sfr->G = R;
-
-        //TODO: maybe allocate and free these outside the loop
-        cufftComplex * mass_filtered = (cufftComplex *) mymalloc("mass_filtered", pm_mass->priv->fftsize * sizeof(double));
-        cufftComplex * star_filtered = (cufftComplex *) mymalloc("star_filtered", pm_star->priv->fftsize * sizeof(double));
-        cufftComplex * sfr_filtered;
-        if(use_sfr){
-            sfr_filtered = (cufftComplex *) mymalloc("sfr_filtered", pm_sfr->priv->fftsize * sizeof(double));
-        }
-
-        /* apply the filtering at this radius */
-        /*We want the last step to be unfiltered,
-         *  calling apply transfer with NULL should just copy the grids */
-
-        petapm_transfer_func transfer = last_step ? NULL : f->transfer;
-
-        pm_apply_transfer_function(pm_mass, mass_unfiltered, mass_filtered, transfer);
-        pm_apply_transfer_function(pm_star, star_unfiltered, star_filtered, transfer);
-        if(use_sfr){
-            pm_apply_transfer_function(pm_sfr, sfr_unfiltered, sfr_filtered, transfer);
-        }
-        walltime_measure("/PMreion/calc");
-
-        double * star_real = (double * ) mymalloc2("star_real", pm_star->priv->fftsize * sizeof(double));
-        /* back to real space */
-        pfft_execute_dft_c2r(pm_mass->priv->plan_back, mass_filtered, mass_real);
-        pfft_execute_dft_c2r(pm_star->priv->plan_back, star_filtered, star_real);
-        double * sfr_real = NULL;
-        if(use_sfr){
-            sfr_real = (double * ) mymalloc2("sfr_real", pm_sfr->priv->fftsize * sizeof(double));
-            pfft_execute_dft_c2r(pm_sfr->priv->plan_back, sfr_filtered, sfr_real);
-            myfree(sfr_filtered);
-        }
-        walltime_measure("/PMreion/c2r");
-
-        myfree(star_filtered);
-        myfree(mass_filtered);
-
-        /* the reion loop calculates the J21 and stores it,
-         * for now the mass_real grid will be reused to hold J21
-         * on the last filtering step*/
-        reion_loop(pm_mass,pm_star,pm_sfr,mass_real,star_real,sfr_real,last_step);
-
-        /* since we don't need to readout star and sfr grids...*/
-        /* on the last step, the mass grid is populated with J21 and read out*/
-        if(sfr_real){
-            myfree(sfr_real);
-        }
-        myfree(star_real);
-
-        R = R / R_delta;
-    }
-    //J21 grid is exchanged to pm_mass buffer and freed
-    layout_build_and_exchange_cells_to_local(pm_mass, &pm_mass->priv->layout, pm_mass->priv->meshbuf, mass_real);
-    walltime_measure("/PMreion/comm");
-    //J21 read out to particles
-    pm_iterate(pm_mass, readout, regions, Nregions);
-    walltime_measure("/PMreion/readout");
-}
-
-/* We need a slightly different flow for reionisation, so I
- * will define these here instead of messing with the force functions.
- * The c2r function is the same, however we need a new function, reion_loop
- * to run over all three filtered grids, after the inverse transform.
- * The c2r function itself is also different since we need to apply the
- * transfer (filter) function on all three grids and run reion_loop before any readout.*/
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata) {
-
-    //assigning CPS here due to three sets of regions
-    CPS = pstruct;
-    CPS_R = rstruct;
-
-    /* initialise regions for each grid
-     * NOTE: these regions should be identical except for the grid buffer */
-    int Nregions_mass, Nregions_star, Nregions_sfr;
-    PetaPMRegion * regions_mass = petapm_reion_init(pm_mass, prepare, put_particle_to_mesh, pstruct, &Nregions_mass, userdata);
-    PetaPMRegion * regions_star = petapm_reion_init(pm_star, prepare, put_star_to_mesh, pstruct, &Nregions_star, userdata);
-    PetaPMRegion * regions_sfr;
-    if(use_sfr){
-        regions_sfr = petapm_reion_init(pm_sfr, prepare, put_sfr_to_mesh, pstruct, &Nregions_sfr, userdata);
-    }
-
-    walltime_measure("/PMreion/comm2");
-
-    //using force r2c since this part can be done independently
-    cufftComplex * mass_unfiltered = petapm_force_r2c(pm_mass, global_functions);
-    cufftComplex * star_unfiltered = petapm_force_r2c(pm_star, global_functions);
-    cufftComplex * sfr_unfiltered = NULL;
-    if(use_sfr){
-        sfr_unfiltered = petapm_force_r2c(pm_sfr, global_functions);
-    }
-
-    //need custom reion_c2r to implement the 3 grid c2r and readout
-    //the readout is only performed on the mass grid so for now I only pass in regions/Nregions for mass
-    if(functions)
-        petapm_reion_c2r(pm_mass, pm_star, pm_sfr,
-               mass_unfiltered, star_unfiltered, sfr_unfiltered,
-               regions_mass, Nregions_mass, functions, reion_loop,
-               R_max, R_min, R_delta, use_sfr);
-
-    //free everything in the correct order
-    if(sfr_unfiltered){
-        myfree(sfr_unfiltered);
-    }
-    myfree(star_unfiltered);
-    myfree(mass_unfiltered);
-
-    if(CPS->RegionInd)
-        myfree(CPS->RegionInd);
-
-    if(use_sfr){
-        myfree(regions_sfr);
-    }
-    myfree(regions_star);
-    myfree(regions_mass);
-
-    if(use_sfr){
-        petapm_force_finish(pm_sfr);
-    }
-    petapm_force_finish(pm_star);
-    petapm_force_finish(pm_mass);
-}
-/* End excursion set reionization module*/
-
+/******************************************************************************************************************************************** */
 /* build a communication layout */
 
 static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
@@ -801,15 +687,15 @@ static void layout_finish(struct Layout * L) {
     myfree(L->ibuffer);
 }
 
-/* exchange cells to their pfft host, then reduce the cells to the pfft
+/* exchange cells to their fft host, then reduce the cells to the fft
  * array */
-static void to_pfft(double * cell, double * buf) {
+static void to_fft(double * cell, double * buf) {
 #pragma omp atomic update
             cell[0] += buf[0];
 }
 
 static void
-layout_build_and_exchange_cells_to_pfft(
+layout_build_and_exchange_cells_to_fft(
         PetaPM * pm,
         struct Layout * L,
         double * meshbuf,
@@ -853,12 +739,12 @@ layout_build_and_exchange_cells_to_pfft(
     message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
 #endif
 
-    layout_iterate_cells(pm, L, to_pfft, real);
+    layout_iterate_cells(pm, L, to_fft, real);
     myfree(L->BufRecv);
     myfree(L->BufSend);
 }
 
-/* readout cells on their pfft host, then exchange the cells to the domain
+/* readout cells on their fft host, then exchange the cells to the domain
  * host */
 static void to_region(double * cell, double * region) {
     *region = *cell;
@@ -884,7 +770,7 @@ layout_build_and_exchange_cells_to_local(
     L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
 
     /* exchange cells */
-    /* notice the order is reversed from to_pfft */
+    /* notice the order is reversed from to_fft */
     MPI_Alltoallv(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
@@ -925,8 +811,8 @@ layout_iterate_cells(PetaPM * pm,
             while(ix >= pm->Nmesh) ix -= pm->Nmesh;
             ix -= pm->real_space_region.offset[k];
             if(ix >= pm->real_space_region.size[k]) {
-                /* serious problem assumption about pfft layout was wrong*/
-                endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
+                /* serious problem assumption about fft layout was wrong*/
+                endrun(1, "bad fft: original k: %d ix: %d, cur ix: %d, region: off %ld size %ld\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
             }
             linear0 += ix * pm->real_space_region.strides[k];
         }
@@ -936,8 +822,8 @@ layout_iterate_cells(PetaPM * pm,
             while(iz < 0) iz += pm->Nmesh;
             while(iz >= pm->Nmesh) iz -= pm->Nmesh;
             if(iz >= pm->real_space_region.size[2]) {
-                /* serious problem assmpution about pfft layout was wrong*/
-                endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
+                /* serious problem assmpution about fft layout was wrong*/
+                endrun(1, "bad fft: original iz: %d, cur iz: %d, region: off %ld size %ld\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
             }
             ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
             /*
@@ -1108,47 +994,7 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 }
 #endif
 
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
-
-    PetaPMRegion * region = &pm->fourier_space_region;
-
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
 
-}
 
 
 /**************
@@ -1161,24 +1007,6 @@ static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weigh
 #pragma omp atomic update
     mesh[0] += weight * Mass;
 }
-//escape fraction scaled GSM
-static void put_star_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 4)
-        return;
-    double Mass = *MASS(i);
-    double fesc = *FESC(i);
-#pragma omp atomic update
-    mesh[0] += weight * Mass * fesc;
-}
-//escape fraciton scaled SFR
-static void put_sfr_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
-    if(INACTIVE(i) || *TYPE(i) != 0)
-        return;
-    double Sfr = *SFR(i);
-    double fesc = *FESCSPH(i);
-#pragma omp atomic update
-    mesh[0] += weight * Sfr * fesc;
-}
 static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
     int64_t result = 0;
     MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);

From 5f862765cb6e6f12bc794a3893d4d8c7d8fb898c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 10:52:43 -0400
Subject: [PATCH 084/120] cleanup

---
 libgadget/petapm.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 90235718..35853ca9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -141,10 +141,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     // std::vector<float> ref = data;
 
 
-
-/********************************not sure if these are useful or not**************************************** */
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    ptrdiff_t np[2];
+    ptrdiff_t np[2]; // 2D arrangement of ranks
 
     int ThisTask;
     int NTask;
@@ -184,11 +182,8 @@ if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
     endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
 }
 
-// Step 3: Determine local FFT size (adapt this for cuFFTMp if necessary)
-// cuFFTMp might require manual management of the local data size
-// Example: You may need to calculate how much data each process holds based on grid decomposition
-
-pm->priv->fftsize = 2 * local_fft_size_cufftmp(n, pm->priv->comm_cart_2d, 
+//local_fft_size_cufftmp
+pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d, 
                                                 pm->real_space_region.size, 
                                                 pm->real_space_region.offset, 
                                                 pm->fourier_space_region.size, 

From d80a2ae604a3210aa7d972410cf2ad2808d864b9 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 10:58:17 -0400
Subject: [PATCH 085/120] modified task 2d decomp in petapm

---
 libgadget/petapm.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 35853ca9..671123b9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -142,6 +142,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
 
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
+    //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
     ptrdiff_t np[2]; // 2D arrangement of ranks
 
     int ThisTask;
@@ -154,13 +155,15 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_size(comm, &NTask);
 
     /* try to find a square 2d decomposition */
+    /* CUDA NOTE: CufftMp only supports square decomposition, 
+    so Ntask has to be a perfect square*/
     int i;
     int k;
-    for(i = sqrt(NTask) + 1; i >= 0; i --) {
-        if(NTask % i == 0) break;
+    np[0] = sqrt(NTask);
+    np[1] = Ntask / np[0];
+    if (np[0] * np[1] != NTask) {
+        endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
-    np[0] = i;
-    np[1] = NTask / i;
 
 message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 

From 305e04ec062cf0ec1ef1222a4066cd238ea35895 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 13:55:28 -0400
Subject: [PATCH 086/120] rm previous default slab code->use pencil

---
 libgadget/petapm.c | 44 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 671123b9..40b5a6e7 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -115,32 +115,6 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
 
-
-    int ndevices;
-    cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(ThisTask % ndevices);
-    printf("Hello from rank %d/%d using GPU %d\n", ThisTask, NTask, ThisTask % ndevices);
-
-    // Logical transform size
-    size_t nx = NTask;      // any value >= NTask is OK
-    size_t ny = NTask;      // any value >= NTask is OK
-    size_t nz = 2 * NTask;  // need to be even and >= NTask
-
-    // We start with Slabs distributed along X (X-Slabs)
-    // Ranks 0 ... (nx % size - 1) own 1 more element in the X dimension
-    // All ranks own all element in the Y and Z dimension
-    // The Z dimension has to be padded to accomodate the (nz / 2 + 1) 
-    // complex numbers assuming an in-place data layout.
-    int ranks_with_onemore = nx % size;
-    size_t my_nx = (nx / size) + (rank < ranks_with_onemore ? 1 : 0);
-    size_t padded_nz = 2 * (nz / 2 + 1);
-
-    // // Local, distributed, data
-    // std::vector<float> data(my_nx * ny * padded_nz, 1.0);
-    // generate_random(data, rank);
-    // std::vector<float> ref = data;
-
-
     ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
     //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
     ptrdiff_t np[2]; // 2D arrangement of ranks
@@ -157,24 +131,22 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
     so Ntask has to be a perfect square*/
-    int i;
-    int k;
     np[0] = sqrt(NTask);
     np[1] = Ntask / np[0];
     if (np[0] * np[1] != NTask) {
         endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
 
-// Step 1: Create 2D Cartesian grid for the processes
-int dims[2] = {np[0], np[1]};
-int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
+    // Step 1: Create 2D Cartesian grid for the processes
+    int dims[2] = {np[0], np[1]};
+    int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
 
-// Create 2D Cartesian communicator
-if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-    endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-}
+    // Create 2D Cartesian communicator
+    if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
+        endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
+    }
 
 // Step 2: Get the Cartesian coordinates of the process in the grid
 int periods_unused[2];

From 7e58404277a2907fd007d0d0466832887c2650cd Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 14:05:02 -0400
Subject: [PATCH 087/120] clean up some reion stuff in petapm.h

---
 libgadget/petapm.h | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 3db41533..b3eb580b 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -86,19 +86,7 @@ typedef struct {
     int (*active) (int i);
     int64_t NumPart;
 } PetaPMParticleStruct;
-
-/* extra particle info used in reionisation*/
-typedef struct {
-    size_t offset_type; //offset in particle data to type
-    size_t offset_pi; //offset in particle data to property index
-    void * Sphslot; //pointer to SPH slot
-    size_t sph_elsize; //element size of SPH slot
-    size_t offset_sfr; //offset in SPH slot to star formation rate
-    size_t offset_fesc_sph; //offset in SPH slot to escape fraction
-    void* Starslot; //pointer to fof groups
-    size_t star_elsize; //element size of fof group
-    size_t offset_fesc; //offset in fof groups to fof mass
-} PetaPMReionPartStruct;
+zq
 
 typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
@@ -110,9 +98,6 @@ typedef struct {
     petapm_readout_func readout;
 } PetaPMFunctions;
 
-/* Reion Loop function, applied after c2r, doesn't iterate over all particles*/
-typedef void (*petapm_reion_func)(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, double * mass_real, double * star_real, double * sfr_real, int last_step);
-
 /* this mixes up fourier space analysis; with transfer. Shall split them. */
 typedef struct {
     /* this is a fourier space readout; need a better name */
@@ -158,15 +143,4 @@ int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
 cufftComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
-
-void petapm_reion(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        petapm_prepare_func prepare,
-        PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
-        PetaPMFunctions * functions,
-        PetaPMParticleStruct * pstruct,
-        PetaPMReionPartStruct * rstruct,
-        petapm_reion_func reion_loop,
-        double R_max, double R_min, double R_delta, int use_sfr,
-        void * userdata);
-
 #endif

From 3fc532a747395dbe5385e951f856ac0fcfd0802c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 14:42:17 -0400
Subject: [PATCH 088/120] rewrite petapm_init

---
 libgadget/petapm.c | 160 ++++++++++++++++-----------------------------
 1 file changed, 58 insertions(+), 102 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 40b5a6e7..284f0351 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,6 +7,7 @@
 
 #include "types.h"
 #include "petapm.h"
+#include "box_iterator.hpp"
 
 #include "utils.h"
 #include "walltime.h"
@@ -93,9 +94,9 @@ petapm_module_init(int Nthreads)
     #endif
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    // get rid of pencil type
-    //MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
-    //MPI_Type_commit(&MPI_PENCIL);
+    get rid of pencil type
+    MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
+    MPI_Type_commit(&MPI_PENCIL);
 }
 
 void
@@ -114,19 +115,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     int NTask;
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
-
-    ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
-    //CUDA NOTE: keep np[2] to be two numbers for now, but np[0] = np[1]
-    ptrdiff_t np[2]; // 2D arrangement of ranks
-
-    int ThisTask;
-    int NTask;
-
-    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
-    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
-
-    MPI_Comm_rank(comm, &ThisTask);
-    MPI_Comm_size(comm, &NTask);
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(rank % ndevices);
 
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
@@ -138,63 +129,53 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     }
 
     message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
-
-    // Step 1: Create 2D Cartesian grid for the processes
-    int dims[2] = {np[0], np[1]};
-    int periods[2] = {0, 0};  // No periodic boundaries in the Cartesian grid
-
-    // Create 2D Cartesian communicator
-    if (MPI_Cart_create(comm, 2, dims, periods, 1, &pm->priv->comm_cart_2d) != MPI_SUCCESS) {
-        endrun(0, "Error: This test file only works with %td processes.\n", np[0] * np[1]);
-    }
-
-// Step 2: Get the Cartesian coordinates of the process in the grid
-int periods_unused[2];
-MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
-
-// Ensure that the task grid matches the expected number of processes
-if (pm->NTask2d[0] != np[0] || pm->NTask2d[1] != np[1]) {
-    endrun(6, "Bad PM mesh: Task2D = %d %d np %ld %ld\n", pm->NTask2d[0], pm->NTask2d[1], np[0], np[1]);
-}
-
-//local_fft_size_cufftmp
-pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d, 
-                                                pm->real_space_region.size, 
-                                                pm->real_space_region.offset, 
-                                                pm->fourier_space_region.size, 
-                                                pm->fourier_space_region.offset);
-
-    /*
-     * In fourier space, the transposed array is ordered in
-     * are in (y, z, x). The strides and sizes returned
-     * from local size is in (Nx, Ny, Nz), hence we roll them once
-     * so that the strides will give correct linear indexing for
-     * integer coordinates given in order of (y, z, x).
-     * */
-
-#define ROLL(a, N, j) { \
-    typeof(a[0]) tmp[N]; \
-    ptrdiff_t k; \
-    for(k = 0; k < N; k ++) tmp[k] = a[k]; \
-    for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
-    }
-
-    ROLL(pm->fourier_space_region.offset, 3, 1);
-    ROLL(pm->fourier_space_region.size, 3, 1);
-
-#undef ROLL
-
-    /* calculate the strides */
-    petapm_region_init_strides(&pm->real_space_region);
-    petapm_region_init_strides(&pm->fourier_space_region);
-
-
-/******************************** end unsure block **************************************** */
-
+    // Define custom data distribution
+    int64 nx               = Nmesh;
+    int64 ny               = Nmesh;
+    int64 nz               = Nmesh;
+    int64 nz_real          = nz;
+    int64 nz_complex       = (nz/2+1);
+    int64 nz_real_padded   = 2*nz_complex;
+
+    // Describe the data distribution using boxes
+    auto make_box = [](int64 lower[3], int64 upper[3], int64 strides[3]) {
+        Box3D box;
+        for(int i = 0; i < 3; i++) {
+            box.lower[i] = lower[i];
+            box.upper[i] = upper[i];
+            box.strides[i] = strides[i];
+        }
+        return box;
+    };
+
+    auto displacement = [](int64 length, int rank, int size) {
+        int ranks_cutoff = length % size;
+        return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+    };
+
+    // Input data are real pencils in X & Y, along Z
+    // Strides are packed and in-place (i.e., real is padded)
+    int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
+    int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
+    int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
+    box_real = make_box(lower, upper, strides);
+    boxes_real.push_back(make_box(lower, upper, strides));
+
+    // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
+    // Strides are packed
+    // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
+    // to ensure cuFFTMp will only perform two communication phases.
+    // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
+    int64 lower[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
+    int64 upper[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
+    box_complex = make_box(lower, upper, strides);
+
+
+    //===============================================================================================
     cudaStreamCreate(&pm->priv->stream);
     cufftCreate(&pm->priv->plan_forw);
     cufftCreate(&pm->priv->plan_back);
-
     // Attach the MPI communicator to the plans
     cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
     cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
@@ -204,8 +185,8 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
 
-    // cufftXtSetDistribution(plan_r2c, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    // cufftXtSetDistribution(plan_c2r, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+    cufftXtSetDistribution(pm->priv->plan_forw, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
+    cufftXtSetDistribution(pm->priv->plan_back, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
 
     // Set the stream
     cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
@@ -217,6 +198,11 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
 
 
+
+
+    //===============================================================================================
+
+
     // Allocate GPU memory, copy CPU data to GPU
     // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
     cudaLibXtDesc *desc;
@@ -224,36 +210,6 @@ pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
     // TODO: what to make of the cpu_data here?
     cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
 
-    /* now lets fill up the mesh2task arrays */
-
-#if 0
-    message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
-            pm->real_space_region.offset[0],
-            pm->real_space_region.offset[1],
-            pm->real_space_region.offset[2],
-            pm->real_space_region.size[0],
-            pm->real_space_region.size[1],
-            pm->real_space_region.size[2]);
-#endif
-
-    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
-    for(k = 0; k < 2; k ++) {
-        for(i = 0; i < Nmesh; i ++) {
-            tmp[i] = 0;
-        }
-        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
-            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
-        }
-        /* which column / row hosts this tile? */
-        /* FIXME: this is very inefficient */
-        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-        /*
-        for(i = 0; i < Nmesh; i ++) {
-            message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
-        }
-        */
-    }
-    myfree(tmp);
 }
 
 void

From 3b5fbb536f1944553ae1374cdcbc6c18d80d0e41 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 15:07:19 -0400
Subject: [PATCH 089/120] rewrite r2c in petapm_force_r2c

---
 libgadget/petapm.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 284f0351..038f3aaa 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -197,19 +197,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
 
-
-
-
     //===============================================================================================
 
 
-    // Allocate GPU memory, copy CPU data to GPU
-    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
-    cudaLibXtDesc *desc;
-    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_INPLACE);
-    // TODO: what to make of the cpu_data here?
-    cufftXtMemcpy(pm->priv->plan_back, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE);
-
 }
 
 void
@@ -313,27 +303,32 @@ static void pm_apply_transfer_function(PetaPM * pm,
 cufftComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
-    /* call pfft rho_k is CFT of rho */
-
-    /* this is because
-     *
-     * CFT = DFT * dx **3
-     * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
-     * */
+     // CUDA TODO: figureout how to properly get fftsize
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
     layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
     walltime_measure("/PMgrav/comm2");
-
 #ifdef DEBUG
     verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
     walltime_measure("/PMgrav/Verify");
 #endif
 
     cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-    pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
+
+    // CUDA TODO: figure out if this is needed
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
+    cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_INPLACE);
+    // copy real array to gpu
+    cufftXtMemcpy(pm->priv->plan_back, (void*)pm->priv->desc, (void*)real, CUFFT_COPY_HOST_TO_DEVICE);
+    // execute the plan
+    cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD);
     myfree(real);
 
+
+    //=============================== End of R2C =============================================
+
+    //========================== Begin Transfer Function =====================================
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
     /*Do any analysis that may be required before the transfer function is applied*/

From 4b7f2dbf25297b063cd8198e2cfb1a8046c0459e Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 15:19:40 -0400
Subject: [PATCH 090/120] modified c2r, almost done with petapm changes

---
 libgadget/petapm.c | 8 ++++----
 libgadget/petapm.h | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 038f3aaa..78c2c29b 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -327,7 +327,6 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
 
 
     //=============================== End of R2C =============================================
-
     //========================== Begin Transfer Function =====================================
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
@@ -363,9 +362,10 @@ petapm_force_c2r(PetaPM * pm,
         /* apply the greens function turn rho_k into potential in fourier space */
         pm_apply_transfer_function(pm, rho_k, complx, transfer);
         walltime_measure("/PMgrav/calc");
-
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
+        // double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+        /* CUDA TODO: BUT WHERE DO I INPUT THE ACTUAL ARRAY? */
+        cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE);
+        double * real = (double * ) pm->priv->desc->descriptor->data[0];
 
         walltime_measure("/PMgrav/c2r");
         if(f == functions) // Once
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index b3eb580b..3b6c68f8 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -4,11 +4,14 @@
 
 #include "powerspectrum.h"
 
+
 typedef struct Region {
     /* represents a region in the FFT Mesh */
     ptrdiff_t offset[3];
     ptrdiff_t size[3];
     ptrdiff_t strides[3];
+
+
     size_t totalsize;
     double * buffer;
     /* below are used mostly for investigation */
@@ -58,6 +61,7 @@ typedef struct PetaPMPriv {
     double * meshbuf;
     size_t meshbufsize;
     struct Layout layout;
+    cudaLibXtDesc *desc;
 } PetaPMPriv;
 
 typedef struct PetaPM {

From c1b317053a1bf99efa2d91ee082e2d510f2339cf Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:03:46 -0400
Subject: [PATCH 091/120] added box structure to petapm

---
 libgadget/petapm.c | 5 +----
 libgadget/petapm.h | 7 +++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 78c2c29b..aa0587a9 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,7 +7,6 @@
 
 #include "types.h"
 #include "petapm.h"
-#include "box_iterator.hpp"
 
 #include "utils.h"
 #include "walltime.h"
@@ -155,6 +154,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     // Input data are real pencils in X & Y, along Z
     // Strides are packed and in-place (i.e., real is padded)
+    
     int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
     int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
     int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
@@ -196,10 +196,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     size_t workspace;
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
-
     //===============================================================================================
-
-
 }
 
 void
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 3b6c68f8..0f704dd0 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -4,6 +4,13 @@
 
 #include "powerspectrum.h"
 
+using int64 = long long int;
+
+struct Box3D {
+    int64 lower[3];
+    int64 upper[3];
+    int64 strides[3];
+};
 
 typedef struct Region {
     /* represents a region in the FFT Mesh */

From aa935fa4270eab616f4821043775fe12cc0e6e07 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 17:32:13 -0500
Subject: [PATCH 092/120] fixed compiler errors in petapm

---
 libgadget/petapm.c | 38 ++++++++++++++++++++------------------
 libgadget/petapm.h |  1 -
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index aa0587a9..81c33990 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -57,7 +57,6 @@ petapm_alloc_rhok(PetaPM * pm)
 static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
 
 static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
-static PetaPMReionPartStruct * CPS_R; /* stored by calculate_uvbg, how to access other properties in P, SphP, and Fof */
 #define POS(i) ((double*)  (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
 #define INACTIVE(i) (CPS->active && !CPS->active(i))
@@ -88,12 +87,11 @@ petapm_module_init(int Nthreads)
     cudaSetDevice(device_id);  // Set the active GPU device
 
     // Handle CPU threads manually, if needed (optional if not using multithreading on the CPU)
-    #ifdef _OPENMP
-    omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
-    #endif
+    // #ifdef _OPENMP
+    // omp_set_num_threads(Nthreads); // Set number of threads for OpenMP parallelism
+    // #endif
     // cuFFT itself is inherently multithreaded; no cuFFT-specific thread setting needed
 
-    get rid of pencil type
     MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
     MPI_Type_commit(&MPI_PENCIL);
 }
@@ -116,18 +114,18 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     MPI_Comm_size(comm, &NTask);
     int ndevices;
     cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(rank % ndevices);
+    cudaSetDevice(ThisTask % ndevices);
 
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
     so Ntask has to be a perfect square*/
-    np[0] = sqrt(NTask);
-    np[1] = Ntask / np[0];
-    if (np[0] * np[1] != NTask) {
+    int nranks1d;
+    nranks1d = sqrt(NTask);
+    if (nranks1d != NTask/nranks1d) {
         endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-    message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
+    message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);
     // Define custom data distribution
     int64 nx               = Nmesh;
     int64 ny               = Nmesh;
@@ -154,22 +152,26 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     // Input data are real pencils in X & Y, along Z
     // Strides are packed and in-place (i.e., real is padded)
-    
+    Box3D box_real;
+    Box3D box_complex;
+    int i,j;
+    i = ThisTask / nranks1d;
+    j = ThisTask % nranks1d;
+
     int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
     int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
     int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
     box_real = make_box(lower, upper, strides);
-    boxes_real.push_back(make_box(lower, upper, strides));
 
     // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
     // Strides are packed
     // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
     // to ensure cuFFTMp will only perform two communication phases.
     // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
-    int64 lower[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
-    int64 upper[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
-    int64 strides[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
-    box_complex = make_box(lower, upper, strides);
+    int64 lower_c[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
+    int64 upper_c[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides_c[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
+    box_complex = make_box(lower_c, upper_c, strides_c);
 
 
     //===============================================================================================
@@ -288,8 +290,8 @@ static void pm_apply_transfer_function(PetaPM * pm,
         pos[0] = kpos[2];
         pos[1] = kpos[0];
         pos[2] = kpos[1];
-        dst[ip][0] = src[ip][0];
-        dst[ip][1] = src[ip][1];
+        dst[ip].x = src[ip].x;
+        dst[ip].y = src[ip].y;
         if(H) {
             H(pm, k2, pos, &dst[ip]);
         }
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 0f704dd0..c6e14469 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -97,7 +97,6 @@ typedef struct {
     int (*active) (int i);
     int64_t NumPart;
 } PetaPMParticleStruct;
-zq
 
 typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);

From f2329476e9b4d8b173abeb710575184f7a6340ed Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:36:54 -0400
Subject: [PATCH 093/120] pfft->cufft type change in zeldovich

---
 libgenic/zeldovich.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index ffc2bfee..6c606f6f 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <math.h>
 /* do NOT use complex.h it breaks the code */
-#include <pfft.h>
 #include "allvars.h"
 #include "proto.h"
 #include "power.h"
@@ -16,13 +15,13 @@
 #include <libgadget/utils.h>
 
 #define MESH2K(i) petapm_mesh_to_k(i)
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_density(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_vel_y(PetaPM * pm, int i, double * mesh, double weight);
@@ -30,7 +29,7 @@ static void readout_vel_z(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_x(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_y(PetaPM * pm, int i, double * mesh, double weight);
 static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight);
-static void gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
+static void gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int UnitaryAmplitude, int InvertPhase, const int Seed);
 
 static inline double periodic_wrap(double x, const double BoxSize)
 {
@@ -218,7 +217,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
            &icprep);
 
     /*This allocates the memory*/
-    pfft_complex * rho_k = petapm_alloc_rhok(pm);
+    cufftComplex * rho_k = petapm_alloc_rhok(pm);
 
     gaussian_fill(pm->Nmesh, petapm_get_fourier_region(pm),
 		  rho_k, GenicConfig.UnitaryAmplitude, GenicConfig.InvertPhase, GenicConfig.Seed);
@@ -274,7 +273,7 @@ void displacement_fields(PetaPM * pm, enum TransferType Type, struct ic_part_dat
  *
  *********************/
 
-static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     if(k2) {
         /* density is smoothed in k space by a gaussian kernel of 1 mesh grid */
         double r2 = 1.0 / pm->Nmesh;
@@ -289,7 +288,7 @@ static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex
     }
 }
 
-static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * value, int include_growth) {
+static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, cufftComplex * value, int include_growth) {
     if(k2) {
         double fac = 1./ (2 * M_PI) / sqrt(pm->BoxSize) * kaxis / k2;
         /*
@@ -313,23 +312,23 @@ static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, pfft_complex * val
     }
 }
 
-static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 1);
 }
-static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 1);
 }
-static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void vel_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 1);
 }
 
-static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[0], value, 0);
 }
-static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[1], value, 0);
 }
-static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void disp_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
     disp_transfer(pm, k2, kpos[2], value, 0);
 }
 
@@ -360,7 +359,7 @@ static void readout_disp_z(PetaPM * pm, int i, double * mesh, double weight) {
 }
 
 static void
-gaussian_fill(int Nmesh, PetaPMRegion * region, pfft_complex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
+gaussian_fill(int Nmesh, PetaPMRegion * region, cufftComplex * rho_k, int setUnitaryAmplitude, int setInvertPhase, const int Seed)
 {
     /* fastpm deals with strides properly; petapm not. So we translate it here. */
     PMDesc pm[1];

From 8836c71113adbd17009467d1cda69e760f5d8613 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:40:03 -0400
Subject: [PATCH 094/120] fix cufft complex number indexing

---
 libgenic/zeldovich.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgenic/zeldovich.c b/libgenic/zeldovich.c
index 6c606f6f..c10e7641 100644
--- a/libgenic/zeldovich.c
+++ b/libgenic/zeldovich.c
@@ -283,8 +283,8 @@ static void density_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex
         double kmag = sqrt(k2) * 2 * M_PI / pm->BoxSize;
         fac *= DeltaSpec(kmag, ptype) / sqrt(pm->BoxSize * pm->BoxSize * pm->BoxSize);
 
-        value[0][0] *= fac;
-        value[0][1] *= fac;
+        value[0].x *= fac;
+        value[0].y *= fac;
     }
 }
 
@@ -306,9 +306,9 @@ static void disp_transfer(PetaPM * pm, int64_t k2, int kaxis, cufftComplex * val
             fac *= dlogGrowth(kmag, ptype);
         else
             fac *= DeltaSpec(kmag, ptype);
-        double tmp = value[0][0];
-        value[0][0] = - value[0][1] * fac;
-        value[0][1] = tmp * fac;
+        double tmp = value[0].x;
+        value[0].x = - value[0].y * fac;
+        value[0].y = tmp * fac;
     }
 }
 

From 8d1ec8954a57bae025863132ee34be93d6341064 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 3 Oct 2024 18:44:14 -0400
Subject: [PATCH 095/120] pfft->cufft type in glass.c

---
 libgenic/glass.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libgenic/glass.c b/libgenic/glass.c
index e9925016..5b24ee0b 100644
--- a/libgenic/glass.c
+++ b/libgenic/glass.c
@@ -16,10 +16,10 @@
 #include <libgadget/powerspectrum.h>
 #include <libgadget/gravity.h>
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value);
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value);
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_y(PetaPM *pm, int i, double * mesh, double weight);
 static void readout_force_z(PetaPM *pm, int i, double * mesh, double weight);
@@ -278,7 +278,7 @@ _prepare(PetaPM * pm, PetaPMParticleStruct * pstruct, void * userdata, int * Nre
  *
  *********************/
 
-static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex *value) {
+static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex *value) {
 
     double f = 1.0;
     const double smth = 1.0 / k2;
@@ -301,13 +301,13 @@ static void potential_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex
 
     if(k2 == 0) {
         /* Remove zero mode corresponding to the mean.*/
-        value[0][0] = 0.0;
-        value[0][1] = 0.0;
+        value[0].x = 0.0;
+        value[0].y = 0.0;
         return;
     }
 
-    value[0][0] *= fac;
-    value[0][1] *= fac;
+    value[0].x *= fac;
+    value[0].y *= fac;
 }
 
 /* the transfer functions for force in fourier space applied to potential */
@@ -322,7 +322,7 @@ static double diff_kernel(double w) {
     return 1 / 6.0 * (8 * sin (w) - sin (2 * w));
 }
 
-static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
+static void force_transfer(PetaPM *pm, int k, cufftComplex * value) {
     double tmp0;
     double tmp1;
     /*
@@ -331,18 +331,18 @@ static void force_transfer(PetaPM *pm, int k, pfft_complex * value) {
      * filter is   i K(w)
      * */
     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-    tmp0 = - value[0][1] * fac;
-    tmp1 = value[0][0] * fac;
-    value[0][0] = tmp0;
-    value[0][1] = tmp1;
+    tmp0 = - value[0].y * fac;
+    tmp1 = value[0].x * fac;
+    value[0].x = tmp0;
+    value[0].y = tmp1;
 }
-static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_x_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[0], value);
 }
-static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_y_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[1], value);
 }
-static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex * value) {
+static void force_z_transfer(PetaPM *pm, int64_t k2, int kpos[3], cufftComplex * value) {
     force_transfer(pm, kpos[2], value);
 }
 static void readout_force_x(PetaPM *pm, int i, double * mesh, double weight) {

From ccddd54c72bd8555184798bca4047fed49ef0758 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 7 Oct 2024 13:53:49 -0400
Subject: [PATCH 096/120] comments/todos after talking to Yu

---
 libgadget/petapm.c | 6 +++++-
 libgadget/petapm.h | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 81c33990..a9738c2b 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -324,7 +324,9 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
     cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD);
     myfree(real);
 
-
+     // CUDA TODO: need to check if the output complex array is transpose
+     // need to verify
+     // can verify by using both version of the code
     //=============================== End of R2C =============================================
     //========================== Begin Transfer Function =====================================
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
@@ -920,6 +922,8 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 /**************
  * functions iterating over particle / mesh pairs
  ***************/
+ // can write to some other place and add up later
+ // look for numpy reduce at/bin count
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
     double Mass = *MASS(i);
     if(INACTIVE(i))
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index c6e14469..4a58439c 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -31,6 +31,8 @@ typedef struct Region {
 /* a layout is the communication object, represent
  * pencil / cells exchanged  */
 
+
+// Layout determins which cells are sent to which task.
 struct Layout {
     MPI_Comm comm;
     int NpExport;

From 989fe42f469ddf51630e9d10838fb37314e8ac80 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 7 Oct 2024 14:15:23 -0400
Subject: [PATCH 097/120] manually incorporate gravpm change from upstream

---
 libgadget/gravpm.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index b2ab750b..f7f496d1 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -318,9 +318,7 @@ static void compute_neutrino_power(PetaPM * pm) {
     delta_nu_from_power(ps, GravPM.CP, GravPM.Time, GravPM.TimeIC);
 
     /*Initialize the interpolation for the neutrinos*/
-    ps->nu_spline = gsl_interp_alloc(gsl_interp_linear,ps->nonzero);
-    ps->nu_acc = gsl_interp_accel_alloc();
-    gsl_interp_init(ps->nu_spline,ps->logknu,ps->delta_nu_ratio,ps->nonzero);
+    ps->nu_spline = new boost::math::interpolators::barycentric_rational<double>(ps->logknu, ps->delta_nu_ratio, ps->nonzero);
     /*Zero power spectrum, which is stored with the neutrinos*/
     powerspectrum_zero(ps);
 }
@@ -430,8 +428,7 @@ potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value)
          *            = (M_cdm + M_nu) * delta_t
          * This is correct for the forces, and gives the right power spectrum,
          * once we multiply PowerSpectrum.Norm by (Omega0 / (Omega0 - OmegaNu))**2 */
-        const double nufac = 1 + ps->nu_prefac * gsl_interp_eval(ps->nu_spline,ps->logknu,
-                                                                       ps->delta_nu_ratio,logk2,ps->nu_acc);
+        const double nufac = 1 + ps->nu_prefac * (*ps->nu_spline)(logk2);
         value[0].x *= nufac;
         value[0].y *= nufac;
     }

From b9f15914b50cb9f182af142fb31e2e0ec7c3cbe6 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 7 Oct 2024 15:20:34 -0500
Subject: [PATCH 098/120] remove pfft/fftw3 dependency

---
 depends/Makefile        | 17 ++---------------
 depends/install_pfft.sh | 35 -----------------------------------
 libgadget/uvbg.h        |  1 -
 3 files changed, 2 insertions(+), 51 deletions(-)
 delete mode 100644 depends/install_pfft.sh

diff --git a/depends/Makefile b/depends/Makefile
index 0c9e6144..85b6d56d 100644
--- a/depends/Makefile
+++ b/depends/Makefile
@@ -2,14 +2,11 @@ CONFIG = ../Options.mk
 include $(CONFIG)
 
 .PHONY: depends
-.INTERMEDIATE: pfft
 # MPICC ?= mpicc
 MPICCDEP ?= mpicc
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
 LIBRARIES=lib/libbigfile-mpi.a
-FFTLIBRARIES=lib/libpfft_omp.a lib/libfftw3_mpi.a lib/libfftw3_omp.a
-depends: $(LIBRARIES) $(FFTLIBRARIES)
-$(FFTLIBRARIES): pfft
+depends: $(LIBRARIES)
 
 lib/libbigfile-mpi.a: bigfile/src/bigfile-mpi.c
 	mkdir -p lib; \
@@ -17,20 +14,10 @@ lib/libbigfile-mpi.a: bigfile/src/bigfile-mpi.c
 	cd bigfile/src; \
 	make install PREFIX=$(PWD) CC="$(MPICCDEP)" MPICC="$(MPICCDEP)" CFLAGS="$(OPTIMIZE)" AR="$(AR)"
 
-pfft: install_pfft.sh
-	mkdir -p lib; \
-	mkdir -p include; \
-	#Using -ipo causes icc to crash.
-	MPICC="$(MPICCDEP)" CC="$(MPICCDEP)" CFLAGS="$(filter-out -ipo,$(OPTIMIZE)) -I $(PWD)/include -L$(PWD)/lib" AR="$(AR)" RANLIB=$(RANLIB) \
-        sh $(PWD)/install_pfft.sh $(PWD)/
 
-clean: clean-fast clean-fft
+clean: clean-fast
 
 clean-fast:
 	rm -rf $(LIBRARIES)
 	cd bigfile/src; make clean
 
-clean-fft:
-	rm -rf $(FFTLIBRARIES)
-	rm -rf tmp-pfft-*/double
-	rm -rf tmp-pfft-*/single
diff --git a/depends/install_pfft.sh b/depends/install_pfft.sh
deleted file mode 100644
index 9be12703..00000000
--- a/depends/install_pfft.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh -e
-
-PREFIX="$1"
-shift
-OPTIMIZE="$*"
-OPTIMIZE1="$*"
-echo "Optimization for double" ${OPTIMIZE}
-
-PFFT_VERSION=1.0.8-alpha3-fftw3-2don2d
-TMP="tmp-pfft-$PFFT_VERSION"
-LOGFILE="build.log"
-
-mkdir $TMP 
-ROOT=`dirname $0`/../
-if ! [ -f $ROOT/depends/pfft-$PFFT_VERSION.tar.gz ]; then
-wget https://github.com/rainwoodman/pfft/releases/download/$PFFT_VERSION/pfft-$PFFT_VERSION.tar.gz \
-    -O $ROOT/depends/pfft-$PFFT_VERSION.tar.gz 
-fi
-
-gzip -dc $ROOT/depends/pfft-$PFFT_VERSION.tar.gz | tar xf - -C $TMP
-cd $TMP
-
-(
-mkdir -p double;cd double
-
-../pfft-${PFFT_VERSION}/configure --prefix=$PREFIX --disable-shared --enable-static --enable-openmp \
---disable-fortran --disable-dependency-tracking --disable-doc --enable-mpi ${OPTIMIZE} &&
-make -j 8   &&
-make install && echo "PFFT_DONE"
-) 2>&1 > ${LOGFILE}.double
-
-if ! grep PFFT_DONE ${LOGFILE}.double > /dev/null; then
-    tail ${LOGFILE}.double
-    exit 1
-fi
diff --git a/libgadget/uvbg.h b/libgadget/uvbg.h
index 4520a583..3058e416 100644
--- a/libgadget/uvbg.h
+++ b/libgadget/uvbg.h
@@ -1,7 +1,6 @@
 #ifndef UVBG_H
 #define UVBG_H
 
-#include <pfft.h>
 #include "petapm.h"
 #include "utils/paramset.h"
 #include "fof.h"

From daf4c2f6d6f2c9b02abcc4c34323c03e79d14d36 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 7 Oct 2024 16:24:38 -0500
Subject: [PATCH 099/120] remove plane and lenstool in compilation

---
 Makefile           | 2 --
 gadget/params.c    | 8 --------
 libgadget/Makefile | 6 +-----
 3 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 802532f8..f7dc0877 100644
--- a/Makefile
+++ b/Makefile
@@ -8,8 +8,6 @@ include Makefile.version
 FILES = $(shell git ls-files)
 
 all: $(CONFIG)
-	@echo "=================$(BOOST_LIBS)======================="
-	@echo "=================$(GSL_LIBS)======================="
 	cd depends; $(MAKE)
 	cd libgadget; $(MAKE)
 	cd libgenic; $(MAKE)
diff --git a/gadget/params.c b/gadget/params.c
index 608189bf..9f384614 100644
--- a/gadget/params.c
+++ b/gadget/params.c
@@ -23,7 +23,6 @@
 #include <libgadget/cooling_qso_lightup.h>
 #include <libgadget/uvbg.h>
 #include <libgadget/stats.h>
-#include <libgadget/plane.h>
 
 static int
 BlackHoleFeedbackMethodAction (ParameterSet * ps, const char * name, void * data)
@@ -73,12 +72,6 @@ create_gadget_parameter_set()
     param_declare_string(ps, "CpuFile", OPTIONAL, "cpu.txt", "File to output cpu usage information");
     param_declare_string(ps, "OutputList", REQUIRED, NULL, "List of output scale factors.");
 
-    /*Potential plane parameters*/
-    param_declare_string(ps, "PlaneOutputList", OPTIONAL, NULL, "List of potential plane output scale factors.");
-    param_declare_int(ps, "PlaneResolution", OPTIONAL, 256, "Number of pixels per dimension in the potential plane (should be an even number).");
-    param_declare_double(ps, "PlaneThickness", OPTIONAL, -1, "Thickness of the potential plane in the normal direction in internal gadget units (kpc/h by default).");
-    param_declare_string(ps, "PlaneCutPoints", OPTIONAL, NULL, "List of potential plane cut points in the normal direction in internal gadget units (kpc/h by default).");
-    param_declare_string(ps, "PlaneNormals", OPTIONAL, "\"0, 1, 2\"", "List of potential plane normal directions (0=x, 1=y, 2=z).");
 
     /*Cosmology parameters*/
     param_declare_double(ps, "Omega0", REQUIRED, 0.2814, "Total matter density at z=0");
@@ -419,7 +412,6 @@ void read_parameter_file(char *fname, int * ShowBacktrace, double * MaxMemSizePe
 
     /*Initialize per-module parameters.*/
     set_all_global_params(ps);
-    set_plane_params(ps);
     set_init_params(ps);
     set_petaio_params(ps);
     set_timestep_params(ps);
diff --git a/libgadget/Makefile b/libgadget/Makefile
index ab569eb9..ef7a279e 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -38,8 +38,6 @@ INCL = densitykernel.h \
 	neutrinos_lra.h \
 	omega_nu_single.h \
 	uvbg.h \
-	plane.h \
-	lenstools.h \
 utils/unitsystem.h \
 utils/peano.h \
 utils/interp.h \
@@ -110,9 +108,7 @@ GADGET_OBJS =  \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
-	 uvbg.o \
-	 plane.o\
-	 lenstools.o
+	 uvbg.o 
 
 GADGET_UTILS_OBJS= \
 utils/endrun.o \

From e782b31d255f0d1186b94cdf0c0605d3038a172b Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 7 Oct 2024 17:26:56 -0500
Subject: [PATCH 100/120] add back plane.h just for getting things to run

---
 gadget/params.c    | 6 ++++++
 libgadget/Makefile | 4 +++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/gadget/params.c b/gadget/params.c
index 9f384614..4c5c19e6 100644
--- a/gadget/params.c
+++ b/gadget/params.c
@@ -72,6 +72,12 @@ create_gadget_parameter_set()
     param_declare_string(ps, "CpuFile", OPTIONAL, "cpu.txt", "File to output cpu usage information");
     param_declare_string(ps, "OutputList", REQUIRED, NULL, "List of output scale factors.");
 
+    /*Potential plane parameters*/
+    param_declare_string(ps, "PlaneOutputList", OPTIONAL, NULL, "List of potential plane output scale factors.");
+    param_declare_int(ps, "PlaneResolution", OPTIONAL, 256, "Number of pixels per dimension in the potential plane (should be an even number).");
+    param_declare_double(ps, "PlaneThickness", OPTIONAL, -1, "Thickness of the potential plane in the normal direction in internal gadget units (kpc/h by default).");
+    param_declare_string(ps, "PlaneCutPoints", OPTIONAL, NULL, "List of potential plane cut points in the normal direction in internal gadget units (kpc/h by default).");
+    param_declare_string(ps, "PlaneNormals", OPTIONAL, "\"0, 1, 2\"", "List of potential plane normal directions (0=x, 1=y, 2=z).");
 
     /*Cosmology parameters*/
     param_declare_double(ps, "Omega0", REQUIRED, 0.2814, "Total matter density at z=0");
diff --git a/libgadget/Makefile b/libgadget/Makefile
index ef7a279e..0e7e8a82 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -38,6 +38,7 @@ INCL = densitykernel.h \
 	neutrinos_lra.h \
 	omega_nu_single.h \
 	uvbg.h \
+	plane.h \
 utils/unitsystem.h \
 utils/peano.h \
 utils/interp.h \
@@ -108,7 +109,8 @@ GADGET_OBJS =  \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
-	 uvbg.o 
+	 uvbg.o \
+	 plane.o 
 
 GADGET_UTILS_OBJS= \
 utils/endrun.o \

From 6e170909ffb98e0229f3f877103f7ebaf1496bd1 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Mon, 7 Oct 2024 22:42:57 -0700
Subject: [PATCH 101/120] Some fixes to travis tests

---
 libgadget/tests/test_cosmology.c   | 4 ++--
 platform-options/Options.mk.travis | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libgadget/tests/test_cosmology.c b/libgadget/tests/test_cosmology.c
index c1913927..35ccdfc8 100644
--- a/libgadget/tests/test_cosmology.c
+++ b/libgadget/tests/test_cosmology.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
-#include <boost/math/special_functions/hypergeometric_2f1.hpp>
+#include <boost/math/special_functions/hypergeometric_2F1.hpp>
 #include <libgadget/physconst.h>
 #include <libgadget/cosmology.h>
 #include "stub.h"
@@ -52,7 +52,7 @@ static inline double radgrow(double aa, double omegar) {
 //Omega_L + Omega_M = 1 => D+ ~ Gauss hypergeometric function
 static inline double growth(double aa, double omegam) {
     double omegal = 1 - omegam;
-    return aa * boost::math::hypergeometric_2f1(1./3, 1, 11./6, -omegal/omegam * pow(aa, 3));
+    return aa * boost::math::hypergeometric_2F1(1./3, 1, 11./6, -omegal/omegam * pow(aa, 3));
 }
 
 static void test_cosmology(void ** state)
diff --git a/platform-options/Options.mk.travis b/platform-options/Options.mk.travis
index ab4aa2a4..9366c785 100644
--- a/platform-options/Options.mk.travis
+++ b/platform-options/Options.mk.travis
@@ -1,4 +1,4 @@
-OPTIMIZE =  -fopenmp -O2 -g -std=gnu99
+OPTIMIZE =  -fopenmp -O2 -g 
 AR=ar
 SHELL = /bin/bash
 

From be177e0eac1ed2221d323006b96c98bdecc0b000 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Mon, 7 Oct 2024 22:43:25 -0700
Subject: [PATCH 102/120] Remove pfft from makefiles

---
 .github/workflows/main.yaml | 2 +-
 Makefile                    | 2 --
 Makefile.rules              | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index f9622c4c..df7786fd 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -35,7 +35,7 @@ jobs:
     - name: Cache depends/
       uses: actions/cache@v4
       with:
-        key: ${{ runner.os }}-build-${{ hashFiles('depends/Makefile', 'depends/install_pfft.sh') }}
+        key: ${{ runner.os }}-build-${{ hashFiles('depends/Makefile') }}
         path: |
           ~/depends/install
           ~/depends/include
diff --git a/Makefile b/Makefile
index f7dc0877..1c5a2eed 100644
--- a/Makefile
+++ b/Makefile
@@ -58,9 +58,7 @@ tag:
 sdist:
 	(git rev-parse --abbrev-ref HEAD | grep master )|| (echo "Must be on master" && exit 1);
 	git checkout -B "rc-$(VERSION)";
-	git add -f depends/pfft-1.0.8-alpha2-fftw3.tar.gz
 	git commit -m "rc-$(VERSION) packaging"
-	git rm --cached depends/pfft-1.0.8-alpha2-fftw3.tar.gz
 	git commit -m "rc-$(VERSION) cleanup"
 	bash maintainer/git-archive-all.sh --prefix MPGadget-$(VERSION)/ -- - | gzip -c > MPGadget-$(VERSION).tar.gz
 	git checkout master
diff --git a/Makefile.rules b/Makefile.rules
index 56101cbe..13c66dd6 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -37,7 +37,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 #For tests
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
-BUNDLEDLIBS = -lbigfile-mpi -lbigfile -lpfft_omp -lfftw3_mpi -lfftw3_omp -lfftw3
+BUNDLEDLIBS = -lbigfile-mpi -lbigfile
 LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0

From 1257aa0013b47c2e3eb0d0b0a10ae1c3cd5dbc93 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Mon, 7 Oct 2024 22:46:14 -0700
Subject: [PATCH 103/120] Remove last bits of EXCUR_REION

---
 gadget/params.c    |   2 -
 libgadget/Makefile |   2 -
 libgadget/run.c    |   8 -
 libgadget/uvbg.c   | 596 ---------------------------------------------
 libgadget/uvbg.h   |  21 --
 5 files changed, 629 deletions(-)
 delete mode 100644 libgadget/uvbg.c
 delete mode 100644 libgadget/uvbg.h

diff --git a/gadget/params.c b/gadget/params.c
index 4c5c19e6..dddc17dc 100644
--- a/gadget/params.c
+++ b/gadget/params.c
@@ -21,7 +21,6 @@
 #include <libgadget/timebinmgr.h>
 #include <libgadget/petaio.h>
 #include <libgadget/cooling_qso_lightup.h>
-#include <libgadget/uvbg.h>
 #include <libgadget/stats.h>
 
 static int
@@ -431,7 +430,6 @@ void read_parameter_file(char *fname, int * ShowBacktrace, double * MaxMemSizePe
     set_domain_params(ps);
     set_sfr_params(ps);
     set_sync_params(ps);
-    set_uvbg_params(ps);
     set_winds_params(ps);
     set_fof_params(ps);
     set_blackhole_params(ps);
diff --git a/libgadget/Makefile b/libgadget/Makefile
index 0e7e8a82..7e184058 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -37,7 +37,6 @@ INCL = densitykernel.h \
 	walltime.h \
 	neutrinos_lra.h \
 	omega_nu_single.h \
-	uvbg.h \
 	plane.h \
 utils/unitsystem.h \
 utils/peano.h \
@@ -109,7 +108,6 @@ GADGET_OBJS =  \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
-	 uvbg.o \
 	 plane.o 
 
 GADGET_UTILS_OBJS= \
diff --git a/libgadget/run.c b/libgadget/run.c
index d40075df..6b750db0 100644
--- a/libgadget/run.c
+++ b/libgadget/run.c
@@ -30,7 +30,6 @@
 #include "fof.h"
 #include "cooling_qso_lightup.h"
 #include "timefac.h"
-#include "uvbg.h"
 #include "neutrinos_lra.h"
 #include "stats.h"
 #include "veldisp.h"
@@ -614,13 +613,6 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
                     /* Helium reionization by switching on quasar bubbles*/
                     do_heiii_reionization(atime, &fof, &gasTree, &All.CP, &rnd, units.UnitInternalEnergy_in_cgs, fds.FdHelium);
                 }
-#ifdef EXCUR_REION
-                //excursion set reionisation
-                if(CalcUVBG && All.ExcursionSetReionOn) {
-                    calculate_uvbg(&pm_mass, &pm_star, &pm_sfr, WriteSnapshot, SnapshotFileCount, All.OutputDir, atime, &All.CP, units);
-                    message(0,"uvbg calculated\n");
-                }
-#endif // ifdef EXCUR_REION
                 fof_finish(&fof);
             }
 
diff --git a/libgadget/uvbg.c b/libgadget/uvbg.c
deleted file mode 100644
index 97a379ae..00000000
--- a/libgadget/uvbg.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/*=============================================================================
- * An implementation of a patchy UV ionising background
- * calculation. This code utilises the decomposition and communication
- * in the long-range force code in petapm.c, some new functions have been
- * written in petapm.c to accomodate the order of operations and multiple grids
- * present in the reionisation model
-============================================================================*/
-
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <bigfile.h>
-#include <bigfile-mpi.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include "uvbg.h"
-#include "cosmology.h"
-#include "utils.h"
-#include "partmanager.h"
-#include "slotsmanager.h"
-#include "petapm.h"
-#include "physconst.h"
-#include "walltime.h"
-#include "petaio.h"
-
-// TODO(smutch): See if something equivalent is defined anywhere else
-#define FLOAT_REL_TOL (float)1e-5
-
-static struct UVBGParams {
-    /*filter scale parameters*/
-    double ReionRBubbleMax;
-    double ReionRBubbleMin;
-    double ReionDeltaRFactor;
-    int ReionFilterType;
-    int RtoMFilterType;
-
-    /*J21 calculation parameters*/
-    double ReionGammaHaloBias;
-    double ReionNionPhotPerBary;
-    double AlphaUV;
-    double EscapeFractionNorm;
-    double EscapeFractionScaling;
-    int ReionUseParticleSFR;
-    double ReionSFRTimescale;
-    int UVBGdim;
-
-    double Time;
-    Cosmology *CP;
-    double UnitLength_in_cm;
-    double UnitMass_in_g;
-    double UnitTime_in_s;
-
-} uvbg_params;
-
-struct UVBGgrids_type UVBGgrids;
-
-/*set uvbg parameters*/
-void set_uvbg_params(ParameterSet * ps) {
-
-    int ThisTask;
-    MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
-    if(ThisTask==0)
-    {
-        uvbg_params.ReionFilterType = param_get_int(ps, "ReionFilterType");
-        uvbg_params.RtoMFilterType = param_get_int(ps, "RtoMFilterType");
-        uvbg_params.ReionRBubbleMax = param_get_double(ps, "ReionRBubbleMax");
-        uvbg_params.ReionRBubbleMin = param_get_double(ps, "ReionRBubbleMin");
-        uvbg_params.ReionDeltaRFactor = param_get_double(ps, "ReionDeltaRFactor");
-        uvbg_params.ReionGammaHaloBias = param_get_double(ps, "ReionGammaHaloBias");
-        uvbg_params.ReionNionPhotPerBary = param_get_double(ps, "ReionNionPhotPerBary");
-        uvbg_params.AlphaUV = param_get_double(ps, "AlphaUV");
-        uvbg_params.EscapeFractionNorm = param_get_double(ps, "EscapeFractionNorm");
-        uvbg_params.EscapeFractionScaling = param_get_double(ps, "EscapeFractionScaling");
-        uvbg_params.ReionUseParticleSFR = param_get_int(ps, "ReionUseParticleSFR");
-        uvbg_params.ReionSFRTimescale = param_get_double(ps, "ReionSFRTimescale");
-        uvbg_params.UVBGdim = param_get_int(ps,"UVBGdim");
-    }
-
-    MPI_Bcast(&uvbg_params, sizeof(struct UVBGParams), MPI_BYTE, 0, MPI_COMM_WORLD);
-}
-
-int grid_index(int i, int j, int k, ptrdiff_t strides[3])
-{
-    return k*strides[2] + j*strides[1] + i*strides[0];
-}
-
-
-void save_uvbg_grids(int SnapshotFileCount, char * OutputDir, PetaPM * pm)
-{
-    int n_ranks;
-    int this_rank=-1;
-    int grid_n = pm->real_space_region.size[0] * pm->real_space_region.size[1] * pm->real_space_region.size[2];
-    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
-    MPI_Comm_rank(MPI_COMM_WORLD, &this_rank);
-
-    //TODO(jdavies): finish this grid writing function, it outputs fine but in the wrong rank order
-    BigFile fout;
-    char fname[256];
-    sprintf(fname, "%s/UVgrids_%03d", OutputDir,SnapshotFileCount);
-    message(0, "saving uv grids to %s \n", fname);
-
-    if(0 != big_file_mpi_create(&fout, fname, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to create snapshot at %s:%s\n", fname,
-                    big_file_get_error_message());
-    }
-
-    BigBlock bh;
-    if(0 != big_file_mpi_create_block(&fout, &bh, "Header", NULL, 0, 0, 0, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to create block at %s:%s\n", "Header",
-                big_file_get_error_message());
-    }
-
-    if(
-    (0 != big_block_set_attr(&bh, "volume_weighted_global_xHI", &(UVBGgrids.volume_weighted_global_xHI), "f8", 1)) ||
-    (0 != big_block_set_attr(&bh, "mass_weighted_global_xHI", &(UVBGgrids.mass_weighted_global_xHI), "f8", 1)) ||
-    (0 != big_block_set_attr(&bh, "scale_factor", &uvbg_params.Time, "f8", 1)) ) {
-        endrun(0, "Failed to write attributes %s\n",
-                    big_file_get_error_message());
-    }
-
-    if(0 != big_block_mpi_close(&bh, MPI_COMM_WORLD)) {
-        endrun(0, "Failed to close block %s\n",
-                    big_file_get_error_message());
-    }
-
-    //TODO: think about the cartesian communicator in the PetaPM struct
-    //and the mapping between ranks, indices and positions
-
-    size_t dims[2] = {(size_t)grid_n, 1};
-    //J21 block
-    BigArray arr = {0};
-    big_array_init(&arr, UVBGgrids.J21, "=f4", 2, dims, NULL);
-    petaio_save_block(&fout,"J21",&arr,1);
-
-    message(0,"saved J21\n");
-
-    //xHI block
-    BigArray arr2 = {0};
-    big_array_init(&arr2, UVBGgrids.xHI, "=f4", 2, dims, NULL);
-    petaio_save_block(&fout,"XHI",&arr2,1);
-
-    message(0,"saved XHI\n");
-
-    if(0 != big_file_mpi_close(&fout, MPI_COMM_WORLD)){
-        endrun(0, "Failed to close snapshot at %s:%s\n", fname,
-                    big_file_get_error_message());
-    }
-}
-
-#ifdef EXCUR_REION
-
-static double RtoM(double R)
-{
-    // All in internal units
-    const int filter = uvbg_params.RtoMFilterType;
-    double OmegaM = uvbg_params.CP->Omega0;
-    double RhoCrit = uvbg_params.CP->RhoCrit;
-
-    switch (filter) {
-    case 0: //top hat M = (4/3) PI <rho> R^3
-        return (4.0 / 3.0) * M_PI * pow(R, 3) * (OmegaM * RhoCrit);
-    case 1: //gaussian: M = (2PI)^1.5 <rho> R^3
-        return pow(2 * M_PI, 1.5) * OmegaM * RhoCrit * pow(R, 3);
-    default: // filter not defined
-        endrun(1, "Unrecognised RtoM filter (%d).\n", filter);
-        break;
-    }
-
-    return -1;
-}
-
-//Simple region initialization (taken from zeldovich.c)
-//TODO: look into _prepare (gravpm.c) and see if its worth implementing anything there
-static PetaPMRegion * makeregion(PetaPM * pm, PetaPMParticleStruct * pstruct, void * userdata, int * Nregions) {
-    PetaPMRegion * regions = mymalloc2("Regions", sizeof(PetaPMRegion));
-    int NumPart = PartManager->NumPart;
-    int k;
-    int r = 0;
-    int i;
-    double min[3] = {pm->BoxSize, pm->BoxSize, pm->BoxSize};
-    double max[3] = {0, 0, 0.};
-
-    for(i = 0; i < NumPart; i ++) {
-        for(k = 0; k < 3; k ++) {
-            if(min[k] > P[i].Pos[k])
-            min[k] = P[i].Pos[k];
-            if(max[k] < P[i].Pos[k])
-            max[k] = P[i].Pos[k];
-        }
-    }
-
-    for(k = 0; k < 3; k ++) {
-        regions[r].offset[k] = floor(min[k] / pm->BoxSize * pm->Nmesh - 1);
-        regions[r].size[k] = ceil(max[k] / pm->BoxSize * pm->Nmesh + 2);
-        regions[r].size[k] -= regions[r].offset[k];
-    }
-
-    /* setup the internal data structure of the region */
-    petapm_region_init_strides(&regions[r]);
-    *Nregions = 1;
-    return regions;
-}
-
-//this is applied as global_transfer, dividing by n_cells due to the forward-reverse FFT
-static void divide_by_ncell(PetaPM * pm, int64_t k2, int k[3], pfft_complex * value){
-        int total_n_cells = (double)(uvbg_params.UVBGdim * uvbg_params.UVBGdim * uvbg_params.UVBGdim);
-        value[0][0] /= total_n_cells;
-        value[0][1] /= total_n_cells;
-}
-
-//transfer functions that applies a certain filter (top-hat or gaussian)
-static void filter_pm(PetaPM * pm, int64_t k2, int k[3], pfft_complex * value)
-{
-    const int filter_type = uvbg_params.ReionFilterType;
-    double k_mag = sqrt(k2) * (2 * M_PI / pm->Nmesh) * (pm->Nmesh / pm->BoxSize);
-
-    double kR = k_mag * pm->G; // Radius is stored in the G variable
-
-    switch (filter_type) {
-    case 0: // Real space top-hat
-        if (kR > 1e-4){
-            value[0][0] *= (3.0 * (sinf(kR) / powf(kR, 3) - cosf(kR) / powf(kR, 2)));
-            value[0][1] *= (3.0 * (sinf(kR) / powf(kR, 3) - cosf(kR) / powf(kR, 2)));
-        }
-        break;
-
-    case 1: // k-space top hat
-        kR *= 0.413566994; // Equates integrated volume to the real space top-hat (9pi/2)^(-1/3)
-        if (kR > 1){
-            value[0][0] = 0.0;
-            value[0][1] = 0.0;
-        }
-        break;
-
-    case 2: // Gaussian
-        kR *= 0.643; // Equates integrated volume to the real space top-hat
-        value[0][0] *= (pow(M_E,(-kR * kR / 2.0)));
-        value[0][1] *= (pow(M_E,(-kR * kR / 2.0)));
-        break;
-
-    default:
-        endrun(1, "ReionFilterType type %d is undefined!\n", filter_type);
-        break;
-    }
-}
-
-#ifdef DEBUG
-//print some statistics of the reion grids for debugging
-static void print_reion_debug_info(PetaPM * pm_mass, float * J21, float * xHI, double * mass_real, double * star_real, double * sfr_real)
-{
-    double min_J21 = 1e30;
-    double max_J21 = 0;
-    double min_mass = 1e30;
-    double max_mass = 0;
-    double min_star = 1e30;
-    double max_star = 0;
-    double min_sfr = 1e30;
-    double max_sfr = 0;
-    double total_star = 0;
-    double total_mass = 0;
-    int neutral_count = 0;
-    int ion_count = 0;
-    int pm_idx;
-    int uvbg_dim = uvbg_params.UVBGdim;
-    int grid_n_real = uvbg_dim * uvbg_dim * uvbg_dim;
-#pragma omp parallel for collapse(3) reduction(+:neutral_count,ion_count,total_mass,total_star) reduction(min:min_J21,min_mass,min_star,min_sfr) reduction(max:max_J21,max_mass,max_star,max_sfr) private(pm_idx)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-
-                total_mass += mass_real[pm_idx];
-                total_star += star_real[pm_idx];
-                if(xHI[pm_idx] > 1 - FLOAT_REL_TOL)
-                    neutral_count += 1;
-                if(xHI[pm_idx] < FLOAT_REL_TOL)
-                    ion_count += 1;
-                min_J21 = min_J21 < J21[pm_idx] ? min_J21 : J21[pm_idx];
-                max_J21 = max_J21 > J21[pm_idx] ? max_J21 : J21[pm_idx];
-                min_mass = min_mass < mass_real[pm_idx] ? min_mass : mass_real[pm_idx];
-                max_mass = max_mass > mass_real[pm_idx] ? max_mass : mass_real[pm_idx];
-                min_star = min_star < star_real[pm_idx] ? min_star : star_real[pm_idx];
-                max_star = max_star > star_real[pm_idx] ? max_star : star_real[pm_idx];
-                if(uvbg_params.ReionUseParticleSFR){
-                    min_sfr = min_sfr < sfr_real[pm_idx] ? min_sfr : sfr_real[pm_idx];
-                    max_sfr = max_sfr > sfr_real[pm_idx] ? max_sfr : sfr_real[pm_idx];
-                }
-            }
-
-    message(1,"rank total mass : %e | rank total star : %e\n",total_mass,total_star);
-    MPI_Allreduce(MPI_IN_PLACE, &neutral_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &ion_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_J21, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_J21, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_mass, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_mass, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_star, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_star, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &min_sfr, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &max_sfr, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &total_mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, &total_star, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double n_ratio = (double)neutral_count / (double)grid_n_real;
-    double i_ratio = (double)ion_count / (double)grid_n_real;
-
-    message(0,"neutral cells : %d, ion cells %d, ratio(%d) N %f ion %f\n",neutral_count, ion_count, grid_n_real, n_ratio, i_ratio);
-    message(0,"min J21 : %e | max J21 %e\n",min_J21,max_J21);
-    message(0,"min mass : %e | max mass : %e | total mass %e\n",min_mass,max_mass,total_mass);
-    message(0,"min star : %e | max star %e | total star : %e\n",min_star,max_star,total_star);
-    message(0,"min sfr : %e | max sfr %e\n",min_sfr,max_sfr);
-}
-#endif
-
-//takes filtered mass, star, sfr grids and calculates J21 and neutral fractions onto a grid
-//which is placed in the mass grid out on the last call of this function.
-static void reion_loop_pm(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr,
-        double * mass_real, double * star_real, double * sfr_real, int last_step)
-{
-    //MAKE SURE THESE ARE PRIVATE IN THREADED LOOPS
-    double density_over_mean = 0;
-    double sfr_density = 0;
-    double f_coll_stars = 0;
-    int pm_idx = 0;
-
-    double R = pm_mass->G; //radius is stored here
-
-    const double redshift = 1.0 / (uvbg_params.Time) - 1.;
-
-    // Loop through filter radii
-    //(jdavies): get the parameters
-    //double ReionGammaHaloBias = uvbg_params.ReionGammaHaloBias;
-    const double ReionNionPhotPerBary = uvbg_params.ReionNionPhotPerBary;
-    int use_sfr = uvbg_params.ReionUseParticleSFR;
-    double alpha_uv = uvbg_params.AlphaUV;
-
-    // TODO(smutch): tidy this up!
-    // The following is based on Sobacchi & Messinger (2013) eqn 7
-    // with f_* removed and f_b added since we define f_coll as M_*/M_tot rather than M_vir/M_tot,
-    // and also with the inclusion of the effects of the Helium fraction.
-    const double Y_He = 1.0 - HYDROGEN_MASSFRAC;
-    const double BaryonFrac = uvbg_params.CP->OmegaBaryon / uvbg_params.CP->Omega0;
-    double ReionEfficiency = 1.0 / BaryonFrac * ReionNionPhotPerBary / (1.0 - 0.75 * Y_He);
-
-    const double tot_n_cells = pm_mass->Nmesh * pm_mass->Nmesh * pm_mass->Nmesh;
-    const double pixel_volume = pm_mass->CellSize * pm_mass->CellSize * pm_mass->CellSize;
-    const double deltax_conv_factor = tot_n_cells / (uvbg_params.CP->RhoCrit * uvbg_params.CP->Omega0 * pm_mass->BoxSize * pm_mass->BoxSize * pm_mass->BoxSize);
-
-    float* J21 = UVBGgrids.J21;
-    float* xHI = UVBGgrids.xHI;
-
-    // Perform sanity checks to account for aliasing effects
-#pragma omp parallel for collapse(3) private(pm_idx)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-                mass_real[pm_idx] = fmax(mass_real[pm_idx], 0.0);
-                star_real[pm_idx] = fmax(star_real[pm_idx], 0.0);
-                if(use_sfr)
-                    sfr_real[pm_idx] = fmax(sfr_real[pm_idx], 0.0);
-            }
-
-    const double J21_aux_constant = (1.0 + redshift) * (1.0 + redshift) / (4.0 * M_PI)
-        * alpha_uv * PLANCK * 1e21
-        * R * uvbg_params.UnitLength_in_cm * ReionNionPhotPerBary / PROTONMASS
-        * uvbg_params.UnitMass_in_g / pow(uvbg_params.UnitLength_in_cm, 3) / uvbg_params.UnitTime_in_s;
-
-    const double hubble_time = 1 / (hubble_function(uvbg_params.CP,uvbg_params.Time) * uvbg_params.CP->HubbleParam);
-
-    // Main loop through the box
-#pragma omp parallel for collapse(3) private(pm_idx,density_over_mean,f_coll_stars,sfr_density)
-    for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-        for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-            for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-
-                //convert mass to delta
-                density_over_mean = mass_real[pm_idx] * deltax_conv_factor;
-
-                f_coll_stars = star_real[pm_idx] / (RtoM(R) * density_over_mean)
-                    * (4.0 / 3.0) * M_PI * R * R * R / pixel_volume;
-
-                if(use_sfr)
-                    sfr_density = sfr_real[pm_idx] / pixel_volume / (uvbg_params.UnitMass_in_g / SOLAR_MASS) * (uvbg_params.UnitTime_in_s / SEC_PER_YEAR); // In internal units
-                else
-                    sfr_density = star_real[pm_idx] / (uvbg_params.ReionSFRTimescale * hubble_time) / pixel_volume; // In internal units
-                const float J21_aux = (float)(sfr_density * J21_aux_constant);
-
-                // Check if ionised!
-                if (f_coll_stars > (1.0 / ReionEfficiency)) // IONISED!!!!
-                {
-                    // If it is the first crossing of the ionisation barrier for this cell (largest R), let's record J21
-                    if (xHI[pm_idx] > FLOAT_REL_TOL) {
-                        J21[pm_idx] = J21_aux;
-                    }
-
-                    // Mark as ionised
-                    xHI[pm_idx] = 0.0f;
-
-                    // TODO(smutch): Do we want to implement this?
-                    // r_bubble[i_real] = (float)R;
-                }
-                //TODO: implement CellSizeFactor for low-res
-                else if (last_step && (xHI[pm_idx] > FLOAT_REL_TOL)) {
-                    // Check if this is the last filtering step.
-                    // If so, assign partial ionisations to those cells which aren't fully ionised
-                     xHI[pm_idx] = (float)(1.0 - f_coll_stars * ReionEfficiency);
-                }
-
-            } // iz
-    // Find the volume and mass weighted neutral fractions
-    // TODO: The deltax grid will have rounding errors from forward and reverse
-    //       FFT. Should cache deltax slabs prior to ffts and reuse here.
-    if(last_step){
-
-#ifdef DEBUG
-        print_reion_debug_info(pm_mass,J21,xHI,mass_real,star_real,sfr_real);
-#endif
-
-        double volume_weighted_global_xHI = 0.0;
-        double mass_weighted_global_xHI = 0.0;
-        double mass_weight = 0.0;
-        int uvbg_dim = uvbg_params.UVBGdim;
-        int grid_n_real = uvbg_dim * uvbg_dim * uvbg_dim;
-#pragma omp parallel for collapse(3) reduction(+:volume_weighted_global_xHI,mass_weighted_global_xHI,mass_weight) private(pm_idx,density_over_mean)
-        for (int ix = 0; ix < pm_mass->real_space_region.size[0]; ix++)
-            for (int iy = 0; iy < pm_mass->real_space_region.size[1]; iy++)
-                for (int iz = 0; iz < pm_mass->real_space_region.size[2]; iz++) {
-                    pm_idx = grid_index(ix, iy, iz, pm_mass->real_space_region.strides);
-                    volume_weighted_global_xHI += (double)(xHI[pm_idx]);
-
-                    density_over_mean = deltax_conv_factor * mass_real[pm_idx];
-                    mass_weighted_global_xHI += (double)(xHI[pm_idx]) * density_over_mean;
-                    mass_weight += density_over_mean;
-
-                    //if we are on the last step, we re_use the mass grid to store J21 so it can be read out
-                    mass_real[pm_idx] = (double)(J21[pm_idx]);
-                }
-
-        MPI_Allreduce(MPI_IN_PLACE, &volume_weighted_global_xHI, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &mass_weighted_global_xHI, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &mass_weight, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-        volume_weighted_global_xHI /= grid_n_real;
-        mass_weighted_global_xHI /= mass_weight;
-        UVBGgrids.volume_weighted_global_xHI = volume_weighted_global_xHI;
-        UVBGgrids.mass_weighted_global_xHI = mass_weighted_global_xHI;
-        message(0,"vol weighted xhi : %f\n",volume_weighted_global_xHI);
-        message(0,"mass weighted xhi : %f\n",mass_weighted_global_xHI);
-    }
-
-}
-
-//readout J21 from grid to particle
-static void readout_J21(PetaPM * pm, int i, double * mesh, double weight) {
-    // Since we need to decide whether particles on the boundary are ionised or not,
-    // We choose to take the maximum J21 (of 8 cells) here.
-    //TODO: change the iterator in petapm for reionisation to use NGP to avoid the (minor) resolution effects
-    if (P[i].Type == 0 && mesh[0] > SPHP(i).local_J21){
-        SPHP(i).local_J21 = mesh[0];
-        //if particle has not been ionised yet, set its zreion
-        //the above conditional makes sure the particle is (partially) in an ionsied cell
-        if(SPHP(i).zreion == -1)
-            SPHP(i).zreion = 1/uvbg_params.Time - 1;
-    }
-}
-/* sets particle properties needed for the Excursion Set */
-static void init_particle_uvbg(){
-    /* need to convert halo mass to 1e10 solar */
-    double fesc_unit_conv = uvbg_params.UnitMass_in_g / SOLAR_MASS / 1e10 / uvbg_params.CP->HubbleParam;
-    double fesc_temp;
-
-    /* Reset local J21 */
-#pragma omp parallel for private(fesc_temp)
-    for(int ii = 0; ii < PartManager->NumPart; ii++) {
-        /* Init J21 and set escape fracitons for sph particles */
-        if(P[ii].Type == 0) {
-            SPHP(ii).local_J21 = 0.;
-            //P[i].EscapeFraction is currently halo mass (from fof.c)
-            if(!uvbg_params.ReionUseParticleSFR || SPHP(ii).EscapeFraction == 0) continue;
-
-            fesc_temp = uvbg_params.EscapeFractionNorm * pow(SPHP(ii).EscapeFraction
-                    * fesc_unit_conv, uvbg_params.EscapeFractionScaling);
-
-            if(fesc_temp > 1) fesc_temp = 1;
-            if(fesc_temp < 0) endrun(1,"negative escape fraction?\n");
-            SPHP(ii).EscapeFraction = fesc_temp;
-        }
-        /* Assign escape fractions to star particles */
-        else if(P[ii].Type == 4) {
-            if(STARP(ii).EscapeFraction == 0) continue;
-
-            fesc_temp = uvbg_params.EscapeFractionNorm * pow(STARP(ii).EscapeFraction
-                    * fesc_unit_conv, uvbg_params.EscapeFractionScaling);
-
-            if(fesc_temp > 1) fesc_temp = 1;
-            if(fesc_temp < 0) endrun(1,"negative escape fraction?\n");
-            STARP(ii).EscapeFraction = fesc_temp;
-        }
-    }
-}
-
-void calculate_uvbg(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, int WriteSnapshot, int SnapshotFileCount, char * OutputDir, double Time, Cosmology * CP, const struct UnitSystem units){
-    //setup filter radius range
-    double Rmax = uvbg_params.ReionRBubbleMax;
-    double Rmin = uvbg_params.ReionRBubbleMin;
-    double Rdelta = uvbg_params.ReionDeltaRFactor;
-
-    //define particle structure with the info petapm needs
-    PetaPMParticleStruct pstruct = {
-        P,
-        sizeof(P[0]),
-        (char*) &P[0].Pos[0]  - (char*) P,
-        (char*) &P[0].Mass  - (char*) P,
-        /* Regions allocated inside _prepare*/
-        NULL,
-        /* By default all particles are active. For hybrid neutrinos set below.*/
-        NULL,
-        PartManager->NumPart,
-    };
-    PetaPMReionPartStruct rstruct = {
-        (char*) &P[0].Type  - (char*) P,
-        (char*) &P[0].PI  - (char*) P,
-        SphP,
-        sizeof(SphP[0]),
-        (char*) &SphP[0].Sfr  - (char*) SphP,
-        (char*) &SphP[0].EscapeFraction  - (char*) SphP,
-        StarP,
-        sizeof(StarP[0]),
-        (char*) &StarP[0].EscapeFraction - (char*) StarP,
-    };
-
-    uvbg_params.Time = Time;
-    uvbg_params.CP = CP;
-    uvbg_params.UnitMass_in_g = units.UnitMass_in_g;
-    uvbg_params.UnitLength_in_cm = units.UnitLength_in_cm;
-    uvbg_params.UnitTime_in_s = units.UnitTime_in_s;
-
-    PetaPMGlobalFunctions global_functions = {NULL, NULL, divide_by_ncell};
-
-    //TODO: set this up with all the filtering/reion loops
-    static PetaPMFunctions functions [] =
-    {
-        {"Reionisation", filter_pm, readout_J21},
-        {NULL, NULL, NULL},
-    };
-
-    //set local J21 = 0 and set escape fractions for all particles
-    init_particle_uvbg();
-    uvbg_params.Time = Time;
-    uvbg_params.CP = CP;
-
-    /* initialize grids */
-    int grid_n = pm_mass->real_space_region.size[0]
-        * pm_mass->real_space_region.size[1]
-        * pm_mass->real_space_region.size[2];
-
-    UVBGgrids.J21 = mymalloc("J21", sizeof(float) * grid_n);
-    float * J21 = UVBGgrids.J21;
-    UVBGgrids.xHI = mymalloc("xHI", sizeof(float) * grid_n);
-    float * xHI = UVBGgrids.xHI;
-
-    for (int ii = 0; ii < grid_n; ii++) {
-        J21[ii] = 0.0f;
-        xHI[ii] = 1.0f;
-    }
-
-    message(0, "Away to call find_HII_bubbles...\n");
-    petapm_reion(pm_mass,pm_star,pm_sfr,makeregion,&global_functions
-            ,functions,&pstruct,&rstruct,reion_loop_pm,Rmax,Rmin,Rdelta
-            ,uvbg_params.ReionUseParticleSFR,NULL);
-
-    //TODO: In line with Meraxes, should we multiply J21 with a halo bias parameter for particles in a group??
-
-    walltime_measure("/UVBG/find_HII_bubbles");
-
-    //since J21 is output to particles, we should only need to write these grids for debugging
-#ifdef DEBUG
-    if(WriteSnapshot) {
-        save_uvbg_grids(SnapshotFileCount, OutputDir, pm_mass);
-        message(0,"uvbg saved\n");
-    }
-    walltime_measure("/UVBG/save");
-#endif
-
-    myfree(UVBGgrids.xHI);
-    myfree(UVBGgrids.J21);
-
-    walltime_measure("/UVBG");
-}
-#endif // ifdef EXCUR_REION
diff --git a/libgadget/uvbg.h b/libgadget/uvbg.h
deleted file mode 100644
index 3058e416..00000000
--- a/libgadget/uvbg.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef UVBG_H
-#define UVBG_H
-
-#include "petapm.h"
-#include "utils/paramset.h"
-#include "fof.h"
-
-struct UVBGgrids_type {
-    float *J21;
-    float *xHI;
-
-    double volume_weighted_global_xHI;
-    double mass_weighted_global_xHI;
-};
-
-//extern struct UVBGgrids_type UVBGgrids; 
-
-void calculate_uvbg(PetaPM * pm_mass, PetaPM * pm_star, PetaPM * pm_sfr, int WriteSnapshot, int SnapshotFileCount, char * Outputdir, double Time, Cosmology * CP, const struct UnitSystem units);
-void set_uvbg_params(ParameterSet * ps);
-
-#endif

From 9bc2c678431255e82b5e7c5bdcb27619cf119cda Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Mon, 7 Oct 2024 22:51:23 -0700
Subject: [PATCH 104/120] Update license with new name

---
 LICENSE | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/LICENSE b/LICENSE
index 446508f6..46041b7c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -14,6 +14,4 @@ Redistribution and use in source and binary forms, with or without modification,
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-MP-Gadget is also available under the terms of the GNU Public License v2 or later at the option of the user. MP-Gadget is a derived code of Gadget-2 and has been re-licensed under the above license with the permission of all original copyright holders, including Volker Springel, the author of Gadget-2. 
-
-Note that as MP-Gadget depends on PFFT and the GNU scientific library, a compiled version is implicitly distributed under the terms of the GNU General Public License version under which the version of the library llinked against is available.
+Shenqi is a derived code of Gadget-2 and has been re-licensed under the above license with the permission of all original copyright holders, including Volker Springel, the author of Gadget-2. 

From eb704d608f986ccdf56e6d20e5fa8e4c092caf6b Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Tue, 8 Oct 2024 10:34:49 -0500
Subject: [PATCH 105/120] added cufftmp, cuda, nvshmem libs/includes to
 Option.mk.vista

---
 Makefile.rules                    |  4 ----
 platform-options/Options.mk.vista | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index acd86487..486a0880 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -5,10 +5,6 @@ MPICC ?= mpic++
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
-GSL_INCL ?= $(shell pkg-config --cflags gsl)
-GSL_LIBS ?= $(shell pkg-config --libs gsl)
-#BOOST_INCL ?= $(shell pkg-config --cflags boost)
-#BOOST_LIBS ?= $(shell pkg-config --libs boost)
 
 ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
     # If found, set FITSIO_INCL with the cfitsio flags
diff --git a/platform-options/Options.mk.vista b/platform-options/Options.mk.vista
index 28c4d944..ae7ac597 100644
--- a/platform-options/Options.mk.vista
+++ b/platform-options/Options.mk.vista
@@ -1,5 +1,20 @@
 #These variables are set to useful defaults, but may be overriden if needed
 #This is a good optimized build default for nvc
+CUDA_LIBS=-L$(TACC_CUDA_LIB) -lcudart
+CUDA_INCL=-I$(TACC_CUDA_INC)
+
+BOOST_INCL=-I$(TACC_BOOST_INC)
+BOOST_LIBS=-L$(TACC_BOOST_LIB) -lboost_system -lboost_math_c99
+
+NVMATH_INCL=-I$(TACC_NVMATH_INC)
+NVMATH_LIBS=-L$(TACC_NVMATH_LIB)
+
+CUFFTMP_INCL=-I/home1/apps/nvidia/Linux_aarch64/24.7/math_libs/include/cufftmp
+CUFFTMP_LIBS=-L/home1/apps/nvidia/Linux_aarch64/24.7/math_libs/lib64 -lcufftMp
+
+NVSHMEM_INCL=-I$(TACC_NVSHMEM_INC)
+NVSHMEM_LIBS=-L$(TACC_NVSHMEM_LIB) -lnvshmem_host
+
 OPTIMIZE =  -mp -g -Wall -fast
 #This is a good non-optimized default for debugging
 #OPTIMIZE =  -mp -O0 -g -Wall

From b38fc82b7b8eba88fadfd59d66c3f57073e91a80 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Wed, 9 Oct 2024 23:37:05 -0500
Subject: [PATCH 106/120] merged box struct into region

---
 libgadget/petapm.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 4a58439c..9a0112c4 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -4,19 +4,15 @@
 
 #include "powerspectrum.h"
 
-using int64 = long long int;
 
-struct Box3D {
-    int64 lower[3];
-    int64 upper[3];
-    int64 strides[3];
-};
+using int64 = long long int;
 
 typedef struct Region {
     /* represents a region in the FFT Mesh */
-    ptrdiff_t offset[3];
-    ptrdiff_t size[3];
-    ptrdiff_t strides[3];
+    int64 offset[3];
+    int64 size[3];
+    int64 upper[3];
+    int64 strides[3];
 
 
     size_t totalsize;

From 30787867fc27de93d994185f19c3ce62888d46ef Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Wed, 9 Oct 2024 23:37:31 -0500
Subject: [PATCH 107/120] massive debugging petapm.c, almost works

---
 libgadget/petapm.c | 141 +++++++++++++++++++++++++++++++++------------
 1 file changed, 103 insertions(+), 38 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index a9738c2b..7a3e940d 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -110,12 +110,17 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     int ThisTask;
     int NTask;
+    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
+    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
+    
     int ndevices;
     cudaGetDeviceCount(&ndevices);
     cudaSetDevice(ThisTask % ndevices);
 
+    message(0, "Cuda Devices %d \n", ndevices);
+
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition, 
     so Ntask has to be a perfect square*/
@@ -126,6 +131,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     }
 
     message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);
+    
     // Define custom data distribution
     int64 nx               = Nmesh;
     int64 ny               = Nmesh;
@@ -134,45 +140,54 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     int64 nz_complex       = (nz/2+1);
     int64 nz_real_padded   = 2*nz_complex;
 
-    // Describe the data distribution using boxes
-    auto make_box = [](int64 lower[3], int64 upper[3], int64 strides[3]) {
-        Box3D box;
-        for(int i = 0; i < 3; i++) {
-            box.lower[i] = lower[i];
-            box.upper[i] = upper[i];
-            box.strides[i] = strides[i];
-        }
-        return box;
-    };
+    // create 2D cartesian MPI comm without pfft
+    int dims[2] = {nranks1d, nranks1d};
+    int periods[2] = {0, 0};  // non-periodic in both dimensions
+    // Allow the ranks to be reordered by MPI for efficiency
+    // Actually don't allow reordering for now to be safe
+    int reorder = 0;
+    MPI_Cart_create(comm, 2, dims, periods, reorder, &pm->priv->comm_cart_2d);
+    if (pm->priv->comm_cart_2d == MPI_COMM_NULL) {
+        endrun(0, "Error: comm_cart_2d is MPI_COMM_NULL\n");
+    }
+    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods, pm->ThisTask2d);
+    message(1, "Task = %d ThisTask2d = (%d, %d) Ntask2d = (%d, %d) \n", 
+        ThisTask, pm->ThisTask2d[0], pm->ThisTask2d[1], pm->NTask2d[0], pm->NTask2d[1]);
+
 
+    // compute offset, size and strides
     auto displacement = [](int64 length, int rank, int size) {
         int ranks_cutoff = length % size;
-        return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+        int chunk_size = length / size;
+        return (rank < ranks_cutoff ? rank * (chunk_size + 1) : ranks_cutoff * (chunk_size + 1) + (rank - ranks_cutoff) * chunk_size);
     };
-
-    // Input data are real pencils in X & Y, along Z
-    // Strides are packed and in-place (i.e., real is padded)
-    Box3D box_real;
-    Box3D box_complex;
-    int i,j;
-    i = ThisTask / nranks1d;
-    j = ThisTask % nranks1d;
-
-    int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
-    int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
-    int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
-    box_real = make_box(lower, upper, strides);
-
-    // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
-    // Strides are packed
-    // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
-    // to ensure cuFFTMp will only perform two communication phases.
-    // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
-    int64 lower_c[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
-    int64 upper_c[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
-    int64 strides_c[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
-    box_complex = make_box(lower_c, upper_c, strides_c);
-
+    
+    // update region properties
+    auto update_region = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region) {
+        for (int i = 0; i < 3; i++) {
+            region.offset[i]  = lower[i];
+            region.upper[i]   = upper[i];
+            region.size[i]    = upper[i] - lower[i];
+            region.strides[i] = strides[i];
+        }
+    };
+    
+    int i = ThisTask / nranks1d;
+    int j = ThisTask % nranks1d;
+    
+    // real region setup
+    // note the petapm->region has non-padded strides, while cufft takes in padded strides
+    int64 lower_real[3]   = {displacement(nx, i, nranks1d), displacement(ny, j, nranks1d), 0};
+    int64 upper_real[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
+    int64 strides_real[3] = {(upper_real[1] - lower_real[1]) * nz_real_padded, nz_real_padded, 1};
+    int64 strides_real_nopad[3] = {(upper_real[1] - lower_real[1]) * nz_real, nz_real, 1};
+    update_region(lower_real, upper_real, strides_real_nopad, pm->real_space_region);
+    
+    // complex region setup
+    int64 lower_fourier[3]   = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
+    int64 upper_fourier[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides_fourier[3] = {(upper_fourier[1] - lower_fourier[1]) * (upper_fourier[2] - lower_fourier[2]), (upper_fourier[2] - lower_fourier[2]), 1};
+    update_region(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region);
 
     //===============================================================================================
     cudaStreamCreate(&pm->priv->stream);
@@ -187,8 +202,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
 
-    cufftXtSetDistribution(pm->priv->plan_forw, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    cufftXtSetDistribution(pm->priv->plan_back, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+    cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
+    cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
 
     // Set the stream
     cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
@@ -198,7 +213,56 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     size_t workspace;
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_DISTRIBUTED_INPUT, i.e., box_real
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT);
+    pm->priv->fftsize = desc->descriptor->size[0];
     //===============================================================================================
+
+    message(1, "Task %d NGPUs=%d, pfftsize=%d \n", ThisTask, desc->descriptor->nGPUs, pm->priv->fftsize);
+    /* now lets fill up the mesh2task arrays */
+    #if 1
+        message(1, "Real Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->real_space_region.offset[0],
+                pm->real_space_region.offset[1],
+                pm->real_space_region.offset[2],
+                pm->real_space_region.upper[0],
+                pm->real_space_region.upper[1],
+                pm->real_space_region.upper[2],
+                pm->real_space_region.strides[0],
+                pm->real_space_region.strides[1],
+                pm->real_space_region.strides[2]);
+        message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->fourier_space_region.offset[0],
+                pm->fourier_space_region.offset[1],
+                pm->fourier_space_region.offset[2],
+                pm->fourier_space_region.upper[0],
+                pm->fourier_space_region.upper[1],
+                pm->fourier_space_region.upper[2],
+                pm->fourier_space_region.strides[0],
+                pm->fourier_space_region.strides[1],
+                pm->fourier_space_region.strides[2]);
+    #endif
+    
+        int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+        int k;
+        for(k = 0; k < 2; k ++) {
+            for(i = 0; i < Nmesh; i ++) {
+                tmp[i] = 0;
+            }
+            for(i = 0; i < pm->real_space_region.size[k]; i ++) {
+                tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
+            }
+            /* which column / row hosts this tile? */
+            /* FIXME: this is very inefficient */
+            MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
+            for(i = 0; i < Nmesh; i ++) {
+                message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
+            }
+        }
+        myfree(tmp);
 }
 
 void
@@ -644,7 +708,7 @@ layout_build_and_exchange_cells_to_fft(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->comm);
 
-#if 0
+#if 1
     double massExport = 0;
     for(i = 0; i < L->NcExport; i ++) {
         massExport += L->BufSend[i];
@@ -951,3 +1015,4 @@ static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
  * iCFT(CFT) = 2pi
  *
  * **************************8*/
+

From de0d7c4a66ebfdfe661c93508fa556f2e2503735 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Tue, 15 Oct 2024 14:26:11 -0500
Subject: [PATCH 108/120] Fix the OpenMP sort on ARM architectures.

The standard does not guarantee that the merges are atomic, it seems.
This new code seems to improve memory safety even on intel, so let's
make it unconditional. It will make the sort a bit slower but we can
live with that.
---
 libgadget/utils/openmpsort.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libgadget/utils/openmpsort.c b/libgadget/utils/openmpsort.c
index c021323f..4ba823b5 100644
--- a/libgadget/utils/openmpsort.c
+++ b/libgadget/utils/openmpsort.c
@@ -302,6 +302,8 @@ void qsort_openmp(void *base, size_t nmemb, size_t size,
             if(key == 0 && color % 2 == 0) {
                 int nextT = tid + sep;
                 /*merge with next guy */
+#pragma omp critical
+		    {
                 /* only even leaders arrives to this point*/
                 if(nextT >= Nt) {
                     /* no next guy, copy directly.*/
@@ -318,21 +320,23 @@ void qsort_openmp(void *base, size_t nmemb, size_t size,
 #endif
                     merge(Abase[tid], Anmemb[tid], Abase[nextT], Anmemb[nextT], Atmp[tid], p.s, compar, indirect);
                     /* merge two lists */
-                    Anmemb[tid] = Anmemb[tid] + Anmemb[nextT];
+                    Anmemb[tid] += Anmemb[nextT];
                     Anmemb[nextT] = 0;
+		    }
                 }
             }
 
             /* now swap Abase and Atmp for next merge */
 #pragma omp barrier
-            if(tid == 0) {
+#pragma omp master
+	    {
                 void ** a = Abase;
                 Abase = Atmp;
                 Atmp = a;
-            }
+	    }
+#pragma omp barrier
             /* at this point Abase contains the sorted array */
         }
-#pragma omp barrier
         /* output was written to the tmp rather than desired location, copy it */
         if((!indirect && Abase[0] != base)
                 || (indirect && Abase[0] != (char *) tmp + nmemb * sizeof(void *))) {

From 0316e4f19cac9e8667beeaf23967a13d23c5d71e Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Tue, 15 Oct 2024 16:04:41 -0500
Subject: [PATCH 109/120] Remove stray GSL variable from makefile

---
 Makefile.rules | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index 486a0880..6791fe6d 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -26,7 +26,7 @@ endif
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
+CFLAGS = $(OPTIONS) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
@@ -34,7 +34,7 @@ CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
 BUNDLEDLIBS = -lbigfile-mpi -lbigfile
-LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
+LIBS  = -lm  $(BOOST_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 

From 77b4618967c9c41df36e57bd58cbc777586251b2 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Tue, 15 Oct 2024 16:58:47 -0500
Subject: [PATCH 110/120] Fix C++ compilation error

---
 libgadget/treewalk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgadget/treewalk.c b/libgadget/treewalk.c
index 62acbd4a..959d5b1a 100644
--- a/libgadget/treewalk.c
+++ b/libgadget/treewalk.c
@@ -206,7 +206,7 @@ treewalk_build_queue(TreeWalk * tw, int * active_set, const size_t size, int may
     /* Explicitly deal with the case where the queue is zero and there is nothing to do.
      * Some OpenMP compilers (nvcc) seem to still execute the below loop in that case*/
     if(size == 0) {
-        tw->WorkSet = mymalloc("ActiveQueue", sizeof(int));
+        tw->WorkSet = (int *) mymalloc("ActiveQueue", sizeof(int));
         tw->WorkSetSize = size;
         return;
     }

From e5936c782659bd1a49e0767166ed7bbad7e67520 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:00:48 -0500
Subject: [PATCH 111/120] add compile rules for cuda kernels

---
 Makefile.rules | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/Makefile.rules b/Makefile.rules
index 486a0880..5a9a59d8 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -2,6 +2,7 @@
 #
 AR ?= ar
 MPICC ?= mpic++
+NVOPTIMIZE ?=  
 LOW_PRECISION ?= double
 
 OPTIMIZE ?= -O2 -g -fopenmp -Wall
@@ -21,29 +22,36 @@ ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
     NVSHMEM_LIBS ?= -lnvshmem_host
     NVCC ?= nvcc
     NVOPTIMIZE ?= -O3
+    MPI_INCL ?=
 endif
 
 OPTIONS = $(OPTIMIZE) $(OPT)
 GADGET_TESTDATA_ROOT = $(CURDIR)/../
 
-CFLAGS = $(OPTIONS) $(GSL_INCL) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
+CFLAGS = $(OPTIONS) $(BOOST_INCL) $(FITSIO_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL)
 CFLAGS += -I../depends/include
 CFLAGS += -I../
 CFLAGS += "-DLOW_PRECISION=$(LOW_PRECISION)"
 #For tests
 TCFLAGS = $(CFLAGS) -DGADGET_TESTDATA_ROOT=\"$(GADGET_TESTDATA_ROOT)\"
 
-BUNDLEDLIBS = -lbigfile-mpi -lbigfile
-LIBS  = -lm  $(BOOST_LIBS) $(GSL_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
+CUDAFLAGS = $(BOOST_INCL) $(CUDA_INCL) $(CUFFTMP_INCL) $(NVSHMEM_INCL) $(MPI_INCL)
+
+BUNDLEDLIBS = -lbigfile-mpi -lbigfile 
+LIBS  = -lm $(BOOST_LIBS) $(FITSIO_LIBS) $(CUDA_LIBS) $(CUFFTMP_LIBS) $(NVSHMEM_LIBS)
 LIBS += -L../depends/lib $(BUNDLEDLIBS)
 V ?= 0
 
-.objs/%.o: %.c Makefile $(CONFIG)
-	@cmd="$(MPICC) -MMD -c -o $@ $(CFLAGS) $<"; \
+.objs/%.o: %.c $(INCL) Makefile $(CONFIG)
+	@cmd="$(MPICC) -c -o $@ $(CFLAGS) $<"; \
 	if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
 	mkdir -p `dirname $@`; \
 	echo Compiling $<; $$cmd
 
-# Rule to compile .cu files (using nvcc)
-.objs/%.o: %.cu
-	$(NVCC) $(NVOPTIMIZE) -c $< -o $@
+
+.objs/%.o: %.cu Makefile $(CONFIG)
+	@cmd="$(NVCC) -MMD -c -o $@ $(CUDAFLAGS) $<"; \
+	if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
+	echo Compiling $<; $$cmd
+	
+

From bff0683a6637d377d9ee3d8668b457917f8f3dea Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:01:11 -0500
Subject: [PATCH 112/120] add mpi include into cuda compile rules

---
 platform-options/Options.mk.vista | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/platform-options/Options.mk.vista b/platform-options/Options.mk.vista
index ae7ac597..df699d6f 100644
--- a/platform-options/Options.mk.vista
+++ b/platform-options/Options.mk.vista
@@ -1,5 +1,8 @@
 #These variables are set to useful defaults, but may be overriden if needed
-#This is a good optimized build default for nvc
+MPICC=mpic++
+MPICCDEP=mpicc
+MPI_INCL=-I$(TACC_MPI_DIR)/include
+
 CUDA_LIBS=-L$(TACC_CUDA_LIB) -lcudart
 CUDA_INCL=-I$(TACC_CUDA_INC)
 
@@ -15,12 +18,26 @@ CUFFTMP_LIBS=-L/home1/apps/nvidia/Linux_aarch64/24.7/math_libs/lib64 -lcufftMp
 NVSHMEM_INCL=-I$(TACC_NVSHMEM_INC)
 NVSHMEM_LIBS=-L$(TACC_NVSHMEM_LIB) -lnvshmem_host
 
-OPTIMIZE =  -mp -g -Wall -fast
+NVCC=nvcc
+NVOPTIMIZE = -O3 -arch=sm_61 # specify architecture according to you GPU model, sm_90 shall be used for Vista's H100
+
+#GSL_LIBS=
+#GSL_INCL=
+#This is a good optimized build default for gcc
+#OPTIMIZE =  -fopenmp -O3 -g -Wall -ffast-math
+#nvc++ does not have the -ffast-math flag
+#OPTIMIZE =  -fopenmp -O3 -g -Wall -use_fast_math
 #This is a good non-optimized default for debugging
-#OPTIMIZE =  -mp -O0 -g -Wall
+OPTIMIZE =  -fopenmp -O0 -g -Wall
 
 #--------------------------------------- Basic operation mode of code
 #OPT += VALGRIND     # allow debugging with valgrind, disable the GADGET memory allocator.
-#OPT += -DDEBUG      # print a lot of debugging messages
+OPT += -DDEBUG      # print a lot of debugging messages
 #Disable openmp locking. This means no threading.
 #OPT += -DNO_OPENMP_SPINLOCK
+OPT += -DUSE_CUDA  #Enable GPU-specific CUDA code
+#-----------
+#OPT += -DEXCUR_REION  # reionization with excursion set
+
+#--------- CFITSIO (required only for saving potential plane files)
+# OPT += -DUSE_CFITSIO

From 223d7cbcf49862308e4c71b901b4d88905e5aa19 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:01:35 -0500
Subject: [PATCH 113/120] add box iterator helper for cufftmp

---
 libgadget/box_iterator.cpp |   2 +
 libgadget/box_iterator.hpp | 238 +++++++++++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 libgadget/box_iterator.cpp
 create mode 100644 libgadget/box_iterator.hpp

diff --git a/libgadget/box_iterator.cpp b/libgadget/box_iterator.cpp
new file mode 100644
index 00000000..139597f9
--- /dev/null
+++ b/libgadget/box_iterator.cpp
@@ -0,0 +1,2 @@
+
+
diff --git a/libgadget/box_iterator.hpp b/libgadget/box_iterator.hpp
new file mode 100644
index 00000000..2764bbf6
--- /dev/null
+++ b/libgadget/box_iterator.hpp
@@ -0,0 +1,238 @@
+#ifndef __CUFFTMP_BOX_ITERATOR_HPP__
+#define __CUFFTMP_BOX_ITERATOR_HPP__
+
+#include <iterator>
+#include <cstddef> 
+#include <cufftXt.h>
+#include <tuple>
+
+
+/**
+ * This iterator lets one iterate through the underlying data
+ * associated to a (lower, upper, strides) box, and exposes the mapping
+ * between global 3D coordinates (x, y, z) and local linear
+ * indices.
+ * 
+ * This iterator can be used in __host__ or __device__ code
+ */
+
+using int64 = long long int;
+
+struct Box3D {
+    int64 lower[3];
+    int64 upper[3];
+    int64 strides[3];
+};
+
+template<typename T>
+struct BoxIterator 
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type   = std::ptrdiff_t;
+    using value_type        = T;
+    using pointer           = T*;
+    using reference         = T&;
+
+    __host__ __device__ __forceinline__
+    BoxIterator(int64 i, Box3D box, T* ptr) : i_(i), box_(box), ptr_(ptr), 
+                                                    lx_(box.upper[0] - box.lower[0]),
+                                                    ly_(box.upper[1] - box.lower[1]),
+                                                    lz_(box.upper[2] - box.lower[2]) {
+        linear_to_box3d(i_, &x_, &y_, &z_);
+    };
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator++() { increment(1); return *this; } 
+
+    __host__ __device__
+    BoxIterator operator++(int) { 
+        BoxIterator tmp = *this; 
+        ++(*this); 
+        return tmp; 
+    } 
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator--() { increment(-1); return *this; } 
+
+    __host__ __device__
+    BoxIterator operator--(int) { 
+        BoxIterator tmp = *this; 
+        --(*this); 
+        return tmp; 
+    }  
+    
+    __host__ __device__ __forceinline__
+    BoxIterator& operator+=(difference_type rhs) { increment(rhs); return *this; }
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator-=(difference_type rhs) { increment(-rhs); return *this; }
+
+
+    __host__ __device__ __forceinline__
+    reference operator*() const { return ptr_[i()]; }
+
+    __host__ __device__ __forceinline__
+    pointer operator->() { return ptr_ + i(); }
+
+    __host__ __device__ __forceinline__
+    BoxIterator& operator[](difference_type rhs) const { return (*this + rhs); }
+
+    __host__ __device__ __forceinline__ 
+    friend difference_type operator-(const BoxIterator& a, const BoxIterator& b) {return a.i_ - b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator-(const BoxIterator& a, difference_type n) { a -= n; return a; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator+(const BoxIterator& a, difference_type n) { a += n; return a; }
+
+    __host__ __device__ __forceinline__ 
+    friend BoxIterator operator+(difference_type n, const BoxIterator& a) { return a+n; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator==(const BoxIterator& a, const BoxIterator& b) { return a.i_ == b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator!=(const BoxIterator& a, const BoxIterator& b) { return a.i_ != b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator>(const BoxIterator& a, const BoxIterator& b) { return a.i_ > b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator<(const BoxIterator& a, const BoxIterator& b) { return a.i_ < b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator>=(const BoxIterator& a, const BoxIterator& b) { return a.i_ >= b.i_; }
+
+    __host__ __device__ __forceinline__ 
+    friend bool operator<=(const BoxIterator& a, const BoxIterator& b) { return a.i_ <= b.i_; }
+
+    /**
+     * Return the global X coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 x() const { return x_; }
+
+    /**
+     * Return the global Y coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 y() const { return y_; }
+
+    /**
+     * Return the global Z coordinate of the iterator
+     */
+    __host__ __device__ __forceinline__
+    int64 z() const { return z_; }
+
+    /**
+     * Return the linear position of the iterator
+     * in the local data buffer
+     */
+    __host__ __device__ __forceinline__
+    int64 i() const {
+        return (x_ - box_.lower[0]) * box_.strides[0] + (y_ - box_.lower[1]) * box_.strides[1] + (z_ - box_.lower[2]) * box_.strides[2]; 
+    }
+
+private:
+
+    // Current 3D global index in the box
+    int64 x_, y_, z_;
+    // Current linear 3D index (not the location in memory)
+    int64 i_;
+    // Global box lower and upper corner and local strides
+    const Box3D box_;
+    // Underlying data pointer
+    T* ptr_;
+    // Length of the X, Y and Z dimensions
+    const int64 lx_, ly_, lz_;
+
+    // Linear to 3D coordinates
+    __host__ __device__ __forceinline__
+    void linear_to_box3d(int64 i, int64* x, int64* y, int64* z) {
+        if(lx_ * ly_ * lz_ > 0) {
+            *x  =   i  / (ly_ * lz_);
+            i  -= (*x) * (ly_ * lz_);
+            *y  =   i  / (lz_);
+            i  -= (*y) * (lz_);
+            *z  =   i;
+        } else {
+            *x = 0;
+            *y = 0;
+            *z = 0;
+        }
+        *x += box_.lower[0];
+        *y += box_.lower[1];
+        *z += box_.lower[2];
+    }
+
+    // Increment/decrement by n
+    __host__ __device__ __forceinline__
+    void increment(difference_type n) {
+        i_ += n;
+        linear_to_box3d(i_, &x_, &y_, &z_);
+    }
+
+};
+
+inline int64 slabs_displacement(int64 length, int rank, int size) {
+    int ranks_cutoff = length % size;
+    return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+}
+
+inline Box3D buildBox3D(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz) {
+    if(format == CUFFT_XT_FORMAT_INPLACE) {
+        int64 x_start      = slabs_displacement(nx, rank,   size);
+        int64 x_end        = slabs_displacement(nx, rank+1, size);
+        int64 my_ny        = ny;
+        int64 my_nz        = nz;
+        int64 my_nz_padded = (type == CUFFT_C2C || type == CUFFT_Z2Z) ? my_nz : 2*(nz/2 + 1);
+        return {
+            {x_start, 0, 0}, {x_end, my_ny, my_nz}, {my_ny * my_nz_padded, my_nz_padded, 1}
+        };
+    } else {
+        int64 y_start      = slabs_displacement(ny, rank,   size);
+        int64 y_end        = slabs_displacement(ny, rank+1, size);
+        int64 my_nx        = nx;
+        int64 my_nz        = (type == CUFFT_C2C || type == CUFFT_Z2Z) ? nz : (nz/2 + 1);
+        int64 my_nz_padded = my_nz;
+        return {
+            {0, y_start, 0}, {my_nx, y_end, my_nz}, {(y_end-y_start) * my_nz_padded, my_nz_padded, 1}
+        };
+    }
+}
+
+
+template<typename T> __host__ __device__ __forceinline__ 
+BoxIterator<T> BoxIteratorBegin(Box3D box, T* ptr) {
+    return BoxIterator<T>(0, box, ptr);
+};
+
+template<typename T> __host__ __device__ __forceinline__
+BoxIterator<T> BoxIteratorEnd(Box3D box, T* ptr) {
+    return BoxIterator<T>( (box.upper[0] - box.lower[0]) * (box.upper[1] - box.lower[1]) * (box.upper[2] - box.lower[2]), box, ptr);
+};
+
+template<typename T>
+std::pair<BoxIterator<T>,BoxIterator<T>> BoxIterators(Box3D box, T* ptr) {
+    return {BoxIteratorBegin<T>(box, ptr),BoxIteratorEnd<T>(box, ptr)};
+}
+
+template<typename T>
+BoxIterator<T> BoxIteratorBegin(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    Box3D box = buildBox3D(format, type, rank, size, nx, ny, nz);
+    return BoxIteratorBegin<T>(box, ptr);
+}
+
+template<typename T>
+BoxIterator<T> BoxIteratorEnd(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    Box3D box = buildBox3D(format, type, rank, size, nx, ny, nz);
+    return BoxIteratorEnd<T>(box, ptr);
+}
+
+template<typename T>
+std::pair<BoxIterator<T>,BoxIterator<T>> BoxIterators(cufftXtSubFormat format, cufftType type, int rank, int size, int64 nx, int64 ny, int64 nz, T* ptr) {
+    return {BoxIteratorBegin<T>(format, type, rank, size, nx, ny, nz, ptr),BoxIteratorEnd<T>(format, type, rank, size, nx, ny, nz, ptr)};
+}
+
+#endif // __CUFFTMP_BOX_ITERATOR_HPP__
\ No newline at end of file

From 9028343bb5634f17c85dfbfeb9b7cfd44005b7bf Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:02:07 -0500
Subject: [PATCH 114/120] move pm transfer functions to cuda kernel

---
 libgadget/pm_kernel.cu  | 134 ++++++++++++++++++++++++++++++++++++++++
 libgadget/pm_kernel.cuh |  15 +++++
 2 files changed, 149 insertions(+)
 create mode 100644 libgadget/pm_kernel.cu
 create mode 100644 libgadget/pm_kernel.cuh

diff --git a/libgadget/pm_kernel.cu b/libgadget/pm_kernel.cu
new file mode 100644
index 00000000..20b61bf3
--- /dev/null
+++ b/libgadget/pm_kernel.cu
@@ -0,0 +1,134 @@
+// pm_kernel.cu
+#include <cuda_runtime.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include "box_iterator.hpp"
+#include "petapm.h"
+
+
+/* unnormalized sinc function sin(x) / x */
+__device__ double sinc_unnormed(double x) {
+    if(x < 1e-5 && x > -1e-5) {
+        double x2 = x * x;
+        return 1.0 - x2 / 6. + x2  * x2 / 120.;
+    } else {
+        return sin(x) / x;
+    }
+}
+
+
+/* the transfer functions for force in fourier space applied to potential */
+/* super lanzcos in CH6 P 122 Digital Filters by Richard W. Hamming */
+__device__ double diff_kernel(double w) {
+/* order N = 1 */
+/*
+ * This is the same as GADGET-2 but in fourier space:
+ * see gadget-2 paper and Hamming's book.
+ * c1 = 2 / 3, c2 = 1 / 12
+ * */
+    return 1 / 6.0 * (8 * sin (w) - sin (2 * w));
+}
+
+
+__global__
+void potential_transfer_kernel(BoxIterator<cufftComplex> begin, BoxIterator<cufftComplex> end, PetaPM *pm) {
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    begin += tid;
+
+    if (begin < end) {
+        // Get global 3D coordinates of the current element in real space
+        int x = begin.x();
+        int y = begin.y();
+        int z = begin.z();
+
+        // Compute the corresponding wave numbers (kx, ky, kz), in grid unit
+        int kx = x<=pm->Nmesh/2 ? x : (x-pm->Nmesh);
+        int ky = y<=pm->Nmesh/2 ? y : (y-pm->Nmesh);
+        int kz = z<=pm->Nmesh/2 ? z : (z-pm->Nmesh);
+        int64_t k2 = 0.0;
+        k2 += ((int64_t)kx) * kx;
+        k2 += ((int64_t)ky) * ky;
+        k2 += ((int64_t)kz) * kz;
+        
+        const double asmth2 = pow((2 * M_PI) * pm->Asmth / pm->Nmesh, 2);
+        double f = 1.0;
+        const double smth = exp(-k2 * asmth2) / k2;
+        const double pot_factor = -pm->G / (M_PI * pm->BoxSize);
+
+        int kpos[3] = {kx, ky, kz};
+        // Apply CIC deconvolution
+        for (int k = 0; k < 3; k++) {
+            double tmp = (kpos[k] * M_PI) / pm->Nmesh;
+            tmp = sinc_unnormed(tmp);
+            f *= 1.0 / (tmp * tmp);
+        }
+        const double fac = pot_factor * smth * f * f;
+        //CUDA TODO: add massive neutrino back
+
+        // Handle zero mode separately
+        if (k2 == 0) {
+            begin->x = 0.0;
+            begin->y = 0.0;
+            return;
+        }
+        // Apply scaling factor
+        begin->x *= fac;
+        begin->y *= fac;
+    }
+}
+
+
+extern "C" void launch_potential_transfer(Box3D box_complex, cufftComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    potential_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm);
+}
+
+
+
+
+// static void force_transfer(PetaPM * pm, int k, cufftComplex * value) {
+//     double tmp0;
+//     double tmp1;
+//     /*
+//      * negative sign is from force_x = - Del_x pot
+//      *
+//      * filter is   i K(w)
+//      * */
+//     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
+//     tmp0 = - value[0].y * fac;
+//     tmp1 = value[0].x * fac;
+//     value[0].x = tmp0;
+//     value[0].y = tmp1;
+// }
+// static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
+//     force_transfer(pm, kpos[0], value);
+// }
+// static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
+//     force_transfer(pm, kpos[1], value);
+// }
+// static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
+//     force_transfer(pm, kpos[2], value);
+// }
+// static void readout_potential(PetaPM * pm, int i, double * mesh, double weight) {
+//     P[i].Potential += weight * mesh[0];
+// }
+// static void readout_force_x(PetaPM * pm, int i, double * mesh, double weight) {
+//     P[i].GravPM[0] += weight * mesh[0];
+// }
+// static void readout_force_y(PetaPM * pm, int i, double * mesh, double weight) {
+//     P[i].GravPM[1] += weight * mesh[0];
+// }
+// static void readout_force_z(PetaPM * pm, int i, double * mesh, double weight) {
+//     P[i].GravPM[2] += weight * mesh[0];
+// }
+
+
+
+
+
+
diff --git a/libgadget/pm_kernel.cuh b/libgadget/pm_kernel.cuh
new file mode 100644
index 00000000..4a36a999
--- /dev/null
+++ b/libgadget/pm_kernel.cuh
@@ -0,0 +1,15 @@
+// kernel_launch.h
+#ifndef PM_KERNEL_H
+#define PM_KERNEL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void launch_potential_transfer(Box3D box_complex, cufftComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PM_KERNEL_H
\ No newline at end of file

From 3f75f8976b32efa3c6d3f184de8c66daf9ad19da Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:03:00 -0500
Subject: [PATCH 115/120] adapt pm for kernel transfer functions (half done)

---
 libgadget/gravpm.c |   2 +
 libgadget/petapm.c | 129 +++++++++++++++++++++++++++++----------------
 libgadget/petapm.h |   8 +--
 3 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/libgadget/gravpm.c b/libgadget/gravpm.c
index f7f496d1..cdc2c69a 100644
--- a/libgadget/gravpm.c
+++ b/libgadget/gravpm.c
@@ -378,6 +378,8 @@ measure_power_spectrum(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value
     powerspectrum_add_mode(pm->ps, k2, kpos, value, f, pm->Nmesh);
 }
 
+
+/*  */
 static void
 potential_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex *value)
 {
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 7a3e940d..c8d52e73 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -7,10 +7,12 @@
 
 #include "types.h"
 #include "petapm.h"
+#include "pm_kernel.cuh"
 
 #include "utils.h"
 #include "walltime.h"
 
+
 static void
 layout_prepare(PetaPM * pm,
                struct Layout * L,
@@ -107,7 +109,6 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     pm->CellSize = BoxSize / Nmesh;
     pm->comm = comm;
 
-
     int ThisTask;
     int NTask;
     pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
@@ -130,7 +131,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
         endrun(0, "Error: The number of MPI ranks has to be a perfect square for CufftMp\n");
     }
 
-    message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);
+    message(0, "Using 2D Task mesh %d x %d \n", nranks1d, nranks1d);
     
     // Define custom data distribution
     int64 nx               = Nmesh;
@@ -162,14 +163,21 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
         return (rank < ranks_cutoff ? rank * (chunk_size + 1) : ranks_cutoff * (chunk_size + 1) + (rank - ranks_cutoff) * chunk_size);
     };
     
-    // update region properties
-    auto update_region = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region) {
+    // update region properties, also have a redundant box struct for now to use box_iterator, will merge it to region
+    auto update_region_and_box = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region, Box3D &box) {
+        region.totalsize = 1;
         for (int i = 0; i < 3; i++) {
             region.offset[i]  = lower[i];
             region.upper[i]   = upper[i];
             region.size[i]    = upper[i] - lower[i];
             region.strides[i] = strides[i];
+            region.totalsize *= region.size[i];
+            // init box3d
+            box.lower[i] = lower[i];
+            box.upper[i] = upper[i];
+            box.strides[i] = strides[i];
         }
+        region.buffer = NULL;
     };
     
     int i = ThisTask / nranks1d;
@@ -181,13 +189,14 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     int64 upper_real[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
     int64 strides_real[3] = {(upper_real[1] - lower_real[1]) * nz_real_padded, nz_real_padded, 1};
     int64 strides_real_nopad[3] = {(upper_real[1] - lower_real[1]) * nz_real, nz_real, 1};
-    update_region(lower_real, upper_real, strides_real_nopad, pm->real_space_region);
+
+    update_region_and_box(lower_real, upper_real, strides_real_nopad, pm->real_space_region, pm->box_real);
     
     // complex region setup
     int64 lower_fourier[3]   = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
     int64 upper_fourier[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
     int64 strides_fourier[3] = {(upper_fourier[1] - lower_fourier[1]) * (upper_fourier[2] - lower_fourier[2]), (upper_fourier[2] - lower_fourier[2]), 1};
-    update_region(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region);
+    update_region_and_box(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region, pm->box_complex);
 
     //===============================================================================================
     cudaStreamCreate(&pm->priv->stream);
@@ -201,7 +210,6 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
-
     cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
     cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
 
@@ -223,17 +231,17 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     message(1, "Task %d NGPUs=%d, pfftsize=%d \n", ThisTask, desc->descriptor->nGPUs, pm->priv->fftsize);
     /* now lets fill up the mesh2task arrays */
-    #if 1
-        message(1, "Real Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
-                pm->real_space_region.offset[0],
-                pm->real_space_region.offset[1],
-                pm->real_space_region.offset[2],
-                pm->real_space_region.upper[0],
-                pm->real_space_region.upper[1],
-                pm->real_space_region.upper[2],
-                pm->real_space_region.strides[0],
-                pm->real_space_region.strides[1],
-                pm->real_space_region.strides[2]);
+    #if 0
+        message(1, "Real Box3d %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->box_real.lower[0],
+                pm->box_real.lower[1],
+                pm->box_real.lower[2],
+                pm->box_real.upper[0],
+                pm->box_real.upper[1],
+                pm->box_real.upper[2],
+                pm->box_real.strides[0],
+                pm->box_real.strides[1],
+                pm->box_real.strides[2]);
         message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
                 pm->fourier_space_region.offset[0],
                 pm->fourier_space_region.offset[1],
@@ -245,7 +253,6 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
                 pm->fourier_space_region.strides[1],
                 pm->fourier_space_region.strides[2]);
     #endif
-    
         int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
         int k;
         for(k = 0; k < 2; k ++) {
@@ -258,9 +265,9 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
             /* which column / row hosts this tile? */
             /* FIXME: this is very inefficient */
             MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-            for(i = 0; i < Nmesh; i ++) {
-                message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
-            }
+            // for(i = 0; i < Nmesh; i ++) {
+            //     message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
+            // }
         }
         myfree(tmp);
 }
@@ -270,6 +277,7 @@ petapm_destroy(PetaPM * pm)
 {
     cufftDestroy(pm->priv->plan_forw);
     cufftDestroy(pm->priv->plan_back);
+    cudaStreamDestroy(pm->priv->stream);
     MPI_Comm_free(&pm->priv->comm_cart_2d);
     myfree(pm->Mesh2Task[0]);
 }
@@ -321,13 +329,17 @@ petapm_force_init(
     return regions;
 }
 
+
+
 static void pm_apply_transfer_function(PetaPM * pm,
         cufftComplex * src,
         cufftComplex * dst, petapm_transfer_func H
         ){
     size_t ip = 0;
+    
 
     PetaPMRegion * region = &pm->fourier_space_region;
+    message(1, "**region size %d; pfftsize %d \n", region->totalsize, pm->priv->fftsize);
 
 #pragma omp parallel for
     for(ip = 0; ip < region->totalsize; ip ++) {
@@ -354,8 +366,14 @@ static void pm_apply_transfer_function(PetaPM * pm,
         pos[0] = kpos[2];
         pos[1] = kpos[0];
         pos[2] = kpos[1];
+        message(1, "ip=%d\n", ip);
+
         dst[ip].x = src[ip].x;
         dst[ip].y = src[ip].y;
+
+        message(1, "dst=%f \n", dst[ip].x);
+        message(1, "src=%f \n", src[ip].x);
+        
         if(H) {
             H(pm, k2, pos, &dst[ip]);
         }
@@ -376,37 +394,41 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
     walltime_measure("/PMgrav/Verify");
 #endif
 
-    cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
-
     // CUDA TODO: figure out if this is needed
     // Allocate GPU memory, copy CPU data to GPU
     // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
     cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_INPLACE);
     // copy real array to gpu
-    cufftXtMemcpy(pm->priv->plan_back, (void*)pm->priv->desc, (void*)real, CUFFT_COPY_HOST_TO_DEVICE);
+    cufftXtMemcpy(pm->priv->plan_forw, pm->priv->desc, real, CUFFT_COPY_HOST_TO_DEVICE);
+    message(1, "Real array first element %f\n", real[0]);
+    
     // execute the plan
     cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD);
     myfree(real);
 
      // CUDA TODO: need to check if the output complex array is transpose
      // need to verify
-     // can verify by using both version of the code
     //=============================== End of R2C =============================================
     //========================== Begin Transfer Function =====================================
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(pm->comm, &ThisTask);
+    MPI_Comm_size(pm->comm, &NTask);
+    
     cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
 
+    launch_potential_transfer(pm->box_complex, (cufftComplex *) pm->priv->desc->descriptor->data[0], ThisTask, NTask, pm, pm->priv->stream);
+    message(1, "Simple kernel suceeded \n");
+        
     /*Do any analysis that may be required before the transfer function is applied*/
-    petapm_transfer_func global_readout = global_functions->global_readout;
-    if(global_readout)
-        pm_apply_transfer_function(pm, complx, rho_k, global_readout);
-    if(global_functions->global_analysis)
-        global_functions->global_analysis(pm);
-    /*Apply the transfer function*/
-    petapm_transfer_func global_transfer = global_functions->global_transfer;
-    pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
+    /* CUDA Note: global readout and analysis is NULL unless CP->MassiveNuLinRespOn*/
+    /* CUDA TODO: add back the CP->MassiveNuLinRespOn function later*/
+    
+    // /*Apply the transfer function*/
+    /* global transfer is potential transfer in gravpm*/
+    // petapm_transfer_func global_transfer = global_functions->global_transfer;
+    // pm_apply_transfer_function(pm, complex_data, rho_k, global_transfer);
     walltime_measure("/PMgrav/r2c");
-
-    myfree(complx);
     return rho_k;
 }
 
@@ -417,32 +439,38 @@ petapm_force_c2r(PetaPM * pm,
         const int Nregions,
         PetaPMFunctions * functions)
 {
-
+    // For grav the functions are: potential, forcex, forcey, forcez, 
+    // where the potential has no transfer function, only readout
+    // as the potential transfer is applied in r2c
     PetaPMFunctions * f = functions;
     for (f = functions; f->name; f ++) {
         petapm_transfer_func transfer = f->transfer;
         petapm_readout_func readout = f->readout;
 
-        cufftComplex * complx = (cufftComplex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
         /* apply the greens function turn rho_k into potential in fourier space */
-        pm_apply_transfer_function(pm, rho_k, complx, transfer);
+        // pm_apply_transfer_function(pm, rho_k, (cufftComplex*) pm->priv->desc->descriptor->data[0], transfer);
         walltime_measure("/PMgrav/calc");
-        // double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        /* CUDA TODO: BUT WHERE DO I INPUT THE ACTUAL ARRAY? */
+        // execute c2r
         cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE);
-        double * real = (double * ) pm->priv->desc->descriptor->data[0];
+        cudaStreamSynchronize(pm->priv->stream);
+        // copy data back to cpu
+        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+        cufftXtMemcpy(pm->priv->plan_back, real, pm->priv->desc, CUFFT_COPY_DEVICE_TO_HOST);
+        cufftXtFree(pm->priv->desc);
 
         walltime_measure("/PMgrav/c2r");
         if(f == functions) // Once
             report_memory_usage("PetaPM");
-        myfree(complx);
+        message(1, "FREED DESC ***************** \n");
         /* read out the potential: this will copy and free real.*/
         layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
         walltime_measure("/PMgrav/comm");
 
         pm_iterate(pm, readout, regions, Nregions);
         walltime_measure("/PMgrav/readout");
-    }
+//    }
+    
+        message(1, "READ OUT DONE ***************** \n");
 }
 
 void petapm_force_finish(PetaPM * pm) {
@@ -786,6 +814,7 @@ layout_iterate_cells(PetaPM * pm,
                      double * real)
 {
     int i;
+    message(1, "******** NpImport %d \n", L->NpImport);
 #pragma omp parallel for
     for(i = 0; i < L->NpImport; i ++) {
         struct Pencil * p = &L->PencilRecv[i];
@@ -802,6 +831,8 @@ layout_iterate_cells(PetaPM * pm,
             }
             linear0 += ix * pm->real_space_region.strides[k];
         }
+        
+        
         int j;
         for(j = 0; j < p->len; j ++) {
             int iz = p->offset[2] + j;
@@ -921,6 +952,7 @@ void petapm_region_init_strides(PetaPMRegion * region) {
     region->buffer = NULL;
 }
 
+
 static int pos_get_target(PetaPM * pm, const int pos[2]) {
     int k;
     int task2d[2];
@@ -946,6 +978,11 @@ static int pencil_cmp_target(const void * v1, const void * v2) {
         ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
 }
 
+
+
+
+/********************************************************************************************/
+
 #ifdef DEBUG
 static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
     /* verify the density field */
@@ -983,6 +1020,9 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 
 
 
+
+
+
 /**************
  * functions iterating over particle / mesh pairs
  ***************/
@@ -1015,4 +1055,3 @@ static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
  * iCFT(CFT) = 2pi
  *
  * **************************8*/
-
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 9a0112c4..1c2d0b4c 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -1,12 +1,13 @@
 #ifndef __PETAPM_H__
 #define __PETAPM_H__
-#include <cufftMp.h>   // NC:library change
+#include <cufftMp.h>
 
 #include "powerspectrum.h"
-
+#include "box_iterator.hpp"
 
 using int64 = long long int;
 
+
 typedef struct Region {
     /* represents a region in the FFT Mesh */
     int64 offset[3];
@@ -14,7 +15,6 @@ typedef struct Region {
     int64 upper[3];
     int64 strides[3];
 
-
     size_t totalsize;
     double * buffer;
     /* below are used mostly for investigation */
@@ -74,6 +74,8 @@ typedef struct PetaPM {
     MPI_Comm comm;
     PetaPMRegion real_space_region;
     PetaPMRegion fourier_space_region;
+    Box3D box_real;
+    Box3D box_complex;
     double CellSize;
     int Nmesh;
     double Asmth;

From 4e1c8458ad0ac39d7b17d0ecd7553ecf3e47cb0c Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Thu, 17 Oct 2024 23:03:42 -0500
Subject: [PATCH 116/120] output obj files for cuda kernels

---
 libgadget/Makefile | 57 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/libgadget/Makefile b/libgadget/Makefile
index 49ab0270..facc7978 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -6,6 +6,53 @@ CONFIG ?= ../Options.mk
 
 include $(CONFIG)
 
+INCL = densitykernel.h \
+	forcetree.h \
+	hci.h \
+	petapm.h \
+	run.h \
+	timebinmgr.h \
+	treewalk.h \
+	partmanager.h \
+	cooling.h   \
+	cooling_rates.h cooling_qso_lightup.h \
+	domain.h   \
+	exchange.h \
+	slotsmanager.h     \
+	checkpoint.h \
+	physconst.h   \
+	sfr_eff.h \
+	stats.h \
+	winds.h \
+	timefac.h \
+	blackhole.h bhdynfric.h bhinfo.h \
+	gravity.h \
+	cosmology.h \
+	drift.h     \
+	fof.h  \
+	gravshort.h  \
+	petaio.h  \
+	powerspectrum.h  \
+	timestep.h  \
+	walltime.h \
+	neutrinos_lra.h \
+	omega_nu_single.h \
+	uvbg.h \
+	plane.h \
+utils/unitsystem.h \
+utils/peano.h \
+utils/interp.h \
+utils/paramset.h \
+utils/endrun.h \
+utils/memory.h \
+utils/mpsort.h \
+utils/mymalloc.h \
+utils/system.h \
+utils/event.h \
+utils/openmpsort.h \
+utils/spinlocks.h \
+utils/string.h
+
 UTILS_TESTED = memory openmpsort interp peano
 UTILS_MPI_TESTED = mpsort
 
@@ -56,12 +103,13 @@ GADGET_OBJS =  \
 	 gravshort-tree.o gravshort-pair.o hydra.o  timefac.o \
 	 gravpm.o powerspectrum.o \
 	 forcetree.o \
-	 petapm.o gravity.o \
+	 petapm.o pm_kernel.o gravity.o \
 	 densitykernel.o walltime.o\
 	 runtests.o \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
+	 uvbg.o \
 	 plane.o 
 
 GADGET_UTILS_OBJS= \
@@ -79,6 +127,7 @@ utils/unitsystem.o \
 utils/string.o \
 utils/spinlocks.o
 
+
 GADGET_OBJS := $(GADGET_OBJS:%=.objs/%)
 GADGET_UTILS_OBJS := $(GADGET_UTILS_OBJS:%=.objs/%)
 
@@ -143,12 +192,6 @@ config.c: $(CONFIG)
 	mkdir -p `dirname $@`
 	MPICC="$(MPICC)" CFLAGS="$(CFLAGS)" OPT="$(OPT)" OPTIMIZE="$(OPTIMIZE)" VERSION="$(VERSION)" bash makeconfig.sh $@
 
-GADGET_DEPS := $(GADGET_OBJS:.o=.d)
-GADGET_UTILS_DEPS := $(GADGET_UTILS_OBJS:.o=.d)
--include $(GADGET_DEPS)
--include $(GADGET_UTILS_DEPS)
-
-
 #This snippet works out the current git revision and the git revision in config.h.
 #It checks whether they are the same.
 #If they are not the same it makes config.h as a PHONY target to be rebuilt.

From ee60387184d728eef89a5b84c628e54d29aa8151 Mon Sep 17 00:00:00 2001
From: Simeon Bird <sbird@ucr.edu>
Date: Thu, 17 Oct 2024 22:36:58 -0700
Subject: [PATCH 117/120] Use cudaMallocManaged by default

We should use system malloc by default for large allocations so that we
can get the CUDA memory management. The removes the VALGRIND flag.
mymalloc calls now use the cuda allocation, but tamalloc continue to use
a private cpu heap.
---
 libgadget/utils/mymalloc.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libgadget/utils/mymalloc.c b/libgadget/utils/mymalloc.c
index e8f31af7..819c8884 100644
--- a/libgadget/utils/mymalloc.c
+++ b/libgadget/utils/mymalloc.c
@@ -21,10 +21,6 @@ Allocator A_MAIN[1];
  * */
 Allocator A_TEMP[1];
 
-#ifdef VALGRIND
-#define allocator_init allocator_malloc_init
-#endif
-
 void
 tamalloc_init(void)
 {
@@ -64,7 +60,7 @@ mymalloc_init(double MaxMemSizePerNode)
         endrun(2, "Mem too small! MB/node=%g, nodespercpu = %g NTask = %d\n", MaxMemSizePerNode, nodespercpu, NTask);
 
 
-    if (MPIU_Any(ALLOC_ENOMEMORY == allocator_init(A_MAIN, "MAIN", n, 1, NULL), MPI_COMM_WORLD)) {
+    if (MPIU_Any(ALLOC_ENOMEMORY == allocator_malloc_init(A_MAIN, "MAIN", n, 1, NULL), MPI_COMM_WORLD)) {
         endrun(0, "Insufficient memory for the MAIN allocator on at least one nodes."
                   "Requestion %td bytes. Try reducing MaxMemSizePerNode. Also check the node health status.\n", n);
     }

From 424eb339f8e2d60c60155c5051767b9df269ff69 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Fri, 18 Oct 2024 09:50:53 -0500
Subject: [PATCH 118/120] fix compiler error

---
 libgadget/Makefile | 55 +++++-----------------------------------------
 libgadget/petapm.c |  2 +-
 2 files changed, 7 insertions(+), 50 deletions(-)

diff --git a/libgadget/Makefile b/libgadget/Makefile
index facc7978..f3ca903d 100644
--- a/libgadget/Makefile
+++ b/libgadget/Makefile
@@ -6,53 +6,6 @@ CONFIG ?= ../Options.mk
 
 include $(CONFIG)
 
-INCL = densitykernel.h \
-	forcetree.h \
-	hci.h \
-	petapm.h \
-	run.h \
-	timebinmgr.h \
-	treewalk.h \
-	partmanager.h \
-	cooling.h   \
-	cooling_rates.h cooling_qso_lightup.h \
-	domain.h   \
-	exchange.h \
-	slotsmanager.h     \
-	checkpoint.h \
-	physconst.h   \
-	sfr_eff.h \
-	stats.h \
-	winds.h \
-	timefac.h \
-	blackhole.h bhdynfric.h bhinfo.h \
-	gravity.h \
-	cosmology.h \
-	drift.h     \
-	fof.h  \
-	gravshort.h  \
-	petaio.h  \
-	powerspectrum.h  \
-	timestep.h  \
-	walltime.h \
-	neutrinos_lra.h \
-	omega_nu_single.h \
-	uvbg.h \
-	plane.h \
-utils/unitsystem.h \
-utils/peano.h \
-utils/interp.h \
-utils/paramset.h \
-utils/endrun.h \
-utils/memory.h \
-utils/mpsort.h \
-utils/mymalloc.h \
-utils/system.h \
-utils/event.h \
-utils/openmpsort.h \
-utils/spinlocks.h \
-utils/string.h
-
 UTILS_TESTED = memory openmpsort interp peano
 UTILS_MPI_TESTED = mpsort
 
@@ -109,7 +62,6 @@ GADGET_OBJS =  \
 	 neutrinos_lra.o \
      omega_nu_single.o \
 	 config.o \
-	 uvbg.o \
 	 plane.o 
 
 GADGET_UTILS_OBJS= \
@@ -127,7 +79,6 @@ utils/unitsystem.o \
 utils/string.o \
 utils/spinlocks.o
 
-
 GADGET_OBJS := $(GADGET_OBJS:%=.objs/%)
 GADGET_UTILS_OBJS := $(GADGET_UTILS_OBJS:%=.objs/%)
 
@@ -192,6 +143,12 @@ config.c: $(CONFIG)
 	mkdir -p `dirname $@`
 	MPICC="$(MPICC)" CFLAGS="$(CFLAGS)" OPT="$(OPT)" OPTIMIZE="$(OPTIMIZE)" VERSION="$(VERSION)" bash makeconfig.sh $@
 
+GADGET_DEPS := $(GADGET_OBJS:.o=.d)
+GADGET_UTILS_DEPS := $(GADGET_UTILS_OBJS:.o=.d)
+-include $(GADGET_DEPS)
+-include $(GADGET_UTILS_DEPS)
+
+
 #This snippet works out the current git revision and the git revision in config.h.
 #It checks whether they are the same.
 #If they are not the same it makes config.h as a PHONY target to be rebuilt.
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index c8d52e73..02c7f598 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -443,7 +443,7 @@ petapm_force_c2r(PetaPM * pm,
     // where the potential has no transfer function, only readout
     // as the potential transfer is applied in r2c
     PetaPMFunctions * f = functions;
-    for (f = functions; f->name; f ++) {
+//    for (f = functions; f->name; f ++) {
         petapm_transfer_func transfer = f->transfer;
         petapm_readout_func readout = f->readout;
 

From c2c0cf04dd62aeb99ead9862c3bfb3f0ba854141 Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Fri, 18 Oct 2024 10:29:51 -0500
Subject: [PATCH 119/120] undo one commit from master for cudamalloc

---
 libgadget/utils/mymalloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libgadget/utils/mymalloc.c b/libgadget/utils/mymalloc.c
index 819c8884..ae56d410 100644
--- a/libgadget/utils/mymalloc.c
+++ b/libgadget/utils/mymalloc.c
@@ -16,6 +16,10 @@
 /* The main allocator is used to store large objects, e.g. tree, toptree */
 Allocator A_MAIN[1];
 
+#ifdef VALGRIND
+#define allocator_init allocator_malloc_init
+#endif
+
 /* The temp allocator is used to store objects that lives on the stack;
  * replacing alloca and similar cases to avoid stack induced memory fragmentation
  * */
@@ -60,7 +64,7 @@ mymalloc_init(double MaxMemSizePerNode)
         endrun(2, "Mem too small! MB/node=%g, nodespercpu = %g NTask = %d\n", MaxMemSizePerNode, nodespercpu, NTask);
 
 
-    if (MPIU_Any(ALLOC_ENOMEMORY == allocator_malloc_init(A_MAIN, "MAIN", n, 1, NULL), MPI_COMM_WORLD)) {
+    if (MPIU_Any(ALLOC_ENOMEMORY == allocator_init(A_MAIN, "MAIN", n, 1, NULL), MPI_COMM_WORLD)) {
         endrun(0, "Insufficient memory for the MAIN allocator on at least one nodes."
                   "Requestion %td bytes. Try reducing MaxMemSizePerNode. Also check the node health status.\n", n);
     }

From cc38eb91019edd90a370dd1d29973e81df9fcdba Mon Sep 17 00:00:00 2001
From: Nianyi Chen <nianyi.chen7@gmail.com>
Date: Mon, 4 Nov 2024 13:11:22 -0600
Subject: [PATCH 120/120] latest updates from pm changes

---
 libgadget/petapm.c     | 295 ++++++++++++++++++++---------------------
 libgadget/petapm.h     |  31 ++++-
 libgadget/pm_kernel.cu | 106 +++++++++------
 3 files changed, 242 insertions(+), 190 deletions(-)

diff --git a/libgadget/petapm.c b/libgadget/petapm.c
index 02c7f598..3bd2fb09 100644
--- a/libgadget/petapm.c
+++ b/libgadget/petapm.c
@@ -48,10 +48,10 @@ static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, c
 static MPI_Datatype MPI_PENCIL;
 
 /*Used only in MP-GenIC*/
-cufftComplex *
+cufftDoubleComplex *
 petapm_alloc_rhok(PetaPM * pm)
 {
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
+    cufftDoubleComplex * rho_k = (cufftDoubleComplex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
     memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
@@ -199,49 +199,40 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     update_region_and_box(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region, pm->box_complex);
 
     //===============================================================================================
-    cudaStreamCreate(&pm->priv->stream);
-    cufftCreate(&pm->priv->plan_forw);
-    cufftCreate(&pm->priv->plan_back);
+    CUDA_CHECK(cudaStreamCreate(&pm->priv->stream));
+    CUFFT_CHECK(cufftCreate(&pm->priv->plan_forw));
+    CUFFT_CHECK(cufftCreate(&pm->priv->plan_back));
+    
     // Attach the MPI communicator to the plans
-    cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm);
-    cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm);
-
+    CUFFT_CHECK(cufftMpAttachComm(pm->priv->plan_forw, CUFFT_COMM_MPI, &comm));
+    CUFFT_CHECK(cufftMpAttachComm(pm->priv->plan_back, CUFFT_COMM_MPI, &comm));
     // Describe the data distribution (only for custumized data decomposition, not needed for default slab decomposition)
     // R2C plans only support CUFFT_XT_FORMAT_DISTRIBUTED_INPUT and always perform a CUFFT_FORWARD transform
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
-    cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
-    cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
+    CUFFT_CHECK(cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier));
+    CUFFT_CHECK(cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier));
 
     // Set the stream
-    cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
-    cufftSetStream(pm->priv->plan_back, pm->priv->stream);
+    CUFFT_CHECK(cufftSetStream(pm->priv->plan_forw, pm->priv->stream));
+    CUFFT_CHECK(cufftSetStream(pm->priv->plan_back, pm->priv->stream));
 
     // Make the plan
     size_t workspace;
-    cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
-    cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+    CUFFT_CHECK(cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_D2Z, &workspace));
+    CUFFT_CHECK(cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_Z2D, &workspace));
 
     // Allocate GPU memory, copy CPU data to GPU
     // Data is initially distributed according to CUFFT_XT_FORMAT_DISTRIBUTED_INPUT, i.e., box_real
-    cudaLibXtDesc *desc;
-    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT);
-    pm->priv->fftsize = desc->descriptor->size[0];
+//    cudaLibXtDesc *desc;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT));
+    
+    pm->priv->fftsize = (upper_real[0] - lower_real[0]) * strides_real[0];
+    pm->priv->fftsize_complex = (upper_fourier[0] - lower_fourier[0]) * strides_fourier[0];
     //===============================================================================================
-
-    message(1, "Task %d NGPUs=%d, pfftsize=%d \n", ThisTask, desc->descriptor->nGPUs, pm->priv->fftsize);
+    message(1, "Task %d NGPUs=%d, local real size (fftsize)=%d, local fourier size=%d\n", ThisTask, pm->priv->desc->descriptor->nGPUs, pm->priv->fftsize, pm->priv->fftsize_complex);
     /* now lets fill up the mesh2task arrays */
     #if 0
-        message(1, "Real Box3d %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
-                pm->box_real.lower[0],
-                pm->box_real.lower[1],
-                pm->box_real.lower[2],
-                pm->box_real.upper[0],
-                pm->box_real.upper[1],
-                pm->box_real.upper[2],
-                pm->box_real.strides[0],
-                pm->box_real.strides[1],
-                pm->box_real.strides[2]);
         message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
                 pm->fourier_space_region.offset[0],
                 pm->fourier_space_region.offset[1],
@@ -253,31 +244,31 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
                 pm->fourier_space_region.strides[1],
                 pm->fourier_space_region.strides[2]);
     #endif
-        int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
-        int k;
-        for(k = 0; k < 2; k ++) {
-            for(i = 0; i < Nmesh; i ++) {
-                tmp[i] = 0;
-            }
-            for(i = 0; i < pm->real_space_region.size[k]; i ++) {
-                tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
-            }
-            /* which column / row hosts this tile? */
-            /* FIXME: this is very inefficient */
-            MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
-            // for(i = 0; i < Nmesh; i ++) {
-            //     message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
-            // }
+    int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+    int k;
+    for(k = 0; k < 2; k ++) {
+        for(i = 0; i < Nmesh; i ++) {
+            tmp[i] = 0;
+        }
+        for(i = 0; i < pm->real_space_region.size[k]; i ++) {
+            tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
         }
-        myfree(tmp);
+        /* which column / row hosts this tile? */
+        /* FIXME: this is very inefficient */
+        MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
+        // for(i = 0; i < Nmesh; i ++) {
+        //     message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
+        // }
+    }
+    myfree(tmp);
 }
 
 void
 petapm_destroy(PetaPM * pm)
 {
-    cufftDestroy(pm->priv->plan_forw);
-    cufftDestroy(pm->priv->plan_back);
-    cudaStreamDestroy(pm->priv->stream);
+    CUFFT_CHECK(cufftDestroy(pm->priv->plan_forw));
+    CUFFT_CHECK(cufftDestroy(pm->priv->plan_back));
+    CUDA_CHECK(cudaStreamDestroy(pm->priv->stream));
     MPI_Comm_free(&pm->priv->comm_cart_2d);
     myfree(pm->Mesh2Task[0]);
 }
@@ -288,10 +279,6 @@ petapm_destroy(PetaPM * pm)
  * */
 typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
-/* apply transfer function to value, kpos array is in x, y, z order */
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H);
 
 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
 /*
@@ -320,7 +307,6 @@ petapm_force_init(
     *Nregions = 0;
     PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
     pm_init_regions(pm, regions, *Nregions);
-
     pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
 
     layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
@@ -330,80 +316,29 @@ petapm_force_init(
 }
 
 
-
-static void pm_apply_transfer_function(PetaPM * pm,
-        cufftComplex * src,
-        cufftComplex * dst, petapm_transfer_func H
-        ){
-    size_t ip = 0;
-    
-
-    PetaPMRegion * region = &pm->fourier_space_region;
-    message(1, "**region size %d; pfftsize %d \n", region->totalsize, pm->priv->fftsize);
-
-#pragma omp parallel for
-    for(ip = 0; ip < region->totalsize; ip ++) {
-        ptrdiff_t tmp = ip;
-        int pos[3];
-        int kpos[3];
-        int64_t k2 = 0.0;
-        int k;
-        for(k = 0; k < 3; k ++) {
-            pos[k] = tmp / region->strides[k];
-            tmp -= pos[k] * region->strides[k];
-            /* lets get the abs pos on the grid*/
-            pos[k] += region->offset[k];
-            /* check */
-            if(pos[k] >= pm->Nmesh) {
-                endrun(1, "position didn't make sense\n");
-            }
-            kpos[k] = petapm_mesh_to_k(pm, pos[k]);
-            /* Watch out the cast */
-            k2 += ((int64_t)kpos[k]) * kpos[k];
-        }
-        /* swap 0 and 1 because fourier space was transposed */
-        /* kpos is y, z, x */
-        pos[0] = kpos[2];
-        pos[1] = kpos[0];
-        pos[2] = kpos[1];
-        message(1, "ip=%d\n", ip);
-
-        dst[ip].x = src[ip].x;
-        dst[ip].y = src[ip].y;
-
-        message(1, "dst=%f \n", dst[ip].x);
-        message(1, "src=%f \n", src[ip].x);
-        
-        if(H) {
-            H(pm, k2, pos, &dst[ip]);
-        }
-    }
-
-}
-
-cufftComplex * petapm_force_r2c(PetaPM * pm,
+cufftDoubleComplex *
+petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ) {
      // CUDA TODO: figureout how to properly get fftsize
     double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
     memset(real, 0, sizeof(double) * pm->priv->fftsize);
     layout_build_and_exchange_cells_to_fft(pm, &pm->priv->layout, pm->priv->meshbuf, real);
+    
     walltime_measure("/PMgrav/comm2");
 #ifdef DEBUG
     verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
     walltime_measure("/PMgrav/Verify");
 #endif
-
-    // CUDA TODO: figure out if this is needed
-    // Allocate GPU memory, copy CPU data to GPU
-    // Data is initially distributed according to CUFFT_XT_FORMAT_INPLACE
-    cufftXtMalloc(pm->priv->plan_forw, &pm->priv->desc, CUFFT_XT_FORMAT_INPLACE);
+    // pm->priv->desc allocated in init
     // copy real array to gpu
-    cufftXtMemcpy(pm->priv->plan_forw, pm->priv->desc, real, CUFFT_COPY_HOST_TO_DEVICE);
+    
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_forw, pm->priv->desc, real, CUFFT_COPY_HOST_TO_DEVICE));
     message(1, "Real array first element %f\n", real[0]);
     
     // execute the plan
-    cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD);
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_forw, pm->priv->desc, pm->priv->desc, CUFFT_FORWARD));
+    // message(1, "complex array first element %f\n", ((cufftDoubleComplex*)pm->priv->desc->descriptor->data[0])[0].x);
     myfree(real);
 
      // CUDA TODO: need to check if the output complex array is transpose
@@ -414,27 +349,25 @@ cufftComplex * petapm_force_r2c(PetaPM * pm,
     int NTask;
     MPI_Comm_rank(pm->comm, &ThisTask);
     MPI_Comm_size(pm->comm, &NTask);
-    
-    cufftComplex * rho_k = (cufftComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
-
-    launch_potential_transfer(pm->box_complex, (cufftComplex *) pm->priv->desc->descriptor->data[0], ThisTask, NTask, pm, pm->priv->stream);
-    message(1, "Simple kernel suceeded \n");
-        
     /*Do any analysis that may be required before the transfer function is applied*/
     /* CUDA Note: global readout and analysis is NULL unless CP->MassiveNuLinRespOn*/
     /* CUDA TODO: add back the CP->MassiveNuLinRespOn function later*/
     
-    // /*Apply the transfer function*/
+    /*Apply the transfer function*/
     /* global transfer is potential transfer in gravpm*/
     // petapm_transfer_func global_transfer = global_functions->global_transfer;
     // pm_apply_transfer_function(pm, complex_data, rho_k, global_transfer);
+    
+    launch_potential_transfer(pm->box_complex, (cufftDoubleComplex *) pm->priv->desc->descriptor->data[0], ThisTask, NTask, pm, pm->priv->stream);
+    message(1, "Simple kernel suceeded \n");
     walltime_measure("/PMgrav/r2c");
+    cufftDoubleComplex * rho_k = (cufftDoubleComplex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
     return rho_k;
 }
 
 void
 petapm_force_c2r(PetaPM * pm,
-        cufftComplex * rho_k,
+        cufftDoubleComplex * rho_k,
         PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions)
@@ -442,35 +375,99 @@ petapm_force_c2r(PetaPM * pm,
     // For grav the functions are: potential, forcex, forcey, forcez, 
     // where the potential has no transfer function, only readout
     // as the potential transfer is applied in r2c
-    PetaPMFunctions * f = functions;
-//    for (f = functions; f->name; f ++) {
-        petapm_transfer_func transfer = f->transfer;
-        petapm_readout_func readout = f->readout;
-
-        /* apply the greens function turn rho_k into potential in fourier space */
-        // pm_apply_transfer_function(pm, rho_k, (cufftComplex*) pm->priv->desc->descriptor->data[0], transfer);
-        walltime_measure("/PMgrav/calc");
-        // execute c2r
-        cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE);
-        cudaStreamSynchronize(pm->priv->stream);
-        // copy data back to cpu
-        double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
-        cufftXtMemcpy(pm->priv->plan_back, real, pm->priv->desc, CUFFT_COPY_DEVICE_TO_HOST);
-        cufftXtFree(pm->priv->desc);
-
-        walltime_measure("/PMgrav/c2r");
-        if(f == functions) // Once
-            report_memory_usage("PetaPM");
-        message(1, "FREED DESC ***************** \n");
-        /* read out the potential: this will copy and free real.*/
-        layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real);
-        walltime_measure("/PMgrav/comm");
-
-        pm_iterate(pm, readout, regions, Nregions);
-        walltime_measure("/PMgrav/readout");
-//    }
+    PetaPMFunctions f;
+    petapm_readout_func readout;
+
+    // c2r on rhok and apply potential readout function
+    // transfer function for x,y,z, c2r, then readout
+    int ThisTask;
+    int NTask;
+    MPI_Comm_rank(pm->comm, &ThisTask);
+    MPI_Comm_size(pm->comm, &NTask);
+    cufftResult res;
+    cudaError_t CudaError;
+    size_t size_cpy = pm->priv->desc->descriptor->size[0];
+    cufftDoubleComplex* complex;
     
-        message(1, "READ OUT DONE ***************** \n");
+    // -------------------- force x --------------------------------
+    double * real_fx = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMalloc(&complex, size_cpy));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_x_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fx;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fx, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fx->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fx, desc_fx, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fx, desc_fx, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fx);
+    f = functions[1];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fx));
+
+    // -------------------- force y--------------------------------
+    double * real_fy = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_y_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fy;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fy, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fy->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fy, desc_fy, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fy, desc_fy, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fy);
+    f = functions[2];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fy));
+    // -------------------- force z --------------------------------
+    double * real_fz = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    CUDA_CHECK(cudaMemcpy(complex, pm->priv->desc->descriptor->data[0], size_cpy, cudaMemcpyDeviceToDevice));
+    launch_force_y_transfer(pm->box_complex, complex, 
+                            ThisTask, NTask, pm, pm->priv->stream);
+
+    cudaLibXtDesc *desc_fz;
+    CUFFT_CHECK(cufftXtMalloc(pm->priv->plan_back, &desc_fz, CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT));
+    CUDA_CHECK(cudaMemcpy(desc_fz->descriptor->data[0], complex, size_cpy, cudaMemcpyDeviceToDevice));
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, desc_fz, desc_fz, CUFFT_INVERSE));
+    cudaStreamSynchronize(pm->priv->stream);
+    // copy back
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_fz, desc_fz, CUFFT_COPY_DEVICE_TO_HOST));
+    walltime_measure("/PMgrav/c2r");
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_fz);
+    f = functions[3];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    walltime_measure("/PMgrav/readout");
+    CUFFT_CHECK(cufftXtFree(desc_fz));
+    // -------------------- potential --------------------------------
+    double * real_pot = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
+    /* get potential out last*/
+    // fft back
+    CUFFT_CHECK(cufftXtExecDescriptor(pm->priv->plan_back, pm->priv->desc, pm->priv->desc, CUFFT_INVERSE));
+    // copy data back to cpu
+    CUFFT_CHECK(cufftXtMemcpy(pm->priv->plan_back, real_pot, pm->priv->desc, CUFFT_COPY_DEVICE_TO_HOST));
+    layout_build_and_exchange_cells_to_local(pm, &pm->priv->layout, pm->priv->meshbuf, real_pot);
+    walltime_measure("/PMgrav/comm");
+    f = functions[0];
+    readout = f.readout;
+    pm_iterate(pm, readout, regions, Nregions);
+    CUFFT_CHECK(cufftXtFree(pm->priv->desc));
+    CUDA_CHECK(cudaFree(complex));
 }
 
 void petapm_force_finish(PetaPM * pm) {
@@ -485,7 +482,7 @@ void petapm_force(PetaPM * pm, petapm_prepare_func prepare,
         void * userdata) {
     int Nregions;
     PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
-    cufftComplex * rho_k = petapm_force_r2c(pm, global_functions);
+    cufftDoubleComplex * rho_k = petapm_force_r2c(pm, global_functions);
     if(functions)
         petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
     myfree(rho_k);
diff --git a/libgadget/petapm.h b/libgadget/petapm.h
index 1c2d0b4c..8d057c62 100644
--- a/libgadget/petapm.h
+++ b/libgadget/petapm.h
@@ -7,6 +7,27 @@
 
 using int64 = long long int;
 
+#define CUDA_CHECK(ans) { gpu_checkAssert((ans), __FILE__, __LINE__); }
+inline void gpu_checkAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"CUDA_CHECK: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#define CUFFT_CHECK(ans) { cufft_check((ans), __FILE__, __LINE__); }
+inline void cufft_check(int code, const char *file, int line, bool abort=true)
+{
+    if (code != CUFFT_SUCCESS) 
+    {
+        fprintf(stderr,"CUFFT_CHECK: %d %s %d\n", code, file, line);
+        if (abort) exit(code);
+    }
+}
+
+
 
 typedef struct Region {
     /* represents a region in the FFT Mesh */
@@ -57,6 +78,7 @@ typedef struct PetaPMPriv {
     /* These varibles are initialized by petapm_init*/
 
     int fftsize;
+    int fftsize_complex;
     cufftHandle plan_forw; // NC:change plan function call
     cufftHandle plan_back;
     cudaStream_t stream;
@@ -98,7 +120,7 @@ typedef struct {
     int64_t NumPart;
 } PetaPMParticleStruct;
 
-typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value); //NC:change to complex type
+typedef void (*petapm_transfer_func)(PetaPM * pm, int64_t k2, int kpos[3], cufftDoubleComplex * value); //NC:change to complex type
 typedef void (*petapm_readout_func)(PetaPM * pm, int i, double * mesh, double weight);
 typedef PetaPMRegion * (*petapm_prepare_func)(PetaPM * pm, PetaPMParticleStruct * pstruct, void * data, int *Nregions);
 
@@ -138,11 +160,12 @@ PetaPMRegion * petapm_force_init(PetaPM * pm,
         PetaPMParticleStruct * pstruct,
         int * Nregions,
         void * userdata);
-cufftComplex * petapm_force_r2c(PetaPM * pm,
+cufftDoubleComplex * petapm_force_r2c(PetaPM * pm,
         PetaPMGlobalFunctions * global_functions
         ); // NC: changed returned complex type
 void petapm_force_c2r(PetaPM * pm,
-        cufftComplex * rho_k, PetaPMRegion * regions,
+            cufftDoubleComplex * rho_k,
+               PetaPMRegion * regions,
         const int Nregions,
         PetaPMFunctions * functions); // NC: changed input complex type
 void petapm_force_finish(PetaPM * pm);
@@ -152,5 +175,5 @@ PetaPMRegion * petapm_get_real_region(PetaPM * pm);
 int petapm_mesh_to_k(PetaPM * pm, int i);
 int *petapm_get_thistask2d(PetaPM * pm);
 int *petapm_get_ntask2d(PetaPM * pm);
-cufftComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
+cufftDoubleComplex * petapm_alloc_rhok(PetaPM * pm); // NC: changed returned complex type
 #endif
diff --git a/libgadget/pm_kernel.cu b/libgadget/pm_kernel.cu
index 20b61bf3..1b58fbdb 100644
--- a/libgadget/pm_kernel.cu
+++ b/libgadget/pm_kernel.cu
@@ -33,7 +33,7 @@ __device__ double diff_kernel(double w) {
 
 
 __global__
-void potential_transfer_kernel(BoxIterator<cufftComplex> begin, BoxIterator<cufftComplex> end, PetaPM *pm) {
+void potential_transfer_kernel(BoxIterator<cufftDoubleComplex> begin, BoxIterator<cufftDoubleComplex> end, PetaPM *pm) {
     const int tid = threadIdx.x + blockIdx.x * blockDim.x;
     begin += tid;
 
@@ -73,6 +73,10 @@ void potential_transfer_kernel(BoxIterator<cufftComplex> begin, BoxIterator<cuff
             begin->y = 0.0;
             return;
         }
+        if(tid < 10) {
+            printf("GPU data (after first transform): global 3D index [%d %d %d], local index %d is (%f,%f)\n", 
+                (int)begin.x(), (int)begin.y(), (int)begin.z(), (int)begin.i(), begin->x, begin->y);
+        }
         // Apply scaling factor
         begin->x *= fac;
         begin->y *= fac;
@@ -80,7 +84,44 @@ void potential_transfer_kernel(BoxIterator<cufftComplex> begin, BoxIterator<cuff
 }
 
 
-extern "C" void launch_potential_transfer(Box3D box_complex, cufftComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+__global__ 
+void force_transfer_kernel(BoxIterator<cufftDoubleComplex> begin, BoxIterator<cufftDoubleComplex> end, PetaPM *pm, int ik) {
+    double tmp0;
+    double tmp1;
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    begin += tid;
+    int pos;
+    
+    if (begin < end) {
+        // Get global 3D coordinates of the current element in real space
+        switch (ik) {
+            case 0:
+                pos = begin.x();
+                break;
+            case 1:
+                pos = begin.y();
+                break;
+            case 2:
+                pos = begin.z();
+                break;
+        }
+        // Compute the corresponding wave numbers (kx, ky, kz), in grid unit
+        int kpos = pos<=pm->Nmesh/2 ? pos : (pos-pm->Nmesh);
+        /*
+         * negative sign is from force_x = - Del_x pot
+         *
+         * filter is   i K(w)
+         * */
+        double fac = -1 * diff_kernel (kpos * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
+        tmp0 = - begin->y * fac;
+        tmp1 = begin->x * fac;
+        begin->x = tmp0;
+        begin->y = tmp1;
+    }
+}
+
+
+extern "C" void launch_potential_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
     auto [begin_d, end_d] = BoxIterators(box_complex, data);
     const size_t num_elements = std::distance(begin_d, end_d);
     const size_t num_threads  = 256;
@@ -89,43 +130,34 @@ extern "C" void launch_potential_transfer(Box3D box_complex, cufftComplex* data,
 }
 
 
+extern "C" void launch_force_x_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 0);
+}
+
+extern "C" void launch_force_y_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 1);
+}
+
+extern "C" void launch_force_z_transfer(Box3D box_complex, cufftDoubleComplex* data, int rank, int size, PetaPM *pm, cudaStream_t stream) {
+    auto [begin_d, end_d] = BoxIterators(box_complex, data);
+    const size_t num_elements = std::distance(begin_d, end_d);
+    const size_t num_threads  = 256;
+    const size_t num_blocks   = (num_elements + num_threads - 1) / num_threads;
+    force_transfer_kernel<<<num_blocks, num_threads, 0, stream>>>(begin_d, end_d, pm, 2);
+}
+
+
+
 
 
-// static void force_transfer(PetaPM * pm, int k, cufftComplex * value) {
-//     double tmp0;
-//     double tmp1;
-//     /*
-//      * negative sign is from force_x = - Del_x pot
-//      *
-//      * filter is   i K(w)
-//      * */
-//     double fac = -1 * diff_kernel (k * (2 * M_PI / pm->Nmesh)) * (pm->Nmesh / pm->BoxSize);
-//     tmp0 = - value[0].y * fac;
-//     tmp1 = value[0].x * fac;
-//     value[0].x = tmp0;
-//     value[0].y = tmp1;
-// }
-// static void force_x_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
-//     force_transfer(pm, kpos[0], value);
-// }
-// static void force_y_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
-//     force_transfer(pm, kpos[1], value);
-// }
-// static void force_z_transfer(PetaPM * pm, int64_t k2, int kpos[3], cufftComplex * value) {
-//     force_transfer(pm, kpos[2], value);
-// }
-// static void readout_potential(PetaPM * pm, int i, double * mesh, double weight) {
-//     P[i].Potential += weight * mesh[0];
-// }
-// static void readout_force_x(PetaPM * pm, int i, double * mesh, double weight) {
-//     P[i].GravPM[0] += weight * mesh[0];
-// }
-// static void readout_force_y(PetaPM * pm, int i, double * mesh, double weight) {
-//     P[i].GravPM[1] += weight * mesh[0];
-// }
-// static void readout_force_z(PetaPM * pm, int i, double * mesh, double weight) {
-//     P[i].GravPM[2] += weight * mesh[0];
-// }