From 193a18403d148a2d807a124286069d74d6826464 Mon Sep 17 00:00:00 2001
From: ACEsuit <christohortner@gmail.com>
Date: Tue, 18 Jun 2024 21:51:04 -0700
Subject: [PATCH 1/5] enable dynamic AA

---
 Project.toml                |   4 +-
 profile/profile_evaluate.jl |   2 +-
 src/UltraFastACE.jl         |   4 +
 src/auxiliary.jl            |  49 +++++++++++-
 src/splines.jl              |  10 ++-
 src/uface.jl                | 146 +++++-------------------------------
 src/uface_eval.jl           | 123 ++++++++++++++++++++++++++++++
 test/test_import_ace1.jl    |   6 +-
 8 files changed, 208 insertions(+), 136 deletions(-)
 create mode 100644 src/uface_eval.jl

diff --git a/Project.toml b/Project.toml
index 0e21464..7e0b817 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.0.2"
 ACE1 = "e3f9bc04-086e-409a-ba78-e9769fe067bb"
 ACE1x = "5cc4c08c-8782-4a30-af6d-550b302e9707"
 ACEbase = "14bae519-eb20-449c-a949-9c58ed33163e"
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
 ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
 DynamicPolynomials = "7c1d4256-1411-5781-91ec-d7bc3513ac07"
@@ -25,6 +26,7 @@ SpheriCart = "5caf2b29-02d9-47a3-9434-5931c85ba645"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StaticPolynomials = "62e018b1-6e46-5407-a5a7-97d4fbcae734"
 StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
+WithAlloc = "fb1aa66a-603c-4c1d-9bc4-66947c7b08dd"
 
 [compat]
 ChunkSplitters = "2"
@@ -32,7 +34,7 @@ Folds = "0.2"
 LoopVectorization = "0.12"
 NamedTupleTools = "0.14"
 ObjectPools = "0.3"
-Polynomials4ML = "0.2.7"
+Polynomials4ML = "0.2.11"
 julia = "1"
 
 [extras]
diff --git a/profile/profile_evaluate.jl b/profile/profile_evaluate.jl
index f84d0b6..244783e 100644
--- a/profile/profile_evaluate.jl
+++ b/profile/profile_evaluate.jl
@@ -14,7 +14,7 @@ pot = model.potential
 mbpot = pot.components[2]
 
 # convert to UFACE format 
-uf_ace = UltraFastACE.uface_from_ace1(mbpot; n_spl_points = 100)
+uf_ace = UltraFastACE.uface_from_ace1(pot; n_spl_points = 100)
 
 ## ------------------------------------
 
diff --git a/src/UltraFastACE.jl b/src/UltraFastACE.jl
index 0fdb99c..7e750a1 100644
--- a/src/UltraFastACE.jl
+++ b/src/UltraFastACE.jl
@@ -4,6 +4,9 @@ import ACEbase
 import ACEbase: evaluate, evaluate!, 
                 evaluate_ed, evaluate_ed!
 
+using Bumper, WithAlloc, StrideArrays      
+import WithAlloc: whatalloc          
+
 _i2z(obj, i::Integer) = obj._i2z[i] 
 
 function _z2i(obj, Z)
@@ -28,6 +31,7 @@ include("auxiliary.jl")
 include("pair.jl")
 
 include("uface.jl")
+include("uface_eval.jl")
 
 include("julip_calculator.jl")
 
diff --git a/src/auxiliary.jl b/src/auxiliary.jl
index 35d99c5..c1a0e88 100644
--- a/src/auxiliary.jl
+++ b/src/auxiliary.jl
@@ -1,3 +1,5 @@
+
+
 #
 # some auxiliary functions for UF_ACE evaluation 
 #
@@ -35,11 +37,21 @@ _len_ylm(ybasis) = (_get_L(ybasis) + 1)^2
 
 function evaluate_ylm(ace, Rs)
    TF = eltype(eltype(Rs))
-   Zlm = acquire!(ace.pool, :Zlm, (length(Rs), _len_ylm(ace.ybasis)), TF)
+   Zlm = zeros(TF, length(Rs), _len_ylm(ace.ybasis))
+   evaluate_ylm!(Zlm, ace, Rs)
+   return Zlm 
+end
+
+function evaluate_ylm!(Zlm, ace, Rs)
    compute!(Zlm, ace.ybasis, Rs)
    return Zlm 
 end
 
+function whatalloc(::typeof(evaluate_ylm!), ace, Rs)
+   TF = eltype(eltype(Rs))
+   return (TF, length(Rs), _len_ylm(ace.ybasis))
+end
+
 function evaluate_ylm_ed(ace, Rs)
    TF = eltype(eltype(Rs))
    Zlm = acquire!(ace.pool, :Zlm, (length(Rs), _len_ylm(ace.ybasis)), TF)
@@ -52,11 +64,15 @@ end
 # ------------------------------ 
 #  element embedding 
 
-
-
 function embed_z(ace, Rs, Zs)
    TF = eltype(eltype(Rs))
-   Ez = acquire!(ace.pool, :Ez, (length(Zs), length(ace.rbasis)), TF)
+   # Ez = acquire!(ace.pool, :Ez, (length(Zs), length(ace.rbasis)), TF)
+   Ez = zeros(TF, length(Zs), length(ace.rbasis))
+   return embed_z!(Ez, ace, Rs, Zs)
+end
+
+
+function embed_z!(Ez, ace, Rs, Zs)
    fill!(Ez, 0)
    for (j, z) in enumerate(Zs)
       iz = _z2i(ace.rbasis, z)
@@ -66,3 +82,28 @@ function embed_z(ace, Rs, Zs)
 end
 
 
+function whatalloc(::typeof(embed_z!), ace, Rs, Zs)
+   TF = eltype(eltype(Rs))
+   return (TF, length(Zs), length(ace.rbasis))
+end
+
+
+# ------------------------------ 
+#  aadot via P4ML 
+
+using LinearAlgebra: dot 
+
+struct AADot{T, TAA}
+   cc::Vector{T} 
+   aabasis::TAA
+end
+
+function (aadot::AADot)(A)
+   @no_escape begin 
+      AA = @alloc(eltype(A), length(aadot.aabasis))
+      P4ML.evaluate!(AA, aadot.aabasis, A)
+      out = dot(aadot.cc, AA)
+   end
+   return out 
+end
+
diff --git a/src/splines.jl b/src/splines.jl
index 61b37b2..dc52d7f 100644
--- a/src/splines.jl
+++ b/src/splines.jl
@@ -25,11 +25,17 @@ end
 function evaluate(ace, basis::SplineRadialsZ, 
                   Rs::AbstractVector{<: SVector}, Zs::AbstractVector)
    TF = eltype(eltype(Rs))                  
-   Rn = acquire!(ace.pool, :Rn, (length(Rs), length(basis)), TF)
+   Rn = zeros(TF, length(Rs), length(basis))
    evaluate!(Rn, basis, Rs, Zs)
    return Rn 
 end
 
+function whatalloc(::typeof(ACEbase.evaluate!), 
+                   basis::SplineRadialsZ, Rs, Zs)
+   TF = eltype(eltype(Rs))                  
+   return (TF, length(Rs), length(basis))
+end
+
 
 function evaluate!(out, basis::SplineRadialsZ, Rs, Zs)
    nX = length(Rs)
@@ -49,6 +55,8 @@ function evaluate!(out, basis::SplineRadialsZ, Rs, Zs)
 end
 
 
+
+
 function evaluate_ed(ace, rbasis, Rs, Zs)
    TF = eltype(eltype(Rs))                  
    Rn = acquire!(ace.pool, :Rn, (length(Rs), length(rbasis)), TF)
diff --git a/src/uface.jl b/src/uface.jl
index d8dfd89..5b8c731 100644
--- a/src/uface.jl
+++ b/src/uface.jl
@@ -15,13 +15,13 @@ struct UFACE_inner{TR, TY, TA, TAA}
    abasis::TA
    aadot::TAA
    # ---------- admin and meta-data 
-   pool::TSafe{ArrayPool{FlexArrayCache}}
+   # pool::TSafe{ArrayPool{FlexArrayCache}}
    meta::Dict
 end
 
 UFACE_inner(rbasis, ybasis, abasis, aadot) = 
    UFACE_inner(rbasis, ybasis, abasis, aadot, 
-               TSafe(ArrayPool(FlexArrayCache)), 
+               # TSafe(ArrayPool(FlexArrayCache)), 
                Dict())
 
 struct UFACE{NZ, INNER, PAIR}
@@ -30,139 +30,17 @@ struct UFACE{NZ, INNER, PAIR}
    pairpot::PAIR
    E0s::Dict{Int, Float64}
    # ---------- 
-   pool::TSafe{ArrayPool{FlexArrayCache}}
+   # pool::TSafe{ArrayPool{FlexArrayCache}}
    meta::Dict
 end
 
 UFACE(_i2z, ace_inner, pairpot, E0s) = 
       UFACE(_i2z, ace_inner, pairpot, E0s,
-            TSafe(ArrayPool(FlexArrayCache)), 
+            # TSafe(ArrayPool(FlexArrayCache)), 
             Dict())
 
 
 
-function ACEbase.evaluate(ace::UFACE, Rs, Zs, zi) 
-   i_zi = _z2i(ace, zi)
-   ace_inner = ace.ace_inner[i_zi]
-   Ei = ( evaluate(ace_inner, Rs, Zs) + 
-          evaluate(ace.pairpot, Rs, Zs, zi) + 
-          ace.E0s[zi] )
-end
-
-# --------------------------------------------------------
-# UF_ACE evaluation code. 
-
-function ACEbase.evaluate(ace::UFACE_inner, Rs, Zs)
-   TF = eltype(eltype(Rs))
-   rbasis = ace.rbasis 
-   NZ = length(rbasis._i2z)
-   
-   # embeddings    
-   # element embedding 
-   Ez = embed_z(ace, Rs, Zs)
-   # radial embedding 
-   Rn = evaluate(ace, rbasis, Rs, Zs)
-   # angular embedding 
-   Zlm = evaluate_ylm(ace, Rs)
-   
-   # pooling 
-   A = ace.abasis((unwrap(Ez), unwrap(Rn), unwrap(Zlm)))
-
-   # n correlations
-   φ = ace.aadot(A)
-
-   # release the borrowed arrays 
-   release!(Zlm)
-   release!(Rn)
-   release!(Ez)
-   release!(A)
-
-   return φ
-end
-
-function evaluate_ed(ace::UFACE, Rs, Zs, z0)
-   ∇φ = acquire!(ace.pool, :out_dEs, (length(Rs),), eltype(Rs))
-   return evaluate_ed!(∇φ, ace, Rs, Zs, z0)
-end
-
-function evaluate_ed!(∇φ, ace::UFACE, Rs, Zs, z0)
-   i_z0 = _z2i(ace, z0)
-   ace_inner = ace.ace_inner[i_z0]
-   φ, _ = evaluate_ed!(∇φ, ace_inner, Rs, Zs)
-   add_evaluate_d!(∇φ, ace.pairpot, Rs, Zs, z0)
-   φ += ace.E0s[z0] + evaluate(ace.pairpot, Rs, Zs, z0)
-   return φ, ∇φ
-end
-
-
-function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs)
-   TF = eltype(eltype(Rs))
-   rbasis = ace.rbasis 
-   NZ = length(rbasis._i2z)
-
-   # embeddings    
-   # element embedding  (there is no gradient)
-   Ez = embed_z(ace, Rs, Zs)
-
-   # radial embedding 
-   Rn, dRn = evaluate_ed(ace, rbasis, Rs, Zs)
-
-   # angular embedding 
-   Zlm, dZlm = evaluate_ylm_ed(ace, Rs)
-
-   # pooling 
-   A = ace.abasis((Ez, Rn, Zlm))
-
-   # n correlations - compute with gradient, do it in-place 
-   ∂φ_∂A = acquire!(ace.pool, :∂A, size(A), TF)
-   φ = evaluate_and_gradient!(∂φ_∂A, ace.aadot, A)
-   
-   # backprop through A  =>  this part could be done more nicely I think
-   ∂φ_∂Ez = BlackHole(TF) 
-   # ∂φ_∂Ez = zeros(TF, size(Ez))
-   ∂φ_∂Rn = acquire!(ace.pool, :∂Rn, size(Rn), TF)
-   ∂φ_∂Zlm = acquire!(ace.pool, :∂Zlm, size(Zlm), TF)
-   fill!(∂φ_∂Rn, zero(TF))
-   fill!(∂φ_∂Zlm, zero(TF))
-   P4ML._pullback_evaluate!((∂φ_∂Ez, unwrap(∂φ_∂Rn), unwrap(∂φ_∂Zlm)), 
-                             unwrap(∂φ_∂A), 
-                             ace.abasis, 
-                             (unwrap(Ez), unwrap(Rn), unwrap(Zlm)); 
-                             sizecheck=false)
-
-   # backprop through the embeddings 
-   # depending on whether there is a bottleneck here, this can be 
-   # potentially implemented more efficiently without needing writing/reading 
-   # (to be investigated where the bottleneck is)
-   
-   # we just ignore Ez (hence the black hole)
-
-   # backprop through Rn 
-   # We already computed the gradients in the forward pass
-   fill!(∇φ, zero(SVector{3, TF}))
-   @inbounds for n = 1:size(Rn, 2)
-      @simd ivdep for j = 1:length(Rs)
-         ∇φ[j] += ∂φ_∂Rn[j, n] * dRn[j, n]
-      end
-   end
-
-   # ... and Ylm 
-   @inbounds for i_lm = 1:size(Zlm, 2)
-      @simd ivdep for j = 1:length(Rs)
-         ∇φ[j] += ∂φ_∂Zlm[j, i_lm] * dZlm[j, i_lm]
-      end
-   end
-
-   # release the borrowed arrays 
-   release!(Zlm); release!(dZlm)
-   release!(Rn); release!(dRn)
-   release!(Ez)
-   release!(A)
-   release!(∂φ_∂Rn); release!(∂φ_∂Zlm); release!(∂φ_∂A)
-
-   return φ, ∇φ 
-end
-
 
 # ------------------------------------------------------
 # transformation code : ACE1 -> UF_ACE models 
@@ -196,12 +74,18 @@ end
 
 
 
-function uface_from_ace1_inner(mbpot, iz; n_spl_points = 100)
+function uface_from_ace1_inner(mbpot, iz; 
+                           n_spl_points = 100, 
+                           aa_static = :auto)
    b1p = mbpot.pibasis.basis1p
    zlist = b1p.zlist.list 
    z0 = zlist[iz]
    t_zlist = tuple(zlist...)
 
+   if aa_static == :auto 
+      aa_static = (length(mbpot.pibasis.inner[1]) < 1_200)
+   end
+
    # radial embedding
    Rn_basis = mbpot.pibasis.basis1p.J
    LEN_Rn = length(Rn_basis.J)
@@ -258,7 +142,13 @@ function uface_from_ace1_inner(mbpot, iz; n_spl_points = 100)
 
    # AA_basis = P4ML.SparseSymmProd(spec_AA_inds)
    c_r_iz = AA_transform[:T]' * mbpot.coeffs[iz]
-   aadot = generate_AA_dot(spec_AA_inds, c_r_iz)
+
+   if aa_static
+      aadot = generate_AA_dot(spec_AA_inds, c_r_iz)
+   else 
+      aabasis = P4ML.SparseSymmProd(spec_AA_inds)
+      aadot = AADot(c_r_iz, aabasis)
+   end 
 
    return UFACE_inner(rbasis_new, rYlm_basis_sc, A_basis, aadot)
 end
diff --git a/src/uface_eval.jl b/src/uface_eval.jl
new file mode 100644
index 0000000..19f10c9
--- /dev/null
+++ b/src/uface_eval.jl
@@ -0,0 +1,123 @@
+
+function ACEbase.evaluate(ace::UFACE, Rs, Zs, zi) 
+   i_zi = _z2i(ace, zi)
+   ace_inner = ace.ace_inner[i_zi]
+   Ei = ( evaluate(ace_inner, Rs, Zs) + 
+          evaluate(ace.pairpot, Rs, Zs, zi) + 
+          ace.E0s[zi] )
+end
+
+# --------------------------------------------------------
+# UF_ACE evaluation code. 
+
+function ACEbase.evaluate(ace::UFACE_inner, Rs, Zs)
+   TF = eltype(eltype(Rs))
+   rbasis = ace.rbasis 
+   NZ = length(rbasis._i2z)
+  
+   @no_escape begin 
+   
+   # embeddings    
+   # element embedding 
+   Ez = @withalloc embed_z!(ace, Rs, Zs)
+   # radial embedding 
+   Rn = @withalloc evaluate!(rbasis, Rs, Zs)
+   # angular embedding 
+   Zlm = @withalloc evaluate_ylm!(ace, Rs)
+   
+   # pooling 
+   A = ace.abasis((Ez, Rn, Zlm))
+
+   # n correlations
+   φ = ace.aadot(A)
+
+   # release the borrowed arrays 
+   release!(A)
+
+   end
+
+   return φ
+end
+
+function evaluate_ed(ace::UFACE, Rs, Zs, z0)
+   ∇φ = acquire!(ace.pool, :out_dEs, (length(Rs),), eltype(Rs))
+   return evaluate_ed!(∇φ, ace, Rs, Zs, z0)
+end
+
+function evaluate_ed!(∇φ, ace::UFACE, Rs, Zs, z0)
+   i_z0 = _z2i(ace, z0)
+   ace_inner = ace.ace_inner[i_z0]
+   φ, _ = evaluate_ed!(∇φ, ace_inner, Rs, Zs)
+   add_evaluate_d!(∇φ, ace.pairpot, Rs, Zs, z0)
+   φ += ace.E0s[z0] + evaluate(ace.pairpot, Rs, Zs, z0)
+   return φ, ∇φ
+end
+
+
+function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs)
+   TF = eltype(eltype(Rs))
+   rbasis = ace.rbasis 
+   NZ = length(rbasis._i2z)
+
+   # embeddings    
+   # element embedding  (there is no gradient)
+   Ez = embed_z(ace, Rs, Zs)
+
+   # radial embedding 
+   Rn, dRn = evaluate_ed(ace, rbasis, Rs, Zs)
+
+   # angular embedding 
+   Zlm, dZlm = evaluate_ylm_ed(ace, Rs)
+
+   # pooling 
+   A = ace.abasis((Ez, Rn, Zlm))
+
+   # n correlations - compute with gradient, do it in-place 
+   ∂φ_∂A = acquire!(ace.pool, :∂A, size(A), TF)
+   φ = evaluate_and_gradient!(∂φ_∂A, ace.aadot, A)
+   
+   # backprop through A  =>  this part could be done more nicely I think
+   ∂φ_∂Ez = BlackHole(TF) 
+   # ∂φ_∂Ez = zeros(TF, size(Ez))
+   ∂φ_∂Rn = acquire!(ace.pool, :∂Rn, size(Rn), TF)
+   ∂φ_∂Zlm = acquire!(ace.pool, :∂Zlm, size(Zlm), TF)
+   fill!(∂φ_∂Rn, zero(TF))
+   fill!(∂φ_∂Zlm, zero(TF))
+   P4ML._pullback_evaluate!((∂φ_∂Ez, unwrap(∂φ_∂Rn), unwrap(∂φ_∂Zlm)), 
+                             unwrap(∂φ_∂A), 
+                             ace.abasis, 
+                             (unwrap(Ez), unwrap(Rn), unwrap(Zlm)); 
+                             sizecheck=false)
+
+   # backprop through the embeddings 
+   # depending on whether there is a bottleneck here, this can be 
+   # potentially implemented more efficiently without needing writing/reading 
+   # (to be investigated where the bottleneck is)
+   
+   # we just ignore Ez (hence the black hole)
+
+   # backprop through Rn 
+   # We already computed the gradients in the forward pass
+   fill!(∇φ, zero(SVector{3, TF}))
+   @inbounds for n = 1:size(Rn, 2)
+      @simd ivdep for j = 1:length(Rs)
+         ∇φ[j] += ∂φ_∂Rn[j, n] * dRn[j, n]
+      end
+   end
+
+   # ... and Ylm 
+   @inbounds for i_lm = 1:size(Zlm, 2)
+      @simd ivdep for j = 1:length(Rs)
+         ∇φ[j] += ∂φ_∂Zlm[j, i_lm] * dZlm[j, i_lm]
+      end
+   end
+
+   # release the borrowed arrays 
+   release!(Zlm); release!(dZlm)
+   release!(Rn); release!(dRn)
+   release!(Ez)
+   release!(A)
+   release!(∂φ_∂Rn); release!(∂φ_∂Zlm); release!(∂φ_∂A)
+
+   return φ, ∇φ 
+end
diff --git a/test/test_import_ace1.jl b/test/test_import_ace1.jl
index 945a097..ed2967a 100644
--- a/test/test_import_ace1.jl
+++ b/test/test_import_ace1.jl
@@ -15,13 +15,17 @@ end
 
 elements = [:Si,:O]
 
+# use totaldegree = 10, 12 for static aa 
+#     totaldegree > 15 for dynamic aa
+
 model = acemodel(; elements = elements, 
-                   order = 3, totaldegree = 10, 
+                   order = 3, totaldegree = 15, 
                    Eref = Dict(:Si => -1.234, :O => -0.432))
 pot = model.potential
 pairpot = pot.components[1]
 mbpot = pot.components[2]
 pot1 = pot.components[3]
+@show length(mbpot.pibasis.inner[1])
 
 ##
 # normalize the potential a bit so that all contributions are O(1) 

From 355378f9b9c2fe3bb8e81404e6590b4200ed11d8 Mon Sep 17 00:00:00 2001
From: ACEsuit <christohortner@gmail.com>
Date: Wed, 19 Jun 2024 08:38:47 -0700
Subject: [PATCH 2/5] test both static and dynamic

---
 src/auxiliary.jl         | 19 +++++++++++++++++
 src/ncorr.jl             |  4 +++-
 src/splines.jl           |  7 ++++++-
 src/uface_eval.jl        | 45 +++++++++++++++++++---------------------
 test/test_import_ace1.jl | 41 ++++++++++++++++++++++++++++--------
 5 files changed, 81 insertions(+), 35 deletions(-)

diff --git a/src/auxiliary.jl b/src/auxiliary.jl
index c1a0e88..c990b21 100644
--- a/src/auxiliary.jl
+++ b/src/auxiliary.jl
@@ -60,6 +60,18 @@ function evaluate_ylm_ed(ace, Rs)
    return Zlm, dZlm
 end
 
+function evaluate_ylm_ed!(Zlm, dZlm, ace, Rs)
+   compute_with_gradients!(Zlm, dZlm, ace.ybasis, Rs)
+   return Zlm, dZlm
+end
+
+function whatalloc(::typeof(evaluate_ylm_ed!), ace, Rs)
+   TF = eltype(eltype(Rs))
+   return (TF, length(Rs), _len_ylm(ace.ybasis)), 
+          (SVector{3, TF}, length(Rs), _len_ylm(ace.ybasis))
+end
+
+
 
 # ------------------------------ 
 #  element embedding 
@@ -107,3 +119,10 @@ function (aadot::AADot)(A)
    return out 
 end
 
+function eval_and_grad!(∇φ_A, aadot::AADot, A)
+   φ = aadot(A)
+   ∇φ_A_1 = P4ML._pb_evaluate(aadot.aabasis, aadot.cc, A)
+   ∇φ_A .= unwrap(∇φ_A_1)
+   release!(∇φ_A_1)
+   return φ
+end
\ No newline at end of file
diff --git a/src/ncorr.jl b/src/ncorr.jl
index 2707c79..5c82929 100644
--- a/src/ncorr.jl
+++ b/src/ncorr.jl
@@ -28,4 +28,6 @@ function generate_AA_dot(spec, c)
    return Polynomial(dynamic_poly)
 end
 
-
+function eval_and_grad!(∇_A, aadot, A)
+   return evaluate_and_gradient!(∇_A, aadot, A)
+end
diff --git a/src/splines.jl b/src/splines.jl
index dc52d7f..b6f1ff5 100644
--- a/src/splines.jl
+++ b/src/splines.jl
@@ -65,6 +65,11 @@ function evaluate_ed(ace, rbasis, Rs, Zs)
    return Rn, dRn 
 end
 
+function whatalloc(::typeof(evaluate_ed!), basis::SplineRadialsZ, Rs, Zs)
+   TF = eltype(eltype(Rs))
+   return (TF, length(Rs), length(basis)), 
+          (SVector{3, TF}, length(Rs), length(basis))
+end
 
 function evaluate_ed!(Rn, dRn, basis::SplineRadialsZ, Rs, Zs)
    nX = length(Rs)
@@ -87,6 +92,6 @@ function evaluate_ed!(Rn, dRn, basis::SplineRadialsZ, Rs, Zs)
          dRn[ij, n] = g[n] * 𝐫̂ij
       end
    end
-   return nothing 
+   return Rn, dRn 
 end
 
diff --git a/src/uface_eval.jl b/src/uface_eval.jl
index 19f10c9..8309cc8 100644
--- a/src/uface_eval.jl
+++ b/src/uface_eval.jl
@@ -26,28 +26,27 @@ function ACEbase.evaluate(ace::UFACE_inner, Rs, Zs)
    Zlm = @withalloc evaluate_ylm!(ace, Rs)
    
    # pooling 
-   A = ace.abasis((Ez, Rn, Zlm))
+   TA = promote_type(eltype(Ez), eltype(Rn), eltype(Zlm))
+   A = @alloc(TA, length(ace.abasis))
+   P4ML.evaluate!(A, ace.abasis, (Ez, Rn, Zlm))
 
    # n correlations
    φ = ace.aadot(A)
 
-   # release the borrowed arrays 
-   release!(A)
-
    end
 
    return φ
 end
 
 function evaluate_ed(ace::UFACE, Rs, Zs, z0)
-   ∇φ = acquire!(ace.pool, :out_dEs, (length(Rs),), eltype(Rs))
+   ∇φ = zeros(eltype(Rs), length(Rs))
    return evaluate_ed!(∇φ, ace, Rs, Zs, z0)
 end
 
 function evaluate_ed!(∇φ, ace::UFACE, Rs, Zs, z0)
    i_z0 = _z2i(ace, z0)
    ace_inner = ace.ace_inner[i_z0]
-   φ, _ = evaluate_ed!(∇φ, ace_inner, Rs, Zs)
+   φ, _∇φ = evaluate_ed!(∇φ, ace_inner, Rs, Zs)
    add_evaluate_d!(∇φ, ace.pairpot, Rs, Zs, z0)
    φ += ace.E0s[z0] + evaluate(ace.pairpot, Rs, Zs, z0)
    return φ, ∇φ
@@ -59,34 +58,37 @@ function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs)
    rbasis = ace.rbasis 
    NZ = length(rbasis._i2z)
 
+   @no_escape begin
+
    # embeddings    
    # element embedding  (there is no gradient)
-   Ez = embed_z(ace, Rs, Zs)
+   Ez = @withalloc embed_z!(ace, Rs, Zs)
 
    # radial embedding 
-   Rn, dRn = evaluate_ed(ace, rbasis, Rs, Zs)
+   Rn, dRn = @withalloc evaluate_ed!(rbasis, Rs, Zs)
 
    # angular embedding 
-   Zlm, dZlm = evaluate_ylm_ed(ace, Rs)
+   Zlm, dZlm = @withalloc evaluate_ylm_ed!(ace, Rs)
 
    # pooling 
-   A = ace.abasis((Ez, Rn, Zlm))
+   TA = promote_type(eltype(Ez), eltype(Rn), eltype(Zlm))
+   A = @alloc(TA, length(ace.abasis))
+   P4ML.evaluate!(A, ace.abasis, (Ez, Rn, Zlm))
 
    # n correlations - compute with gradient, do it in-place 
-   ∂φ_∂A = acquire!(ace.pool, :∂A, size(A), TF)
-   φ = evaluate_and_gradient!(∂φ_∂A, ace.aadot, A)
+   ∂φ_∂A = @alloc(TA, length(A))
+   φ = eval_and_grad!(∂φ_∂A, ace.aadot, A)
    
    # backprop through A  =>  this part could be done more nicely I think
    ∂φ_∂Ez = BlackHole(TF) 
    # ∂φ_∂Ez = zeros(TF, size(Ez))
-   ∂φ_∂Rn = acquire!(ace.pool, :∂Rn, size(Rn), TF)
-   ∂φ_∂Zlm = acquire!(ace.pool, :∂Zlm, size(Zlm), TF)
+   ∂φ_∂Rn = @alloc(TF, size(Rn)...)
+   ∂φ_∂Zlm = @alloc(TF, size(Zlm)...)
    fill!(∂φ_∂Rn, zero(TF))
    fill!(∂φ_∂Zlm, zero(TF))
-   P4ML._pullback_evaluate!((∂φ_∂Ez, unwrap(∂φ_∂Rn), unwrap(∂φ_∂Zlm)), 
-                             unwrap(∂φ_∂A), 
-                             ace.abasis, 
-                             (unwrap(Ez), unwrap(Rn), unwrap(Zlm)); 
+   P4ML._pullback_evaluate!((∂φ_∂Ez, ∂φ_∂Rn, ∂φ_∂Zlm), 
+                             ∂φ_∂A, 
+                             ace.abasis, (Ez, Rn, Zlm); 
                              sizecheck=false)
 
    # backprop through the embeddings 
@@ -112,12 +114,7 @@ function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs)
       end
    end
 
-   # release the borrowed arrays 
-   release!(Zlm); release!(dZlm)
-   release!(Rn); release!(dRn)
-   release!(Ez)
-   release!(A)
-   release!(∂φ_∂Rn); release!(∂φ_∂Zlm); release!(∂φ_∂A)
+   end # no_escape
 
    return φ, ∇φ 
 end
diff --git a/test/test_import_ace1.jl b/test/test_import_ace1.jl
index ed2967a..fec098b 100644
--- a/test/test_import_ace1.jl
+++ b/test/test_import_ace1.jl
@@ -18,19 +18,38 @@ elements = [:Si,:O]
 # use totaldegree = 10, 12 for static aa 
 #     totaldegree > 15 for dynamic aa
 
-model = acemodel(; elements = elements, 
+@info("Dynamic Model: totaldegree = 15")
+model_dyn = acemodel(; elements = elements, 
                    order = 3, totaldegree = 15, 
                    Eref = Dict(:Si => -1.234, :O => -0.432))
-pot = model.potential
-pairpot = pot.components[1]
-mbpot = pot.components[2]
-pot1 = pot.components[3]
-@show length(mbpot.pibasis.inner[1])
+pot_dyn = model_dyn.potential
+pairpot_dyn = pot_dyn.components[1]
+mbpot_dyn = pot_dyn.components[2]
+pot1_dyn = pot_dyn.components[3]
+@show length(mbpot_dyn.pibasis.inner[1])
+
+
+@info("Static Model: totaldegree = 10")
+model_st = acemodel(; elements = elements, 
+                   order = 3, totaldegree = 10, 
+                   Eref = Dict(:Si => -1.234, :O => -0.432))
+pot_st = model_st.potential
+pairpot_st = pot_st.components[1]
+mbpot_st = pot_st.components[2]
+pot1_st = pot_st.components[3]
+@show length(mbpot_st.pibasis.inner[1])
+
+## 
+
+MODELS = [ (model_dyn, pot_dyn, pairpot_dyn, mbpot_dyn, pot1_dyn), 
+           (model_st,  pot_st,  pairpot_st,  mbpot_st,  pot1_st) ]
 
 ##
 # normalize the potential a bit so that all contributions are O(1) 
 # pot1 will be O(1) by construction  
 
+for (model, pot, pairpot, mbpot, pot1) in MODELS
+
 Nsample = 10_000
 pairpot.coeffs[:] = randn(length(pairpot.coeffs)) 
 a = sum( evaluate(pairpot, rand_env()...) for _ = 1:Nsample ) / Nsample
@@ -45,13 +64,13 @@ sum( evaluate(mbpot_, rand_env()...) for _ = 1:Nsample ) / Nsample
 
 pot.components[2] = mbpot = mbpot_
 
-
 # convert to UFACE format 
 uf_ace = UltraFastACE.uface_from_ace1(pot; n_spl_points = 10_000)
 
-## ------------------------------------
+# ------------------------------------
 
 @info("Test Consistency of ACE1 with UFACE")
+
 for ntest = 1:50 
    Rs, Zs, z0 = rand_env()
 
@@ -73,8 +92,9 @@ end
 println()
 
 
-## check gradient
+# check gradient ------------------------------
 
+@info("test gradients")
 for ntest = 1:30 
    Rs, Zs, z0 = rand_env()
    v1, dv1 = evaluate_ed(uf_ace, Rs, Zs, z0)
@@ -84,4 +104,7 @@ for ntest = 1:30
    dF = t -> (dV = evaluate_ed(uf_ace, Rs + t * U, Zs, z0)[2]; 
             sum( dot(dv, u) for (dv, u) in zip(dV, U) ) )
    print_tf(@test ACEbase.Testing.fdtest(F, dF, 0.0; verbose=false))
+end
+println()
+
 end
\ No newline at end of file

From f7313f9883a55403f3d2d4582148427fe969dee6 Mon Sep 17 00:00:00 2001
From: ACEsuit <christohortner@gmail.com>
Date: Wed, 19 Jun 2024 08:44:16 -0700
Subject: [PATCH 3/5] fixing CI

---
 .github/workflows/CI.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 57c563d..84d85a6 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -19,8 +19,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
           - '1.9'
+          - '1.10'
           - 'nightly'
         os:
           - ubuntu-latest
@@ -33,6 +33,10 @@ jobs:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v1
+      - run: |
+             using Pkg
+             Pkg.pkg"registry add https://github.com/ACEsuit/ACEregistry"
+           shell: bash -c "julia --color=yes {0}"
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
   docs:

From 7baf4777d033785f98050a1cb236434549ac9a45 Mon Sep 17 00:00:00 2001
From: ACEsuit <christohortner@gmail.com>
Date: Wed, 19 Jun 2024 08:47:12 -0700
Subject: [PATCH 4/5] fixing CI

---
 .github/workflows/CI.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 84d85a6..0d2a76a 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -34,9 +34,9 @@ jobs:
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v1
       - run: |
-             using Pkg
-             Pkg.pkg"registry add https://github.com/ACEsuit/ACEregistry"
-           shell: bash -c "julia --color=yes {0}"
+          using Pkg
+          Pkg.pkg"registry add https://github.com/ACEsuit/ACEregistry"
+        shell: bash -c "julia --color=yes {0}"
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
   docs:

From 398782e3a5d33bb92714f5a550bf16d4cf894d26 Mon Sep 17 00:00:00 2001
From: ACEsuit <christohortner@gmail.com>
Date: Wed, 19 Jun 2024 08:53:19 -0700
Subject: [PATCH 5/5] add a seed to avoid random fail

---
 Project.toml             | 1 +
 test/test_import_ace1.jl | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 7e0b817..909b063 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,6 +21,7 @@ ObjectPools = "658cac36-ff0f-48ad-967c-110375d98c9d"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 Polynomials4ML = "03c4bcba-a943-47e9-bfa1-b1661fc2974f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpheriCart = "5caf2b29-02d9-47a3-9434-5931c85ba645"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/test/test_import_ace1.jl b/test/test_import_ace1.jl
index fec098b..348d5f6 100644
--- a/test/test_import_ace1.jl
+++ b/test/test_import_ace1.jl
@@ -1,6 +1,6 @@
 
 using ACE1, ACE1x, JuLIP, StaticArrays, BenchmarkTools, 
-      LinearAlgebra, UltraFastACE, Test, ACEbase
+      LinearAlgebra, UltraFastACE, Test, ACEbase, Random 
 using ACEbase: evaluate, evaluate_ed
 using ACEbase.Testing: print_tf 
 
@@ -11,6 +11,8 @@ function rand_env(; Nat = rand(4:12), r0 = 0.8 * rnn(:Si), r1 = 2.0 * rnn(:Si))
       return Rs, Zs, z0 
 end
 
+Random.seed!(1234)
+
 ##
 
 elements = [:Si,:O]