diff --git a/Project.toml b/Project.toml index 0e21464..7e0b817 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.0.2" ACE1 = "e3f9bc04-086e-409a-ba78-e9769fe067bb" ACE1x = "5cc4c08c-8782-4a30-af6d-550b302e9707" ACEbase = "14bae519-eb20-449c-a949-9c58ed33163e" +Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2" ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e" DynamicPolynomials = "7c1d4256-1411-5781-91ec-d7bc3513ac07" @@ -25,6 +26,7 @@ SpheriCart = "5caf2b29-02d9-47a3-9434-5931c85ba645" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StaticPolynomials = "62e018b1-6e46-5407-a5a7-97d4fbcae734" StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b" +WithAlloc = "fb1aa66a-603c-4c1d-9bc4-66947c7b08dd" [compat] ChunkSplitters = "2" @@ -32,7 +34,7 @@ Folds = "0.2" LoopVectorization = "0.12" NamedTupleTools = "0.14" ObjectPools = "0.3" -Polynomials4ML = "0.2.7" +Polynomials4ML = "0.2.11" julia = "1" [extras] diff --git a/profile/profile_evaluate.jl b/profile/profile_evaluate.jl index f84d0b6..244783e 100644 --- a/profile/profile_evaluate.jl +++ b/profile/profile_evaluate.jl @@ -14,7 +14,7 @@ pot = model.potential mbpot = pot.components[2] # convert to UFACE format -uf_ace = UltraFastACE.uface_from_ace1(mbpot; n_spl_points = 100) +uf_ace = UltraFastACE.uface_from_ace1(pot; n_spl_points = 100) ## ------------------------------------ diff --git a/src/UltraFastACE.jl b/src/UltraFastACE.jl index 0fdb99c..7e750a1 100644 --- a/src/UltraFastACE.jl +++ b/src/UltraFastACE.jl @@ -4,6 +4,9 @@ import ACEbase import ACEbase: evaluate, evaluate!, evaluate_ed, evaluate_ed! +using Bumper, WithAlloc, StrideArrays +import WithAlloc: whatalloc + _i2z(obj, i::Integer) = obj._i2z[i] function _z2i(obj, Z) @@ -28,6 +31,7 @@ include("auxiliary.jl") include("pair.jl") include("uface.jl") +include("uface_eval.jl") include("julip_calculator.jl") diff --git a/src/auxiliary.jl b/src/auxiliary.jl index 35d99c5..c1a0e88 100644 --- a/src/auxiliary.jl +++ b/src/auxiliary.jl @@ -1,3 +1,5 @@ + + # # some auxiliary functions for UF_ACE evaluation # @@ -35,11 +37,21 @@ _len_ylm(ybasis) = (_get_L(ybasis) + 1)^2 function evaluate_ylm(ace, Rs) TF = eltype(eltype(Rs)) - Zlm = acquire!(ace.pool, :Zlm, (length(Rs), _len_ylm(ace.ybasis)), TF) + Zlm = zeros(TF, length(Rs), _len_ylm(ace.ybasis)) + evaluate_ylm!(Zlm, ace, Rs) + return Zlm +end + +function evaluate_ylm!(Zlm, ace, Rs) compute!(Zlm, ace.ybasis, Rs) return Zlm end +function whatalloc(::typeof(evaluate_ylm!), ace, Rs) + TF = eltype(eltype(Rs)) + return (TF, length(Rs), _len_ylm(ace.ybasis)) +end + function evaluate_ylm_ed(ace, Rs) TF = eltype(eltype(Rs)) Zlm = acquire!(ace.pool, :Zlm, (length(Rs), _len_ylm(ace.ybasis)), TF) @@ -52,11 +64,15 @@ end # ------------------------------ # element embedding - - function embed_z(ace, Rs, Zs) TF = eltype(eltype(Rs)) - Ez = acquire!(ace.pool, :Ez, (length(Zs), length(ace.rbasis)), TF) + # Ez = acquire!(ace.pool, :Ez, (length(Zs), length(ace.rbasis)), TF) + Ez = zeros(TF, length(Zs), length(ace.rbasis)) + return embed_z!(Ez, ace, Rs, Zs) +end + + +function embed_z!(Ez, ace, Rs, Zs) fill!(Ez, 0) for (j, z) in enumerate(Zs) iz = _z2i(ace.rbasis, z) @@ -66,3 +82,28 @@ function embed_z(ace, Rs, Zs) end +function whatalloc(::typeof(embed_z!), ace, Rs, Zs) + TF = eltype(eltype(Rs)) + return (TF, length(Zs), length(ace.rbasis)) +end + + +# ------------------------------ +# aadot via P4ML + +using LinearAlgebra: dot + +struct AADot{T, TAA} + cc::Vector{T} + aabasis::TAA +end + +function (aadot::AADot)(A) + @no_escape begin + AA = @alloc(eltype(A), length(aadot.aabasis)) + P4ML.evaluate!(AA, aadot.aabasis, A) + out = dot(aadot.cc, AA) + end + return out +end + diff --git a/src/splines.jl b/src/splines.jl index 61b37b2..dc52d7f 100644 --- a/src/splines.jl +++ b/src/splines.jl @@ -25,11 +25,17 @@ end function evaluate(ace, basis::SplineRadialsZ, Rs::AbstractVector{<: SVector}, Zs::AbstractVector) TF = eltype(eltype(Rs)) - Rn = acquire!(ace.pool, :Rn, (length(Rs), length(basis)), TF) + Rn = zeros(TF, length(Rs), length(basis)) evaluate!(Rn, basis, Rs, Zs) return Rn end +function whatalloc(::typeof(ACEbase.evaluate!), + basis::SplineRadialsZ, Rs, Zs) + TF = eltype(eltype(Rs)) + return (TF, length(Rs), length(basis)) +end + function evaluate!(out, basis::SplineRadialsZ, Rs, Zs) nX = length(Rs) @@ -49,6 +55,8 @@ function evaluate!(out, basis::SplineRadialsZ, Rs, Zs) end + + function evaluate_ed(ace, rbasis, Rs, Zs) TF = eltype(eltype(Rs)) Rn = acquire!(ace.pool, :Rn, (length(Rs), length(rbasis)), TF) diff --git a/src/uface.jl b/src/uface.jl index d8dfd89..5b8c731 100644 --- a/src/uface.jl +++ b/src/uface.jl @@ -15,13 +15,13 @@ struct UFACE_inner{TR, TY, TA, TAA} abasis::TA aadot::TAA # ---------- admin and meta-data - pool::TSafe{ArrayPool{FlexArrayCache}} + # pool::TSafe{ArrayPool{FlexArrayCache}} meta::Dict end UFACE_inner(rbasis, ybasis, abasis, aadot) = UFACE_inner(rbasis, ybasis, abasis, aadot, - TSafe(ArrayPool(FlexArrayCache)), + # TSafe(ArrayPool(FlexArrayCache)), Dict()) struct UFACE{NZ, INNER, PAIR} @@ -30,139 +30,17 @@ struct UFACE{NZ, INNER, PAIR} pairpot::PAIR E0s::Dict{Int, Float64} # ---------- - pool::TSafe{ArrayPool{FlexArrayCache}} + # pool::TSafe{ArrayPool{FlexArrayCache}} meta::Dict end UFACE(_i2z, ace_inner, pairpot, E0s) = UFACE(_i2z, ace_inner, pairpot, E0s, - TSafe(ArrayPool(FlexArrayCache)), + # TSafe(ArrayPool(FlexArrayCache)), Dict()) -function ACEbase.evaluate(ace::UFACE, Rs, Zs, zi) - i_zi = _z2i(ace, zi) - ace_inner = ace.ace_inner[i_zi] - Ei = ( evaluate(ace_inner, Rs, Zs) + - evaluate(ace.pairpot, Rs, Zs, zi) + - ace.E0s[zi] ) -end - -# -------------------------------------------------------- -# UF_ACE evaluation code. - -function ACEbase.evaluate(ace::UFACE_inner, Rs, Zs) - TF = eltype(eltype(Rs)) - rbasis = ace.rbasis - NZ = length(rbasis._i2z) - - # embeddings - # element embedding - Ez = embed_z(ace, Rs, Zs) - # radial embedding - Rn = evaluate(ace, rbasis, Rs, Zs) - # angular embedding - Zlm = evaluate_ylm(ace, Rs) - - # pooling - A = ace.abasis((unwrap(Ez), unwrap(Rn), unwrap(Zlm))) - - # n correlations - φ = ace.aadot(A) - - # release the borrowed arrays - release!(Zlm) - release!(Rn) - release!(Ez) - release!(A) - - return φ -end - -function evaluate_ed(ace::UFACE, Rs, Zs, z0) - ∇φ = acquire!(ace.pool, :out_dEs, (length(Rs),), eltype(Rs)) - return evaluate_ed!(∇φ, ace, Rs, Zs, z0) -end - -function evaluate_ed!(∇φ, ace::UFACE, Rs, Zs, z0) - i_z0 = _z2i(ace, z0) - ace_inner = ace.ace_inner[i_z0] - φ, _ = evaluate_ed!(∇φ, ace_inner, Rs, Zs) - add_evaluate_d!(∇φ, ace.pairpot, Rs, Zs, z0) - φ += ace.E0s[z0] + evaluate(ace.pairpot, Rs, Zs, z0) - return φ, ∇φ -end - - -function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs) - TF = eltype(eltype(Rs)) - rbasis = ace.rbasis - NZ = length(rbasis._i2z) - - # embeddings - # element embedding (there is no gradient) - Ez = embed_z(ace, Rs, Zs) - - # radial embedding - Rn, dRn = evaluate_ed(ace, rbasis, Rs, Zs) - - # angular embedding - Zlm, dZlm = evaluate_ylm_ed(ace, Rs) - - # pooling - A = ace.abasis((Ez, Rn, Zlm)) - - # n correlations - compute with gradient, do it in-place - ∂φ_∂A = acquire!(ace.pool, :∂A, size(A), TF) - φ = evaluate_and_gradient!(∂φ_∂A, ace.aadot, A) - - # backprop through A => this part could be done more nicely I think - ∂φ_∂Ez = BlackHole(TF) - # ∂φ_∂Ez = zeros(TF, size(Ez)) - ∂φ_∂Rn = acquire!(ace.pool, :∂Rn, size(Rn), TF) - ∂φ_∂Zlm = acquire!(ace.pool, :∂Zlm, size(Zlm), TF) - fill!(∂φ_∂Rn, zero(TF)) - fill!(∂φ_∂Zlm, zero(TF)) - P4ML._pullback_evaluate!((∂φ_∂Ez, unwrap(∂φ_∂Rn), unwrap(∂φ_∂Zlm)), - unwrap(∂φ_∂A), - ace.abasis, - (unwrap(Ez), unwrap(Rn), unwrap(Zlm)); - sizecheck=false) - - # backprop through the embeddings - # depending on whether there is a bottleneck here, this can be - # potentially implemented more efficiently without needing writing/reading - # (to be investigated where the bottleneck is) - - # we just ignore Ez (hence the black hole) - - # backprop through Rn - # We already computed the gradients in the forward pass - fill!(∇φ, zero(SVector{3, TF})) - @inbounds for n = 1:size(Rn, 2) - @simd ivdep for j = 1:length(Rs) - ∇φ[j] += ∂φ_∂Rn[j, n] * dRn[j, n] - end - end - - # ... and Ylm - @inbounds for i_lm = 1:size(Zlm, 2) - @simd ivdep for j = 1:length(Rs) - ∇φ[j] += ∂φ_∂Zlm[j, i_lm] * dZlm[j, i_lm] - end - end - - # release the borrowed arrays - release!(Zlm); release!(dZlm) - release!(Rn); release!(dRn) - release!(Ez) - release!(A) - release!(∂φ_∂Rn); release!(∂φ_∂Zlm); release!(∂φ_∂A) - - return φ, ∇φ -end - # ------------------------------------------------------ # transformation code : ACE1 -> UF_ACE models @@ -196,12 +74,18 @@ end -function uface_from_ace1_inner(mbpot, iz; n_spl_points = 100) +function uface_from_ace1_inner(mbpot, iz; + n_spl_points = 100, + aa_static = :auto) b1p = mbpot.pibasis.basis1p zlist = b1p.zlist.list z0 = zlist[iz] t_zlist = tuple(zlist...) + if aa_static == :auto + aa_static = (length(mbpot.pibasis.inner[1]) < 1_200) + end + # radial embedding Rn_basis = mbpot.pibasis.basis1p.J LEN_Rn = length(Rn_basis.J) @@ -258,7 +142,13 @@ function uface_from_ace1_inner(mbpot, iz; n_spl_points = 100) # AA_basis = P4ML.SparseSymmProd(spec_AA_inds) c_r_iz = AA_transform[:T]' * mbpot.coeffs[iz] - aadot = generate_AA_dot(spec_AA_inds, c_r_iz) + + if aa_static + aadot = generate_AA_dot(spec_AA_inds, c_r_iz) + else + aabasis = P4ML.SparseSymmProd(spec_AA_inds) + aadot = AADot(c_r_iz, aabasis) + end return UFACE_inner(rbasis_new, rYlm_basis_sc, A_basis, aadot) end diff --git a/src/uface_eval.jl b/src/uface_eval.jl new file mode 100644 index 0000000..19f10c9 --- /dev/null +++ b/src/uface_eval.jl @@ -0,0 +1,123 @@ + +function ACEbase.evaluate(ace::UFACE, Rs, Zs, zi) + i_zi = _z2i(ace, zi) + ace_inner = ace.ace_inner[i_zi] + Ei = ( evaluate(ace_inner, Rs, Zs) + + evaluate(ace.pairpot, Rs, Zs, zi) + + ace.E0s[zi] ) +end + +# -------------------------------------------------------- +# UF_ACE evaluation code. + +function ACEbase.evaluate(ace::UFACE_inner, Rs, Zs) + TF = eltype(eltype(Rs)) + rbasis = ace.rbasis + NZ = length(rbasis._i2z) + + @no_escape begin + + # embeddings + # element embedding + Ez = @withalloc embed_z!(ace, Rs, Zs) + # radial embedding + Rn = @withalloc evaluate!(rbasis, Rs, Zs) + # angular embedding + Zlm = @withalloc evaluate_ylm!(ace, Rs) + + # pooling + A = ace.abasis((Ez, Rn, Zlm)) + + # n correlations + φ = ace.aadot(A) + + # release the borrowed arrays + release!(A) + + end + + return φ +end + +function evaluate_ed(ace::UFACE, Rs, Zs, z0) + ∇φ = acquire!(ace.pool, :out_dEs, (length(Rs),), eltype(Rs)) + return evaluate_ed!(∇φ, ace, Rs, Zs, z0) +end + +function evaluate_ed!(∇φ, ace::UFACE, Rs, Zs, z0) + i_z0 = _z2i(ace, z0) + ace_inner = ace.ace_inner[i_z0] + φ, _ = evaluate_ed!(∇φ, ace_inner, Rs, Zs) + add_evaluate_d!(∇φ, ace.pairpot, Rs, Zs, z0) + φ += ace.E0s[z0] + evaluate(ace.pairpot, Rs, Zs, z0) + return φ, ∇φ +end + + +function ACEbase.evaluate_ed!(∇φ, ace::UFACE_inner, Rs, Zs) + TF = eltype(eltype(Rs)) + rbasis = ace.rbasis + NZ = length(rbasis._i2z) + + # embeddings + # element embedding (there is no gradient) + Ez = embed_z(ace, Rs, Zs) + + # radial embedding + Rn, dRn = evaluate_ed(ace, rbasis, Rs, Zs) + + # angular embedding + Zlm, dZlm = evaluate_ylm_ed(ace, Rs) + + # pooling + A = ace.abasis((Ez, Rn, Zlm)) + + # n correlations - compute with gradient, do it in-place + ∂φ_∂A = acquire!(ace.pool, :∂A, size(A), TF) + φ = evaluate_and_gradient!(∂φ_∂A, ace.aadot, A) + + # backprop through A => this part could be done more nicely I think + ∂φ_∂Ez = BlackHole(TF) + # ∂φ_∂Ez = zeros(TF, size(Ez)) + ∂φ_∂Rn = acquire!(ace.pool, :∂Rn, size(Rn), TF) + ∂φ_∂Zlm = acquire!(ace.pool, :∂Zlm, size(Zlm), TF) + fill!(∂φ_∂Rn, zero(TF)) + fill!(∂φ_∂Zlm, zero(TF)) + P4ML._pullback_evaluate!((∂φ_∂Ez, unwrap(∂φ_∂Rn), unwrap(∂φ_∂Zlm)), + unwrap(∂φ_∂A), + ace.abasis, + (unwrap(Ez), unwrap(Rn), unwrap(Zlm)); + sizecheck=false) + + # backprop through the embeddings + # depending on whether there is a bottleneck here, this can be + # potentially implemented more efficiently without needing writing/reading + # (to be investigated where the bottleneck is) + + # we just ignore Ez (hence the black hole) + + # backprop through Rn + # We already computed the gradients in the forward pass + fill!(∇φ, zero(SVector{3, TF})) + @inbounds for n = 1:size(Rn, 2) + @simd ivdep for j = 1:length(Rs) + ∇φ[j] += ∂φ_∂Rn[j, n] * dRn[j, n] + end + end + + # ... and Ylm + @inbounds for i_lm = 1:size(Zlm, 2) + @simd ivdep for j = 1:length(Rs) + ∇φ[j] += ∂φ_∂Zlm[j, i_lm] * dZlm[j, i_lm] + end + end + + # release the borrowed arrays + release!(Zlm); release!(dZlm) + release!(Rn); release!(dRn) + release!(Ez) + release!(A) + release!(∂φ_∂Rn); release!(∂φ_∂Zlm); release!(∂φ_∂A) + + return φ, ∇φ +end diff --git a/test/test_import_ace1.jl b/test/test_import_ace1.jl index 945a097..ed2967a 100644 --- a/test/test_import_ace1.jl +++ b/test/test_import_ace1.jl @@ -15,13 +15,17 @@ end elements = [:Si,:O] +# use totaldegree = 10, 12 for static aa +# totaldegree > 15 for dynamic aa + model = acemodel(; elements = elements, - order = 3, totaldegree = 10, + order = 3, totaldegree = 15, Eref = Dict(:Si => -1.234, :O => -0.432)) pot = model.potential pairpot = pot.components[1] mbpot = pot.components[2] pot1 = pot.components[3] +@show length(mbpot.pibasis.inner[1]) ## # normalize the potential a bit so that all contributions are O(1)