diff --git a/benchmarks/benchmark_fourier_transforms.jl b/benchmarks/benchmark_fourier_transforms.jl index 3a978417..840b63f6 100644 --- a/benchmarks/benchmark_fourier_transforms.jl +++ b/benchmarks/benchmark_fourier_transforms.jl @@ -28,10 +28,10 @@ function extract_modes(fft_result, mlow, mhigh, mtheta) for (i, m) in enumerate(mlow:mhigh) if m >= 0 # Positive frequencies - modes[i] = fft_result[m + 1] / mtheta # FFT normalization + modes[i] = fft_result[m+1] / mtheta # FFT normalization else # Negative frequencies (wrap around) - modes[i] = fft_result[mtheta + m + 1] / mtheta + modes[i] = fft_result[mtheta+m+1] / mtheta end end return modes @@ -39,10 +39,10 @@ end # Test configurations test_cases = [ - (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5), - (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10), - (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20), - (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40), + (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5), + (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10), + (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20), + (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40) ] for test in test_cases @@ -56,7 +56,7 @@ for test in test_cases mhigh = mlow + mpert - 1 # Create test data - theta = range(0, 2π, length=mtheta+1)[1:end-1] + theta = range(0, 2π; length=mtheta+1)[1:(end-1)] data = sin.(3 .* theta) .+ 0.5 .* cos.(7 .* theta) .+ 0.2 .* sin.(11 .* theta) # Initialize FourierTransform @@ -67,7 +67,8 @@ for test in test_cases theta_buffer = zeros(ComplexF64, mtheta) # Pre-allocate for low-level API - cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow) + cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis) gij = reshape(data, mtheta, 1) # Matrix form gil = zeros(Float64, mtheta, mpert) @@ -99,7 +100,7 @@ for test in test_cases # Note: Our transform uses a different normalization and basis println("\n--- Accuracy Check ---") println("FourierTransform allocating vs in-place: ", - @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer)))) + @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer)))) # Compare magnitudes of modes (since basis might differ) println("Mode magnitudes comparison (FourierTransform vs FFTW):") @@ -129,9 +130,9 @@ for test in test_cases full_modes = zeros(ComplexF64, mtheta) for (i, m) in enumerate(mlow:mhigh) if m >= 0 - full_modes[m + 1] = modes_test[i] + full_modes[m+1] = modes_test[i] else - full_modes[mtheta + m + 1] = modes_test[i] + full_modes[mtheta+m+1] = modes_test[i] end end t6 = @benchmark ifft($full_modes) @@ -140,21 +141,21 @@ for test in test_cases # Accuracy check println("\n--- Inverse Accuracy Check ---") println("inverse() allocating vs in-place: ", - @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer)))) + @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer)))) println("Round-trip error (real part): ", - @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data)))) + @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data)))) # Performance summary println("\n--- Performance Summary ---") println(@sprintf("Forward transform speedup (in-place vs allocating): %.2fx", - median(t1).time / median(t2).time)) + median(t1).time / median(t2).time)) println(@sprintf("Allocations eliminated: %d → %d", - t1.allocs, t2.allocs)) + t1.allocs, t2.allocs)) # Compare to FFTW println(@sprintf("\nFourier vs FFTW (forward): %.2fx %s", - abs(median(t2).time / median(t3).time), - median(t2).time < median(t3).time ? "faster" : "slower")) + abs(median(t2).time / median(t3).time), + median(t2).time < median(t3).time ? "faster" : "slower")) println("Note: FFTW computes full DFT (all N modes), we compute truncated series ($mpert modes)") end @@ -169,7 +170,7 @@ mlow = -10 nbatch = 10 # Transform 10 functions simultaneously ft = FourierTransform(mtheta, mpert, mlow) -theta = range(0, 2π, length=mtheta+1)[1:end-1] +theta = range(0, 2π; length=mtheta+1)[1:(end-1)] # Create batch data data_matrix = zeros(Float64, mtheta, nbatch) @@ -182,7 +183,7 @@ modes_matrix = zeros(ComplexF64, mpert, nbatch) println("\nTransforming $nbatch functions of length $mtheta:") print("Allocating (loop): ") -@btime for i in 1:$nbatch +@btime for i in 1:($nbatch) modes = $ft($data_matrix[:, i]) end @@ -191,7 +192,7 @@ print("Allocating (matrix):") print("In-place (loop): ") modes_buffer = zeros(ComplexF64, mpert) -@btime for i in 1:$nbatch +@btime for i in 1:($nbatch) transform!($modes_buffer, $ft, $data_matrix[:, i]) end @@ -208,7 +209,8 @@ mpert = 10 mlow = 1 # Setup for low-level API -cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow) +exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow) +cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis) gij = randn(mtheta, mtheta) # Green's function matrix gil = zeros(Float64, mtheta, mpert) diff --git a/benchmarks/benchmark_vacuum_galerkin.jl b/benchmarks/benchmark_vacuum_galerkin.jl new file mode 100644 index 00000000..358c6240 --- /dev/null +++ b/benchmarks/benchmark_vacuum_galerkin.jl @@ -0,0 +1,399 @@ +#!/usr/bin/env julia + +using Printf +using LinearAlgebra +using TOML +using Plots + +using Pkg +Pkg.instantiate() +using BenchmarkTools + +using GeneralizedPerturbedEquilibrium +const GPEC = GeneralizedPerturbedEquilibrium + +""" + make_wall_settings(example_dir::AbstractString) + +Construct `Vacuum.WallShapeSettings` from the `[Wall]` section in `gpec.toml` +if present; otherwise return default settings. +""" +function make_wall_settings(example_dir::AbstractString) + inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml")) + if haskey(inputs, "Wall") + return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["Wall"])...) + elseif haskey(inputs, "WALL") + # Some examples use legacy capitalized section name + return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["WALL"])...) + else + return GPEC.Vacuum.WallShapeSettings() + end +end + +""" + load_equilibrium(example_dir::AbstractString) + +Set up the equilibrium specified by `[Equilibrium]` in `gpec.toml` under `example_dir`. +""" +function load_equilibrium(example_dir::AbstractString) + inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml")) + @assert haskey(inputs, "Equilibrium") "[Equilibrium] section missing in gpec.toml for $example_dir" + eq_cfg = GPEC.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], example_dir) + return GPEC.Equilibrium.setup_equilibrium(eq_cfg) +end + +""" + make_vacuum_input( + equil::GPEC.Equilibrium.PlasmaEquilibrium; + ψ::Real, + mtheta::Int, + nzeta::Int, + mpert::Int, + mlow::Int, + npert::Int, + nlow::Int, + use_galerkin::Bool, + ) -> GPEC.Vacuum.VacuumInput + +Construct a `VacuumInput` at flux surface `ψ` with the specified resolution and mode set. +The only parameter that differs between the two algorithms we compare is `use_galerkin`. +""" +function make_vacuum_input( + equil::GPEC.Equilibrium.PlasmaEquilibrium; + ψ::Real, + mtheta::Int, + nzeta::Int, + mpert::Int, + mlow::Int, + npert::Int, + nlow::Int, + use_galerkin::Bool +) + r, z, ν = GPEC.Vacuum.extract_plasma_surface_at_psi(equil, float(ψ)) + + return GPEC.Vacuum.VacuumInput(; + x=reverse(r), + z=reverse(z), + ν=reverse(ν), + mtheta_in=length(r), + nzeta_in=1, + mlow=mlow, + mpert=mpert, + nlow=nlow, + npert=npert, + mtheta=mtheta, + nzeta=nzeta, + force_wv_symmetry=true, + use_galerkin=use_galerkin + ) +end + +""" + benchmark_vacuum_2d( + example_dir::AbstractString; + ψ::Real = 0.95, + mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:9)), + mpert::Int = 32, + mlow::Int = 0, + npert::Int = 1, + nlow::Int = 1, + ) + +Benchmark `compute_vacuum_response` for 2D (nzeta = 1) using the Solovev example in +`example_dir`. Scans over `mtheta_values` and compares collocation (`use_galerkin=false`) +against Galerkin (`use_galerkin=true`) for convergence of the `wv` matrix and runtime. +""" +function benchmark_vacuum_2d( + example_dir::AbstractString; + ψ::Real=1.0, + mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:7)), + mpert::Int=32, + mlow::Int=0, + npert::Int=1, + nlow::Int=1 +) + println("\n===== 2D Vacuum Benchmark (Solovev, $(basename(example_dir))) =====") + println("ψ = $(ψ), mtheta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n") + + equil = load_equilibrium(example_dir) + wall_settings = make_wall_settings(example_dir) + + mtheta_values = collect(mtheta_values) + nm = length(mtheta_values) + + times_colloc = zeros(Float64, nm) + times_galerkin = zeros(Float64, nm) + errs_colloc = zeros(Float64, nm) + errs_galerkin = zeros(Float64, nm) + + # Reference wv for convergence (highest resolution * 2) – always use galerkin + mtheta_ref = maximum(mtheta_values) * 2 + println("Computing 2D reference matrices at mtheta_ref = $mtheta_ref") + + input_ref_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta_ref, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings) + # IMPORTANT: `compute_vacuum_response` uses AdaptiveArrayPools; take a copy so + # later pooled allocations do not overwrite the reference storage. + wv_ref_galerkin = copy(wv_ref_galerkin) + ref_norm_galerkin = norm(wv_ref_galerkin) + + for (i, mtheta) in enumerate(mtheta_values) + println(" 2D: mtheta = $(rpad(string(mtheta), 5)) (nzeta = 1)") + + # Collocation + input_colloc = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=false + ) + t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings) + wv, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings) + errs_colloc[i] = norm(wv .- wv_ref_galerkin) / ref_norm_galerkin + times_colloc[i] = t_colloc + + # Galerkin + input_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings) + wv_g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings) + errs_galerkin[i] = norm(wv_g .- wv_ref_galerkin) / ref_norm_galerkin + times_galerkin[i] = t_galerkin + + @printf(" Collocation: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i]) + @printf(" Galerkin: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i]) + end + + # Two‑pane plot: left = convergence, right = runtime + plt = plot(; layout=(1, 2), size=(1100, 420)) + + # Convergence + plot!(plt[1], mtheta_values, errs_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ", ylabel="rel‖Δwv‖", + title="2D Vacuum: wv convergence vs mθ") + plot!(plt[1], mtheta_values, errs_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + # Runtime + plot!(plt[2], mtheta_values, times_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ", ylabel="runtime [s]", + title="2D Vacuum: runtime vs mθ") + plot!(plt[2], mtheta_values, times_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + plot!(plt[1]; legend=:bottomleft) + plot!(plt[2]; legend=:topleft) + + outpath = joinpath(@__DIR__, "vacuum_galerkin_2d.png") + savefig(plt, outpath) + println("\n2D results saved to $(outpath)") + + return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin) +end + +""" + benchmark_vacuum_3d( + example_dir::AbstractString; + ψ::Real = 0.95, + mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:3)), + mpert::Int = 16, + mlow::Int = 0, + npert::Int = 1, + nlow::Int = 1, + ) + +Benchmark `compute_vacuum_response` for 3D (nzeta = mtheta) using the Solovev 3D example +in `example_dir`. Scans over `mtheta = nzeta` and compares collocation vs Galerkin. +""" +function benchmark_vacuum_3d( + example_dir::AbstractString; + ψ::Real=1.0, + mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:2)), + mpert::Int=16, + mlow::Int=0, + npert::Int=1, + nlow::Int=1 +) + println("\n===== 3D Vacuum Benchmark (Solovev 3D, $(basename(example_dir))) =====") + println("ψ = $(ψ), mtheta = nzeta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n") + + equil = load_equilibrium(example_dir) + wall_settings = make_wall_settings(example_dir) + + mtheta_values = collect(mtheta_values) + nm = length(mtheta_values) + + times_colloc = zeros(Float64, nm) + times_galerkin = zeros(Float64, nm) + errs_colloc = zeros(Float64, nm) + errs_galerkin = zeros(Float64, nm) + + # Reference wv for convergence (highest resolution * 2) – always use galerkin + mtheta_ref = maximum(mtheta_values) * 2 + nzeta_ref = mtheta_ref + println("Computing 3D reference matrices at mtheta_ref = nzeta_ref = $mtheta_ref") + + input_ref_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta_ref, + nzeta=nzeta_ref, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings) + # Again, protect the reference matrix from being overwritten by pooled allocations. + wv_ref_galerkin = copy(wv_ref_galerkin) + ref_norm_galerkin = norm(wv_ref_galerkin) + + for (i, mtheta) in enumerate(mtheta_values) + nzeta = mtheta + println(" 3D: mtheta = $(rpad(string(mtheta), 5)), nzeta = $(rpad(string(nzeta), 5))") + + # Collocation + input_colloc = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=nzeta, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=false + ) + t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings) + wv3, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings) + errs_colloc[i] = norm(wv3 .- wv_ref_galerkin) / ref_norm_galerkin + times_colloc[i] = t_colloc + + # Galerkin + input_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=nzeta, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings) + wv3g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings) + errs_galerkin[i] = norm(wv3g .- wv_ref_galerkin) / ref_norm_galerkin + times_galerkin[i] = t_galerkin + + @printf(" Collocation: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i]) + @printf(" Galerkin: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i]) + end + + # Two‑pane plot: left = convergence, right = runtime (mtheta = nzeta on x-axis) + plt = plot(; layout=(1, 2), size=(1100, 420)) + + # Convergence + plot!(plt[1], mtheta_values, errs_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ = nzeta", ylabel="rel‖Δwv‖", + title="3D Vacuum: wv convergence vs mθ = nzeta") + plot!(plt[1], mtheta_values, errs_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + # Runtime + plot!(plt[2], mtheta_values, times_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ = nzeta", ylabel="runtime [s]", + title="3D Vacuum: runtime vs mθ = nzeta") + plot!(plt[2], mtheta_values, times_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + plot!(plt[1]; legend=:bottomleft) + plot!(plt[2]; legend=:topleft) + + outpath = joinpath(@__DIR__, "vacuum_galerkin_3d.png") + savefig(plt, outpath) + println("\n3D results saved to $(outpath)") + + return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin) +end + +""" + main() + +Entry point when running this file as a script. + +Usage (from repository root): + +```bash +julia --project=. benchmarks/benchmark_vacuum_galerkin.jl +``` + +Edit the `mtheta_values` and other keyword arguments in the calls below to +explore different resolution ranges. +""" +function main() + # 2D Solovev example + example_2d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example") + + # 3D Solovev example + example_3d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example_3D") + + benchmark_vacuum_2d( + example_2d; + ψ=1.0, + mtheta_values=16 .* (2 .^ (0:9)), # 16 → 8192 (easily editable) + mpert=31, + mlow=-15, + npert=1, + nlow=1 + ) + + benchmark_vacuum_3d( + example_3d; + ψ=1.0, + mtheta_values=16 .* (2 .^ (0:3)), # 16 → 128 (easily editable) + mpert=31, + mlow=-15, + npert=1, + nlow=1 + ) + + return nothing +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/examples/DIIID-like_ideal_example/gpec.toml b/examples/DIIID-like_ideal_example/gpec.toml index 982f6817..f95e67dc 100644 --- a/examples/DIIID-like_ideal_example/gpec.toml +++ b/examples/DIIID-like_ideal_example/gpec.toml @@ -14,16 +14,6 @@ newq0 = 0 # Override for on-axis safety factor (0 etol = 1e-7 # Error tolerance for equilibrium solver force_termination = false # Terminate after equilibrium setup (skip stability calculations) -[Wall] -shape = "nowall" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) -a = 0.2415 # Distance from plasma (conformal) or shape parameter -aw = 0.05 # Half-thickness parameter for Dee-shaped walls -bw = 1.5 # Elongation parameter for wall shapes -cw = 0 # Offset of wall center from major radius -dw = 0.5 # Triangularity parameter for wall shapes -tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) -equal_arc_wall = true # Equal arc length distribution of nodes on wall - [ForceFreeStates] bal_flag = false # Ideal MHD ballooning criterion for short wavelengths mat_flag = true # Construct coefficient matrices for diagnostic purposes @@ -63,6 +53,16 @@ save_interval = 3 # Save every Nth ODE step (1=all, 10=every 10th). A singfac_min = 1e-4 # Fractional distance from rational q at which ideal jump enforced ucrit = 1e4 # Maximum fraction of solutions allowed before re-normalized +[Wall] +shape = "nowall" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) +a = 0.2415 # Distance from plasma (conformal) or shape parameter +aw = 0.05 # Half-thickness parameter for Dee-shaped walls +bw = 1.5 # Elongation parameter for wall shapes +cw = 0 # Offset of wall center from major radius +dw = 0.5 # Triangularity parameter for wall shapes +tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) +equal_arc_wall = true # Equal arc length distribution of nodes on wall + [ForcingTerms] forcing_data_file = "forcing.dat" # Path to forcing data file (n, m, complex amplitude) forcing_data_format = "ascii" # Format of forcing data: "ascii" or "hdf5" diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml index 0065fde8..108c55bd 100644 --- a/examples/Solovev_ideal_example/gpec.toml +++ b/examples/Solovev_ideal_example/gpec.toml @@ -14,28 +14,6 @@ newq0 = 0 # Override for on-axis safety factor (0 etol = 1e-7 # Error tolerance for equilibrium solver force_termination = false # Terminate after equilibrium setup (skip stability calculations) - -[Wall] -shape = "conformal" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) -a = 0.2415 # Distance from plasma (conformal) or shape parameter -aw = 0.05 # Half-thickness parameter for Dee-shaped walls -bw = 1.5 # Elongation parameter for wall shapes -cw = 0 # Offset of wall center from major radius -dw = 0.5 # Triangularity parameter for wall shapes -tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) -equal_arc_wall = true # Equal arc length distribution of nodes on wall - -# [PerturbedEquilibrium] -# # Uncomment this section to enable perturbed equilibrium calculations -# forcing_data_file = "forcing.dat" # Path to forcing data (n, m, real, imag) -# forcing_data_format = "ascii" # "ascii" or "hdf5" -# fixed_boundary = false # Fixed boundary flag -# output_eigenmodes = true # Output mode fields as b-fields -# compute_response = true # Compute plasma response -# compute_singular_coupling = true # Compute singular coupling metrics -# verbose = true # Enable verbose logging -# write_outputs_to_HDF5 = true # Write outputs to HDF5 - [ForceFreeStates] bal_flag = false # Ideal MHD ballooning criterion for short wavelengths mat_flag = true # Construct coefficient matrices for diagnostic purposes @@ -75,7 +53,7 @@ ucrit = 1e3 # Maximum fraction of solutions allowed before re- force_wv_symmetry = true # Forces vacuum energy matrix symmetry save_interval = 3 # Save every Nth ODE step (1=all, 10=every 10th). Always saves near rational surfaces. -[WALL] +[Wall] shape = "conformal" # String selecting wall shape ["nowall", "conformal", "elliptical", "dee", "mod_dee", "from_file"] a = 0.2415 # The distance of the wall from the plasma in units of major radius (conformal), or minor radius parameter (others). aw = 0.05 # Half-thickness of the wall. diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml index d9e8b663..93d4e995 100644 --- a/examples/Solovev_ideal_example_3D/gpec.toml +++ b/examples/Solovev_ideal_example_3D/gpec.toml @@ -32,8 +32,8 @@ nn_high = 1 # Largest toroidal mode number to include delta_mlow = 8 # Expands lower bound of Fourier harmonics delta_mhigh = 8 # Expands upper bound of Fourier harmonics delta_mband = 0 # Integration keeps only this wide a band... -mthvac = 96 # Number of points used in splines over poloidal angle at plasma-vacuum interface. -nzvac = 64 +mthvac = 128 # Number of points used in splines over poloidal angle at plasma-vacuum interface. +nzvac = 128 thmax0 = 1 # Linear multiplier on the automatic choice of theta integration bounds kin_flag = false # Kinetic EL equation (default: false) @@ -63,14 +63,3 @@ cw = 0 # Offset of wall center from major radiu dw = 0.5 # Triangularity parameter for wall shapes tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) equal_arc_wall = false # Equal arc length distribution of nodes on wall - -# [PerturbedEquilibrium] -# # Uncomment this section to enable perturbed equilibrium calculations -# forcing_data_file = "forcing.dat" # Path to forcing data (n, m, real, imag) -# forcing_data_format = "ascii" # "ascii" or "hdf5" -# fixed_boundary = false # Fixed boundary flag -# output_eigenmodes = true # Output mode fields as b-fields -# compute_response = true # Compute plasma response -# compute_singular_coupling = true # Compute singular coupling metrics -# verbose = true # Enable verbose logging -# write_outputs_to_HDF5 = true # Write outputs to HDF5 diff --git a/examples/Solovev_ideal_example_3D/run_example.jl b/examples/Solovev_ideal_example_3D/run_example.jl new file mode 100644 index 00000000..fa0a9e53 --- /dev/null +++ b/examples/Solovev_ideal_example_3D/run_example.jl @@ -0,0 +1,4 @@ +using Pkg; +Pkg.activate(joinpath(@__DIR__, "../..")) +using GeneralizedPerturbedEquilibrium +GeneralizedPerturbedEquilibrium.main([dirname(@__FILE__)]) diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl index 9c8ec89c..f0e4a9be 100644 --- a/src/ForceFreeStates/Free.jl +++ b/src/ForceFreeStates/Free.jl @@ -32,10 +32,7 @@ and data dumping. # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126] singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)') - @inbounds for ipert in 1:numpert_total - @views vac_data.wv[ipert, :] .*= singfac[ipert] - @views vac_data.wv[:, ipert] .*= singfac[ipert] - end + @inbounds @views vac_data.wv .*= singfac .* singfac' # Compute complex energy eigenvalues and vectors vac_data.wt .= wp .+ vac_data.wv @@ -75,13 +72,11 @@ and data dumping. # Compute plasma and vacuum contributions. # wpt = wt' * wp * wt ; wvt = wt' * wv * wt mul!(tmp_mat, wp, vac_data.wt) - mul!(wpt, adjoint(vac_data.wt), tmp_mat) + mul!(wpt, vac_data.wt', tmp_mat) mul!(tmp_mat, vac_data.wv, vac_data.wt) - mul!(wvt, adjoint(vac_data.wt), tmp_mat) - for ipert in 1:numpert_total - vac_data.ep[ipert] = wpt[ipert, ipert] - vac_data.ev[ipert] = wvt[ipert, ipert] - end + mul!(wvt, vac_data.wt', tmp_mat) + vac_data.ep .= diag(wpt) + vac_data.ev .= diag(wvt) # Normalize eigenvectors based on scaled wt coeffs = odet.u[:, :, 1, end] \ (vac_data.wt .* (2π * equil.psio * 1e-3)) @@ -123,8 +118,8 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium # TODO: 4 spline points is arbitrary - is there a better way? qedge = profiles.q_spline(ctrl.psiedge) npsi = max(4, ceil(Int, (intr.qlim - qedge) * intr.nhigh * 4)) - psi_array = zeros(Float64, npsi + 1) - wv_array = zeros(ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total) + psi_array = zeros!(pool, Float64, npsi + 1) + wv_array = zeros!(pool, ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total) for i in 1:(npsi+1) # Space points evenly in q @@ -143,10 +138,7 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium # Apply singular factor scaling: (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126] singfac = vec((intr.mlow:intr.mhigh) .- qi .* (intr.nlow:intr.nhigh)') - @inbounds for ipert in 1:intr.numpert_total - @views wv[ipert, :] .*= singfac[ipert] - @views wv[:, ipert] .*= singfac[ipert] - end + @inbounds @views wv .*= singfac .* singfac' @views wv_array[i, :, :] .= wv end diff --git a/src/PerturbedEquilibrium/SingularCoupling.jl b/src/PerturbedEquilibrium/SingularCoupling.jl index b27da960..40a8c1ce 100644 --- a/src/PerturbedEquilibrium/SingularCoupling.jl +++ b/src/PerturbedEquilibrium/SingularCoupling.jl @@ -154,7 +154,7 @@ function compute_singular_coupling_metrics!( # Compute Green's functions at this surface for this n # TODO: This assumes an initial 2D equilibrum, getting 2D Green's functions for independent n vac_input = Vacuum.VacuumInput(equil, sing_surf.psifac, mtheta, 1, mpert, mlow, 1, nn) - _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings; green_only=true) + _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings) # Store in singular surface struct (overwrites for each n) ffs_intr.sing[s].grri = grri diff --git a/src/Utilities/FourierTransforms.jl b/src/Utilities/FourierTransforms.jl index 36a53e0b..ae3fe8ce 100644 --- a/src/Utilities/FourierTransforms.jl +++ b/src/Utilities/FourierTransforms.jl @@ -66,19 +66,16 @@ just use the n argument. In 3D, we need to compute the basis for all modes and g # Returns - 2D - - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν)` [mtheta, mpert] - - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν)` [mtheta, mpert] + - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν))` [mtheta, mpert] - 3D - - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert] - - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert] + - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν - n*ϕ))` [mtheta * nzeta, mpert * npert] # Notes The theta and phi grids are uniform: `θᵢ = 2π*i/mtheta` for `i = 0:mtheta-1` and `ϕⱼ = 2π*j/nzeta` for `j = 0:nzeta-1` When `n=0, ν=0` (default), this reduces to simple harmonic basis: -- `cos_mn_basis[i,l] = cos(m*θᵢ)` -- `sin_mn_basis[i,l] = sin(m*θᵢ)` +- `exp_mn_basis[i,l] = exp(i(m*θᵢ))` """ function compute_fourier_coefficients( mtheta::Int, @@ -100,17 +97,15 @@ function compute_fourier_coefficients( @assert length(ν) == mtheta "ν must have length mtheta" # In 2D, we only use one toroidal mode at a time - # Compute sin(mθ - nν) and cos(mθ - nν) - sin_mn_basis = sin.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν) - cos_mn_basis = cos.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν) + # Compute exp(i(mθ - nν)) + exp_mn_basis = exp.(im .* ((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)) else # 3D @assert (n_2D === nothing && ν === nothing) "n_2D and ν should be nothing for 3D" # In 3D, we need to compute the basis for all modes and grid points - # Compute sin(mθ - nζ) and cos(mθ - nζ) + # Compute exp(i(mθ - nζ)) ζ_grid = range(; start=0, length=nzeta, step=2π/nzeta) - sin_mn_basis = zeros(mtheta * nzeta, mpert * npert) - cos_mn_basis = zeros(mtheta * nzeta, mpert * npert) + exp_mn_basis = zeros(ComplexF64, mtheta * nzeta, mpert * npert) for idx_n in 1:npert n = nlow + idx_n - 1 n_col_offset = (idx_n - 1) * mpert @@ -119,16 +114,13 @@ function compute_fourier_coefficients( col = idx_m + n_col_offset for (j, ζ) in enumerate(ζ_grid), (i, θ) in enumerate(θ_grid) idx = i + (j-1)*mtheta - arg = m * θ - n * ζ - s, c = sincos(arg) - cos_mn_basis[idx, col] = c - sin_mn_basis[idx, col] = s + exp_mn_basis[idx, col] = exp(im * (m * θ - n * ζ)) end end end end - return cos_mn_basis, sin_mn_basis + return exp_mn_basis end """ @@ -207,8 +199,8 @@ function FourierTransform( n::Int=0, ν::Vector{Float64}=zeros(Float64, mtheta) ) - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν) - return FourierTransform(mtheta, mpert, mlow, cos_mn_basis, sin_mn_basis) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν) + return FourierTransform(mtheta, mpert, mlow, real(exp_mn_basis), imag(exp_mn_basis)) end """ diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl index eac91b01..17748804 100644 --- a/src/Vacuum/Kernel2D.jl +++ b/src/Vacuum/Kernel2D.jl @@ -59,56 +59,124 @@ const GL8_LAGRANGE_STENCILS = precompute_lagrange_stencils(GL8.x) # and per-n sinh/cosh cache are defined in PnQuadCache.jl. """ - kernel!(grad_greenfunction, greenfunction, observer, source, n) + compute_2D_kernel_matrices!(K, G, observer, source, n, Z, Gram) -Compute kernels of integral equation for Laplace's equation in a torus. -**WARNING: This kernel only supports closed toroidal walls currently. -The residue calculation needs to be updated for open walls.** +Compute the **Fourier/Galerkin-projected** 2D vacuum boundary-integral kernel blocks for +Laplace’s equation in an axisymmetric torus, **without ever forming the dense +`M×M` “point-to-point” kernel matrices. -# Arguments +This is the fused “evaluate kernel + project” path that the vacuum solver uses: - - `grad_greenfunction`: Gradient Green's function matrix (output) - - `greenfunction`: Green's function matrix (output) - - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry) - - `source`: Source geometry struct (PlasmaGeometry or WallGeometry) - - `n`: Toroidal mode number + Kc = Zᴴ * K * Z + Gc = Zᴴ * G -# Returns +where: -Modifies `grad_greenfunction` and `greenfunction` in place. -Note that greenfunction is zeroed each time this function is called, -but grad_greenfunction is not since it fills a different block of the -(2 * mtheta, 2 * mtheta) depending on the source/observer. + - `K` is the **double-layer** kernel (normal derivative of the Green’s function), + - `G` is the **single-layer** kernel (Green’s function itself; only needed for plasma-as-source), + - `Z ∈ ℂ^{M×P}` is the complex Fourier basis on the poloidal grid, + and `Zᴴ` is its conjugate transpose. -# Notes +Rather than computing a full kernel row `K[j, :]` and then multiplying by `Z`, this routine +projects **on the fly**: + + - For each observer node `j`, it accumulates the projected row-vector + `proj_k = (K[j,:] * weights) · Z` and (optionally) `proj_g = (G[j,:] * weights) · Z` + into length-`P` work buffers. + + - It then performs a **rank-1 update** into the appropriate projected block: + + Kc += conj(Z[j, :])' * proj_k + Gc += conj(Z[j, :])' * proj_g + +This reduces peak memory from `O(M^2)` to `O(MP + P^2)` while keeping the same +mathematical discretization. + +## Arguments + + - `Kc`: Complex global projected double-layer kernel matrix. + - `Gc`: Complex global projected single-layer kernel matrix. + - `observer`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`. + - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`. + - `n`: Integer representing the order of the toroidal Fourier component. + - `Z`: Complex Fourier basis sampled on the `θ` grid. + - `Gram`: Diagonal of the mode-space Gram matrix for this basis on the discrete grid. + +## Block layout + +`Kc` and `Gc` are the complex global projected matrices. `Kc` contains four blocks corresponding to +plasma/wall as observer/source, `Gc` contains two blocks corresponding to plasma/wall as observer. +This function writes **only one block** to each of `Kc` and `Gc` per call: + + - `Kc_block` is a `P×P` view into `Kc` selected by `(observer isa PlasmaGeometry ? 1 : 2, source isa PlasmaGeometry ? 1 : 2)`. + - `Gc_block` is a `P×(2P)` view into `Gc` with the same observer block-row; only the columns + corresponding to the source being plasma are populated (when `source isa PlasmaGeometry`). + +## Toroidal Green's functions + + - The scalar `G_n` returned by `green(...)` is `2π * 𝒢ⁿ(θ, θ′)` (Chance 1997), + the `n`-th toroidal Fourier component of the Laplace Green’s function in axisymmetry. + - The scalar `gradG_n` returned by `green(...)` corresponds to the toroidal-mode `n` + contribution to the **double-layer** integrand `𝒥 * (∇′𝒢ⁿ · ∇′ℒ)` + (Chance 1997), i.e. the normal-derivative factor multiplied by the geometric Jacobian. + - `gradG_0` is the `n = 0` piece used for analytic diagonal/singularity bookkeeping. + +## Numerical treatment of the singularity + +The toroidal Green’s function kernel is weakly singular as `θ′ → θ`. The implementation follows +Chance *Phys. Plasmas* **4**, 2161 (1997) and uses a mixed strategy: + + - **Far field (nonsingular region)**: composite Simpson’s `1/3` rule on the uniform `θ` grid, + skipping the near-singular stencil around `j`. + - **Near field (singular panels)**: 8-point Gauss–Legendre quadrature on the two panels + of length `dtheta` immediately to the left/right of `θ_j`, using: + + periodic cubic splines of `source.x(θ)` and `source.z(θ)` to evaluate geometry at off-grid nodes, + + precomputed 5-point Lagrange stencils to map each Gauss node back to the five neighboring + discrete source indices `[j-2, j-1, j, j+1, j+2]` *without allocations*, + + analytic logarithmic/singular correction terms (`log_correction_array`) for the single-layer + kernel when the observer/source block is plasma–plasma (Chance 1997, e.g. eqs. 75, 78), + + an analytic diagonal/residue correction for the double-layer kernel (Chance 1997, Table I / residues). - - Uses Simpson's rule for integration away from singular points - - Uses Gaussian quadrature near singular points for improved accuracy - - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161] +## Performance and allocation avoidance (hot-path optimizations) + +This routine is intentionally written to be allocation-light in tight loops: + + - **Precomputed quadrature tables**: `GL8` and `GL8_LAGRANGE_STENCILS` are global constants. + - **Hoisted n-dependent constants**: `gamma_prefactor = 2√π * Γ(1/2 - n)` is computed once per call + and passed into `green(...)` rather than recomputed per quadrature node. + - **Spline derivative batching**: `∂R/∂θ` and `∂Z/∂θ` are evaluated on the full grid once (for Simpson), + while off-grid Gauss points evaluate splines directly. + - **Projection reuse**: the transpose `Zt = transpose(Z)` is materialized so that “row-accumulate” + operations `_accum_row!` can access `Z` with contiguous column memory. + - **Rank-1 assembly**: the final projected update uses `conj(Z[j,:]) ⊗ proj_*` via `_rank1_conj!`, + avoiding constructing intermediate `P×P` temporaries. + +## Caveats / limitations + + - **Closed-wall assumption**: the current residue/diagonal handling is written for closed, + periodic toroidal boundaries. Open-wall residue logic is not implemented. """ @with_pool pool function compute_2D_kernel_matrices!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, - n::Int + n::Int, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractVector{ComplexF64} ) - mtheta = length(observer.x) - dtheta = 2π / mtheta - theta_grid = range(; start=0, length=mtheta, step=dtheta) + M, P = size(Z) # M = mtheta, P = num_modes + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access + dtheta = 2π / M + theta_grid = range(; start=0, length=M, step=dtheta) # Take a view of the corresponding block of the grad_greenfunction - col_index = (source isa PlasmaGeometry ? 1 : 2) - row_index = (observer isa PlasmaGeometry ? 1 : 2) - grad_greenfunction_block = view( - grad_greenfunction, - ((row_index-1)*mtheta+1):(row_index*mtheta), - ((col_index-1)*mtheta+1):(col_index*mtheta) - ) + col_idx = (source isa PlasmaGeometry ? 1 : 2) + row_idx = (observer isa PlasmaGeometry ? 1 : 2) + Kc_block = view(Kc, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) + Gc_block = view(Gc, ((row_idx-1)*P+1):(row_idx*P), :) - # Zero out greenfunction at start of each kernel call - fill!(greenfunction, 0.0) # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) populate_greenfunction = source isa PlasmaGeometry @@ -119,7 +187,7 @@ but grad_greenfunction is not since it fills a different block of the log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2) # Precompute the n-dependent prefactor 2√π·Γ(1/2-n) [Chance Phys. Plasmas 1997 2161 eq. 40] - # This is constant for all source/observer point pairs within this kernel call. + # This constant is only computed once for each n gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n) # Set up periodic splines used for off-grid Gaussian quadrature points @@ -134,42 +202,59 @@ but grad_greenfunction is not since it fills a different block of the # Precompute source derivatives on the theta grid once used in Simpson integration # The Gaussian singular-panel points are off-grid, so those still use spline evaluation directly. - dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta) - dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta) + dx_dtheta_grid = acquire!(pool, eltype(source.x), M) + dz_dtheta_grid = acquire!(pool, eltype(source.z), M) # Call in-place API to avoid allocations d1_spline_x(dx_dtheta_grid, theta_grid) d1_spline_z(dz_dtheta_grid, theta_grid) + # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition) + legendre_buf = acquire!(pool, Float64, n + 2) + + # Per-observer projection vectors: proj = (kernel row) · Z + proj_k = zeros!(pool, ComplexF64, P) + proj_g = zeros!(pool, ComplexF64, P) + # Loop through observer points - for j in 1:mtheta + for j in 1:M # Get observer coordinates x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] - # Perform Simpson integration for nonsingular source points + # Zero out projection terms + fill!(proj_k, 0.0) + fill!(proj_g, 0.0) + diag_accum = 0.0 + + # ============================================================ + # FAR FIELD: Simpson integration for nonsingular source points + # ============================================================ # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1. - @inbounds for k in 1:(mtheta-3) - isrc = mod1(j + 1 + k, mtheta) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; gamma_prefactor) + @inbounds for k in 1:(M-3) + isrc = mod1(j + 1 + k, M) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor) # Composite Simpson's 1/3 rule weights, excluding singular points # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0 - wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2)) + wsimpson = dtheta / 3 * ((k == 1 || k == M - 3) ? 1 : (iseven(k) ? 4 : 2)) - # Sum contributions to Green's function matrices using Simpson weight + # Sum and project contributions to Green's function matrices if populate_greenfunction - greenfunction[j, isrc] += G_n * wsimpson + _accum_row!(proj_g, G_n * wsimpson, Zt, isrc) end - grad_greenfunction_block[j, isrc] += gradG_n * wsimpson + _accum_row!(proj_k, gradG_n * wsimpson, Zt, isrc) # Subtract regular integral component of δⱼᵢK⁰ [Chance Phys. Plasmas 1997 2161 eq. 83] - grad_greenfunction_block[j, j] -= gradG_0 * wsimpson + diag_accum -= gradG_0 * wsimpson end - # Perform Gaussian quadrature for singular points (source = obs point) + # ============================================================ + # NEAR FIELD: Gaussian quadrature with singular correction + # ============================================================ # Indices of the singularity region, [j-2, j-1, j, j+1, j+2] (allocation-free) for (offset_idx, offset) in enumerate(-2:2) - sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta) + sing_idx[offset_idx] = mod1(j + offset + M, M) end + # Integrate region of length 2 * dtheta on left/right of singularity for leftpanel in (true, false) gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta) @@ -181,7 +266,7 @@ but grad_greenfunction is not since it fills a different block of the dx_dtheta_gauss = d1_spline_x(theta_gauss0) z_gauss = spline_z(theta_gauss0) dz_dtheta_gauss = d1_spline_z(theta_gauss0) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; gamma_prefactor) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; gamma_prefactor) # Get stencil and weight for the Gaussian point s = leftpanel ? stencils_left[ig] : stencils_right[ig] @@ -194,54 +279,71 @@ but grad_greenfunction is not since it fills a different block of the G_n += log((theta_obs - theta_gauss)^2) / x_obs end @inbounds for stencil_idx in 1:5 - greenfunction[j, sing_idx[stencil_idx]] += G_n * s[stencil_idx] * wgauss + _accum_row!(proj_g, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end end # Second type of singularity: 𝒦ⁿ [Chance Phys. Plasmas 1997 2161 eq. 83, 86] @inbounds for stencil_idx in 1:5 - grad_greenfunction_block[j, sing_idx[stencil_idx]] += gradG_n * s[stencil_idx] * wgauss + _accum_row!(proj_k, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end # Subtract off the diverging singular n=0 component - grad_greenfunction_block[j, j] -= gradG_0 * wgauss + diag_accum -= gradG_0 * wgauss end end # Subtract off analytic singular integral [Chance Phys. Plasmas 1997 2161 eq. 75] if plasma-plasma block if populate_greenfunction && observer isa PlasmaGeometry @inbounds for stencil_idx in 1:5 - greenfunction[j, sing_idx[stencil_idx]] -= log_correction_array[stencil_idx] / x_obs + _accum_row!(proj_g, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx]) end end + + # Project the n=0 diagonal accumulation + _accum_row!(proj_k, diag_accum, Zt, j) + + # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ── + _rank1_conj!(Kc_block, Zt, j, proj_k) + if populate_greenfunction + _rank1_conj!(Gc_block, Zt, j, proj_g) + end end # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources if source isa PlasmaGeometry - grad_greenfunction_block .*= -1 + Kc_block .*= -1 end # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] + # The residue term is diagonal in mode space and is scaled by the Gram diagonal. residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) - @inbounds for i in 1:mtheta - grad_greenfunction_block[i, i] += residue + @inbounds for p in 1:P + Kc_block[p, p] += residue * Gram[p] end # Since we computed 2π𝒢, divide by 2π to get 𝒢 if populate_greenfunction - greenfunction ./= 2π + Gc_block ./= 2π end end -# Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n +""" + kernel!(Kc, Gc, observer, source, params::KernelParams2D, Z, Gram) + +Public 2D kernel entry point. This is a thin wrapper that forwards to +`compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)`. +""" function kernel!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, - params::KernelParams2D + params::KernelParams2D, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractVector{ComplexF64} ) - return compute_2D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, params.n) + return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram) end ############################################################# @@ -639,19 +741,22 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c - Implements analytical derivatives from Chance 1997 equations - The coupling terms include the Jacobian factor from the coordinate transformation - By default uses the 2007 Legendre function implementation (Bulirsch + Gaussian integration) + +An overload accepting a pre-allocated `legendre_buf::Vector{Float64}` of length `n+2` is available. +Callers in tight loops should allocate this buffer once and pass it in to avoid per-call pool acquisition. """ -@with_pool pool function green( +function green( x_obs::Float64, z_obs::Float64, x_source::Float64, z_source::Float64, dx_dtheta::Float64, dz_dtheta::Float64, - n::Int; + n::Int, + legendre::AbstractVector{Float64}; gamma_prefactor::Float64=2 * sqrt(π) * gamma(0.5 - n), uselegacygreenfunction::Bool=false ) - x_obs2 = x_obs^2 x_source2 = x_source^2 x_minus2 = (x_obs - x_source)^2 @@ -670,9 +775,7 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c # Argument of Legendre function 𝘴 [Chance Phys. Plasmas 1997 2161 eq. 42] s = (x_obs2 + x_source2 + ζ2) / R2 - # Legendre functions for - # P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1 - legendre = acquire!(pool, Float64, n + 2) + # Legendre functions: P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1 if uselegacygreenfunction Pn_minus_half_1997!(legendre, s, n) else diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index df28dd45..dead2b31 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -177,74 +177,35 @@ function get_singular_quadrature(PATCH_RAD::Int, RAD_DIM::Int, INTERP_ORDER::Int end """ - laplace_single_layer(x_obs, x_src) -> Float64 + laplace_kernel(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double) -Evaluate the Laplace single-layer (FxU) kernel between two 3D points. Returns -0.0 if the observation point coincides with the source point to avoid singularity. +Fused scalar-argument Laplace kernels for the 3D vacuum BIE. -The single-layer kernel φ is the fundamental solution to Laplace's equation: +Returns a tuple `(single, double)` where: -``` -φ(x_obs, x_src) = 1 / |x_obs - x_src| -``` + - `single = 1/r` is the single-layer kernel + - `double = (Δx⋅n)/r^3` is the double-layer kernel -# Arguments - - - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector) - - `x_src`: Source point (3D Cartesian coordinates, any AbstractVector) - -# Returns - - - `Float64`: Kernel value φ(x_obs, x_src) -""" -function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}) - @inbounds begin - dx = x_obs[1] - x_src[1] - dy = x_obs[2] - x_src[2] - dz = x_obs[3] - x_src[3] - end - r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 - return inv(sqrt(r2)) -end - -""" - laplace_double_layer(x_obs, x_src, n_src) -> Float64 - -Evaluate the Laplace double-layer (DxU) kernel between a point and a surface element. Returns -0.0 if the observation point coincides with the source point to avoid singularity. Allocation-free -scalar arithmetic is used for maximum performance. - -The double-layer kernel K is the normal derivative of the fundamental solution: - -``` -K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x_obs - x_src|³ -``` - -# Arguments - - - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector) - - `x_src`: Source point on surface (3D Cartesian coordinates, any AbstractVector) - - `n_src`: Outward UNIT normal at source point (must be normalized!, any AbstractVector) - -# Returns - - - `Float64`: Kernel value K(x_obs, x_src, n_src) +This is used when `compute_3D_kernel_matrices!` needs **both** kernels for the same pair, so the +distance computation (`sqrt(r²)`) is shared. Returns `(0.0, 0.0)` when `r² < 1e-30`. """ -function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real}) - @inbounds begin - dx = x_obs[1] - x_src[1] - dy = x_obs[2] - x_src[2] - dz = x_obs[3] - x_src[3] - nx = n_src[1] - ny = n_src[2] - nz = n_src[3] - end +@fastmath @inline function laplace_kernel( + ox::Float64, oy::Float64, oz::Float64, + sx::Float64, sy::Float64, sz::Float64, + nx::Float64, ny::Float64, nz::Float64 +) + dx = ox - sx + dy = oy - sy + dz = oz - sz r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 + r2 < 1e-30 && return (0.0, 0.0) rinv = inv(sqrt(r2)) + # single-layer: 1/r + single = rinv + # double-layer: (Δx·n)/r^3 r3inv = rinv * rinv * rinv - return (dx*nx + dy*ny + dz*nz) * r3inv + double = (dx*nx + dy*ny + dz*nz) * r3inv + return (single, double) end """ @@ -281,8 +242,7 @@ end interpolate_to_polar!(polar_data, patch, quad_data) Interpolate Cartesian patch data to polar quadrature points using sparse matrix multiply. -Overwrites `polar_data` using mul! function arguments, mul!(C, A, B, α, β) -> C where -C = α * A * B + β * C. +Overwrites `polar_data` using mul! function arguments, mul!(C, A, B) -> C where C = A * B. # Arguments @@ -291,17 +251,17 @@ C = α * A * B + β * C. - `P2G`: Sparse interpolation matrix """ function interpolate_to_polar!(polar_data::Array{Float64,3}, patch::Array{Float64,3}, P2G::SparseMatrixCSC{Float64,Int}) - # Flatten patch to (Ngrid × dof), apply P2G' to get (Npolar × dof) patch_flat = reshape(patch, :, size(patch, 3)) - mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat, 1.0, 0.0) + mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat) end """ - compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar) + compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, normal_orient) Compute normal vector (= ∂r/∂θ × ∂r/∂ζ) at polar quadrature points from interpolated tangent vectors. We already scaled the normals by normal_orient in the geometry construction, so we need to reapply -that here since we are recomputing the normals from the derivatives. +that here since we are recomputing the normals from the derivatives. We use inline cross products +to avoid slice allocation. # Arguments @@ -311,7 +271,6 @@ that here since we are recomputing the normals from the derivatives. - `normal_orient`: Multiplier applied to normals to make them orient out of vacuum region (+1 or -1) """ function compute_polar_normal!(n_polar::Array{Float64,3}, dr_dθ::Array{Float64,3}, dr_dζ::Array{Float64,3}, normal_orient::Int) - # Inline cross product to avoid slice allocation @inbounds for ia in axes(dr_dθ, 2), ir in axes(dr_dθ, 1) n_polar[ir, ia, 1] = dr_dθ[ir, ia, 2] * dr_dζ[ir, ia, 3] - dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 2] n_polar[ir, ia, 2] = dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 1] - dr_dθ[ir, ia, 1] * dr_dζ[ir, ia, 3] @@ -362,63 +321,128 @@ function KernelWorkspace(PATCH_DIM::Int, RAD_DIM::Int, ANG_DIM::Int) end """ - compute_3D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER) + compute_3D_kernel_matrices!(K, G, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, Z, Gram) -Compute boundary integral kernel matrices for 3D geometries with the singular correction -algorithm from [Malhotra Plasma Phys. and Cont. Fusion 2019 024004]. -Uses multi-threading for parallel computation over observer points. +Compute the **Fourier/Galerkin-projected** 3D vacuum boundary-integral kernel blocks for +Laplace’s equation, using a high-order singular quadrature / partition-of-unity (POU) +scheme on a tensor-product `(θ, ζ)` surface grid. - - Far regions: Rectangle rule with uniform weights (1/N) - - Singular regions: Polar quadrature with partition-of-unity blending +Like the 2D kernel, this routine implements the **fused projection path** used by the vacuum solve: +it produces the projected operators in mode space **without materializing a dense** +`N×N` point-space kernel (where `N = mtheta * nzeta`). -grad_greenfunction is the double-layer kernel matrix, where each entry is -∇_{x_src} φ(x_obs, x_src) · n_src, and greenfunction is the single-layer kernel matrix, -where each entry is φ(x_obs, x_src). +## Mathematical object being discretized -# Arguments +Let `x(θ, ζ) ∈ ℝ^3` be a surface parametrization (plasma or wall surface) with outward +unit normal `n(θ, ζ)`. The Laplace kernels are: + + - **Single-layer**: `φ(x_obs, x_src) = 1 / |x_obs - x_src|` + - **Double-layer**: `∂φ/∂n_src = ∇_{x_src} φ ⋅ n_src = (x_obs - x_src) ⋅ n_src / |x_obs - x_src|^3` + +This routine computes the *discrete, projected* operators corresponding to these kernels, +using a uniform quadrature weight `dθdζ = 4π^2 / N` for the far field and a specialized +near-field correction for the singular region. + +## Arguments and block layout + + - `Kc`: Complex global projected double-layer kernel matrix (2P×2P). + - `Gc`: Complex global projected single-layer kernel matrix (2P×P). + - `observer`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data. + - `source`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data. + - `PATCH_RAD`: Half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors. + - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`). + - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`). + - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`. + - `Gram`: Diagonal of the mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution). + +This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`) +selected by whether observer/source are plasma or wall. - - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place +## Numerical treatment of the singularity - - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place - - `observer`: Observer geometry (PlasmaGeometry3D) - - `source`: Source geometry (PlasmaGeometry3D) - - `PATCH_RAD`: Number of points adjacent to source point to treat as singular +The kernel is weakly singular as `x_src → x_obs`. The implementation follows the +approach used in [Malhotra Journal of Comp. Phys. 2019 108791 eq. 38] - + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1) - - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM - - `INTERP_ORDER`: Lagrange interpolation order + - **Far field** (nonsingular sources): - + Must be ≤ (2 * PATCH_RAD + 1) + + Use a uniform trapezoidal/rectangle rule on the `(θ, ζ)` grid. + + For each observer point, a square patch of size `PATCH_DIM = 2*PATCH_RAD+1` + surrounding the singularity is excluded from the far-field sum. -# Threading + - **Near field** (singular patch): -This function automatically uses all available threads (`Threads.nthreads()`). -Start Julia with `julia -t auto` or set `JULIA_NUM_THREADS` to enable multi-threading. + + Extract a Cartesian `PATCH_DIM×PATCH_DIM` patch of the source geometry around the + observer-aligned source index. + + Interpolate the patch to a **polar quadrature grid** (`RAD_DIM × ANG_DIM`, with `ANG_DIM=2*RAD_DIM`) + using a precomputed sparse interpolation operator `P2G` built from tensor-product + Lagrange stencils (`INTERP_ORDER` controls the stencil width). + + Evaluate kernels on the polar grid and weight them with a **partition-of-unity** + quadrature factor `Ppou` that includes the polar Jacobian factor (roughly `r * dr * dθ`) + and a smooth cutoff function `χ(ρ)` that localizes the singular correction. + + Map the polar correction back onto the Cartesian patch via `P2G` and blend with the + far-field trapezoid contribution using `Gpou`, so the combined weight is effectively + `trap*(1-χ) + singular_correction`. + +## Fused projection and threading + +This function is written to be parallel over observer points: + + - Each thread owns a `KernelWorkspace` (scratch arrays for patch extraction, polar interpolation, + and temporary kernel values), plus per-thread accumulation buffers `proj_k` / `proj_g` + (length `P`) and a boolean `is_patch` mask to skip patch indices in the far-field loop. + + - For a given observer index `idx_obs`, the code accumulates the **projected row** + `(kernel row idx_obs) · Z` directly into `proj_k` / `proj_g` using `_accum_row!`, and then writes + these into shared buffers `KZt[:, idx_obs]` and `GZt[:, idx_obs]`. This is race-free because + each observer writes to a unique column. + + - After the threaded loop completes, the final `P×P` blocks are assembled efficiently with BLAS: + + Kc = Zᴴ * (KZt)' + Gc = Zᴴ * (GZt)' + + implemented as `mul!(Kc_block, Z', transpose(KZt))` (and similarly for `Gc`). + +Normalization by `2π` is applied to match the 2D kernel convention so the downstream “add identity” +logic is consistent between 2D/3D. + +## Important parameters + + - `PATCH_RAD`: half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors. + - `RAD_DIM`: radial quadrature order on the polar grid (angular order is `2*RAD_DIM`). + - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`). + +## Performance notes / numerical optimizations + + - **Cached quadrature data**: `get_singular_quadrature` memoizes `P2G`, `Gpou`, `Ppou`, etc. for a given + `(PATCH_RAD, RAD_DIM, INTERP_ORDER)` triple to avoid expensive rebuilds. + - **Allocation control**: all near-field arrays live in thread-local `KernelWorkspace` objects; no per-observer + heap allocation is intended in the hot path. + - **Scalar kernel evaluation**: the Laplace kernels have scalar-argument overloads to avoid view/slice creation + and to enable LLVM to keep values in registers. """ function compute_3D_kernel_matrices!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + K::AbstractMatrix{ComplexF64}, + G::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, PATCH_RAD::Int, RAD_DIM::Int, - INTERP_ORDER::Int + INTERP_ORDER::Int, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractVector{ComplexF64} ) - num_points = observer.mtheta * observer.nzeta - dθdζ = 4π^2 / (num_points) + N, P = size(Z) # N = mtheta * nzeta, P = num_modes + dθdζ = 4π^2 / N + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access - # Get block of grad green function matrix + # Take a view of the corresponding block of the K and G matrices col_index = (source isa PlasmaGeometry3D ? 1 : 2) row_index = (observer isa PlasmaGeometry3D ? 1 : 2) - grad_greenfunction_block = view( - grad_greenfunction, - ((row_index-1)*num_points+1):(row_index*num_points), - ((col_index-1)*num_points+1):(col_index*num_points) - ) + K_block = view(K, ((row_index-1)*P+1):(row_index*P), ((col_index-1)*P+1):(col_index*P)) + G_block = view(G, ((row_index-1)*P+1):(row_index*P), :) - # Zero out green function matrix - fill!(greenfunction, 0.0) - # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) + # G only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) populate_greenfunction = source isa PlasmaGeometry3D # Initialize quadrature data @@ -432,38 +456,60 @@ function compute_3D_kernel_matrices!( @assert observer.mtheta ≥ PATCH_DIM "Must have observer.mtheta ≥ PATCH_DIM, got observer.mtheta=$(observer.mtheta), PATCH_DIM=$PATCH_DIM" @assert observer.nzeta ≥ PATCH_DIM "Must have observer.nzeta ≥ PATCH_DIM, got observer.nzeta=$(observer.nzeta), PATCH_DIM=$PATCH_DIM" + # Buffers for the projection: column idx_obs holds (kernel row idx_obs) · Z + KZt = zeros(ComplexF64, P, N) + GZt = zeros(ComplexF64, P, N) + # Allocate thread-local workspaces (one per thread) max_threadid = Threads.maxthreadid() workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_threadid] + proj_k_all = [zeros(ComplexF64, P) for _ in 1:max_threadid] + proj_g_all = [zeros(ComplexF64, P) for _ in 1:max_threadid] + is_patch_all = [falses(N) for _ in 1:max_threadid] # Parallel loop through observer points - Threads.@threads for idx_obs in 1:num_points + Threads.@threads for idx_obs in 1:N # Get thread-local workspace ws = workspaces[Threads.threadid()] (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar, n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws + proj_k = proj_k_all[Threads.threadid()] + proj_g = proj_g_all[Threads.threadid()] + is_patch = is_patch_all[Threads.threadid()] + + fill!(proj_k, 0.0) + fill!(proj_g, 0.0) + fill!(is_patch, false) # Convert linear index to 2D indices i_obs = mod1(idx_obs, observer.mtheta) j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 - r_obs = @view observer.r[idx_obs, :] + ox = observer.r[idx_obs, 1] + oy = observer.r[idx_obs, 2] + oz = observer.r[idx_obs, 3] + + # Mark patch source indices so the far-field loop can skip them + @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM + idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) + idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) + is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true + end # ============================================================ # FAR FIELD: Trapezoidal rule for nonsingular source points # Note: kernels return zero for r_src = r_obs # ============================================================ - @inbounds for idx_src in 1:num_points - # Evaluate kernels at grid points - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - K_single = laplace_single_layer(r_obs, r_src) - K_double = laplace_double_layer(r_obs, r_src, n_src) - - # Apply weights (periodic trapezoidal rule = constant weights) + @inbounds for idx_src in 1:N + is_patch[idx_src] && continue + + sr = view(source.r, idx_src, :) + sn = view(source.normal, idx_src, :) + + far_single, far_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* dθdζ + _accum_row!(proj_k, far_double, Zt, idx_src) if populate_greenfunction - greenfunction[idx_obs, idx_src] = K_single * dθdζ + _accum_row!(proj_g, far_single, Zt, idx_src) end - grad_greenfunction_block[idx_obs, idx_src] = K_double * dθdζ end # ============================================================ @@ -482,80 +528,93 @@ function compute_3D_kernel_matrices!( # Compute normal vectors at polar points from interpolated tangent vectors compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient) - # Evaluate kernels at polar points with POU weighting + # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM - # Evaluate kernels using recomputed normal (use @view to avoid allocation) - r_src = @view r_polar[ir, ia, :] - n_src = @view n_polar[ir, ia, :] - K_single = laplace_single_layer(r_obs, r_src) - K_double = laplace_double_layer(r_obs, r_src, n_src) - - # Apply quadrature weights: area element × POU, where POU contains rdrdθ already - M_polar_single[ir, ia] = K_single * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = K_double * Ppou[ir, ia] * dθdζ + sr = view(r_polar, ir, ia, :) + sn = view(n_polar, ir, ia, :) + w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* Ppou[ir, ia] .* dθdζ + M_polar_single[ir, ia] = w_single + M_polar_double[ir, ia] = w_double end # Distribute polar singular corrections back to Cartesian grid using sparse matrix # grid = P2G * polar (maps Npolar → Ngrid) - mul!(M_grid_single_flat, P2G, vec(M_polar_single)) mul!(M_grid_double_flat, P2G, vec(M_polar_double)) - M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) + if populate_greenfunction + mul!(M_grid_single_flat, P2G, vec(M_polar_single)) + M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) + end - # Compute remaining far-field POU contribution and near-field polar quadrature result - # We include this region in the far-field trapezoidal rule, so use Gpou = -χ here to get 1-χ + # POU correction: singular correction + (1 + Gpou) * far-field terms @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM # Map back to global indices idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta) idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta) idx_src = idx_pol + source.mtheta * (idx_tor - 1) + sr = view(source.r, idx_src, :) + sn = view(source.normal, idx_src, :) - # Remainder of far-field contribution on the singular grid: Gpou = -χ - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - far_single = laplace_single_layer(r_obs, r_src) * Gpou[i, j] * dθdζ - far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[i, j] * dθdζ - - # Apply near + far contributions + w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* (1.0 + Gpou[i, j]) .* dθdζ + _accum_row!(proj_k, M_grid_double[i, j] + w_double, Zt, idx_src) if populate_greenfunction - greenfunction[idx_obs, idx_src] += M_grid_single[i, j] + far_single + _accum_row!(proj_g, M_grid_single[i, j] + w_single, Zt, idx_src) + end + end + + # ── Write projected column to buffer (each idx_obs owns its column) ── + @inbounds for p in 1:P + KZt[p, idx_obs] = proj_k[p] + end + if populate_greenfunction + @inbounds for p in 1:P + GZt[p, idx_obs] = proj_g[p] end - grad_greenfunction_block[idx_obs, idx_src] += M_grid_double[i, j] + far_double end end - # Use the same normalization as in the 2D kernel so we can just add I to the diagonal + # Use the same normalization as in the 2D kernel so we can just add Gram to the diagonal # This makes the grri logic identical to the 2D kernel. - grad_greenfunction_block ./= 2π - greenfunction ./= 2π + mul!(K_block, Z', transpose(KZt)) + K_block ./= 2π + if populate_greenfunction + mul!(G_block, Z', transpose(GZt)) + G_block ./= 2π + end - # Add the term that comes from the volume integral of Green's identity - typeof(source) == typeof(observer) && begin - for i in 1:num_points - grad_greenfunction_block[i, i] += 1.0 + # Add the term that comes from the volume integral of Green's identity. + if typeof(source) == typeof(observer) + @inbounds for p in 1:P + K_block[p, p] += Gram[p] end end end """ - kernel!(grad_greenfunction, greenfunction, observer, source, params::KernelParams3D) + kernel!(Kc, Gc, observer, source, params::KernelParams3D, Z, Gram) + +Public 3D kernel entry point. Forwards to: -Dispatch wrapper for 3D kernel that forwards to `compute_3D_kernel_matrices!` with params. +`compute_3D_kernel_matrices!(Kc, Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, Z, Gram)`. """ function kernel!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, - params::KernelParams3D + params::KernelParams3D, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractVector{ComplexF64} ) return compute_3D_kernel_matrices!( - grad_greenfunction, - greenfunction, + Kc, + Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, - params.INTERP_ORDER + params.INTERP_ORDER, + Z, + Gram ) end diff --git a/src/Vacuum/Utilities.jl b/src/Vacuum/Utilities.jl index de4d220b..ac69a9e1 100644 --- a/src/Vacuum/Utilities.jl +++ b/src/Vacuum/Utilities.jl @@ -152,3 +152,28 @@ Inline function for fast cross product of two 3D vectors at a given index. c[idx, 3] = a1*b2 - a2*b1 end end + +# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ────────── +""" +Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P. +""" +@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64, + Zt::AbstractMatrix{ComplexF64}, col::Int) + @inbounds @simd for p in eachindex(proj) + proj[p] += w * Zt[p, col] + end +end + +""" +Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary. +""" +@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64}, + Zt::AbstractMatrix{ComplexF64}, j::Int, + y::AbstractVector{ComplexF64}) + @inbounds for p2 in eachindex(y) + y_p2 = y[p2] + for p1 in axes(A, 1) + A[p1, p2] += conj(Zt[p1, j]) * y_p2 + end + end +end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 2f6335d6..72ad0ad4 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -9,7 +9,7 @@ using AdaptiveArrayPools # Import parent modules import ..Equilibrium -using ..Utilities.FourierTransforms: compute_fourier_coefficients, fourier_transform!, fourier_inverse_transform! +using ..Utilities.FourierTransforms: compute_fourier_coefficients include("Utilities.jl") include("DataTypes.jl") @@ -23,44 +23,76 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field export extract_plasma_surface_at_psi """ - compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; - green_only=false) + _compute_vacuum_response_single!( + wv, grri_in, grre_in, plasma_pts, wall_pts, + inputs::VacuumInput, wall_settings::WallShapeSettings; + n_override=nothing + ) -Compute the vacuum response matrix and both Green's functions using provided vacuum inputs. +Compute a single vacuum solve (one coupled 3D solve, or one `n`-slice in 2D) by building and solving +the boundary integral equation in mode space with an optional conducting wall present and writing out the results: -Single entry point for vacuum calculations. + - `wv`: complex vacuum response matrix in straight-fieldline mode space + - `grri_in`: interior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility) + - `grre_in`: exterior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility) + - `plasma_pts` / `wall_pts`: output point clouds for downstream plotting/diagnostics - - For **3D** (`inputs.nzeta > 1`), computes the full coupled response across all (m,n) modes defined - by `inputs.(mlow, mpert, nlow, npert)`. +## Fused kernel assembly + projection - - For **2D geometry** (`inputs.nzeta == 1`), supports either: +This routine uses a **newer kernel evaluation path** that never forms dense point-space kernel matrices. +Instead, it fuses kernel evaluation and Fourier/Galerkin projection into a single pass. - + **single-n** (`inputs.npert == 1`): computes (m,n) response for `n = inputs.nlow` - + **multi-n** (`inputs.npert > 1`): loops over `n = inputs.nlow:(inputs.nlow+inputs.npert-1)` and returns - **blocks** of the full response matrices with one block per toroidal mode number. +The key idea is: -This is the pure Julia implementation that replaces the Fortran `mscvac` function. -It computes both interior (grri) and exterior (grre) Green's functions for GPEC response calculations. + - Assemble and solve the boundary integral equation directly in `P×P` mode space. -# Arguments + - Avoid materializing `M×M` (2D) or `N×N` (3D) kernel matrices. - - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info. - - `wall_settings::WallShapeSettings`: Wall geometry configuration. - - `green_only`: If true, skip building the response matrix `wv` and return zeros for `wv` and `xzpts`. + - Uses complex basis `Z = C + iS` so projected operators are `P×P` complex. -# Returns + - The projected operators are accumulated row-by-row while kernel values are computed. + + - Memory drops from `O(M^2)` (or `O(N^2)`) down to `O(MP + P^2)` (or `O(NP + P^2)`). + + - FLOPs remain dominated by the same scaling as the two-step approach (kernel evaluation + projection), + plus an additional `O(P^3)` for the LU factorization/solve in mode space. + + - **Projected matrices** + + + Exterior projected kernel blocks are assembled into `K_ext` and `G_ext`. + + Interior operators are formed from the exterior ones using the discrete Green-identity diagonal term: + the implementation uses `K_int = 2*Gram - K_ext` for same-type source/observer blocks. This effectively + computes the kernel with an negative normal direction without recalculating the kernel. - - `wv`: Complex vacuum response matrix. + - **Solves** - + 2D single-n: `mpert × mpert` - + 2D multi-n: `(mpert*npert) × (mpert*npert)` (block diagonal) - + 3D: `num_modes × num_modes` (full coupled) + + If `nowall`, solve the plasma-only `P×P` system. + + If a wall is present, solve the coupled `2P×2P` block system. - - `grri`: Interior Green's function matrix. + - **Back-compat outputs** - - `grre`: Exterior Green's function matrix. + + Although the solve is performed in mode space, `grri_in` and `grre_in` are reconstructed into the + legacy real `M×(2P)` layout for downstream code paths that still expect that shape. - - `xzpts`: Coordinate array (mtheta×4 for 2D, mtheta*nzeta×4 for 3D) [R_plasma, Z_plasma, R_wall, Z_wall]. +## Arguments + + - **`wv::AbstractMatrix{ComplexF64}`**: output vacuum response matrix (modified in-place) + - **`grri_in::AbstractMatrix{Float64}`**: output interior Green's function (modified in-place; real/legacy layout) + - **`grre_in::AbstractMatrix{Float64}`**: output exterior Green's function (modified in-place; real/legacy layout) + - **`plasma_pts::AbstractMatrix{Float64}`**: plasma surface coordinates (modified in-place) + - **`wall_pts::AbstractMatrix{Float64}`**: wall surface coordinates (modified in-place) + - **`inputs::VacuumInput`**: mode ranges, grid resolution, and geometry settings + - **`wall_settings::WallShapeSettings`**: wall geometry configuration + - **`n_override::Union{Nothing,Int}`**: optional toroidal mode number override (only used for 2D) + +## 2D vs 3D behavior + + - **3D (`inputs.nzeta > 1`)**: computes the full coupled response across all `(m, n)` modes specified by + `inputs.(mlow, mpert, nlow, npert)` in a single call using the 3D kernel method in Kernel3D.jl. + - **2D (`inputs.nzeta == 1`)**: + + If `inputs.npert == 1`, computes the response for `n = inputs.nlow` using the 2D kernel method in Kernel2D.jl. + + If `inputs.npert > 1`, the public driver loops over `n` and calls this function once per `n`, + writing block columns into the full output matrices using the 2D kernel method in Kernel2D.jl. """ @with_pool pool function _compute_vacuum_response_single!( wv::AbstractMatrix{ComplexF64}, @@ -70,109 +102,127 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC wall_pts::AbstractMatrix{Float64}, inputs::VacuumInput, wall_settings::WallShapeSettings; - n_override::Union{Nothing,Int}=nothing, - green_only::Bool=false + n_override::Union{Nothing,Int}=nothing ) + (; mtheta, mpert, mlow, nzeta, npert, nlow) = inputs + # Initialize surface geometries - plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) - wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) + plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) + wall = nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) # Compute Fourier basis coefficients ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(inputs.mtheta, inputs.mpert, inputs.mlow, inputs.nzeta, inputs.npert, inputs.nlow; n_2D=n_override, ν=ν) - num_points_surf, num_modes = size(cos_mn_basis) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν) + num_points, num_modes = size(exp_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel - if inputs.nzeta > 1 - # Hardcode these values for now - can expose to the user in the future - kparams = KernelParams3D(11, 20, 5) - else - kparams = KernelParams2D(n_override) + # Hardcode these values for now - can expose to the user in the future + PATCH_RAD = 11 + RAD_DIM = 20 + INTERP_ORDER = 5 + kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override) + + # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed) + wall_fac = wall.nowall ? 1 : 2 + + # Gram matrix diagonal for the discrete Fourier basis on the uniform grid. + # + # For the basis produced by `compute_fourier_coefficients(...)` (complex exponentials sampled on a + # uniform grid), the discrete inner products satisfy: + # + # ZᴴZ = num_points · I + # + # up to roundoff, so the Gram matrix is diagonal. Store only the diagonal as a length-P vector. + Gram = acquire!(pool, ComplexF64, num_modes) + fill!(Gram, ComplexF64(num_points)) + + # Projected kernel matrices [P × P complex] + K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes) + G_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, num_modes) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) + + # Fused projected kernel: compute Z^H K Z and Z^H G Z for all operator blocks + # Plasma-plasma block + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + # Wall-plasma, plasma-wall, wall-wall blocks + if !wall.nowall + # Wall-plasma, plasma-wall, wall-wall blocks + kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) end - # Active rows for computation (plasma only if no wall, plasma+wall if wall present) - num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf + # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext + K_int .= -K_ext + @inbounds for p in 1:num_modes + K_int[p, p] += 2 * Gram[p] + end + if !wall.nowall + @inbounds for p in 1:num_modes + K_int[num_modes+p, num_modes+p] += 2 * Gram[p] + end + end + G_int .= G_ext + + # Solve projected BIEs for exterior and interior kernels + F_ext = lu!(K_ext) + ldiv!(F_ext, G_ext) + F_int = lu!(K_int) + ldiv!(F_int, G_int) + + # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G + wv .= view(G_ext, 1:num_modes, :) + @inbounds for p in 1:num_modes + @views wv[p, :] .*= Gram[p] + end + wv .*= (4π^2 / num_points) - # Local work arrays - grad_green = zeros!(pool, num_points_total, num_points_total) - green_temp = zeros!(pool, num_points_surf, num_points_surf) + # Enforce Hermitian symmetry if desired + inputs.force_wv_symmetry && hermitianpart!(wv) + # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section # Views into output Green's function matrices for the active rows/columns - grre = @view grre_in[1:num_points_total, :] - grri = @view grri_in[1:num_points_total, :] - - # Plasma–Plasma block - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - - # Fourier transform obs=plasma, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis) - fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) - + grre = @view grre_in[1:(wall_fac*num_points), :] + grri = @view grri_in[1:(wall_fac*num_points), :] + temp = zeros!(pool, ComplexF64, num_points, num_modes) + + mul!(temp, exp_mn_basis, view(G_ext, 1:num_modes, :)) + @view(grre[1:num_points, 1:num_modes]) .= real.(temp) + @view(grre[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:num_modes, :)) + @view(grri[1:num_points, 1:num_modes]) .= real.(temp) + @view(grri[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp) if !wall.nowall - # Plasma–Wall block - kernel!(grad_green, green_temp, plasma_surf, wall, kparams) - # Wall–Wall block - kernel!(grad_green, green_temp, wall, wall, kparams) - # Wall–Plasma block - kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Fourier transform obs=wall, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) - fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) + mul!(temp, exp_mn_basis, view(G_ext, (num_modes+1):(2*num_modes), :)) + @view(grre[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp) + @view(grre[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, (num_modes+1):(2*num_modes), :)) + @view(grri[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp) + @view(grri[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp) end - # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) - grri .= grre # start from same as exterior - grad_green_interior = similar!(pool, grad_green) - grad_green_interior .= grad_green - - # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel - F_ext = lu!(grad_green) - ldiv!(F_ext, grre) - - # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal - grad_green_interior .*= -1 - for i in 1:num_points_total - grad_green_interior[i, i] += 2.0 - end - F_int = lu!(grad_green_interior) - ldiv!(F_int, grri) - - # Always initialise wv to zero so that green_only keeps it zeroed - if !green_only - # Perform inverse Fourier transforms to get response matrix components [Chance Phys. Plasmas 2007 052506 eq. 115-118] - arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4) - fourier_inverse_transform!(arr, grre, cos_mn_basis) - fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes) - fourier_inverse_transform!(ari, grre, sin_mn_basis) - fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes) - - # Final form of vacuum response matrix [Chance Phys. Plasmas 2007 052506 eq. 114] - wv .= complex.(arr .+ aii, air .- ari) - inputs.force_wv_symmetry && hermitianpart!(wv) - - # Fill coordinate arrays - if inputs.nzeta > 1 # 3D - plasma_pts .= plasma_surf.r - wall_pts .= wall.r - else # 2D - @views begin - plasma_pts[:, 1] .= plasma_surf.x - plasma_pts[:, 2] .= 0.0 - plasma_pts[:, 3] .= plasma_surf.z - wall_pts[:, 1] .= wall.x - wall_pts[:, 2] .= 0.0 - wall_pts[:, 3] .= wall.z - end + if nzeta > 1 # 3D + plasma_pts .= plasma_surf.r + wall_pts .= wall.r + else # 2D + @views begin + plasma_pts[:, 1] .= plasma_surf.x + plasma_pts[:, 2] .= 0.0 + plasma_pts[:, 3] .= plasma_surf.z + wall_pts[:, 1] .= wall.x + wall_pts[:, 2] .= 0.0 + wall_pts[:, 3] .= wall.z end end end """ - compute_vacuum_response( - inputs::VacuumInput, - wall_settings::WallShapeSettings; - green_only=false) + compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) Allocate and return the vacuum response matrix and Green's functions for the given vacuum inputs. @@ -182,7 +232,7 @@ implementation. For performance‑critical paths that already own preallocated s (e.g. `ForceFreeStates.VacuumData`), prefer the in‑place method to avoid extra heap allocations. """ -@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false) +@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) # Allocate storage for the vacuum response matrix and Green's functions numpoints = inputs.mtheta * inputs.nzeta @@ -195,17 +245,13 @@ heap allocations. wall_pts=zeros!(pool, numpoints, 3) ) - compute_vacuum_response!(vac, inputs, wall_settings; green_only=green_only) + compute_vacuum_response!(vac, inputs, wall_settings) return vac.wv, vac.grri, vac.grre, vac.plasma_pts, vac.wall_pts end """ - compute_vacuum_response!( - vac_data, - inputs::VacuumInput, - wall_settings::WallShapeSettings; - green_only=false) + compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings) In-place variant that computes the vacuum response and directly populates the arrays stored in `vac_data`. @@ -222,7 +268,7 @@ compatible sizes: This is designed to work with `ForceFreeStates.VacuumData` but does not depend on its concrete type (duck-typed on field names only). """ -function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false) +function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings) mpert = inputs.mpert npert = inputs.npert @@ -237,8 +283,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings:: vac_data.plasma_pts, vac_data.wall_pts, inputs, - wall_settings; - green_only=green_only + wall_settings ) else # 2D vacuum: fill diagonal blocks of the response matrix @@ -262,8 +307,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings:: vac_data.wall_pts, inputs, wall_settings; - n_override=n, - green_only=green_only + n_override=n ) end end diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl index debb8299..3ae4b648 100644 --- a/test/runtests_vacuum.jl +++ b/test/runtests_vacuum.jl @@ -1,4 +1,5 @@ @testset "Vacuum.jl Unit Tests" begin + using LinearAlgebra @testset "Vacuum.jl (2D)" begin @@ -445,6 +446,120 @@ @test size(plasma_pts) == (16, 3) end end + + # ------------------------------------------------------------------------- + @testset "fused vs two-step Galerkin (2D, nowall)" begin + # Small case where both Galerkin paths are cheap: compare K_c, G_c + # assembled via the full M×M kernel + projection against the fused + # projected kernels from the unified `kernel!` API. + inputs = VacuumInput( + mtheta_in=17, + nzeta_in=1, + x=collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=17))), + z=collect(0.3 .* sin.(range(0, 2π, length=17))), + ν=zeros(17), + mlow=1, + mpert=2, + nlow=1, + npert=1, + nzeta=1, + mtheta=32 + ) + wall_settings = WallShapeSettings(shape="nowall") + + plasma_surf = GeneralizedPerturbedEquilibrium.Vacuum.PlasmaGeometry(inputs) + kparams = GeneralizedPerturbedEquilibrium.Vacuum.KernelParams2D(inputs.nlow) + + # Fourier basis on the surface grid + exp_mn_basis = GeneralizedPerturbedEquilibrium.Utilities.FourierTransforms.compute_fourier_coefficients( + inputs.mtheta, + inputs.mpert, + inputs.mlow, + inputs.nzeta, + inputs.npert, + inputs.nlow; + n_2D=inputs.nlow, + ν=plasma_surf.ν + ) + M, P = size(exp_mn_basis) + Gram = fill(ComplexF64(M), P) + + # --- Two-step Galerkin: materialize full kernels then project --- + grad_green_full = zeros(Float64, 2M, 2M) + green_full = zeros(Float64, M, M) + GeneralizedPerturbedEquilibrium.Vacuum.kernel!( + grad_green_full, + green_full, + plasma_surf, + plasma_surf, + kparams + ) + + # Exterior projected kernels from full matrices: K_c = Z^H K Z, G_c = Z^H G Z + K_c_two = zeros(ComplexF64, P, P) + G_c_two = zeros(ComplexF64, P, P) + tmp = zeros(ComplexF64, M, P) + + grad_pp = @view grad_green_full[1:M, 1:M] + mul!(tmp, grad_pp, exp_mn_basis) + mul!(K_c_two, exp_mn_basis', tmp) + mul!(tmp, green_full, exp_mn_basis) + mul!(G_c_two, exp_mn_basis', tmp) + + # --- Fused Galerkin via unified kernel! --- + K_c_fused = zeros(ComplexF64, P, P) + G_c_fused = zeros(ComplexF64, P, P) + GeneralizedPerturbedEquilibrium.Vacuum.projected_kernel!( + K_c_fused, + G_c_fused, + plasma_surf, + plasma_surf, + kparams, + exp_mn_basis, + Gram + ) + + @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12) + @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12) + end + + # ------------------------------------------------------------------------- + @testset "wall Galerkin vs collocation (2D, conformal)" begin + mtheta_eq = 17 + mtheta = 128 + mpert = 3 + boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq))) + boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq))) + + inputs_colloc = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=1, npert=1, + nzeta=1, mtheta=mtheta, + use_galerkin=false + ) + inputs_galerkin = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=1, npert=1, + nzeta=1, mtheta=mtheta, + use_galerkin=true + ) + + wall_settings = WallShapeSettings(shape="conformal", a=0.5) + + wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings) + wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings) + + M = mtheta + P = mpert + + @test isapprox(wv_g, wv_c; rtol=1e-8) + @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + end end # ------------------------------------------------------------------------- @@ -631,19 +746,43 @@ @test isapprox(wv, wv', rtol=1e-12) end - @testset "compute_vacuum_response 3D green_only" begin - inputs = _make_3d_inputs(mtheta=32, nzeta=32, mtheta_eq=17) - wall_settings = WallShapeSettings(shape="nowall") - wv, grri, grre, plasma_pts, wall_pts = compute_vacuum_response(inputs, wall_settings; green_only=true) + @testset "wall Galerkin vs collocation (3D, conformal)" begin + mtheta_eq = 17 + mtheta = 32 + nzeta = 32 + mpert = 2 + npert = 2 + boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq))) + boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq))) + + inputs_colloc = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=0, npert=npert, + nzeta=nzeta, mtheta=mtheta, + use_galerkin=false + ) + inputs_galerkin = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=0, npert=npert, + nzeta=nzeta, mtheta=mtheta, + use_galerkin=true + ) - numpoints = inputs.mtheta * inputs.nzeta - num_modes = inputs.mpert * inputs.npert - @test size(wv) == (num_modes, num_modes) - @test all(wv .== 0) - @test size(grri) == (2 * numpoints, 2 * num_modes) - @test size(grre) == (2 * numpoints, 2 * num_modes) - @test all(isfinite, grri) - @test all(isfinite, grre) + wall_settings = WallShapeSettings(shape="conformal", a=0.3) + + wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings) + wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings) + + M = mtheta * nzeta + P = mpert * npert + + @test isapprox(wv_g, wv_c; rtol=1e-8) + @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) end @testset "Kernel3D laplace_single_layer" begin