From b4ebfee60160849c6fe0dc5e802fd3f953f2f429 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Fri, 13 Mar 2026 10:34:16 -0400 Subject: [PATCH 01/23] VACUUM - WIP - removing green_only and adding a galerkin projection before inverting the system --- src/ForceFreeStates/Free.jl | 2 +- src/PerturbedEquilibrium/SingularCoupling.jl | 2 +- src/Vacuum/DataTypes.jl | 10 +- src/Vacuum/Vacuum.jl | 206 ++++++++++++------- test/runtests_vacuum.jl | 15 -- 5 files changed, 140 insertions(+), 95 deletions(-) diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl index 9c8ec89c..65455d9e 100644 --- a/src/ForceFreeStates/Free.jl +++ b/src/ForceFreeStates/Free.jl @@ -28,7 +28,7 @@ and data dumping. # Compute vacuum response matrix in-place (handles 2D single-n, 2D multi-n block-diagonal, and 3D) vac_inputs = Vacuum.VacuumInput(equil, psilim, ctrl.mthvac, ctrl.nzvac, mpert, mlow, npert, nlow; force_wv_symmetry=ctrl.force_wv_symmetry) - Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings) + @time Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings) # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126] singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)') diff --git a/src/PerturbedEquilibrium/SingularCoupling.jl b/src/PerturbedEquilibrium/SingularCoupling.jl index b27da960..40a8c1ce 100644 --- a/src/PerturbedEquilibrium/SingularCoupling.jl +++ b/src/PerturbedEquilibrium/SingularCoupling.jl @@ -154,7 +154,7 @@ function compute_singular_coupling_metrics!( # Compute Green's functions at this surface for this n # TODO: This assumes an initial 2D equilibrum, getting 2D Green's functions for independent n vac_input = Vacuum.VacuumInput(equil, sing_surf.psifac, mtheta, 1, mpert, mlow, 1, nn) - _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings; green_only=true) + _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings) # Store in singular surface struct (overwrites for each n) ffs_intr.sing[s].grri = grri diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl index 63d029b4..01811b41 100644 --- a/src/Vacuum/DataTypes.jl +++ b/src/Vacuum/DataTypes.jl @@ -22,6 +22,9 @@ nzeta > 1 for 3D vacuum calculation. - `mtheta::Int`: Number of vacuum calculation poloidal grid points - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation) - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix + - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)] + instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always + use collocation. Defaults to `false`. """ @kwdef struct VacuumInput x::Vector{Float64} = Float64[] @@ -37,6 +40,7 @@ nzeta > 1 for 3D vacuum calculation. mtheta::Int = 1 nzeta::Int = 1 force_wv_symmetry::Bool = true + use_galerkin::Bool = false end """ @@ -76,7 +80,8 @@ function VacuumInput( mlow::Int, npert::Int, nlow::Int; - force_wv_symmetry::Bool=true + force_wv_symmetry::Bool=true, + use_galerkin::Bool=false ) # Extract plasma surface geometry at this psi r, z, ν = extract_plasma_surface_at_psi(equil, ψ) @@ -92,7 +97,8 @@ function VacuumInput( npert=npert, mtheta=mtheta, nzeta=nzeta, - force_wv_symmetry=force_wv_symmetry + force_wv_symmetry=force_wv_symmetry, + use_galerkin=true ) end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 2f6335d6..99ae36b1 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -23,8 +23,7 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field export extract_plasma_surface_at_psi """ - compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; - green_only=false) + compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) Compute the vacuum response matrix and both Green's functions using provided vacuum inputs. @@ -46,7 +45,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info. - `wall_settings::WallShapeSettings`: Wall geometry configuration. - - `green_only`: If true, skip building the response matrix `wv` and return zeros for `wv` and `xzpts`. # Returns @@ -70,8 +68,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC wall_pts::AbstractMatrix{Float64}, inputs::VacuumInput, wall_settings::WallShapeSettings; - n_override::Union{Nothing,Int}=nothing, - green_only::Bool=false + n_override::Union{Nothing,Int}=nothing ) # Initialize surface geometries @@ -84,12 +81,11 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC num_points_surf, num_modes = size(cos_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel - if inputs.nzeta > 1 - # Hardcode these values for now - can expose to the user in the future - kparams = KernelParams3D(11, 20, 5) - else - kparams = KernelParams2D(n_override) - end + # Hardcode these values for now - can expose to the user in the future + PATCH_RAD = 11 + RAD_DIM = 20 + INTERP_ORDER = 5 + kparams = inputs.nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override) # Active rows for computation (plasma only if no wall, plasma+wall if wall present) num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf @@ -105,74 +101,138 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Plasma–Plasma block kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - # Fourier transform obs=plasma, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis) - fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) - - if !wall.nowall - # Plasma–Wall block - kernel!(grad_green, green_temp, plasma_surf, wall, kparams) - # Wall–Wall block - kernel!(grad_green, green_temp, wall, wall, kparams) - # Wall–Plasma block - kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Fourier transform obs=wall, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) - fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) - end + if wall.nowall && inputs.use_galerkin + # ================================================================ + # Galerkin projection: solve in Fourier space [2P × 2P] instead of + # the full collocation system [num_points_surf × num_points_surf]. + # + # Instead of: wv = F_inv * (K \ (G * F)) O(M³) + # We compute: wv ~ (F'KF) \ (F'GF) O(M²P + P³) + # + # where M = num_points_surf and P = num_modes and + # F = [cos_basis | sin_basis] is the [M × 2P] Fourier basis and + # K = grad_green is the [M × M] double-layer kernel matrix. + # ================================================================ + temp = zeros!(pool, num_points_surf, num_modes) + + # K_proj = F' * grad_green * F [2 * num_modes × 2 * num_modes] + K_proj = zeros(2 * num_modes, 2 * num_modes) + mul!(temp, grad_green, cos_mn_basis) + mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) + mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) + mul!(temp, grad_green, sin_mn_basis) + mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) + mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) + + # G_proj = F' * green_temp * F [2 * num_modes × 2 * num_modes] + G_proj = zeros(2 * num_modes, 2 * num_modes) + mul!(temp, green_temp, cos_mn_basis) + mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) + mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) + mul!(temp, green_temp, sin_mn_basis) + mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) + mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) + + # Gram matrix F'F (needed for interior kernel and wv normalization) + FtF = zeros(2 * num_modes, 2 * num_modes) + mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis) + mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis) + mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis) + mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis) + + # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier + # basis with the SFL angle correction (ν) can make the projected operators + # rank-deficient — the interior BIE operator in particular has a physical + # null space (constant potential mode). The pseudoinverse finds the + # minimum-norm solution, correctly projecting out numerically null directions + # without affecting well-resolved modes. + Y_ext = pinv(K_proj) * G_proj + + # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj + K_proj_int = 2 .* FtF .- K_proj + Y_int = pinv(K_proj_int) * G_proj + + # Reconstruct physical-space Green's functions for backward compatibility + # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :] + mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :])) + mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) + mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :])) + mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) + + # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext, + # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] + wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext) + wv .= complex.( + @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), + @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes]) + ) + else + # ================================================================ + # Collocation approach: solve full physical-space system [M × M] + # Handles both no-wall and wall cases. + # ================================================================ + + # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp) + fourier_transform!(grre, green_temp, cos_mn_basis) + fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) + + if !wall.nowall + # Plasma–Wall block + kernel!(grad_green, green_temp, plasma_surf, wall, kparams) + # Wall–Wall block + kernel!(grad_green, green_temp, wall, wall, kparams) + # Wall–Plasma block + kernel!(grad_green, green_temp, wall, plasma_surf, kparams) + # Fourier transform obs=wall, src=plasma block + fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) + fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) + end - # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) - grri .= grre # start from same as exterior - grad_green_interior = similar!(pool, grad_green) - grad_green_interior .= grad_green + # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) + grri .= grre # start from same as exterior + grad_green_interior = similar!(pool, grad_green) + grad_green_interior .= grad_green - # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel - F_ext = lu!(grad_green) - ldiv!(F_ext, grre) + # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel + F_ext = lu!(grad_green) + ldiv!(F_ext, grre) - # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal - grad_green_interior .*= -1 - for i in 1:num_points_total - grad_green_interior[i, i] += 2.0 - end - F_int = lu!(grad_green_interior) - ldiv!(F_int, grri) + # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal + grad_green_interior .*= -1 + for i in 1:num_points_total + grad_green_interior[i, i] += 2.0 + end + F_int = lu!(grad_green_interior) + ldiv!(F_int, grri) - # Always initialise wv to zero so that green_only keeps it zeroed - if !green_only - # Perform inverse Fourier transforms to get response matrix components [Chance Phys. Plasmas 2007 052506 eq. 115-118] + # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118] arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4) fourier_inverse_transform!(arr, grre, cos_mn_basis) fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes) fourier_inverse_transform!(ari, grre, sin_mn_basis) fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes) - - # Final form of vacuum response matrix [Chance Phys. Plasmas 2007 052506 eq. 114] wv .= complex.(arr .+ aii, air .- ari) - inputs.force_wv_symmetry && hermitianpart!(wv) - - # Fill coordinate arrays - if inputs.nzeta > 1 # 3D - plasma_pts .= plasma_surf.r - wall_pts .= wall.r - else # 2D - @views begin - plasma_pts[:, 1] .= plasma_surf.x - plasma_pts[:, 2] .= 0.0 - plasma_pts[:, 3] .= plasma_surf.z - wall_pts[:, 1] .= wall.x - wall_pts[:, 2] .= 0.0 - wall_pts[:, 3] .= wall.z - end + end + + inputs.force_wv_symmetry && hermitianpart!(wv) + + if inputs.nzeta > 1 # 3D + plasma_pts .= plasma_surf.r + wall_pts .= wall.r + else # 2D + @views begin + plasma_pts[:, 1] .= plasma_surf.x + plasma_pts[:, 2] .= 0.0 + plasma_pts[:, 3] .= plasma_surf.z + wall_pts[:, 1] .= wall.x + wall_pts[:, 2] .= 0.0 + wall_pts[:, 3] .= wall.z end end end """ - compute_vacuum_response( - inputs::VacuumInput, - wall_settings::WallShapeSettings; - green_only=false) + compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) Allocate and return the vacuum response matrix and Green's functions for the given vacuum inputs. @@ -182,7 +242,7 @@ implementation. For performance‑critical paths that already own preallocated s (e.g. `ForceFreeStates.VacuumData`), prefer the in‑place method to avoid extra heap allocations. """ -@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false) +@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) # Allocate storage for the vacuum response matrix and Green's functions numpoints = inputs.mtheta * inputs.nzeta @@ -195,17 +255,13 @@ heap allocations. wall_pts=zeros!(pool, numpoints, 3) ) - compute_vacuum_response!(vac, inputs, wall_settings; green_only=green_only) + compute_vacuum_response!(vac, inputs, wall_settings) return vac.wv, vac.grri, vac.grre, vac.plasma_pts, vac.wall_pts end """ - compute_vacuum_response!( - vac_data, - inputs::VacuumInput, - wall_settings::WallShapeSettings; - green_only=false) + compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings) In-place variant that computes the vacuum response and directly populates the arrays stored in `vac_data`. @@ -222,7 +278,7 @@ compatible sizes: This is designed to work with `ForceFreeStates.VacuumData` but does not depend on its concrete type (duck-typed on field names only). """ -function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false) +function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings) mpert = inputs.mpert npert = inputs.npert @@ -237,8 +293,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings:: vac_data.plasma_pts, vac_data.wall_pts, inputs, - wall_settings; - green_only=green_only + wall_settings ) else # 2D vacuum: fill diagonal blocks of the response matrix @@ -262,8 +317,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings:: vac_data.wall_pts, inputs, wall_settings; - n_override=n, - green_only=green_only + n_override=n ) end end diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl index debb8299..d51bd622 100644 --- a/test/runtests_vacuum.jl +++ b/test/runtests_vacuum.jl @@ -631,21 +631,6 @@ @test isapprox(wv, wv', rtol=1e-12) end - @testset "compute_vacuum_response 3D green_only" begin - inputs = _make_3d_inputs(mtheta=32, nzeta=32, mtheta_eq=17) - wall_settings = WallShapeSettings(shape="nowall") - wv, grri, grre, plasma_pts, wall_pts = compute_vacuum_response(inputs, wall_settings; green_only=true) - - numpoints = inputs.mtheta * inputs.nzeta - num_modes = inputs.mpert * inputs.npert - @test size(wv) == (num_modes, num_modes) - @test all(wv .== 0) - @test size(grri) == (2 * numpoints, 2 * num_modes) - @test size(grre) == (2 * numpoints, 2 * num_modes) - @test all(isfinite, grri) - @test all(isfinite, grre) - end - @testset "Kernel3D laplace_single_layer" begin x_obs = [1.0, 0.0, 0.0] x_src = [2.0, 0.0, 0.0] From 388240dcdd885abad3efa786069215f50f8a8648 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Fri, 13 Mar 2026 11:04:10 -0400 Subject: [PATCH 02/23] VACUUM - WIP - adding rough profiling for the code --- src/Vacuum/Vacuum.jl | 211 +++++++++++++++++++++++++------------------ 1 file changed, 122 insertions(+), 89 deletions(-) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 99ae36b1..8796048d 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -72,12 +72,27 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC ) # Initialize surface geometries - plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) - wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) + geom_timing = @timed begin + plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) + wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) + end + println(" Compute geometry TIME=$(round(geom_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(geom_timing.bytes))") # Compute Fourier basis coefficients - ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(inputs.mtheta, inputs.mpert, inputs.mlow, inputs.nzeta, inputs.npert, inputs.nlow; n_2D=n_override, ν=ν) + basis_timing = @timed begin + ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing + cos_mn_basis, sin_mn_basis = compute_fourier_coefficients( + inputs.mtheta, + inputs.mpert, + inputs.mlow, + inputs.nzeta, + inputs.npert, + inputs.nlow; + n_2D=n_override, + ν=ν + ) + end + println(" Compute Fourier basis TIME=$(round(basis_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(basis_timing.bytes))") num_points_surf, num_modes = size(cos_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel @@ -99,7 +114,10 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grri = @view grri_in[1:num_points_total, :] # Plasma–Plasma block - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + pp_kernel_timing = @timed begin + kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + end + println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") if wall.nowall && inputs.use_galerkin # ================================================================ @@ -115,57 +133,60 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # ================================================================ temp = zeros!(pool, num_points_surf, num_modes) - # K_proj = F' * grad_green * F [2 * num_modes × 2 * num_modes] - K_proj = zeros(2 * num_modes, 2 * num_modes) - mul!(temp, grad_green, cos_mn_basis) - mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) - mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) - mul!(temp, grad_green, sin_mn_basis) - mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) - mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - - # G_proj = F' * green_temp * F [2 * num_modes × 2 * num_modes] - G_proj = zeros(2 * num_modes, 2 * num_modes) - mul!(temp, green_temp, cos_mn_basis) - mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) - mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) - mul!(temp, green_temp, sin_mn_basis) - mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) - mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - - # Gram matrix F'F (needed for interior kernel and wv normalization) - FtF = zeros(2 * num_modes, 2 * num_modes) - mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis) - mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis) - mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis) - mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis) - - # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier - # basis with the SFL angle correction (ν) can make the projected operators - # rank-deficient — the interior BIE operator in particular has a physical - # null space (constant potential mode). The pseudoinverse finds the - # minimum-norm solution, correctly projecting out numerically null directions - # without affecting well-resolved modes. - Y_ext = pinv(K_proj) * G_proj - - # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj - K_proj_int = 2 .* FtF .- K_proj - Y_int = pinv(K_proj_int) * G_proj - - # Reconstruct physical-space Green's functions for backward compatibility - # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :] - mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :])) - mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) - mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :])) - mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) - - # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext, - # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] - wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext) - wv .= complex.( - @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), - @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes]) - ) + proj_timing = @timed begin + # K_proj = F' * grad_green * F [2 * num_modes × 2 * num_modes] + K_proj = zeros(2 * num_modes, 2 * num_modes) + mul!(temp, grad_green, cos_mn_basis) + mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) + mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) + mul!(temp, grad_green, sin_mn_basis) + mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) + mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) + + # G_proj = F' * green_temp * F [2 * num_modes × 2 * num_modes] + G_proj = zeros(2 * num_modes, 2 * num_modes) + mul!(temp, green_temp, cos_mn_basis) + mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) + mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) + mul!(temp, green_temp, sin_mn_basis) + mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) + mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) + + # Gram matrix F'F (needed for interior kernel and wv normalization) + FtF = zeros(2 * num_modes, 2 * num_modes) + mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis) + mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis) + mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis) + mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis) + + # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier + # basis with the SFL angle correction (ν) can make the projected operators + # rank-deficient — the interior BIE operator in particular has a physical + # null space (constant potential mode). The pseudoinverse finds the + # minimum-norm solution, correctly projecting out numerically null directions + # without affecting well-resolved modes. + Y_ext = pinv(K_proj) * G_proj + + # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj + K_proj_int = 2 .* FtF .- K_proj + Y_int = pinv(K_proj_int) * G_proj + + # Reconstruct physical-space Green's functions for backward compatibility + # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :] + mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :])) + mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) + mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :])) + mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) + + # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext, + # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] + wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext) + wv .= complex.( + @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), + @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes]) + ) + end + println(" Galerkin Project and Solve TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") else # ================================================================ # Collocation approach: solve full physical-space system [M × M] @@ -173,45 +194,57 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # ================================================================ # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp) - fourier_transform!(grre, green_temp, cos_mn_basis) - fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) + colloc_ft_timing = @timed begin + fourier_transform!(grre, green_temp, cos_mn_basis) + fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) + end + println(" Plasma Fourier Transform TIME=$(round(colloc_ft_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(colloc_ft_timing.bytes))") if !wall.nowall - # Plasma–Wall block - kernel!(grad_green, green_temp, plasma_surf, wall, kparams) - # Wall–Wall block - kernel!(grad_green, green_temp, wall, wall, kparams) - # Wall–Plasma block - kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Fourier transform obs=wall, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) - fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) + wall_block_timing = @timed begin + # Plasma–Wall block + kernel!(grad_green, green_temp, plasma_surf, wall, kparams) + # Wall–Wall block + kernel!(grad_green, green_temp, wall, wall, kparams) + # Wall–Plasma block + kernel!(grad_green, green_temp, wall, plasma_surf, kparams) + # Fourier transform obs=wall, src=plasma block + fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) + fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) + end + println(" Wall Kernel and Fourier Transform TIME=$(round(wall_block_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))") end # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) - grri .= grre # start from same as exterior - grad_green_interior = similar!(pool, grad_green) - grad_green_interior .= grad_green - - # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel - F_ext = lu!(grad_green) - ldiv!(F_ext, grre) - - # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal - grad_green_interior .*= -1 - for i in 1:num_points_total - grad_green_interior[i, i] += 2.0 + solve_timing = @timed begin + grri .= grre # start from same as exterior + grad_green_interior = similar!(pool, grad_green) + grad_green_interior .= grad_green + + # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel + F_ext = lu!(grad_green) + ldiv!(F_ext, grre) + + # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal + grad_green_interior .*= -1 + for i in 1:num_points_total + grad_green_interior[i, i] += 2.0 + end + F_int = lu!(grad_green_interior) + ldiv!(F_int, grri) + end + println(" Invert and Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + + invft_timing = @timed begin + # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118] + arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4) + fourier_inverse_transform!(arr, grre, cos_mn_basis) + fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes) + fourier_inverse_transform!(ari, grre, sin_mn_basis) + fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes) + wv .= complex.(arr .+ aii, air .- ari) end - F_int = lu!(grad_green_interior) - ldiv!(F_int, grri) - - # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118] - arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4) - fourier_inverse_transform!(arr, grre, cos_mn_basis) - fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes) - fourier_inverse_transform!(ari, grre, sin_mn_basis) - fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes) - wv .= complex.(arr .+ aii, air .- ari) + println(" Compute Wv TIME=$(round(invft_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(invft_timing.bytes))") end inputs.force_wv_symmetry && hermitianpart!(wv) From 7733df93e7750227051da9fb83db7274a044491a Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Fri, 13 Mar 2026 11:52:27 -0400 Subject: [PATCH 03/23] VACUUM - WIP - fused kernel operations so full matrix is never stored in memory. Works for 1 thread, but currently some nondeterministic results in multi-threading --- src/Vacuum/DataTypes.jl | 10 +- src/Vacuum/ProjectedKernel.jl | 464 ++++++++++++++++++++++++++++++++++ src/Vacuum/Vacuum.jl | 121 ++++++--- 3 files changed, 561 insertions(+), 34 deletions(-) create mode 100644 src/Vacuum/ProjectedKernel.jl diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl index 01811b41..10c1f8bd 100644 --- a/src/Vacuum/DataTypes.jl +++ b/src/Vacuum/DataTypes.jl @@ -25,6 +25,9 @@ nzeta > 1 for 3D vacuum calculation. - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)] instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always use collocation. Defaults to `false`. + - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with + the Fourier projection so that the full M×M kernel matrices are never materialized. + Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`. """ @kwdef struct VacuumInput x::Vector{Float64} = Float64[] @@ -41,6 +44,7 @@ nzeta > 1 for 3D vacuum calculation. nzeta::Int = 1 force_wv_symmetry::Bool = true use_galerkin::Bool = false + fuse_projection::Bool = false end """ @@ -81,7 +85,8 @@ function VacuumInput( npert::Int, nlow::Int; force_wv_symmetry::Bool=true, - use_galerkin::Bool=false + use_galerkin::Bool=false, + fuse_projection::Bool=false ) # Extract plasma surface geometry at this psi r, z, ν = extract_plasma_surface_at_psi(equil, ψ) @@ -98,7 +103,8 @@ function VacuumInput( mtheta=mtheta, nzeta=nzeta, force_wv_symmetry=force_wv_symmetry, - use_galerkin=true + use_galerkin=true, + fuse_projection=true ) end diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl new file mode 100644 index 00000000..bd17a9a9 --- /dev/null +++ b/src/Vacuum/ProjectedKernel.jl @@ -0,0 +1,464 @@ +# Fused kernel assembly + Fourier projection for Galerkin vacuum solve. +# +# Instead of materializing the full M×M kernel matrices and then projecting, +# these functions accumulate the P×P projected matrices row by row as the +# kernel values are computed, reducing memory from O(M²) to O(MP). +# +# K_c = Z^H K Z and G_c = Z^H G Z +# +# where Z = C + iS is the [M × P] complex Fourier basis, K is the double-layer +# kernel, and G is the single-layer kernel. For each observer point j, the +# kernel row is projected and accumulated via rank-1 updates: +# +# K_c += conj(Z[j,:]) ⊗ (K[j,:] · Z) +# +# FLOP cost is identical to the two-step approach O(M²P), but memory drops +# from O(M²) to O(MP + P²). + +""" + projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram) + +Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z +directly, without materializing the full M×M kernel matrices. + +Dispatches to the 2D or 3D implementation based on the geometry/params types. + +# Arguments + + - `K_c::Matrix{ComplexF64}`: Output P×P projected double-layer kernel [filled in-place] + - `G_c::Matrix{ComplexF64}`: Output P×P projected single-layer kernel [filled in-place] + - `observer`: Observer geometry struct + - `source`: Source geometry struct + - `params`: Kernel parameters (KernelParams2D or KernelParams3D) + - `cos_basis::Matrix{Float64}`: [M × P] cosine Fourier basis + - `sin_basis::Matrix{Float64}`: [M × P] sine Fourier basis + - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term) +""" +function projected_kernel! end + +function projected_kernel!( + K_c::Matrix{ComplexF64}, + G_c::Matrix{ComplexF64}, + observer::Union{PlasmaGeometry,WallGeometry}, + source::Union{PlasmaGeometry,WallGeometry}, + params::KernelParams2D, + cos_basis::Matrix{Float64}, + sin_basis::Matrix{Float64}, + Gram::Matrix{ComplexF64} +) + _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram) +end + +function projected_kernel!( + K_c::Matrix{ComplexF64}, + G_c::Matrix{ComplexF64}, + observer::Union{PlasmaGeometry3D,WallGeometry3D}, + source::Union{PlasmaGeometry3D,WallGeometry3D}, + params::KernelParams3D, + cos_basis::Matrix{Float64}, + sin_basis::Matrix{Float64}, + Gram::Matrix{ComplexF64} +) + _projected_kernel_3D!(K_c, G_c, observer, source, + params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, + cos_basis, sin_basis, Gram) +end + + +# ============================================================================ +# 2D fused projected kernel +# ============================================================================ + +""" + _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram) + +Fused 2D kernel assembly + projection. Mirrors the loop structure of +`compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the +P×P projected matrices instead of filling the M×M kernel matrices. + +Memory: O(MP) instead of O(M²). +""" +@with_pool pool function _projected_kernel_2D!( + K_c::Matrix{ComplexF64}, + G_c::Matrix{ComplexF64}, + observer::Union{PlasmaGeometry,WallGeometry}, + source::Union{PlasmaGeometry,WallGeometry}, + n::Int, + cos_basis::Matrix{Float64}, + sin_basis::Matrix{Float64}, + Gram::Matrix{ComplexF64} +) + M, P = size(cos_basis) + mtheta = length(observer.x) + dtheta = 2π / mtheta + theta_grid = range(; start=0, length=mtheta, step=dtheta) + + populate_greenfunction = source isa PlasmaGeometry + + # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78] + log_correction_0 = 16.0 * dtheta * (log(2 * dtheta) - 68.0 / 15.0) / 15.0 + log_correction_1 = 128.0 * dtheta * (log(2 * dtheta) - 8.0 / 15.0) / 45.0 + log_correction_2 = 4.0 * dtheta * (7.0 * log(2 * dtheta) - 11.0 / 15.0) / 45.0 + log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2) + + gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n) + + spline_x = cubic_interp(theta_grid, source.x; bc=PeriodicBC(; endpoint=:exclusive, period=2π)) + spline_z = cubic_interp(theta_grid, source.z; bc=PeriodicBC(; endpoint=:exclusive, period=2π)) + d1_spline_x = deriv1(spline_x) + d1_spline_z = deriv1(spline_z) + + stencils_left, stencils_right = GL8_LAGRANGE_STENCILS + sing_idx = zeros!(pool, Int, 5) + + dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta) + dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta) + d1_spline_x(dx_dtheta_grid, theta_grid) + d1_spline_z(dz_dtheta_grid, theta_grid) + + # Pre-transpose basis for contiguous column access: Ct[:, k] = C[k, :] + Ct = acquire!(pool, Float64, P, M) + St = acquire!(pool, Float64, P, M) + Ct .= cos_basis' + St .= sin_basis' + + # Real/imaginary accumulators for P×P projected matrices + K_re = zeros(P, P) + K_im = zeros(P, P) + G_re = zeros(P, P) + G_im = zeros(P, P) + + # Per-observer projection vectors (P-length) + proj_kc = zeros(P) + proj_ks = zeros(P) + proj_gc = zeros(P) + proj_gs = zeros(P) + + for j in 1:mtheta + x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] + + fill!(proj_kc, 0.0) + fill!(proj_ks, 0.0) + fill!(proj_gc, 0.0) + fill!(proj_gs, 0.0) + diag_accum = 0.0 + + # ── Simpson integration for nonsingular source points ── + @inbounds for k in 1:(mtheta-3) + isrc = mod1(j + 1 + k, mtheta) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, + source.x[isrc], source.z[isrc], + dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; + gamma_prefactor) + + wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2)) + + if populate_greenfunction + w_g = G_n * wsimpson + BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) + BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + end + w_k = gradG_n * wsimpson + BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc) + BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks) + + diag_accum -= gradG_0 * wsimpson + end + + # ── Gaussian quadrature for singular points ── + for (offset_idx, offset) in enumerate(-2:2) + sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta) + end + + for leftpanel in (true, false) + gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta) + @inbounds for ig in 1:8 + theta_gauss = gauss_mid + GL8.x[ig] * dtheta + theta_gauss0 = mod(theta_gauss, 2π) + x_gauss = spline_x(theta_gauss0) + dx_dtheta_gauss = d1_spline_x(theta_gauss0) + z_gauss = spline_z(theta_gauss0) + dz_dtheta_gauss = d1_spline_z(theta_gauss0) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, + x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; + gamma_prefactor) + + s = leftpanel ? stencils_left[ig] : stencils_right[ig] + wgauss = GL8.w[ig] * dtheta + + if populate_greenfunction + if observer isa PlasmaGeometry + G_n += log((theta_obs - theta_gauss)^2) / x_obs + end + @inbounds for stencil_idx in 1:5 + w_g = G_n * s[stencil_idx] * wgauss + isrc = sing_idx[stencil_idx] + BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) + BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + end + end + + @inbounds for stencil_idx in 1:5 + w_k = gradG_n * s[stencil_idx] * wgauss + isrc = sing_idx[stencil_idx] + BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc) + BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks) + end + + diag_accum -= gradG_0 * wgauss + end + end + + # Analytic singular integral correction [Chance 1997 eq. 75] + if populate_greenfunction && observer isa PlasmaGeometry + @inbounds for stencil_idx in 1:5 + w_g = -log_correction_array[stencil_idx] / x_obs + isrc = sing_idx[stencil_idx] + BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) + BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + end + end + + # Fold diagonal accumulation into projection + BLAS.axpy!(diag_accum, @view(Ct[:, j]), proj_kc) + BLAS.axpy!(diag_accum, @view(St[:, j]), proj_ks) + + # ── Rank-1 accumulate into P×P projection matrices ── + # K_c_re += C[j,:] ⊗ proj_kc + S[j,:] ⊗ proj_ks + BLAS.ger!(1.0, @view(Ct[:, j]), proj_kc, K_re) + BLAS.ger!(1.0, @view(St[:, j]), proj_ks, K_re) + # K_c_im += C[j,:] ⊗ proj_ks − S[j,:] ⊗ proj_kc + BLAS.ger!(1.0, @view(Ct[:, j]), proj_ks, K_im) + BLAS.ger!(-1.0, @view(St[:, j]), proj_kc, K_im) + + if populate_greenfunction + BLAS.ger!(1.0, @view(Ct[:, j]), proj_gc, G_re) + BLAS.ger!(1.0, @view(St[:, j]), proj_gs, G_re) + BLAS.ger!(1.0, @view(Ct[:, j]), proj_gs, G_im) + BLAS.ger!(-1.0, @view(St[:, j]), proj_gc, G_im) + end + end + + # ── Post-processing (mirrors compute_2D_kernel_matrices!) ── + + # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source + if source isa PlasmaGeometry + K_re .*= -1 + K_im .*= -1 + end + + # Diagonal residue: K += residue·I → K_c += residue·Gram + # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] + residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) + if residue != 0.0 + K_re .+= residue .* real.(Gram) + K_im .+= residue .* imag.(Gram) + end + + # 2π𝒢 → 𝒢 + if populate_greenfunction + G_re ./= 2π + G_im ./= 2π + end + + K_c .= complex.(K_re, K_im) + G_c .= complex.(G_re, G_im) +end + + +# ============================================================================ +# 3D fused projected kernel +# ============================================================================ + +""" + _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, cos_basis, sin_basis, Gram) + +Fused 3D kernel assembly + projection. Mirrors the loop structure of +`compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction) +but writes projected P-vectors to per-observer rows of [M × P] buffers instead of +filling the M×M kernel matrices. The P×P assembly is done after the parallel loop +via sequential GEMM calls. + +Each observer writes to its own row of the shared buffers, so there are no +cross-thread accumulation races — the same write pattern as the original +`compute_3D_kernel_matrices!`. + +Memory: O(4MP + P²) instead of O(M²). +""" +function _projected_kernel_3D!( + K_c::Matrix{ComplexF64}, + G_c::Matrix{ComplexF64}, + observer::Union{PlasmaGeometry3D,WallGeometry3D}, + source::Union{PlasmaGeometry3D,WallGeometry3D}, + PATCH_RAD::Int, + RAD_DIM::Int, + INTERP_ORDER::Int, + cos_basis::Matrix{Float64}, + sin_basis::Matrix{Float64}, + Gram::Matrix{ComplexF64} +) + M, P = size(cos_basis) + num_points = observer.mtheta * observer.nzeta + dθdζ = 4π^2 / num_points + + populate_greenfunction = source isa PlasmaGeometry3D + + if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2 + @warn "PATCH_RAD clamped in projected kernel" max_PATCH_RAD=(min(source.mtheta, source.nzeta) - 1) ÷ 2 + PATCH_RAD = (min(source.mtheta, source.nzeta) - 1) ÷ 2 + end + quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER) + (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data + + # Pre-transpose basis for contiguous column access in the inner loop + Ct = Matrix(cos_basis') # [P × M] + St = Matrix(sin_basis') # [P × M] + + # [M × P] buffers for projected kernel rows. + # Row idx_obs = Σ_k K[idx_obs, k] · basis[k, :] — each observer writes to + # its own row, so no cross-thread races. + KZ_c = zeros(M, P) + KZ_s = zeros(M, P) + GZ_c = zeros(M, P) + GZ_s = zeros(M, P) + + # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors) + max_tid = Threads.maxthreadid() + workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid] + proj_kc_all = [zeros(P) for _ in 1:max_tid] + proj_ks_all = [zeros(P) for _ in 1:max_tid] + proj_gc_all = [zeros(P) for _ in 1:max_tid] + proj_gs_all = [zeros(P) for _ in 1:max_tid] + + Threads.@threads :static for idx_obs in 1:num_points + tid = Threads.threadid() + ws = workspaces[tid] + (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar, + n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws + + proj_kc = proj_kc_all[tid] + proj_ks = proj_ks_all[tid] + proj_gc = proj_gc_all[tid] + proj_gs = proj_gs_all[tid] + + fill!(proj_kc, 0.0) + fill!(proj_ks, 0.0) + fill!(proj_gc, 0.0) + fill!(proj_gs, 0.0) + + i_obs = mod1(idx_obs, observer.mtheta) + j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 + r_obs = @view observer.r[idx_obs, :] + + # ── FAR FIELD: Trapezoidal rule ── + @inbounds for idx_src in 1:num_points + r_src = @view source.r[idx_src, :] + n_src = @view source.normal[idx_src, :] + w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ + @inbounds @simd for m in 1:P + proj_kc[m] += w_double * Ct[m, idx_src] + proj_ks[m] += w_double * St[m, idx_src] + end + + if populate_greenfunction + w_single = laplace_single_layer(r_obs, r_src) * dθdζ + @inbounds @simd for m in 1:P + proj_gc[m] += w_single * Ct[m, idx_src] + proj_gs[m] += w_single * St[m, idx_src] + end + end + end + + # ── NEAR FIELD: Polar quadrature with BIEST singular correction ── + extract_patch!(r_patch, source.r, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) + extract_patch!(dr_dθ_patch, source.dr_dθ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) + extract_patch!(dr_dζ_patch, source.dr_dζ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) + + interpolate_to_polar!(r_polar, r_patch, P2G) + interpolate_to_polar!(dr_dθ_polar, dr_dθ_patch, P2G) + interpolate_to_polar!(dr_dζ_polar, dr_dζ_patch, P2G) + + compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient) + + @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM + r_src = @view r_polar[ir, ia, :] + n_src = @view n_polar[ir, ia, :] + M_polar_single[ir, ia] = laplace_single_layer(r_obs, r_src) * Ppou[ir, ia] * dθdζ + M_polar_double[ir, ia] = laplace_double_layer(r_obs, r_src, n_src) * Ppou[ir, ia] * dθdζ + end + + mul!(M_grid_single_flat, P2G, vec(M_polar_single)) + mul!(M_grid_double_flat, P2G, vec(M_polar_double)) + M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) + M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) + + @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM + idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) + idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) + idx_src = idx_pol + source.mtheta * (idx_tor - 1) + + r_src = @view source.r[idx_src, :] + n_src = @view source.normal[idx_src, :] + far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ + w_double = M_grid_double[ii, jj] + far_double + @simd for m in 1:P + proj_kc[m] += w_double * Ct[m, idx_src] + proj_ks[m] += w_double * St[m, idx_src] + end + + if populate_greenfunction + far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ + w_single = M_grid_single[ii, jj] + far_single + @simd for m in 1:P + proj_gc[m] += w_single * Ct[m, idx_src] + proj_gs[m] += w_single * St[m, idx_src] + end + end + end + + # ── Write projected row to buffer (each idx_obs owns its row) ── + @inbounds for m in 1:P + KZ_c[idx_obs, m] = proj_kc[m] + KZ_s[idx_obs, m] = proj_ks[m] + end + if populate_greenfunction + @inbounds for m in 1:P + GZ_c[idx_obs, m] = proj_gc[m] + GZ_s[idx_obs, m] = proj_gs[m] + end + end + end + + # ── Assemble P×P projected matrices via GEMM (sequential, after barrier) ── + # K_c = Z^H K Z = (C'·KZ_c + S'·KZ_s) + i(C'·KZ_s − S'·KZ_c) + K_re = zeros(P, P) + K_im = zeros(P, P) + mul!(K_re, cos_basis', KZ_c) + mul!(K_re, sin_basis', KZ_s, 1.0, 1.0) + mul!(K_im, cos_basis', KZ_s) + mul!(K_im, sin_basis', KZ_c, -1.0, 1.0) + + G_re = zeros(P, P) + G_im = zeros(P, P) + if populate_greenfunction + mul!(G_re, cos_basis', GZ_c) + mul!(G_re, sin_basis', GZ_s, 1.0, 1.0) + mul!(G_im, cos_basis', GZ_s) + mul!(G_im, sin_basis', GZ_c, -1.0, 1.0) + end + + # ── Post-processing (mirrors compute_3D_kernel_matrices!) ── + K_re ./= 2π + K_im ./= 2π + G_re ./= 2π + G_im ./= 2π + + # Diagonal: K += I → K_c += Gram [for same-type source/observer] + if typeof(source) == typeof(observer) + K_re .+= real.(Gram) + K_im .+= imag.(Gram) + end + + K_c .= complex.(K_re, K_im) + G_c .= complex.(G_re, G_im) +end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 8796048d..1970786b 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -16,6 +16,7 @@ include("DataTypes.jl") include("PnQuadCache.jl") include("Kernel2D.jl") include("Kernel3D.jl") +include("ProjectedKernel.jl") include("Field.jl") export VacuumInput, WallShapeSettings @@ -105,36 +106,88 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Active rows for computation (plasma only if no wall, plasma+wall if wall present) num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf - # Local work arrays - grad_green = zeros!(pool, num_points_total, num_points_total) - green_temp = zeros!(pool, num_points_surf, num_points_surf) - # Views into output Green's function matrices for the active rows/columns grre = @view grre_in[1:num_points_total, :] grri = @view grri_in[1:num_points_total, :] - # Plasma–Plasma block - pp_kernel_timing = @timed begin - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - end - println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") - - if wall.nowall && inputs.use_galerkin + if wall.nowall && inputs.use_galerkin && inputs.fuse_projection # ================================================================ - # Galerkin projection: solve in Fourier space [2P × 2P] instead of - # the full collocation system [num_points_surf × num_points_surf]. + # Fused Galerkin: kernel assembly + Fourier projection in one pass. + # The full M×M kernel matrices are never materialized — instead the + # P×P projected matrices K_c and G_c are accumulated row by row as + # kernel values are computed. # - # Instead of: wv = F_inv * (K \ (G * F)) O(M³) - # We compute: wv ~ (F'KF) \ (F'GF) O(M²P + P³) + # Memory: O(MP + P²) instead of O(M²) + # FLOPs: O(M²P + P³) — same as two-step Galerkin + # ================================================================ + P = num_modes + M = num_points_surf + + fused_timing = @timed begin + # Gram matrix Gram = Z^H Z [P × P complex] + Gram = complex.(cos_mn_basis' * cos_mn_basis .+ sin_mn_basis' * sin_mn_basis, + cos_mn_basis' * sin_mn_basis .- sin_mn_basis' * cos_mn_basis) + + # Fused projected kernel: K_c = Z^H K Z, G_c = Z^H G Z [P × P complex] + K_c = zeros(ComplexF64, P, P) + G_c = zeros(ComplexF64, P, P) + projected_kernel!(K_c, G_c, plasma_surf, plasma_surf, kparams, + cos_mn_basis, sin_mn_basis, Gram) + end + println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") + + solve_timing = @timed begin + # Solve projected BIE via SVD-based pseudoinverse [pinv handles rank deficiency] + c_ext = pinv(K_c) * G_c + + # Interior kernel: K_int = -K + 2I → K_c_int = 2·Gram - K_c + K_c_int = 2 .* Gram .- K_c + c_int = pinv(K_c_int) * G_c + + # wv = (4π²/M) · Gram · c_ext [P × P complex, Chance 2007 eq. 114] + wv .= (4π^2 / M) .* (Gram * c_ext) + + # ── Backward-compatible reconstruction of real grri/grre ───────── + # The downstream code (ForceFreeStates, PerturbedEquilibrium) still expects + # grre/grri as [M × 2P] real matrices. Reconstruct from the complex P×P + # solution coefficients. This section can be removed once the downstream + # modules are updated to work directly in mode space. + c_ext_r, c_ext_i = real.(c_ext), imag.(c_ext) + c_int_r, c_int_i = real.(c_int), imag.(c_int) + + mul!(@view(grre[1:M, 1:P]), cos_mn_basis, c_ext_r) + mul!(@view(grre[1:M, 1:P]), sin_mn_basis, c_ext_i, -1.0, 1.0) + mul!(@view(grre[1:M, (P+1):(2*P)]), cos_mn_basis, c_ext_i) + mul!(@view(grre[1:M, (P+1):(2*P)]), sin_mn_basis, c_ext_r, 1.0, 1.0) + + mul!(@view(grri[1:M, 1:P]), cos_mn_basis, c_int_r) + mul!(@view(grri[1:M, 1:P]), sin_mn_basis, c_int_i, -1.0, 1.0) + mul!(@view(grri[1:M, (P+1):(2*P)]), cos_mn_basis, c_int_i) + mul!(@view(grri[1:M, (P+1):(2*P)]), sin_mn_basis, c_int_r, 1.0, 1.0) + end + println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + + elseif wall.nowall && inputs.use_galerkin + # ================================================================ + # Two-step Galerkin: full M×M kernel → project → solve in P×P. # - # where M = num_points_surf and P = num_modes and - # F = [cos_basis | sin_basis] is the [M × 2P] Fourier basis and - # K = grad_green is the [M × M] double-layer kernel matrix. + # Memory: O(M²) for kernel storage + # FLOPs: O(M²P + P³) # ================================================================ + + # Full-size kernel matrices + grad_green = zeros!(pool, num_points_total, num_points_total) + green_temp = zeros!(pool, num_points_surf, num_points_surf) + + pp_kernel_timing = @timed begin + kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + end + println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") + temp = zeros!(pool, num_points_surf, num_modes) proj_timing = @timed begin - # K_proj = F' * grad_green * F [2 * num_modes × 2 * num_modes] + # K_proj = F' * grad_green * F [2P × 2P] K_proj = zeros(2 * num_modes, 2 * num_modes) mul!(temp, grad_green, cos_mn_basis) mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) @@ -143,7 +196,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - # G_proj = F' * green_temp * F [2 * num_modes × 2 * num_modes] + # G_proj = F' * green_temp * F [2P × 2P] G_proj = zeros(2 * num_modes, 2 * num_modes) mul!(temp, green_temp, cos_mn_basis) mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) @@ -152,34 +205,28 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - # Gram matrix F'F (needed for interior kernel and wv normalization) + # Gram matrix F'F FtF = zeros(2 * num_modes, 2 * num_modes) mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis) mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis) mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis) mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis) - # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier - # basis with the SFL angle correction (ν) can make the projected operators - # rank-deficient — the interior BIE operator in particular has a physical - # null space (constant potential mode). The pseudoinverse finds the - # minimum-norm solution, correctly projecting out numerically null directions - # without affecting well-resolved modes. + # Solve projected systems via SVD-based pseudoinverse [pinv handles rank deficiency] Y_ext = pinv(K_proj) * G_proj - # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj + # Interior kernel: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj K_proj_int = 2 .* FtF .- K_proj Y_int = pinv(K_proj_int) * G_proj - # Reconstruct physical-space Green's functions for backward compatibility - # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :] + # ── Backward-compatible reconstruction of real grri/grre ───────── + # This section can be removed once downstream modules work in mode space. mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :])) mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :])) mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) - # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext, - # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] + # wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext) wv .= complex.( @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), @@ -187,12 +234,22 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC ) end println(" Galerkin Project and Solve TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") + else # ================================================================ # Collocation approach: solve full physical-space system [M × M] # Handles both no-wall and wall cases. # ================================================================ + # Full-size kernel matrices + grad_green = zeros!(pool, num_points_total, num_points_total) + green_temp = zeros!(pool, num_points_surf, num_points_surf) + + pp_kernel_timing = @timed begin + kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + end + println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") + # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp) colloc_ft_timing = @timed begin fourier_transform!(grre, green_temp, cos_mn_basis) From 2e1e728b7ee1a46be02f722875d2b509c8dba99a Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Sat, 14 Mar 2026 16:48:59 -0400 Subject: [PATCH 04/23] VACUUM - WIP - condensing tri operations into a single complex operation, some pooling optimizations --- src/Vacuum/ProjectedKernel.jl | 33 +++--- src/Vacuum/Vacuum.jl | 189 ++++++++++++++++------------------ 2 files changed, 104 insertions(+), 118 deletions(-) diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl index bd17a9a9..4bca9062 100644 --- a/src/Vacuum/ProjectedKernel.jl +++ b/src/Vacuum/ProjectedKernel.jl @@ -15,6 +15,9 @@ # FLOP cost is identical to the two-step approach O(M²P), but memory drops # from O(M²) to O(MP + P²). +# ============================================================================ +# 2D fused projected kernel +# ============================================================================ """ projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram) @@ -37,38 +40,32 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types. function projected_kernel! end function projected_kernel!( - K_c::Matrix{ComplexF64}, - G_c::Matrix{ComplexF64}, + K_c::AbstractMatrix{ComplexF64}, + G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, params::KernelParams2D, cos_basis::Matrix{Float64}, sin_basis::Matrix{Float64}, - Gram::Matrix{ComplexF64} + Gram::AbstractMatrix{ComplexF64} ) _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram) end function projected_kernel!( - K_c::Matrix{ComplexF64}, - G_c::Matrix{ComplexF64}, + K_c::AbstractMatrix{ComplexF64}, + G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, params::KernelParams3D, cos_basis::Matrix{Float64}, sin_basis::Matrix{Float64}, - Gram::Matrix{ComplexF64} + Gram::AbstractMatrix{ComplexF64} ) _projected_kernel_3D!(K_c, G_c, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, cos_basis, sin_basis, Gram) end - - -# ============================================================================ -# 2D fused projected kernel -# ============================================================================ - """ _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram) @@ -79,14 +76,14 @@ P×P projected matrices instead of filling the M×M kernel matrices. Memory: O(MP) instead of O(M²). """ @with_pool pool function _projected_kernel_2D!( - K_c::Matrix{ComplexF64}, - G_c::Matrix{ComplexF64}, + K_c::AbstractMatrix{ComplexF64}, + G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, n::Int, cos_basis::Matrix{Float64}, sin_basis::Matrix{Float64}, - Gram::Matrix{ComplexF64} + Gram::AbstractMatrix{ComplexF64} ) M, P = size(cos_basis) mtheta = length(observer.x) @@ -286,8 +283,8 @@ cross-thread accumulation races — the same write pattern as the original Memory: O(4MP + P²) instead of O(M²). """ function _projected_kernel_3D!( - K_c::Matrix{ComplexF64}, - G_c::Matrix{ComplexF64}, + K_c::AbstractMatrix{ComplexF64}, + G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, PATCH_RAD::Int, @@ -295,7 +292,7 @@ function _projected_kernel_3D!( INTERP_ORDER::Int, cos_basis::Matrix{Float64}, sin_basis::Matrix{Float64}, - Gram::Matrix{ComplexF64} + Gram::AbstractMatrix{ComplexF64} ) M, P = size(cos_basis) num_points = observer.mtheta * observer.nzeta diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 1970786b..64decca8 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -72,28 +72,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC n_override::Union{Nothing,Int}=nothing ) + (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin, fuse_projection) = inputs + # Initialize surface geometries - geom_timing = @timed begin - plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) - wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) - end - println(" Compute geometry TIME=$(round(geom_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(geom_timing.bytes))") + plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) + wall = nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings) # Compute Fourier basis coefficients - basis_timing = @timed begin - ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients( - inputs.mtheta, - inputs.mpert, - inputs.mlow, - inputs.nzeta, - inputs.npert, - inputs.nlow; - n_2D=n_override, - ν=ν - ) - end - println(" Compute Fourier basis TIME=$(round(basis_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(basis_timing.bytes))") + ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing + cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν) num_points_surf, num_modes = size(cos_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel @@ -101,7 +88,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC PATCH_RAD = 11 RAD_DIM = 20 INTERP_ORDER = 5 - kparams = inputs.nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override) + kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override) # Active rows for computation (plasma only if no wall, plasma+wall if wall present) num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf @@ -110,12 +97,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grre = @view grre_in[1:num_points_total, :] grri = @view grri_in[1:num_points_total, :] - if wall.nowall && inputs.use_galerkin && inputs.fuse_projection + if wall.nowall && use_galerkin && fuse_projection # ================================================================ # Fused Galerkin: kernel assembly + Fourier projection in one pass. # The full M×M kernel matrices are never materialized — instead the - # P×P projected matrices K_c and G_c are accumulated row by row as - # kernel values are computed. + # P×P projected matrices grad_green_fourier and G_c are accumulated + # row by row as kernel values are computed. # # Memory: O(MP + P²) instead of O(M²) # FLOPs: O(M²P + P³) — same as two-step Galerkin @@ -123,115 +110,117 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC P = num_modes M = num_points_surf - fused_timing = @timed begin - # Gram matrix Gram = Z^H Z [P × P complex] - Gram = complex.(cos_mn_basis' * cos_mn_basis .+ sin_mn_basis' * sin_mn_basis, - cos_mn_basis' * sin_mn_basis .- sin_mn_basis' * cos_mn_basis) + # Temporary matrices + exp_mn_basis = zeros!(pool, ComplexF64, M, P) + exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis) + Gram = zeros!(pool, ComplexF64, P, P) + + # Projected kernel matrices + grad_green_fourier = zeros!(pool, ComplexF64, P, P) + green_fourier = zeros!(pool, ComplexF64, P, P) + grad_green_fourier_int = similar!(pool, grad_green_fourier) + green_fourier_int = similar!(pool, green_fourier) - # Fused projected kernel: K_c = Z^H K Z, G_c = Z^H G Z [P × P complex] - K_c = zeros(ComplexF64, P, P) - G_c = zeros(ComplexF64, P, P) - projected_kernel!(K_c, G_c, plasma_surf, plasma_surf, kparams, + fused_timing = @timed begin + # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z [P × P complex] + projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams, cos_mn_basis, sin_mn_basis, Gram) end println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") solve_timing = @timed begin - # Solve projected BIE via SVD-based pseudoinverse [pinv handles rank deficiency] - c_ext = pinv(K_c) * G_c + # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier + # Gram matrix Gram = Z^H Z [P × P complex] + mul!(Gram, exp_mn_basis', exp_mn_basis) + grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier + green_fourier_int .= green_fourier - # Interior kernel: K_int = -K + 2I → K_c_int = 2·Gram - K_c - K_c_int = 2 .* Gram .- K_c - c_int = pinv(K_c_int) * G_c + # Solve projected BIEs for exterior and interior kernels + F = lu!(grad_green_fourier) + ldiv!(F, green_fourier) + F = lu!(grad_green_fourier_int) + ldiv!(F, green_fourier_int) # wv = (4π²/M) · Gram · c_ext [P × P complex, Chance 2007 eq. 114] - wv .= (4π^2 / M) .* (Gram * c_ext) + wv .= (4π^2 / M) .* (Gram * green_fourier) # ── Backward-compatible reconstruction of real grri/grre ───────── - # The downstream code (ForceFreeStates, PerturbedEquilibrium) still expects - # grre/grri as [M × 2P] real matrices. Reconstruct from the complex P×P - # solution coefficients. This section can be removed once the downstream - # modules are updated to work directly in mode space. - c_ext_r, c_ext_i = real.(c_ext), imag.(c_ext) - c_int_r, c_int_i = real.(c_int), imag.(c_int) - - mul!(@view(grre[1:M, 1:P]), cos_mn_basis, c_ext_r) - mul!(@view(grre[1:M, 1:P]), sin_mn_basis, c_ext_i, -1.0, 1.0) - mul!(@view(grre[1:M, (P+1):(2*P)]), cos_mn_basis, c_ext_i) - mul!(@view(grre[1:M, (P+1):(2*P)]), sin_mn_basis, c_ext_r, 1.0, 1.0) - - mul!(@view(grri[1:M, 1:P]), cos_mn_basis, c_int_r) - mul!(@view(grri[1:M, 1:P]), sin_mn_basis, c_int_i, -1.0, 1.0) - mul!(@view(grri[1:M, (P+1):(2*P)]), cos_mn_basis, c_int_i) - mul!(@view(grri[1:M, (P+1):(2*P)]), sin_mn_basis, c_int_r, 1.0, 1.0) + # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext). + # This section can be removed once downstream modules work in mode space. + temp = zeros!(pool, ComplexF64, M, P) + mul!(temp, exp_mn_basis, green_fourier) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, green_fourier_int) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) end println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - elseif wall.nowall && inputs.use_galerkin + elseif wall.nowall && use_galerkin # ================================================================ # Two-step Galerkin: full M×M kernel → project → solve in P×P. + # Uses complex basis Z = C + iS so projected matrices are P×P complex. # # Memory: O(M²) for kernel storage # FLOPs: O(M²P + P³) # ================================================================ + P = num_modes + M = num_points_surf + # Full-size kernel matrices grad_green = zeros!(pool, num_points_total, num_points_total) green_temp = zeros!(pool, num_points_surf, num_points_surf) + # Projected kernel matrices + grad_green_fourier = zeros!(pool, ComplexF64, P, P) + green_fourier = zeros!(pool, ComplexF64, P, P) + Gram = zeros!(pool, ComplexF64, P, P) + green_fourier_int = similar!(pool, green_fourier) + grad_green_fourier_int = similar!(pool, grad_green_fourier) + + # Temporary matrices + exp_mn_basis = zeros!(pool, ComplexF64, M, P) + exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis) + temp = zeros!(pool, ComplexF64, M, P) + pp_kernel_timing = @timed begin kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) end println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") - temp = zeros!(pool, num_points_surf, num_modes) - proj_timing = @timed begin - # K_proj = F' * grad_green * F [2P × 2P] - K_proj = zeros(2 * num_modes, 2 * num_modes) - mul!(temp, grad_green, cos_mn_basis) - mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) - mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) - mul!(temp, grad_green, sin_mn_basis) - mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) - mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - - # G_proj = F' * green_temp * F [2P × 2P] - G_proj = zeros(2 * num_modes, 2 * num_modes) - mul!(temp, green_temp, cos_mn_basis) - mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp) - mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp) - mul!(temp, green_temp, sin_mn_basis) - mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp) - mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp) - - # Gram matrix F'F - FtF = zeros(2 * num_modes, 2 * num_modes) - mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis) - mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis) - mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis) - mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis) - - # Solve projected systems via SVD-based pseudoinverse [pinv handles rank deficiency] - Y_ext = pinv(K_proj) * G_proj - - # Interior kernel: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj - K_proj_int = 2 .* FtF .- K_proj - Y_int = pinv(K_proj_int) * G_proj + # Project matrices to mode space + # grad_green_fourier = Z^H * grad_green * Z + mul!(temp, grad_green, exp_mn_basis) + mul!(grad_green_fourier, exp_mn_basis', temp) + # green_fourier = Z^H * green_temp * Z + mul!(temp, green_temp, exp_mn_basis) + mul!(green_fourier, exp_mn_basis', temp) + + # Interior kernel: grad_green_fourier_int = 2·Gram - grad_green_fourier + # Gram = Z^H Z [P × P complex] + mul!(Gram, exp_mn_basis', exp_mn_basis) + grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier + + # Solve projected BIEs for exterior and interior kernels + F = lu!(grad_green_fourier) + ldiv!(F, green_fourier) + F = lu!(grad_green_fourier_int) + ldiv!(F, green_fourier_int) + + # wv = (4π²/M) · Gram · green_fourier [P × P complex, Chance 2007 eq. 114] + wv .= (4π^2 / M) .* (Gram * green_fourier) # ── Backward-compatible reconstruction of real grri/grre ───────── - # This section can be removed once downstream modules work in mode space. - mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :])) - mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) - mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :])) - mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0) - - # wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114] - wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext) - wv .= complex.( - @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), - @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes]) - ) + # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext). + mul!(temp, exp_mn_basis, green_fourier) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, green_fourier_int) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) end println(" Galerkin Project and Solve TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") @@ -306,7 +295,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC inputs.force_wv_symmetry && hermitianpart!(wv) - if inputs.nzeta > 1 # 3D + if nzeta > 1 # 3D plasma_pts .= plasma_surf.r wall_pts .= wall.r else # 2D From feb07e4e2f33330b033f044506507f9f318bcb75 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Sat, 14 Mar 2026 16:56:19 -0400 Subject: [PATCH 05/23] VACUUM - WIP - combining galerkin and fused galerkin into one main with a small kernel subbranch for fused or not --- .../Solovev_ideal_example_3D/run_example.jl | 4 + src/Vacuum/Vacuum.jl | 125 ++++++------------ 2 files changed, 41 insertions(+), 88 deletions(-) create mode 100644 examples/Solovev_ideal_example_3D/run_example.jl diff --git a/examples/Solovev_ideal_example_3D/run_example.jl b/examples/Solovev_ideal_example_3D/run_example.jl new file mode 100644 index 00000000..fa0a9e53 --- /dev/null +++ b/examples/Solovev_ideal_example_3D/run_example.jl @@ -0,0 +1,4 @@ +using Pkg; +Pkg.activate(joinpath(@__DIR__, "../..")) +using GeneralizedPerturbedEquilibrium +GeneralizedPerturbedEquilibrium.main([dirname(@__FILE__)]) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 64decca8..2004ffc4 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -97,124 +97,73 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grre = @view grre_in[1:num_points_total, :] grri = @view grri_in[1:num_points_total, :] - if wall.nowall && use_galerkin && fuse_projection + if wall.nowall && use_galerkin # ================================================================ - # Fused Galerkin: kernel assembly + Fourier projection in one pass. - # The full M×M kernel matrices are never materialized — instead the - # P×P projected matrices grad_green_fourier and G_c are accumulated - # row by row as kernel values are computed. + # Galerkin: solve in P×P mode space. Uses complex basis Z = C + iS + # so projected matrices are P×P complex. # + # Fused (fuse_projection=true): kernel assembly + Fourier projection + # in one pass. The full M×M kernel matrices are never materialized — + # instead the P×P projected matrices grad_green_fourier and G_c are + # accumulated row by row as kernel values are computed. # Memory: O(MP + P²) instead of O(M²) - # FLOPs: O(M²P + P³) — same as two-step Galerkin + # + # Two-step (fuse_projection=false): full M×M kernel → project → solve. + # Memory: O(M²) for kernel storage + # + # FLOPs (both): O(M²P + P³) # ================================================================ P = num_modes M = num_points_surf - # Temporary matrices + # Temporary and projected kernel matrices [P × P complex] exp_mn_basis = zeros!(pool, ComplexF64, M, P) exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis) Gram = zeros!(pool, ComplexF64, P, P) - - # Projected kernel matrices grad_green_fourier = zeros!(pool, ComplexF64, P, P) green_fourier = zeros!(pool, ComplexF64, P, P) grad_green_fourier_int = similar!(pool, grad_green_fourier) green_fourier_int = similar!(pool, green_fourier) - - fused_timing = @timed begin - # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z [P × P complex] - projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams, - cos_mn_basis, sin_mn_basis, Gram) - end - println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") - - solve_timing = @timed begin - # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier - # Gram matrix Gram = Z^H Z [P × P complex] - mul!(Gram, exp_mn_basis', exp_mn_basis) - grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier - green_fourier_int .= green_fourier - - # Solve projected BIEs for exterior and interior kernels - F = lu!(grad_green_fourier) - ldiv!(F, green_fourier) - F = lu!(grad_green_fourier_int) - ldiv!(F, green_fourier_int) - - # wv = (4π²/M) · Gram · c_ext [P × P complex, Chance 2007 eq. 114] - wv .= (4π^2 / M) .* (Gram * green_fourier) - - # ── Backward-compatible reconstruction of real grri/grre ───────── - # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext). - # This section can be removed once downstream modules work in mode space. - temp = zeros!(pool, ComplexF64, M, P) - mul!(temp, exp_mn_basis, green_fourier) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, green_fourier_int) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - end - println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - - elseif wall.nowall && use_galerkin - # ================================================================ - # Two-step Galerkin: full M×M kernel → project → solve in P×P. - # Uses complex basis Z = C + iS so projected matrices are P×P complex. - # - # Memory: O(M²) for kernel storage - # FLOPs: O(M²P + P³) - # ================================================================ - - P = num_modes - M = num_points_surf - - # Full-size kernel matrices - grad_green = zeros!(pool, num_points_total, num_points_total) - green_temp = zeros!(pool, num_points_surf, num_points_surf) - - # Projected kernel matrices - grad_green_fourier = zeros!(pool, ComplexF64, P, P) - green_fourier = zeros!(pool, ComplexF64, P, P) - Gram = zeros!(pool, ComplexF64, P, P) - green_fourier_int = similar!(pool, green_fourier) - grad_green_fourier_int = similar!(pool, grad_green_fourier) - - # Temporary matrices - exp_mn_basis = zeros!(pool, ComplexF64, M, P) - exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis) temp = zeros!(pool, ComplexF64, M, P) - pp_kernel_timing = @timed begin - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - end - println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") - - proj_timing = @timed begin - # Project matrices to mode space - # grad_green_fourier = Z^H * grad_green * Z + if fuse_projection + # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z + fused_timing = @timed begin + projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams, + cos_mn_basis, sin_mn_basis, Gram) + end + println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") + else + # Full-size kernel matrices, then project to mode space + grad_green = zeros!(pool, num_points_total, num_points_total) + green_temp = zeros!(pool, num_points_surf, num_points_surf) + pp_kernel_timing = @timed begin + kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + end + println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") + # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z mul!(temp, grad_green, exp_mn_basis) mul!(grad_green_fourier, exp_mn_basis', temp) - # green_fourier = Z^H * green_temp * Z mul!(temp, green_temp, exp_mn_basis) mul!(green_fourier, exp_mn_basis', temp) + end - # Interior kernel: grad_green_fourier_int = 2·Gram - grad_green_fourier - # Gram = Z^H Z [P × P complex] + solve_timing = @timed begin + # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier mul!(Gram, exp_mn_basis', exp_mn_basis) grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier + green_fourier_int .= green_fourier - # Solve projected BIEs for exterior and interior kernels + # Solve projected BIEs for exterior and interior. F = lu!(grad_green_fourier) ldiv!(F, green_fourier) F = lu!(grad_green_fourier_int) ldiv!(F, green_fourier_int) - # wv = (4π²/M) · Gram · green_fourier [P × P complex, Chance 2007 eq. 114] + # wv = (4π²/M) · Gram · green_fourier [Chance 2007 eq. 114] wv .= (4π^2 / M) .* (Gram * green_fourier) - # ── Backward-compatible reconstruction of real grri/grre ───────── - # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext). + # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. mul!(temp, exp_mn_basis, green_fourier) @view(grre[1:M, 1:P]) .= real.(temp) @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) @@ -222,7 +171,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC @view(grri[1:M, 1:P]) .= real.(temp) @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) end - println(" Galerkin Project and Solve TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") + println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") else # ================================================================ From 0e3b7e5fdaf894ca82e7062f9cdc26fc33a140b3 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Sat, 14 Mar 2026 18:17:41 -0400 Subject: [PATCH 06/23] VACUUM - WIP - consolidating cos/sin_mn_basis into exp_mn_basis and propagating it through the vacuum module --- benchmarks/benchmark_fourier_transforms.jl | 44 ++--- src/Utilities/FourierTransforms.jl | 30 ++- src/Vacuum/ProjectedKernel.jl | 202 ++++++--------------- src/Vacuum/Vacuum.jl | 82 +++++---- test/runtests_vacuum.jl | 76 ++++++++ 5 files changed, 210 insertions(+), 224 deletions(-) diff --git a/benchmarks/benchmark_fourier_transforms.jl b/benchmarks/benchmark_fourier_transforms.jl index 3a978417..840b63f6 100644 --- a/benchmarks/benchmark_fourier_transforms.jl +++ b/benchmarks/benchmark_fourier_transforms.jl @@ -28,10 +28,10 @@ function extract_modes(fft_result, mlow, mhigh, mtheta) for (i, m) in enumerate(mlow:mhigh) if m >= 0 # Positive frequencies - modes[i] = fft_result[m + 1] / mtheta # FFT normalization + modes[i] = fft_result[m+1] / mtheta # FFT normalization else # Negative frequencies (wrap around) - modes[i] = fft_result[mtheta + m + 1] / mtheta + modes[i] = fft_result[mtheta+m+1] / mtheta end end return modes @@ -39,10 +39,10 @@ end # Test configurations test_cases = [ - (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5), - (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10), - (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20), - (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40), + (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5), + (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10), + (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20), + (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40) ] for test in test_cases @@ -56,7 +56,7 @@ for test in test_cases mhigh = mlow + mpert - 1 # Create test data - theta = range(0, 2π, length=mtheta+1)[1:end-1] + theta = range(0, 2π; length=mtheta+1)[1:(end-1)] data = sin.(3 .* theta) .+ 0.5 .* cos.(7 .* theta) .+ 0.2 .* sin.(11 .* theta) # Initialize FourierTransform @@ -67,7 +67,8 @@ for test in test_cases theta_buffer = zeros(ComplexF64, mtheta) # Pre-allocate for low-level API - cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow) + cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis) gij = reshape(data, mtheta, 1) # Matrix form gil = zeros(Float64, mtheta, mpert) @@ -99,7 +100,7 @@ for test in test_cases # Note: Our transform uses a different normalization and basis println("\n--- Accuracy Check ---") println("FourierTransform allocating vs in-place: ", - @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer)))) + @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer)))) # Compare magnitudes of modes (since basis might differ) println("Mode magnitudes comparison (FourierTransform vs FFTW):") @@ -129,9 +130,9 @@ for test in test_cases full_modes = zeros(ComplexF64, mtheta) for (i, m) in enumerate(mlow:mhigh) if m >= 0 - full_modes[m + 1] = modes_test[i] + full_modes[m+1] = modes_test[i] else - full_modes[mtheta + m + 1] = modes_test[i] + full_modes[mtheta+m+1] = modes_test[i] end end t6 = @benchmark ifft($full_modes) @@ -140,21 +141,21 @@ for test in test_cases # Accuracy check println("\n--- Inverse Accuracy Check ---") println("inverse() allocating vs in-place: ", - @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer)))) + @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer)))) println("Round-trip error (real part): ", - @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data)))) + @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data)))) # Performance summary println("\n--- Performance Summary ---") println(@sprintf("Forward transform speedup (in-place vs allocating): %.2fx", - median(t1).time / median(t2).time)) + median(t1).time / median(t2).time)) println(@sprintf("Allocations eliminated: %d → %d", - t1.allocs, t2.allocs)) + t1.allocs, t2.allocs)) # Compare to FFTW println(@sprintf("\nFourier vs FFTW (forward): %.2fx %s", - abs(median(t2).time / median(t3).time), - median(t2).time < median(t3).time ? "faster" : "slower")) + abs(median(t2).time / median(t3).time), + median(t2).time < median(t3).time ? "faster" : "slower")) println("Note: FFTW computes full DFT (all N modes), we compute truncated series ($mpert modes)") end @@ -169,7 +170,7 @@ mlow = -10 nbatch = 10 # Transform 10 functions simultaneously ft = FourierTransform(mtheta, mpert, mlow) -theta = range(0, 2π, length=mtheta+1)[1:end-1] +theta = range(0, 2π; length=mtheta+1)[1:(end-1)] # Create batch data data_matrix = zeros(Float64, mtheta, nbatch) @@ -182,7 +183,7 @@ modes_matrix = zeros(ComplexF64, mpert, nbatch) println("\nTransforming $nbatch functions of length $mtheta:") print("Allocating (loop): ") -@btime for i in 1:$nbatch +@btime for i in 1:($nbatch) modes = $ft($data_matrix[:, i]) end @@ -191,7 +192,7 @@ print("Allocating (matrix):") print("In-place (loop): ") modes_buffer = zeros(ComplexF64, mpert) -@btime for i in 1:$nbatch +@btime for i in 1:($nbatch) transform!($modes_buffer, $ft, $data_matrix[:, i]) end @@ -208,7 +209,8 @@ mpert = 10 mlow = 1 # Setup for low-level API -cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow) +exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow) +cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis) gij = randn(mtheta, mtheta) # Green's function matrix gil = zeros(Float64, mtheta, mpert) diff --git a/src/Utilities/FourierTransforms.jl b/src/Utilities/FourierTransforms.jl index 36a53e0b..ae3fe8ce 100644 --- a/src/Utilities/FourierTransforms.jl +++ b/src/Utilities/FourierTransforms.jl @@ -66,19 +66,16 @@ just use the n argument. In 3D, we need to compute the basis for all modes and g # Returns - 2D - - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν)` [mtheta, mpert] - - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν)` [mtheta, mpert] + - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν))` [mtheta, mpert] - 3D - - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert] - - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert] + - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν - n*ϕ))` [mtheta * nzeta, mpert * npert] # Notes The theta and phi grids are uniform: `θᵢ = 2π*i/mtheta` for `i = 0:mtheta-1` and `ϕⱼ = 2π*j/nzeta` for `j = 0:nzeta-1` When `n=0, ν=0` (default), this reduces to simple harmonic basis: -- `cos_mn_basis[i,l] = cos(m*θᵢ)` -- `sin_mn_basis[i,l] = sin(m*θᵢ)` +- `exp_mn_basis[i,l] = exp(i(m*θᵢ))` """ function compute_fourier_coefficients( mtheta::Int, @@ -100,17 +97,15 @@ function compute_fourier_coefficients( @assert length(ν) == mtheta "ν must have length mtheta" # In 2D, we only use one toroidal mode at a time - # Compute sin(mθ - nν) and cos(mθ - nν) - sin_mn_basis = sin.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν) - cos_mn_basis = cos.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν) + # Compute exp(i(mθ - nν)) + exp_mn_basis = exp.(im .* ((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)) else # 3D @assert (n_2D === nothing && ν === nothing) "n_2D and ν should be nothing for 3D" # In 3D, we need to compute the basis for all modes and grid points - # Compute sin(mθ - nζ) and cos(mθ - nζ) + # Compute exp(i(mθ - nζ)) ζ_grid = range(; start=0, length=nzeta, step=2π/nzeta) - sin_mn_basis = zeros(mtheta * nzeta, mpert * npert) - cos_mn_basis = zeros(mtheta * nzeta, mpert * npert) + exp_mn_basis = zeros(ComplexF64, mtheta * nzeta, mpert * npert) for idx_n in 1:npert n = nlow + idx_n - 1 n_col_offset = (idx_n - 1) * mpert @@ -119,16 +114,13 @@ function compute_fourier_coefficients( col = idx_m + n_col_offset for (j, ζ) in enumerate(ζ_grid), (i, θ) in enumerate(θ_grid) idx = i + (j-1)*mtheta - arg = m * θ - n * ζ - s, c = sincos(arg) - cos_mn_basis[idx, col] = c - sin_mn_basis[idx, col] = s + exp_mn_basis[idx, col] = exp(im * (m * θ - n * ζ)) end end end end - return cos_mn_basis, sin_mn_basis + return exp_mn_basis end """ @@ -207,8 +199,8 @@ function FourierTransform( n::Int=0, ν::Vector{Float64}=zeros(Float64, mtheta) ) - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν) - return FourierTransform(mtheta, mpert, mlow, cos_mn_basis, sin_mn_basis) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν) + return FourierTransform(mtheta, mpert, mlow, real(exp_mn_basis), imag(exp_mn_basis)) end """ diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl index 4bca9062..06f8b3f2 100644 --- a/src/Vacuum/ProjectedKernel.jl +++ b/src/Vacuum/ProjectedKernel.jl @@ -19,7 +19,7 @@ # 2D fused projected kernel # ============================================================================ """ - projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram) + projected_kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram) Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z directly, without materializing the full M×M kernel matrices. @@ -33,8 +33,7 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types. - `observer`: Observer geometry struct - `source`: Source geometry struct - `params`: Kernel parameters (KernelParams2D or KernelParams3D) - - `cos_basis::Matrix{Float64}`: [M × P] cosine Fourier basis - - `sin_basis::Matrix{Float64}`: [M × P] sine Fourier basis + - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ)) - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term) """ function projected_kernel! end @@ -45,11 +44,10 @@ function projected_kernel!( observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, params::KernelParams2D, - cos_basis::Matrix{Float64}, - sin_basis::Matrix{Float64}, + exp_mn_basis::AbstractMatrix{ComplexF64}, Gram::AbstractMatrix{ComplexF64} ) - _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram) + _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram) end function projected_kernel!( @@ -58,16 +56,15 @@ function projected_kernel!( observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, params::KernelParams3D, - cos_basis::Matrix{Float64}, - sin_basis::Matrix{Float64}, + exp_mn_basis::AbstractMatrix{ComplexF64}, Gram::AbstractMatrix{ComplexF64} ) _projected_kernel_3D!(K_c, G_c, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, - cos_basis, sin_basis, Gram) + exp_mn_basis, Gram) end """ - _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram) + _projected_kernel_2D!(K_c, G_c, observer, source, n, exp_mn_basis, Gram) Fused 2D kernel assembly + projection. Mirrors the loop structure of `compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the @@ -81,11 +78,11 @@ Memory: O(MP) instead of O(M²). observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, n::Int, - cos_basis::Matrix{Float64}, - sin_basis::Matrix{Float64}, + exp_mn_basis::AbstractMatrix{ComplexF64}, Gram::AbstractMatrix{ComplexF64} ) - M, P = size(cos_basis) + M, P = size(exp_mn_basis) + Z = exp_mn_basis mtheta = length(observer.x) dtheta = 2π / mtheta theta_grid = range(; start=0, length=mtheta, step=dtheta) @@ -113,31 +110,19 @@ Memory: O(MP) instead of O(M²). d1_spline_x(dx_dtheta_grid, theta_grid) d1_spline_z(dz_dtheta_grid, theta_grid) - # Pre-transpose basis for contiguous column access: Ct[:, k] = C[k, :] - Ct = acquire!(pool, Float64, P, M) - St = acquire!(pool, Float64, P, M) - Ct .= cos_basis' - St .= sin_basis' + # Zero output matrices; we accumulate rank-1 updates (conj(Z[j,:]) ⊗ proj_z) + fill!(K_c, 0.0) + fill!(G_c, 0.0) - # Real/imaginary accumulators for P×P projected matrices - K_re = zeros(P, P) - K_im = zeros(P, P) - G_re = zeros(P, P) - G_im = zeros(P, P) - - # Per-observer projection vectors (P-length) - proj_kc = zeros(P) - proj_ks = zeros(P) - proj_gc = zeros(P) - proj_gs = zeros(P) + # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z + proj_kz = zeros(ComplexF64, P) + proj_gz = zeros(ComplexF64, P) for j in 1:mtheta x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] - fill!(proj_kc, 0.0) - fill!(proj_ks, 0.0) - fill!(proj_gc, 0.0) - fill!(proj_gs, 0.0) + fill!(proj_kz, 0.0) + fill!(proj_gz, 0.0) diag_accum = 0.0 # ── Simpson integration for nonsingular source points ── @@ -152,12 +137,10 @@ Memory: O(MP) instead of O(M²). if populate_greenfunction w_g = G_n * wsimpson - BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) - BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) end w_k = gradG_n * wsimpson - BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc) - BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks) + BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz) diag_accum -= gradG_0 * wsimpson end @@ -190,16 +173,14 @@ Memory: O(MP) instead of O(M²). @inbounds for stencil_idx in 1:5 w_g = G_n * s[stencil_idx] * wgauss isrc = sing_idx[stencil_idx] - BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) - BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) end end @inbounds for stencil_idx in 1:5 w_k = gradG_n * s[stencil_idx] * wgauss isrc = sing_idx[stencil_idx] - BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc) - BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks) + BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz) end diag_accum -= gradG_0 * wgauss @@ -211,28 +192,17 @@ Memory: O(MP) instead of O(M²). @inbounds for stencil_idx in 1:5 w_g = -log_correction_array[stencil_idx] / x_obs isrc = sing_idx[stencil_idx] - BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc) - BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs) + BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) end end # Fold diagonal accumulation into projection - BLAS.axpy!(diag_accum, @view(Ct[:, j]), proj_kc) - BLAS.axpy!(diag_accum, @view(St[:, j]), proj_ks) - - # ── Rank-1 accumulate into P×P projection matrices ── - # K_c_re += C[j,:] ⊗ proj_kc + S[j,:] ⊗ proj_ks - BLAS.ger!(1.0, @view(Ct[:, j]), proj_kc, K_re) - BLAS.ger!(1.0, @view(St[:, j]), proj_ks, K_re) - # K_c_im += C[j,:] ⊗ proj_ks − S[j,:] ⊗ proj_kc - BLAS.ger!(1.0, @view(Ct[:, j]), proj_ks, K_im) - BLAS.ger!(-1.0, @view(St[:, j]), proj_kc, K_im) + BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz) + # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ── + BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c) if populate_greenfunction - BLAS.ger!(1.0, @view(Ct[:, j]), proj_gc, G_re) - BLAS.ger!(1.0, @view(St[:, j]), proj_gs, G_re) - BLAS.ger!(1.0, @view(Ct[:, j]), proj_gs, G_im) - BLAS.ger!(-1.0, @view(St[:, j]), proj_gc, G_im) + BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c) end end @@ -240,26 +210,20 @@ Memory: O(MP) instead of O(M²). # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source if source isa PlasmaGeometry - K_re .*= -1 - K_im .*= -1 + K_c .*= -1 end # Diagonal residue: K += residue·I → K_c += residue·Gram # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) if residue != 0.0 - K_re .+= residue .* real.(Gram) - K_im .+= residue .* imag.(Gram) + K_c .+= residue .* Gram end # 2π𝒢 → 𝒢 if populate_greenfunction - G_re ./= 2π - G_im ./= 2π + G_c ./= 2π end - - K_c .= complex.(K_re, K_im) - G_c .= complex.(G_re, G_im) end @@ -268,7 +232,7 @@ end # ============================================================================ """ - _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, cos_basis, sin_basis, Gram) + _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, exp_mn_basis, Gram) Fused 3D kernel assembly + projection. Mirrors the loop structure of `compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction) @@ -280,7 +244,7 @@ Each observer writes to its own row of the shared buffers, so there are no cross-thread accumulation races — the same write pattern as the original `compute_3D_kernel_matrices!`. -Memory: O(4MP + P²) instead of O(M²). +Memory: O(2MP + P²) instead of O(M²). """ function _projected_kernel_3D!( K_c::AbstractMatrix{ComplexF64}, @@ -290,11 +254,11 @@ function _projected_kernel_3D!( PATCH_RAD::Int, RAD_DIM::Int, INTERP_ORDER::Int, - cos_basis::Matrix{Float64}, - sin_basis::Matrix{Float64}, + exp_mn_basis::AbstractMatrix{ComplexF64}, Gram::AbstractMatrix{ComplexF64} ) - M, P = size(cos_basis) + M, P = size(exp_mn_basis) + Z = exp_mn_basis num_points = observer.mtheta * observer.nzeta dθdζ = 4π^2 / num_points @@ -307,25 +271,15 @@ function _projected_kernel_3D!( quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER) (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data - # Pre-transpose basis for contiguous column access in the inner loop - Ct = Matrix(cos_basis') # [P × M] - St = Matrix(sin_basis') # [P × M] - - # [M × P] buffers for projected kernel rows. - # Row idx_obs = Σ_k K[idx_obs, k] · basis[k, :] — each observer writes to - # its own row, so no cross-thread races. - KZ_c = zeros(M, P) - KZ_s = zeros(M, P) - GZ_c = zeros(M, P) - GZ_s = zeros(M, P) + # [M × P] buffers: row idx_obs holds (kernel row idx_obs) · Z + KZ = zeros(ComplexF64, M, P) + GZ = zeros(ComplexF64, M, P) # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors) max_tid = Threads.maxthreadid() workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid] - proj_kc_all = [zeros(P) for _ in 1:max_tid] - proj_ks_all = [zeros(P) for _ in 1:max_tid] - proj_gc_all = [zeros(P) for _ in 1:max_tid] - proj_gs_all = [zeros(P) for _ in 1:max_tid] + proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] + proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] Threads.@threads :static for idx_obs in 1:num_points tid = Threads.threadid() @@ -333,15 +287,11 @@ function _projected_kernel_3D!( (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar, n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws - proj_kc = proj_kc_all[tid] - proj_ks = proj_ks_all[tid] - proj_gc = proj_gc_all[tid] - proj_gs = proj_gs_all[tid] + proj_kz = proj_kz_all[tid] + proj_gz = proj_gz_all[tid] - fill!(proj_kc, 0.0) - fill!(proj_ks, 0.0) - fill!(proj_gc, 0.0) - fill!(proj_gs, 0.0) + fill!(proj_kz, 0.0) + fill!(proj_gz, 0.0) i_obs = mod1(idx_obs, observer.mtheta) j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 @@ -352,17 +302,11 @@ function _projected_kernel_3D!( r_src = @view source.r[idx_src, :] n_src = @view source.normal[idx_src, :] w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ - @inbounds @simd for m in 1:P - proj_kc[m] += w_double * Ct[m, idx_src] - proj_ks[m] += w_double * St[m, idx_src] - end + BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz) if populate_greenfunction w_single = laplace_single_layer(r_obs, r_src) * dθdζ - @inbounds @simd for m in 1:P - proj_gc[m] += w_single * Ct[m, idx_src] - proj_gs[m] += w_single * St[m, idx_src] - end + BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz) end end @@ -398,64 +342,34 @@ function _projected_kernel_3D!( n_src = @view source.normal[idx_src, :] far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ w_double = M_grid_double[ii, jj] + far_double - @simd for m in 1:P - proj_kc[m] += w_double * Ct[m, idx_src] - proj_ks[m] += w_double * St[m, idx_src] - end + BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz) if populate_greenfunction far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ w_single = M_grid_single[ii, jj] + far_single - @simd for m in 1:P - proj_gc[m] += w_single * Ct[m, idx_src] - proj_gs[m] += w_single * St[m, idx_src] - end + BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz) end end # ── Write projected row to buffer (each idx_obs owns its row) ── - @inbounds for m in 1:P - KZ_c[idx_obs, m] = proj_kc[m] - KZ_s[idx_obs, m] = proj_ks[m] - end + @inbounds KZ[idx_obs, :] .= proj_kz if populate_greenfunction - @inbounds for m in 1:P - GZ_c[idx_obs, m] = proj_gc[m] - GZ_s[idx_obs, m] = proj_gs[m] - end + @inbounds GZ[idx_obs, :] .= proj_gz end end - # ── Assemble P×P projected matrices via GEMM (sequential, after barrier) ── - # K_c = Z^H K Z = (C'·KZ_c + S'·KZ_s) + i(C'·KZ_s − S'·KZ_c) - K_re = zeros(P, P) - K_im = zeros(P, P) - mul!(K_re, cos_basis', KZ_c) - mul!(K_re, sin_basis', KZ_s, 1.0, 1.0) - mul!(K_im, cos_basis', KZ_s) - mul!(K_im, sin_basis', KZ_c, -1.0, 1.0) - - G_re = zeros(P, P) - G_im = zeros(P, P) + # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ── + mul!(K_c, Z', KZ) + K_c ./= 2π if populate_greenfunction - mul!(G_re, cos_basis', GZ_c) - mul!(G_re, sin_basis', GZ_s, 1.0, 1.0) - mul!(G_im, cos_basis', GZ_s) - mul!(G_im, sin_basis', GZ_c, -1.0, 1.0) + mul!(G_c, Z', GZ) + G_c ./= 2π + else + fill!(G_c, 0.0) end - # ── Post-processing (mirrors compute_3D_kernel_matrices!) ── - K_re ./= 2π - K_im ./= 2π - G_re ./= 2π - G_im ./= 2π - # Diagonal: K += I → K_c += Gram [for same-type source/observer] if typeof(source) == typeof(observer) - K_re .+= real.(Gram) - K_im .+= imag.(Gram) + K_c .+= Gram end - - K_c .= complex.(K_re, K_im) - G_c .= complex.(G_re, G_im) end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 2004ffc4..ac804eb3 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -9,7 +9,7 @@ using AdaptiveArrayPools # Import parent modules import ..Equilibrium -using ..Utilities.FourierTransforms: compute_fourier_coefficients, fourier_transform!, fourier_inverse_transform! +using ..Utilities.FourierTransforms: compute_fourier_coefficients include("Utilities.jl") include("DataTypes.jl") @@ -80,8 +80,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Compute Fourier basis coefficients ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing - cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν) - num_points_surf, num_modes = size(cos_mn_basis) + exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν) + num_points_surf, num_modes = size(exp_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel # Hardcode these values for now - can expose to the user in the future @@ -97,10 +97,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grre = @view grre_in[1:num_points_total, :] grri = @view grri_in[1:num_points_total, :] + # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility + M = num_points_surf + P = num_modes + temp = zeros!(pool, ComplexF64, M, P) + if wall.nowall && use_galerkin # ================================================================ - # Galerkin: solve in P×P mode space. Uses complex basis Z = C + iS - # so projected matrices are P×P complex. + # Galerkin: solve system in P×P mode space. Uses complex basis + # Z = C + iS so projected matrices are P×P complex. # # Fused (fuse_projection=true): kernel assembly + Fourier projection # in one pass. The full M×M kernel matrices are never materialized — @@ -113,24 +118,21 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # # FLOPs (both): O(M²P + P³) # ================================================================ - P = num_modes - M = num_points_surf - - # Temporary and projected kernel matrices [P × P complex] - exp_mn_basis = zeros!(pool, ComplexF64, M, P) - exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis) - Gram = zeros!(pool, ComplexF64, P, P) + # Projected kernel matrices [P × P complex] grad_green_fourier = zeros!(pool, ComplexF64, P, P) green_fourier = zeros!(pool, ComplexF64, P, P) grad_green_fourier_int = similar!(pool, grad_green_fourier) green_fourier_int = similar!(pool, green_fourier) - temp = zeros!(pool, ComplexF64, M, P) + + # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve + Gram = zeros!(pool, ComplexF64, P, P) + mul!(Gram, exp_mn_basis', exp_mn_basis) if fuse_projection # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z fused_timing = @timed begin projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams, - cos_mn_basis, sin_mn_basis, Gram) + exp_mn_basis, Gram) end println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") else @@ -142,25 +144,27 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC end println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z - mul!(temp, grad_green, exp_mn_basis) - mul!(grad_green_fourier, exp_mn_basis', temp) - mul!(temp, green_temp, exp_mn_basis) - mul!(green_fourier, exp_mn_basis', temp) + proj_timing = @timed begin + mul!(temp, grad_green, exp_mn_basis) + mul!(grad_green_fourier, exp_mn_basis', temp) + mul!(temp, green_temp, exp_mn_basis) + mul!(green_fourier, exp_mn_basis', temp) + end + println(" Project Kernel TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") end solve_timing = @timed begin # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier - mul!(Gram, exp_mn_basis', exp_mn_basis) grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier green_fourier_int .= green_fourier - # Solve projected BIEs for exterior and interior. + # Solve projected BIEs for exterior and interior F = lu!(grad_green_fourier) ldiv!(F, green_fourier) F = lu!(grad_green_fourier_int) ldiv!(F, green_fourier_int) - # wv = (4π²/M) · Gram · green_fourier [Chance 2007 eq. 114] + # wv = (4π²/M) · Gram · green_fourier wv .= (4π^2 / M) .* (Gram * green_fourier) # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. @@ -178,7 +182,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Collocation approach: solve full physical-space system [M × M] # Handles both no-wall and wall cases. # ================================================================ - # Full-size kernel matrices grad_green = zeros!(pool, num_points_total, num_points_total) green_temp = zeros!(pool, num_points_surf, num_points_surf) @@ -188,12 +191,13 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC end println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") - # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp) - colloc_ft_timing = @timed begin - fourier_transform!(grre, green_temp, cos_mn_basis) - fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes) + # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z) + colloc_proj_timing = @timed begin + mul!(temp, green_temp, exp_mn_basis) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) end - println(" Plasma Fourier Transform TIME=$(round(colloc_ft_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(colloc_ft_timing.bytes))") + println(" Plasma Project TIME=$(round(colloc_proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(colloc_proj_timing.bytes))") if !wall.nowall wall_block_timing = @timed begin @@ -203,11 +207,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC kernel!(grad_green, green_temp, wall, wall, kparams) # Wall–Plasma block kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Fourier transform obs=wall, src=plasma block - fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf) - fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes) + # Project obs=wall, src=plasma block to mode space + mul!(temp, green_temp, exp_mn_basis) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) end - println(" Wall Kernel and Fourier Transform TIME=$(round(wall_block_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))") + println(" Wall Kernel and Project TIME=$(round(wall_block_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))") end # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) @@ -230,16 +235,13 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC end println(" Invert and Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - invft_timing = @timed begin - # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118] - arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4) - fourier_inverse_transform!(arr, grre, cos_mn_basis) - fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes) - fourier_inverse_transform!(ari, grre, sin_mn_basis) - fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes) - wv .= complex.(arr .+ aii, air .- ari) + wv_timing = @timed begin + # wv = (4π²/M) · Z^H · grre_complex [Chance Phys. Plasmas 2007 052506 eq. 115-118] + temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)])) + mul!(wv, exp_mn_basis', temp) + wv .*= (4π^2 / M) end - println(" Compute Wv TIME=$(round(invft_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(invft_timing.bytes))") + println(" Compute Wv TIME=$(round(wv_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wv_timing.bytes))") end inputs.force_wv_symmetry && hermitianpart!(wv) diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl index d51bd622..bba2ff61 100644 --- a/test/runtests_vacuum.jl +++ b/test/runtests_vacuum.jl @@ -445,6 +445,82 @@ @test size(plasma_pts) == (16, 3) end end + + # ------------------------------------------------------------------------- + @testset "fused vs two-step Galerkin (2D, nowall)" begin + # Small case where both Galerkin paths are cheap: compare K_c, G_c + # assembled via the full M×M kernel + projection against the fused + # projected kernels from the unified `kernel!` API. + inputs = VacuumInput( + mtheta_in=17, + nzeta_in=1, + x=collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=17))), + z=collect(0.3 .* sin.(range(0, 2π, length=17))), + ν=zeros(17), + mlow=1, + mpert=2, + nlow=1, + npert=1, + nzeta=1, + mtheta=32 + ) + wall_settings = WallShapeSettings(shape="nowall") + + plasma_surf = GeneralizedPerturbedEquilibrium.Vacuum.PlasmaGeometry(inputs) + kparams = GeneralizedPerturbedEquilibrium.Vacuum.KernelParams2D(inputs.nlow) + + # Fourier basis on the surface grid + exp_mn_basis = GeneralizedPerturbedEquilibrium.Utilities.FourierTransforms.compute_fourier_coefficients( + inputs.mtheta, + inputs.mpert, + inputs.mlow, + inputs.nzeta, + inputs.npert, + inputs.nlow; + n_2D=inputs.nlow, + ν=plasma_surf.ν + ) + M, P = size(exp_mn_basis) + Gram = exp_mn_basis' * exp_mn_basis + + # --- Two-step Galerkin: materialize full kernels then project --- + grad_green_full = zeros(Float64, 2M, 2M) + green_full = zeros(Float64, M, M) + GeneralizedPerturbedEquilibrium.Vacuum.kernel!( + grad_green_full, + green_full, + plasma_surf, + plasma_surf, + kparams + ) + + # Exterior projected kernels from full matrices: K_c = Z^H K Z, G_c = Z^H G Z + K_c_two = zeros(ComplexF64, P, P) + G_c_two = zeros(ComplexF64, P, P) + tmp = zeros(ComplexF64, M, P) + + grad_pp = @view grad_green_full[1:M, 1:M] + mul!(tmp, grad_pp, exp_mn_basis) + mul!(K_c_two, exp_mn_basis', tmp) + mul!(tmp, green_full, exp_mn_basis) + mul!(G_c_two, exp_mn_basis', tmp) + + # --- Fused Galerkin via unified kernel! --- + K_c_fused = zeros(ComplexF64, P, P) + G_c_fused = zeros(ComplexF64, P, P) + GeneralizedPerturbedEquilibrium.Vacuum.projected_kernel!( + K_c_fused, + G_c_fused, + plasma_surf, + plasma_surf, + kparams, + exp_mn_basis=exp_mn_basis, + Gram=Gram + ) + + @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12) + @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12) + end end # ------------------------------------------------------------------------- From c29d41692d0562f2aa9810d6731915d73191e30c Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Sat, 14 Mar 2026 18:55:59 -0400 Subject: [PATCH 07/23] VACUUM - WIP - wall implementation of the galerkin method (working for solovev already) --- src/Vacuum/DataTypes.jl | 5 ++- src/Vacuum/Vacuum.jl | 87 ++++++++++++++++++++++++++++++++++++++++- test/runtests_vacuum.jl | 81 +++++++++++++++++++++++++++++++++++++- 3 files changed, 168 insertions(+), 5 deletions(-) diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl index 10c1f8bd..e8d4fad4 100644 --- a/src/Vacuum/DataTypes.jl +++ b/src/Vacuum/DataTypes.jl @@ -23,8 +23,9 @@ nzeta > 1 for 3D vacuum calculation. - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation) - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)] - instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always - use collocation. Defaults to `false`. + instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall + case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P + system with no M² storage. Defaults to `false`. - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with the Fourier projection so that the full M×M kernel matrices are never materialized. Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`. diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index ac804eb3..d7a41f4b 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -158,7 +158,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier green_fourier_int .= green_fourier - # Solve projected BIEs for exterior and interior + # Solve projected BIEs for exterior and interior kernels F = lu!(grad_green_fourier) ldiv!(F, green_fourier) F = lu!(grad_green_fourier_int) @@ -168,6 +168,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC wv .= (4π^2 / M) .* (Gram * green_fourier) # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section mul!(temp, exp_mn_basis, green_fourier) @view(grre[1:M, 1:P]) .= real.(temp) @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) @@ -177,6 +179,89 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC end println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + elseif !wall.nowall && use_galerkin + # ================================================================ + # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space. + # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via + # projected_kernel!, assembles a 2P×2P system, and solves directly. + # Same exp_mn_basis and Gram for all blocks (same angular grid). + # Memory: O(MP + P²), no M² or (2M)² storage. + # ================================================================ + + Gram = zeros!(pool, ComplexF64, P, P) + mul!(Gram, exp_mn_basis', exp_mn_basis) + + # Four projected kernel blocks [P × P complex each] + K_pp_c = zeros!(pool, ComplexF64, P, P) + G_pp_c = zeros!(pool, ComplexF64, P, P) + K_pw_c = zeros!(pool, ComplexF64, P, P) + G_pw_c = zeros!(pool, ComplexF64, P, P) + K_wp_c = zeros!(pool, ComplexF64, P, P) + G_wp_c = zeros!(pool, ComplexF64, P, P) + K_ww_c = zeros!(pool, ComplexF64, P, P) + G_ww_c = zeros!(pool, ComplexF64, P, P) + + kernel_timing = @timed begin + projected_kernel!(K_pp_c, G_pp_c, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + projected_kernel!(K_pw_c, G_pw_c, plasma_surf, wall, kparams, exp_mn_basis, Gram) + projected_kernel!(K_wp_c, G_wp_c, wall, plasma_surf, kparams, exp_mn_basis, Gram) + projected_kernel!(K_ww_c, G_ww_c, wall, wall, kparams, exp_mn_basis, Gram) + end + println(" Wall Galerkin Projected Kernels TIME=$(round(kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))") + + solve_timing = @timed begin + # Assemble 2P×2P exterior system and interior system (before LU overwrites) + K_ext = zeros!(pool, ComplexF64, 2P, 2P) + K_ext[1:P, 1:P] .= K_pp_c + K_ext[1:P, (P+1):(2P)] .= K_pw_c + K_ext[(P+1):(2P), 1:P] .= K_wp_c + K_ext[(P+1):(2P), (P+1):(2P)] .= K_ww_c + + K_int = zeros!(pool, ComplexF64, 2P, 2P) + K_int[1:P, 1:P] .= 2 .* Gram .- K_pp_c + K_int[1:P, (P+1):(2P)] .= .-K_pw_c + K_int[(P+1):(2P), 1:P] .= .-K_wp_c + K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c + + # RHS [2P × P]: single-layer blocks (only plasma-source blocks are nonzero) + G_rhs_ext = zeros!(pool, ComplexF64, 2P, P) + G_rhs_ext[1:P, :] .= G_pp_c + G_rhs_ext[(P+1):(2P), :] .= G_wp_c + + G_rhs_int = similar!(pool, G_rhs_ext) + G_rhs_int .= G_rhs_ext + + # Exterior solve: K_ext * C_ext = G_rhs_ext + F_ext = lu!(K_ext) + ldiv!(F_ext, G_rhs_ext) + c_p_ext = @view G_rhs_ext[1:P, :] + c_w_ext = @view G_rhs_ext[(P+1):(2P), :] + + # Interior solve: K_int * C_int = G_rhs_int + F_int = lu!(K_int) + ldiv!(F_int, G_rhs_int) + c_p_int = @view G_rhs_int[1:P, :] + c_w_int = @view G_rhs_int[(P+1):(2P), :] + + # wv = (4π²/M) · Gram · c_p_ext (plasma observer only) + wv .= (4π^2 / M) .* (Gram * c_p_ext) + + # Backward-compatible reconstruction: grre/grri in M×2P real layout + mul!(temp, exp_mn_basis, c_p_ext) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, c_p_int) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, c_w_ext) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, c_w_int) + @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + end + println(" Wall Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + else # ================================================================ # Collocation approach: solve full physical-space system [M × M] diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl index bba2ff61..472c6c68 100644 --- a/test/runtests_vacuum.jl +++ b/test/runtests_vacuum.jl @@ -514,13 +514,51 @@ plasma_surf, plasma_surf, kparams, - exp_mn_basis=exp_mn_basis, - Gram=Gram + exp_mn_basis, + Gram ) @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12) @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12) end + + # ------------------------------------------------------------------------- + @testset "wall Galerkin vs collocation (2D, conformal)" begin + mtheta_eq = 17 + mtheta = 128 + mpert = 3 + boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq))) + boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq))) + + inputs_colloc = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=1, npert=1, + nzeta=1, mtheta=mtheta, + use_galerkin=false + ) + inputs_galerkin = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=1, npert=1, + nzeta=1, mtheta=mtheta, + use_galerkin=true + ) + + wall_settings = WallShapeSettings(shape="conformal", a=0.5) + + wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings) + wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings) + + M = mtheta + P = mpert + + @test isapprox(wv_g, wv_c; rtol=1e-8) + @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + end end # ------------------------------------------------------------------------- @@ -707,6 +745,45 @@ @test isapprox(wv, wv', rtol=1e-12) end + @testset "wall Galerkin vs collocation (3D, conformal)" begin + mtheta_eq = 17 + mtheta = 32 + nzeta = 32 + mpert = 2 + npert = 2 + boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq))) + boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq))) + + inputs_colloc = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=0, npert=npert, + nzeta=nzeta, mtheta=mtheta, + use_galerkin=false + ) + inputs_galerkin = VacuumInput( + mtheta_in=mtheta_eq, nzeta_in=1, + x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq), + mlow=1, mpert=mpert, nlow=0, npert=npert, + nzeta=nzeta, mtheta=mtheta, + use_galerkin=true + ) + + wall_settings = WallShapeSettings(shape="conformal", a=0.3) + + wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings) + wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings) + + M = mtheta * nzeta + P = mpert * npert + + @test isapprox(wv_g, wv_c; rtol=1e-8) + @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8) + @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8) + end + @testset "Kernel3D laplace_single_layer" begin x_obs = [1.0, 0.0, 0.0] x_src = [2.0, 0.0, 0.0] From 98851de232fd44a6da1baa6f6d4d95092cc97872 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 08:15:08 -0400 Subject: [PATCH 08/23] VACUUM - WIP - renaming matrices --- src/Vacuum/Vacuum.jl | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index d7a41f4b..c65f3774 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -119,10 +119,10 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # FLOPs (both): O(M²P + P³) # ================================================================ # Projected kernel matrices [P × P complex] - grad_green_fourier = zeros!(pool, ComplexF64, P, P) - green_fourier = zeros!(pool, ComplexF64, P, P) - grad_green_fourier_int = similar!(pool, grad_green_fourier) - green_fourier_int = similar!(pool, green_fourier) + K_ext = zeros!(pool, ComplexF64, P, P) + G_ext = zeros!(pool, ComplexF64, P, P) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve Gram = zeros!(pool, ComplexF64, P, P) @@ -131,38 +131,37 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC if fuse_projection # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z fused_timing = @timed begin - projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams, - exp_mn_basis, Gram) + projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) end println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") else # Full-size kernel matrices, then project to mode space - grad_green = zeros!(pool, num_points_total, num_points_total) - green_temp = zeros!(pool, num_points_surf, num_points_surf) + K_ext_temp = zeros!(pool, num_points_total, num_points_total) + G_ext_temp = zeros!(pool, num_points_surf, num_points_surf) pp_kernel_timing = @timed begin - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) + kernel!(K_ext_temp, G_ext_temp, plasma_surf, plasma_surf, kparams) end println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z proj_timing = @timed begin - mul!(temp, grad_green, exp_mn_basis) - mul!(grad_green_fourier, exp_mn_basis', temp) - mul!(temp, green_temp, exp_mn_basis) - mul!(green_fourier, exp_mn_basis', temp) + mul!(temp, K_ext_temp, exp_mn_basis) + mul!(K_ext, exp_mn_basis', temp) + mul!(temp, G_ext_temp, exp_mn_basis) + mul!(G_ext, exp_mn_basis', temp) end println(" Project Kernel TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") end solve_timing = @timed begin # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier - grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier - green_fourier_int .= green_fourier + K_int .= 2 .* Gram .- K_ext + G_int .= G_ext # Solve projected BIEs for exterior and interior kernels - F = lu!(grad_green_fourier) - ldiv!(F, green_fourier) - F = lu!(grad_green_fourier_int) - ldiv!(F, green_fourier_int) + F = lu!(K_ext) + ldiv!(F, G_ext) + F = lu!(K_int) + ldiv!(F, G_int) # wv = (4π²/M) · Gram · green_fourier wv .= (4π^2 / M) .* (Gram * green_fourier) @@ -223,7 +222,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC K_int[(P+1):(2P), 1:P] .= .-K_wp_c K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c - # RHS [2P × P]: single-layer blocks (only plasma-source blocks are nonzero) + # RHS [2P × P]: single-layer blocks with plasma as source G_rhs_ext = zeros!(pool, ComplexF64, 2P, P) G_rhs_ext[1:P, :] .= G_pp_c G_rhs_ext[(P+1):(2P), :] .= G_wp_c @@ -247,6 +246,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC wv .= (4π^2 / M) .* (Gram * c_p_ext) # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts mul!(temp, exp_mn_basis, c_p_ext) @view(grre[1:M, 1:P]) .= real.(temp) @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) From d872ce9cdbb0dfdc88c9b9a7a2b6fe257add8f33 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 08:20:35 -0400 Subject: [PATCH 09/23] VACUUM - WIP - removing the non-fused galerkin method --- src/Vacuum/DataTypes.jl | 10 ++-------- src/Vacuum/Vacuum.jl | 39 +++++++++------------------------------ 2 files changed, 11 insertions(+), 38 deletions(-) diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl index e8d4fad4..5522c37b 100644 --- a/src/Vacuum/DataTypes.jl +++ b/src/Vacuum/DataTypes.jl @@ -26,9 +26,6 @@ nzeta > 1 for 3D vacuum calculation. instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P system with no M² storage. Defaults to `false`. - - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with - the Fourier projection so that the full M×M kernel matrices are never materialized. - Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`. """ @kwdef struct VacuumInput x::Vector{Float64} = Float64[] @@ -45,7 +42,6 @@ nzeta > 1 for 3D vacuum calculation. nzeta::Int = 1 force_wv_symmetry::Bool = true use_galerkin::Bool = false - fuse_projection::Bool = false end """ @@ -86,8 +82,7 @@ function VacuumInput( npert::Int, nlow::Int; force_wv_symmetry::Bool=true, - use_galerkin::Bool=false, - fuse_projection::Bool=false + use_galerkin::Bool=false ) # Extract plasma surface geometry at this psi r, z, ν = extract_plasma_surface_at_psi(equil, ψ) @@ -104,8 +99,7 @@ function VacuumInput( mtheta=mtheta, nzeta=nzeta, force_wv_symmetry=force_wv_symmetry, - use_galerkin=true, - fuse_projection=true + use_galerkin=true ) end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index c65f3774..a50cfe6b 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -72,7 +72,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC n_override::Union{Nothing,Int}=nothing ) - (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin, fuse_projection) = inputs + (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin) = inputs # Initialize surface geometries plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) @@ -113,10 +113,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # accumulated row by row as kernel values are computed. # Memory: O(MP + P²) instead of O(M²) # - # Two-step (fuse_projection=false): full M×M kernel → project → solve. - # Memory: O(M²) for kernel storage - # - # FLOPs (both): O(M²P + P³) + # FLOPs: O(M²P + P³) # ================================================================ # Projected kernel matrices [P × P complex] K_ext = zeros!(pool, ComplexF64, P, P) @@ -128,29 +125,11 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC Gram = zeros!(pool, ComplexF64, P, P) mul!(Gram, exp_mn_basis', exp_mn_basis) - if fuse_projection - # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z - fused_timing = @timed begin - projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - end - println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") - else - # Full-size kernel matrices, then project to mode space - K_ext_temp = zeros!(pool, num_points_total, num_points_total) - G_ext_temp = zeros!(pool, num_points_surf, num_points_surf) - pp_kernel_timing = @timed begin - kernel!(K_ext_temp, G_ext_temp, plasma_surf, plasma_surf, kparams) - end - println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") - # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z - proj_timing = @timed begin - mul!(temp, K_ext_temp, exp_mn_basis) - mul!(K_ext, exp_mn_basis', temp) - mul!(temp, G_ext_temp, exp_mn_basis) - mul!(G_ext, exp_mn_basis', temp) - end - println(" Project Kernel TIME=$(round(proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))") + # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z + fused_timing = @timed begin + projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) end + println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") solve_timing = @timed begin # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier @@ -164,15 +143,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC ldiv!(F, G_int) # wv = (4π²/M) · Gram · green_fourier - wv .= (4π^2 / M) .* (Gram * green_fourier) + wv .= (4π^2 / M) .* (Gram * G_ext) # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code # perhaps make it a complex P * P matrix? Then don't need any of this section - mul!(temp, exp_mn_basis, green_fourier) + mul!(temp, exp_mn_basis, G_ext) @view(grre[1:M, 1:P]) .= real.(temp) @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, green_fourier_int) + mul!(temp, exp_mn_basis, G_int) @view(grri[1:M, 1:P]) .= real.(temp) @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) end From eb0e5f79ee8e617ccd03d2634aca71edb919df88 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 08:53:34 -0400 Subject: [PATCH 10/23] temp --- examples/Solovev_ideal_example_3D/gpec.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml index d9e8b663..3dd466a0 100644 --- a/examples/Solovev_ideal_example_3D/gpec.toml +++ b/examples/Solovev_ideal_example_3D/gpec.toml @@ -32,8 +32,8 @@ nn_high = 1 # Largest toroidal mode number to include delta_mlow = 8 # Expands lower bound of Fourier harmonics delta_mhigh = 8 # Expands upper bound of Fourier harmonics delta_mband = 0 # Integration keeps only this wide a band... -mthvac = 96 # Number of points used in splines over poloidal angle at plasma-vacuum interface. -nzvac = 64 +mthvac = 128 # Number of points used in splines over poloidal angle at plasma-vacuum interface. +nzvac = 128 thmax0 = 1 # Linear multiplier on the automatic choice of theta integration bounds kin_flag = false # Kinetic EL equation (default: false) From afbc93397ad3d0a787cea580466ceda2f5d5e29e Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 10:49:31 -0400 Subject: [PATCH 11/23] VACUUM - WIP - modifying kernels to take views of larger K and G matrices --- src/Vacuum/ProjectedKernel.jl | 49 +++---- src/Vacuum/Vacuum.jl | 240 +++++++++++++++------------------- 2 files changed, 134 insertions(+), 155 deletions(-) diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl index 06f8b3f2..ac8e526f 100644 --- a/src/Vacuum/ProjectedKernel.jl +++ b/src/Vacuum/ProjectedKernel.jl @@ -19,7 +19,7 @@ # 2D fused projected kernel # ============================================================================ """ - projected_kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram) + kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram) Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z directly, without materializing the full M×M kernel matrices. @@ -36,9 +36,7 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types. - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ)) - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term) """ -function projected_kernel! end - -function projected_kernel!( +function kernel!( K_c::AbstractMatrix{ComplexF64}, G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, @@ -50,7 +48,7 @@ function projected_kernel!( _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram) end -function projected_kernel!( +function kernel!( K_c::AbstractMatrix{ComplexF64}, G_c::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, @@ -87,6 +85,12 @@ Memory: O(MP) instead of O(M²). dtheta = 2π / mtheta theta_grid = range(; start=0, length=mtheta, step=dtheta) + # Take a view of the corresponding block of the K_c and G_c matrices + col_idx = (source isa PlasmaGeometry ? 1 : 2) + row_idx = (observer isa PlasmaGeometry ? 1 : 2) + K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) + G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :) + populate_greenfunction = source isa PlasmaGeometry # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78] @@ -110,13 +114,9 @@ Memory: O(MP) instead of O(M²). d1_spline_x(dx_dtheta_grid, theta_grid) d1_spline_z(dz_dtheta_grid, theta_grid) - # Zero output matrices; we accumulate rank-1 updates (conj(Z[j,:]) ⊗ proj_z) - fill!(K_c, 0.0) - fill!(G_c, 0.0) - # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z - proj_kz = zeros(ComplexF64, P) - proj_gz = zeros(ComplexF64, P) + proj_kz = zeros!(pool, ComplexF64, P) + proj_gz = zeros!(pool, ComplexF64, P) for j in 1:mtheta x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] @@ -200,9 +200,9 @@ Memory: O(MP) instead of O(M²). BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz) # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ── - BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c) + BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c_block) if populate_greenfunction - BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c) + BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c_block) end end @@ -210,19 +210,19 @@ Memory: O(MP) instead of O(M²). # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source if source isa PlasmaGeometry - K_c .*= -1 + K_c_block .*= -1 end # Diagonal residue: K += residue·I → K_c += residue·Gram # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) if residue != 0.0 - K_c .+= residue .* Gram + K_c_block .+= residue .* Gram end # 2π𝒢 → 𝒢 if populate_greenfunction - G_c ./= 2π + G_c_block ./= 2π end end @@ -262,6 +262,11 @@ function _projected_kernel_3D!( num_points = observer.mtheta * observer.nzeta dθdζ = 4π^2 / num_points + # Take a view of the corresponding block of the K_c and G_c matrices + col_idx = (source isa PlasmaGeometry3D ? 1 : 2) + row_idx = (observer isa PlasmaGeometry3D ? 1 : 2) + K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) + G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :) populate_greenfunction = source isa PlasmaGeometry3D if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2 @@ -359,17 +364,15 @@ function _projected_kernel_3D!( end # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ── - mul!(K_c, Z', KZ) - K_c ./= 2π + mul!(K_c_block, Z', KZ) + K_c_block ./= 2π if populate_greenfunction - mul!(G_c, Z', GZ) - G_c ./= 2π - else - fill!(G_c, 0.0) + mul!(G_c_block, Z', GZ) + G_c_block ./= 2π end # Diagonal: K += I → K_c += Gram [for same-type source/observer] if typeof(source) == typeof(observer) - K_c .+= Gram + K_c_block .+= Gram end end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index a50cfe6b..e603fe1c 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -102,144 +102,120 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC P = num_modes temp = zeros!(pool, ComplexF64, M, P) - if wall.nowall && use_galerkin - # ================================================================ - # Galerkin: solve system in P×P mode space. Uses complex basis - # Z = C + iS so projected matrices are P×P complex. - # - # Fused (fuse_projection=true): kernel assembly + Fourier projection - # in one pass. The full M×M kernel matrices are never materialized — - # instead the P×P projected matrices grad_green_fourier and G_c are - # accumulated row by row as kernel values are computed. - # Memory: O(MP + P²) instead of O(M²) - # - # FLOPs: O(M²P + P³) - # ================================================================ - # Projected kernel matrices [P × P complex] - K_ext = zeros!(pool, ComplexF64, P, P) - G_ext = zeros!(pool, ComplexF64, P, P) - K_int = similar!(pool, K_ext) - G_int = similar!(pool, G_ext) - + if use_galerkin # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve Gram = zeros!(pool, ComplexF64, P, P) mul!(Gram, exp_mn_basis', exp_mn_basis) - # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z - fused_timing = @timed begin - projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - end - println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") - - solve_timing = @timed begin - # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier - K_int .= 2 .* Gram .- K_ext - G_int .= G_ext - - # Solve projected BIEs for exterior and interior kernels - F = lu!(K_ext) - ldiv!(F, G_ext) - F = lu!(K_int) - ldiv!(F, G_int) - - # wv = (4π²/M) · Gram · green_fourier - wv .= (4π^2 / M) .* (Gram * G_ext) - - # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. - # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code - # perhaps make it a complex P * P matrix? Then don't need any of this section - mul!(temp, exp_mn_basis, G_ext) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, G_int) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - end - println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - - elseif !wall.nowall && use_galerkin - # ================================================================ - # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space. - # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via - # projected_kernel!, assembles a 2P×2P system, and solves directly. - # Same exp_mn_basis and Gram for all blocks (same angular grid). - # Memory: O(MP + P²), no M² or (2M)² storage. - # ================================================================ - - Gram = zeros!(pool, ComplexF64, P, P) - mul!(Gram, exp_mn_basis', exp_mn_basis) - - # Four projected kernel blocks [P × P complex each] - K_pp_c = zeros!(pool, ComplexF64, P, P) - G_pp_c = zeros!(pool, ComplexF64, P, P) - K_pw_c = zeros!(pool, ComplexF64, P, P) - G_pw_c = zeros!(pool, ComplexF64, P, P) - K_wp_c = zeros!(pool, ComplexF64, P, P) - G_wp_c = zeros!(pool, ComplexF64, P, P) - K_ww_c = zeros!(pool, ComplexF64, P, P) - G_ww_c = zeros!(pool, ComplexF64, P, P) - - kernel_timing = @timed begin - projected_kernel!(K_pp_c, G_pp_c, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - projected_kernel!(K_pw_c, G_pw_c, plasma_surf, wall, kparams, exp_mn_basis, Gram) - projected_kernel!(K_wp_c, G_wp_c, wall, plasma_surf, kparams, exp_mn_basis, Gram) - projected_kernel!(K_ww_c, G_ww_c, wall, wall, kparams, exp_mn_basis, Gram) - end - println(" Wall Galerkin Projected Kernels TIME=$(round(kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))") - - solve_timing = @timed begin - # Assemble 2P×2P exterior system and interior system (before LU overwrites) + if wall.nowall + # ================================================================ + # Galerkin (no wall): solve system in P×P mode space. Uses complex basis + # Z = C + iS so projected matrices are P×P complex. + # + # Fused (fuse_projection=true): kernel assembly + Fourier projection + # in one pass. The full M×M kernel matrices are never materialized — + # instead the P×P projected matrices grad_green_fourier and G_c are + # accumulated row by row as kernel values are computed. + # Memory: O(MP + P²) instead of O(M²) + # + # FLOPs: O(M²P + P³) + # ================================================================ + # Projected kernel matrices [P × P complex] + K_ext = zeros!(pool, ComplexF64, P, P) + G_ext = zeros!(pool, ComplexF64, P, P) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) + + # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z + fused_timing = @timed begin + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + end + println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") + + solve_timing = @timed begin + # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier + K_int .= 2 .* Gram .- K_ext + G_int .= G_ext + + # Solve projected BIEs for exterior and interior kernels + F = lu!(K_ext) + ldiv!(F, G_ext) + F = lu!(K_int) + ldiv!(F, G_int) + + # wv = (4π²/M) · Gram · green_fourier + wv .= (4π^2 / M) .* (Gram * G_ext) + + # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section + mul!(temp, exp_mn_basis, G_ext) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, G_int) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + end + println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + + else + # ================================================================ + # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space. + # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via + # projected_kernel!, assembles a 2P×2P system, and solves directly. + # Same exp_mn_basis and Gram for all blocks (same angular grid). + # Memory: O(MP + P²), no M² or (2M)² storage. + # ================================================================ + + # Four projected kernel blocks [P × P complex each] K_ext = zeros!(pool, ComplexF64, 2P, 2P) - K_ext[1:P, 1:P] .= K_pp_c - K_ext[1:P, (P+1):(2P)] .= K_pw_c - K_ext[(P+1):(2P), 1:P] .= K_wp_c - K_ext[(P+1):(2P), (P+1):(2P)] .= K_ww_c - - K_int = zeros!(pool, ComplexF64, 2P, 2P) - K_int[1:P, 1:P] .= 2 .* Gram .- K_pp_c - K_int[1:P, (P+1):(2P)] .= .-K_pw_c - K_int[(P+1):(2P), 1:P] .= .-K_wp_c - K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c - - # RHS [2P × P]: single-layer blocks with plasma as source - G_rhs_ext = zeros!(pool, ComplexF64, 2P, P) - G_rhs_ext[1:P, :] .= G_pp_c - G_rhs_ext[(P+1):(2P), :] .= G_wp_c - - G_rhs_int = similar!(pool, G_rhs_ext) - G_rhs_int .= G_rhs_ext - - # Exterior solve: K_ext * C_ext = G_rhs_ext - F_ext = lu!(K_ext) - ldiv!(F_ext, G_rhs_ext) - c_p_ext = @view G_rhs_ext[1:P, :] - c_w_ext = @view G_rhs_ext[(P+1):(2P), :] - - # Interior solve: K_int * C_int = G_rhs_int - F_int = lu!(K_int) - ldiv!(F_int, G_rhs_int) - c_p_int = @view G_rhs_int[1:P, :] - c_w_int = @view G_rhs_int[(P+1):(2P), :] - - # wv = (4π²/M) · Gram · c_p_ext (plasma observer only) - wv .= (4π^2 / M) .* (Gram * c_p_ext) - - # Backward-compatible reconstruction: grre/grri in M×2P real layout - # Need to convert mode space to physical space and unpack the real and imaginary parts - mul!(temp, exp_mn_basis, c_p_ext) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, c_p_int) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, c_w_ext) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, c_w_int) - @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + G_ext = zeros!(pool, ComplexF64, 2P, P) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) + + kernel_timing = @timed begin + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) + end + println(" Wall Galerkin Projected Kernels TIME=$(round(kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))") + + solve_timing = @timed begin + # Compute interior system: K_int = 2·Gram - K_ext + K_int .= -K_ext + K_int[1:P, 1:P] .+= 2 .* Gram + K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram + G_int .= G_ext + + # Exterior solve: K_ext * G_ext = G_ext + F_ext = lu!(K_ext) + ldiv!(F_ext, G_ext) + + # Interior solve: K_int * C_int = G_rhs_int + F_int = lu!(K_int) + ldiv!(F_int, G_int) + + # wv = (4π²/M) · Gram · G_p_ext (plasma observer only) + wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :)) + + # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts + mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) + @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + end + println(" Wall Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") end - println(" Wall Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") else # ================================================================ From 4925412f8ba1c007bc7107128183aadd4833a45f Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 11:16:13 -0400 Subject: [PATCH 12/23] VACUUM - WIP - consolidating nowall and wall into one branch for galerkin --- src/Vacuum/Vacuum.jl | 145 +++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index e603fe1c..7b058e19 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -102,111 +102,81 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC P = num_modes temp = zeros!(pool, ComplexF64, M, P) + # ================================================================ + # Galerkin: solve system in P×P mode space. Uses complex basis + # Z = C + iS so projected matrices are P×P complex. + # + # Fused (fuse_projection=true): kernel assembly + Fourier projection + # in one pass. The full M×M kernel matrices are never materialized — + # instead the P×P projected matrices grad_green_fourier and G_c are + # accumulated row by row as kernel values are computed. + # Memory: O(MP + P²) instead of O(M²) + # + # FLOPs: O(M²P + P³) + # ================================================================ if use_galerkin # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve Gram = zeros!(pool, ComplexF64, P, P) mul!(Gram, exp_mn_basis', exp_mn_basis) - if wall.nowall - # ================================================================ - # Galerkin (no wall): solve system in P×P mode space. Uses complex basis - # Z = C + iS so projected matrices are P×P complex. - # - # Fused (fuse_projection=true): kernel assembly + Fourier projection - # in one pass. The full M×M kernel matrices are never materialized — - # instead the P×P projected matrices grad_green_fourier and G_c are - # accumulated row by row as kernel values are computed. - # Memory: O(MP + P²) instead of O(M²) - # - # FLOPs: O(M²P + P³) - # ================================================================ - # Projected kernel matrices [P × P complex] - K_ext = zeros!(pool, ComplexF64, P, P) - G_ext = zeros!(pool, ComplexF64, P, P) - K_int = similar!(pool, K_ext) - G_int = similar!(pool, G_ext) - - # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z - fused_timing = @timed begin - kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - end - println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") - - solve_timing = @timed begin - # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier - K_int .= 2 .* Gram .- K_ext - G_int .= G_ext - - # Solve projected BIEs for exterior and interior kernels - F = lu!(K_ext) - ldiv!(F, G_ext) - F = lu!(K_int) - ldiv!(F, G_int) - - # wv = (4π²/M) · Gram · green_fourier - wv .= (4π^2 / M) .* (Gram * G_ext) - - # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real. - # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code - # perhaps make it a complex P * P matrix? Then don't need any of this section - mul!(temp, exp_mn_basis, G_ext) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, G_int) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - end - println(" Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - - else - # ================================================================ - # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space. - # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via - # projected_kernel!, assembles a 2P×2P system, and solves directly. - # Same exp_mn_basis and Gram for all blocks (same angular grid). - # Memory: O(MP + P²), no M² or (2M)² storage. - # ================================================================ - - # Four projected kernel blocks [P × P complex each] - K_ext = zeros!(pool, ComplexF64, 2P, 2P) - G_ext = zeros!(pool, ComplexF64, 2P, P) - K_int = similar!(pool, K_ext) - G_int = similar!(pool, G_ext) + # Projected kernel matrices [P × P complex] + K_ext = zeros!(pool, ComplexF64, 2P, 2P) + G_ext = zeros!(pool, ComplexF64, 2P, P) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) + # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z + fused_timing = @timed begin + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + end + println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") + if !wall.nowall kernel_timing = @timed begin - kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) end println(" Wall Galerkin Projected Kernels TIME=$(round(kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))") + end - solve_timing = @timed begin - # Compute interior system: K_int = 2·Gram - K_ext - K_int .= -K_ext - K_int[1:P, 1:P] .+= 2 .* Gram + solve_timing = @timed begin + # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext + K_int .= -K_ext + K_int[1:P, 1:P] .+= 2 .* Gram + if !wall.nowall K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram - G_int .= G_ext - - # Exterior solve: K_ext * G_ext = G_ext + end + G_int .= G_ext + + # Solve projected BIEs for exterior and interior kernels + if wall.nowall + F_ext = lu!(K_ext[1:P, 1:P]) + ldiv!(F_ext, @view(G_ext[1:P, :])) + F_int = lu!(K_int[1:P, 1:P]) + ldiv!(F_int, @view(G_int[1:P, :])) + else F_ext = lu!(K_ext) ldiv!(F_ext, G_ext) - - # Interior solve: K_int * C_int = G_rhs_int F_int = lu!(K_int) ldiv!(F_int, G_int) + end - # wv = (4π²/M) · Gram · G_p_ext (plasma observer only) - wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :)) - - # Backward-compatible reconstruction: grre/grri in M×2P real layout - # Need to convert mode space to physical space and unpack the real and imaginary parts - mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + # wv = (4π²/M) · Gram · green_fourier + wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :)) + end + println(" Galerkin Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + reconstruct_timing = @timed begin + # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section + mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + if !wall.nowall mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) @@ -214,9 +184,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) end - println(" Wall Galerkin Solve + Reconstruct TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") end - + println(" Reconstruct TIME=$(round(reconstruct_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(reconstruct_timing.bytes))") else # ================================================================ # Collocation approach: solve full physical-space system [M × M] From 8bda42053fd875dd68b1ac2a183f6304940c5d6d Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 12:41:15 -0400 Subject: [PATCH 13/23] VACUUM - WIP - optimizing the fused Galerkin code (mostly 3D benefits) --- src/Vacuum/Kernel2D.jl | 20 +++-- src/Vacuum/Kernel3D.jl | 95 +++++++++++++++------- src/Vacuum/ProjectedKernel.jl | 148 +++++++++++++++++++++++----------- src/Vacuum/Vacuum.jl | 5 +- 4 files changed, 181 insertions(+), 87 deletions(-) diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl index eac91b01..b518ff4c 100644 --- a/src/Vacuum/Kernel2D.jl +++ b/src/Vacuum/Kernel2D.jl @@ -141,6 +141,9 @@ but grad_greenfunction is not since it fills a different block of the d1_spline_x(dx_dtheta_grid, theta_grid) d1_spline_z(dz_dtheta_grid, theta_grid) + # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition) + legendre_buf = acquire!(pool, Float64, n + 2) + # Loop through observer points for j in 1:mtheta # Get observer coordinates @@ -150,7 +153,7 @@ but grad_greenfunction is not since it fills a different block of the # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1. @inbounds for k in 1:(mtheta-3) isrc = mod1(j + 1 + k, mtheta) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; gamma_prefactor) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor) # Composite Simpson's 1/3 rule weights, excluding singular points # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0 @@ -181,7 +184,7 @@ but grad_greenfunction is not since it fills a different block of the dx_dtheta_gauss = d1_spline_x(theta_gauss0) z_gauss = spline_z(theta_gauss0) dz_dtheta_gauss = d1_spline_z(theta_gauss0) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; gamma_prefactor) + G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; gamma_prefactor) # Get stencil and weight for the Gaussian point s = leftpanel ? stencils_left[ig] : stencils_right[ig] @@ -639,19 +642,22 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c - Implements analytical derivatives from Chance 1997 equations - The coupling terms include the Jacobian factor from the coordinate transformation - By default uses the 2007 Legendre function implementation (Bulirsch + Gaussian integration) + +An overload accepting a pre-allocated `legendre_buf::Vector{Float64}` of length `n+2` is available. +Callers in tight loops should allocate this buffer once and pass it in to avoid per-call pool acquisition. """ -@with_pool pool function green( +function green( x_obs::Float64, z_obs::Float64, x_source::Float64, z_source::Float64, dx_dtheta::Float64, dz_dtheta::Float64, - n::Int; + n::Int, + legendre::AbstractVector{Float64}; gamma_prefactor::Float64=2 * sqrt(π) * gamma(0.5 - n), uselegacygreenfunction::Bool=false ) - x_obs2 = x_obs^2 x_source2 = x_source^2 x_minus2 = (x_obs - x_source)^2 @@ -670,9 +676,7 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c # Argument of Legendre function 𝘴 [Chance Phys. Plasmas 1997 2161 eq. 42] s = (x_obs2 + x_source2 + ζ2) / R2 - # Legendre functions for - # P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1 - legendre = acquire!(pool, Float64, n + 2) + # Legendre functions: P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1 if uselegacygreenfunction Pn_minus_half_1997!(legendre, s, n) else diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index df28dd45..c158734e 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -197,7 +197,7 @@ The single-layer kernel φ is the fundamental solution to Laplace's equation: - `Float64`: Kernel value φ(x_obs, x_src) """ -function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}) +@fastmath function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}) @inbounds begin dx = x_obs[1] - x_src[1] dy = x_obs[2] - x_src[2] @@ -208,6 +208,21 @@ function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVect return inv(sqrt(r2)) end +""" +Scalar-argument single-layer kernel. Avoids view creation in tight loops. +""" +@fastmath @inline function laplace_single_layer( + ox::Float64, oy::Float64, oz::Float64, + sx::Float64, sy::Float64, sz::Float64 +) + dx = ox - sx; + dy = oy - sy; + dz = oz - sz + r2 = dx*dx + dy*dy + dz*dz + r2 < 1e-30 && return 0.0 + return inv(sqrt(r2)) +end + """ laplace_double_layer(x_obs, x_src, n_src) -> Float64 @@ -231,7 +246,7 @@ K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x - `Float64`: Kernel value K(x_obs, x_src, n_src) """ -function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real}) +@fastmath function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real}) @inbounds begin dx = x_obs[1] - x_src[1] dy = x_obs[2] - x_src[2] @@ -247,6 +262,24 @@ function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVect return (dx*nx + dy*ny + dz*nz) * r3inv end +""" +Scalar-argument double-layer kernel. Avoids view creation in tight loops. +""" +@fastmath @inline function laplace_double_layer( + ox::Float64, oy::Float64, oz::Float64, + sx::Float64, sy::Float64, sz::Float64, + nx::Float64, ny::Float64, nz::Float64 +) + dx = ox - sx; + dy = oy - sy; + dz = oz - sz + r2 = dx*dx + dy*dy + dz*dz + r2 < 1e-30 && return 0.0 + rinv = inv(sqrt(r2)) + r3inv = rinv * rinv * rinv + return (dx*nx + dy*ny + dz*nz) * r3inv +end + """ extract_patch!(patch, data, idx_pol_center, idx_tor_center, npol, ntor, PATCH_DIM) @@ -380,12 +413,17 @@ where each entry is φ(x_obs, x_src). - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place + - `observer`: Observer geometry (PlasmaGeometry3D) + - `source`: Source geometry (PlasmaGeometry3D) + - `PATCH_RAD`: Number of points adjacent to source point to treat as singular + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1) + - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM + - `INTERP_ORDER`: Lagrange interpolation order + Must be ≤ (2 * PATCH_RAD + 1) @@ -446,24 +484,26 @@ function compute_3D_kernel_matrices!( # Convert linear index to 2D indices i_obs = mod1(idx_obs, observer.mtheta) j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 - r_obs = @view observer.r[idx_obs, :] + @inbounds ox = observer.r[idx_obs, 1] + @inbounds oy = observer.r[idx_obs, 2] + @inbounds oz = observer.r[idx_obs, 3] # ============================================================ # FAR FIELD: Trapezoidal rule for nonsingular source points # Note: kernels return zero for r_src = r_obs # ============================================================ @inbounds for idx_src in 1:num_points - # Evaluate kernels at grid points - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - K_single = laplace_single_layer(r_obs, r_src) - K_double = laplace_double_layer(r_obs, r_src, n_src) - + sx = source.r[idx_src, 1]; + sy = source.r[idx_src, 2]; + sz = source.r[idx_src, 3] + nx = source.normal[idx_src, 1]; + ny = source.normal[idx_src, 2]; + nz = source.normal[idx_src, 3] # Apply weights (periodic trapezoidal rule = constant weights) + grad_greenfunction_block[idx_obs, idx_src] = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ if populate_greenfunction - greenfunction[idx_obs, idx_src] = K_single * dθdζ + greenfunction[idx_obs, idx_src] = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ end - grad_greenfunction_block[idx_obs, idx_src] = K_double * dθdζ end # ============================================================ @@ -484,15 +524,15 @@ function compute_3D_kernel_matrices!( # Evaluate kernels at polar points with POU weighting @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM - # Evaluate kernels using recomputed normal (use @view to avoid allocation) - r_src = @view r_polar[ir, ia, :] - n_src = @view n_polar[ir, ia, :] - K_single = laplace_single_layer(r_obs, r_src) - K_double = laplace_double_layer(r_obs, r_src, n_src) - - # Apply quadrature weights: area element × POU, where POU contains rdrdθ already - M_polar_single[ir, ia] = K_single * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = K_double * Ppou[ir, ia] * dθdζ + # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already + rsx = r_polar[ir, ia, 1]; + rsy = r_polar[ir, ia, 2]; + rsz = r_polar[ir, ia, 3] + nsx = n_polar[ir, ia, 1]; + nsy = n_polar[ir, ia, 2]; + nsz = n_polar[ir, ia, 3] + M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ + M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ end # Distribute polar singular corrections back to Cartesian grid using sparse matrix @@ -502,25 +542,22 @@ function compute_3D_kernel_matrices!( M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) - # Compute remaining far-field POU contribution and near-field polar quadrature result - # We include this region in the far-field trapezoidal rule, so use Gpou = -χ here to get 1-χ + # POU correction: read back far-field trapezoidal values instead of re-evaluating kernels. + # trap + M_grid + trap*Gpou = trap*(1+Gpou) + M_grid = trap*(1-χ) + M_grid @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM # Map back to global indices idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta) idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta) idx_src = idx_pol + source.mtheta * (idx_tor - 1) - # Remainder of far-field contribution on the singular grid: Gpou = -χ - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - far_single = laplace_single_layer(r_obs, r_src) * Gpou[i, j] * dθdζ - far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[i, j] * dθdζ + trap_double = grad_greenfunction_block[idx_obs, idx_src] + grad_greenfunction_block[idx_obs, idx_src] = trap_double + M_grid_double[i, j] + trap_double * Gpou[i, j] # Apply near + far contributions if populate_greenfunction - greenfunction[idx_obs, idx_src] += M_grid_single[i, j] + far_single + trap_single = greenfunction[idx_obs, idx_src] + greenfunction[idx_obs, idx_src] = trap_single + M_grid_single[i, j] + trap_single * Gpou[i, j] end - grad_greenfunction_block[idx_obs, idx_src] += M_grid_double[i, j] + far_double end end diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl index ac8e526f..8218739d 100644 --- a/src/Vacuum/ProjectedKernel.jl +++ b/src/Vacuum/ProjectedKernel.jl @@ -15,6 +15,32 @@ # FLOP cost is identical to the two-step approach O(M²P), but memory drops # from O(M²) to O(MP + P²). +# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ────────── + +""" +Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P. +""" +@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64, + Zt::AbstractMatrix{ComplexF64}, col::Int) + @inbounds @simd for p in eachindex(proj) + proj[p] += w * Zt[p, col] + end +end + +""" +Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary. +""" +@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64}, + Zt::AbstractMatrix{ComplexF64}, j::Int, + y::AbstractVector{ComplexF64}) + @inbounds for p2 in eachindex(y) + y_p2 = y[p2] + for p1 in axes(A, 1) + A[p1, p2] += conj(Zt[p1, j]) * y_p2 + end + end +end + # ============================================================================ # 2D fused projected kernel # ============================================================================ @@ -81,6 +107,7 @@ Memory: O(MP) instead of O(M²). ) M, P = size(exp_mn_basis) Z = exp_mn_basis + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access mtheta = length(observer.x) dtheta = 2π / mtheta theta_grid = range(; start=0, length=mtheta, step=dtheta) @@ -114,6 +141,9 @@ Memory: O(MP) instead of O(M²). d1_spline_x(dx_dtheta_grid, theta_grid) d1_spline_z(dz_dtheta_grid, theta_grid) + # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition) + legendre_buf = Vector{Float64}(undef, n + 2) + # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z proj_kz = zeros!(pool, ComplexF64, P) proj_gz = zeros!(pool, ComplexF64, P) @@ -130,17 +160,15 @@ Memory: O(MP) instead of O(M²). isrc = mod1(j + 1 + k, mtheta) G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], - dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; + dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor) wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2)) if populate_greenfunction - w_g = G_n * wsimpson - BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) + _accum_row!(proj_gz, G_n * wsimpson, Zt, isrc) end - w_k = gradG_n * wsimpson - BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz) + _accum_row!(proj_kz, gradG_n * wsimpson, Zt, isrc) diag_accum -= gradG_0 * wsimpson end @@ -160,7 +188,7 @@ Memory: O(MP) instead of O(M²). z_gauss = spline_z(theta_gauss0) dz_dtheta_gauss = d1_spline_z(theta_gauss0) G_n, gradG_n, gradG_0 = green(x_obs, z_obs, - x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; + x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; gamma_prefactor) s = leftpanel ? stencils_left[ig] : stencils_right[ig] @@ -171,16 +199,12 @@ Memory: O(MP) instead of O(M²). G_n += log((theta_obs - theta_gauss)^2) / x_obs end @inbounds for stencil_idx in 1:5 - w_g = G_n * s[stencil_idx] * wgauss - isrc = sing_idx[stencil_idx] - BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) + _accum_row!(proj_gz, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end end @inbounds for stencil_idx in 1:5 - w_k = gradG_n * s[stencil_idx] * wgauss - isrc = sing_idx[stencil_idx] - BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz) + _accum_row!(proj_kz, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end diag_accum -= gradG_0 * wgauss @@ -190,19 +214,17 @@ Memory: O(MP) instead of O(M²). # Analytic singular integral correction [Chance 1997 eq. 75] if populate_greenfunction && observer isa PlasmaGeometry @inbounds for stencil_idx in 1:5 - w_g = -log_correction_array[stencil_idx] / x_obs - isrc = sing_idx[stencil_idx] - BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz) + _accum_row!(proj_gz, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx]) end end # Fold diagonal accumulation into projection - BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz) + _accum_row!(proj_kz, diag_accum, Zt, j) # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ── - BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c_block) + _rank1_conj!(K_c_block, Zt, j, proj_kz) if populate_greenfunction - BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c_block) + _rank1_conj!(G_c_block, Zt, j, proj_gz) end end @@ -259,6 +281,7 @@ function _projected_kernel_3D!( ) M, P = size(exp_mn_basis) Z = exp_mn_basis + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access num_points = observer.mtheta * observer.nzeta dθdζ = 4π^2 / num_points @@ -276,15 +299,16 @@ function _projected_kernel_3D!( quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER) (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data - # [M × P] buffers: row idx_obs holds (kernel row idx_obs) · Z - KZ = zeros(ComplexF64, M, P) - GZ = zeros(ComplexF64, M, P) + # [P × M] buffers: column idx_obs holds (kernel row idx_obs) · Z + KZt = zeros(ComplexF64, P, M) + GZt = zeros(ComplexF64, P, M) - # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors) + # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors + patch mask) max_tid = Threads.maxthreadid() workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid] proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] + is_patch_all = [falses(num_points) for _ in 1:max_tid] Threads.@threads :static for idx_obs in 1:num_points tid = Threads.threadid() @@ -294,24 +318,40 @@ function _projected_kernel_3D!( proj_kz = proj_kz_all[tid] proj_gz = proj_gz_all[tid] + is_patch = is_patch_all[tid] fill!(proj_kz, 0.0) fill!(proj_gz, 0.0) + fill!(is_patch, false) i_obs = mod1(idx_obs, observer.mtheta) j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 - r_obs = @view observer.r[idx_obs, :] + @inbounds ox = observer.r[idx_obs, 1] + @inbounds oy = observer.r[idx_obs, 2] + @inbounds oz = observer.r[idx_obs, 3] + + # Mark patch source indices so the far-field loop can skip them + @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM + idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) + idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) + is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true + end - # ── FAR FIELD: Trapezoidal rule ── + # ── FAR FIELD: Trapezoidal rule (skip patch — handled in POU correction) ── @inbounds for idx_src in 1:num_points - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ - BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz) + is_patch[idx_src] && continue + sx = source.r[idx_src, 1]; + sy = source.r[idx_src, 2]; + sz = source.r[idx_src, 3] + nx = source.normal[idx_src, 1]; + ny = source.normal[idx_src, 2]; + nz = source.normal[idx_src, 3] + w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ + _accum_row!(proj_kz, w_double, Zt, idx_src) if populate_greenfunction - w_single = laplace_single_layer(r_obs, r_src) * dθdζ - BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz) + w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ + _accum_row!(proj_gz, w_single, Zt, idx_src) end end @@ -327,10 +367,14 @@ function _projected_kernel_3D!( compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient) @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM - r_src = @view r_polar[ir, ia, :] - n_src = @view n_polar[ir, ia, :] - M_polar_single[ir, ia] = laplace_single_layer(r_obs, r_src) * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = laplace_double_layer(r_obs, r_src, n_src) * Ppou[ir, ia] * dθdζ + rsx = r_polar[ir, ia, 1]; + rsy = r_polar[ir, ia, 2]; + rsz = r_polar[ir, ia, 3] + nsx = n_polar[ir, ia, 1]; + nsy = n_polar[ir, ia, 2]; + nsz = n_polar[ir, ia, 3] + M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ + M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ end mul!(M_grid_single_flat, P2G, vec(M_polar_single)) @@ -338,36 +382,44 @@ function _projected_kernel_3D!( M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) + # POU correction: evaluate kernel once with combined weight (1+Gpou) = (1-χ) + # since far-field skipped patch points, we include the full trapezoidal + polar here @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) idx_src = idx_pol + source.mtheta * (idx_tor - 1) - r_src = @view source.r[idx_src, :] - n_src = @view source.normal[idx_src, :] - far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ - w_double = M_grid_double[ii, jj] + far_double - BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz) + sx = source.r[idx_src, 1]; + sy = source.r[idx_src, 2]; + sz = source.r[idx_src, 3] + nx = source.normal[idx_src, 1]; + ny = source.normal[idx_src, 2]; + nz = source.normal[idx_src, 3] + full_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[ii, jj]) * dθdζ + _accum_row!(proj_kz, M_grid_double[ii, jj] + full_double, Zt, idx_src) if populate_greenfunction - far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ - w_single = M_grid_single[ii, jj] + far_single - BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz) + full_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[ii, jj]) * dθdζ + _accum_row!(proj_gz, M_grid_single[ii, jj] + full_single, Zt, idx_src) end end - # ── Write projected row to buffer (each idx_obs owns its row) ── - @inbounds KZ[idx_obs, :] .= proj_kz + # ── Write projected column to buffer (each idx_obs owns its column) ── + @inbounds for p in 1:P + KZt[p, idx_obs] = proj_kz[p] + end if populate_greenfunction - @inbounds GZ[idx_obs, :] .= proj_gz + @inbounds for p in 1:P + GZt[p, idx_obs] = proj_gz[p] + end end end - # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ── - mul!(K_c_block, Z', KZ) + # ── Assemble P×P projected matrices: K_c = Z^H * KZt^T, G_c = Z^H * GZt^T ── + mul!(K_c_block, Z', transpose(KZt)) K_c_block ./= 2π if populate_greenfunction - mul!(G_c_block, Z', GZ) + mul!(G_c_block, Z', transpose(GZt)) G_c_block ./= 2π end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 7b058e19..4fb1387c 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -161,8 +161,9 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC ldiv!(F_int, G_int) end - # wv = (4π²/M) · Gram · green_fourier - wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :)) + # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G + mul!(wv, Gram, view(G_ext, 1:P, :)) + wv .*= (4π^2 / M) end println(" Galerkin Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") reconstruct_timing = @timed begin From b185e95b6d3e1eed3e31f35f5a38e7d1633f5666 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 13:44:53 -0400 Subject: [PATCH 14/23] VACUUM - WIP - removing timers, adding benchmark --- benchmarks/benchmark_vacuum_galerkin.jl | 399 ++++++++++++++++++++++++ src/Vacuum/Vacuum.jl | 182 +++++------ 2 files changed, 477 insertions(+), 104 deletions(-) create mode 100644 benchmarks/benchmark_vacuum_galerkin.jl diff --git a/benchmarks/benchmark_vacuum_galerkin.jl b/benchmarks/benchmark_vacuum_galerkin.jl new file mode 100644 index 00000000..358c6240 --- /dev/null +++ b/benchmarks/benchmark_vacuum_galerkin.jl @@ -0,0 +1,399 @@ +#!/usr/bin/env julia + +using Printf +using LinearAlgebra +using TOML +using Plots + +using Pkg +Pkg.instantiate() +using BenchmarkTools + +using GeneralizedPerturbedEquilibrium +const GPEC = GeneralizedPerturbedEquilibrium + +""" + make_wall_settings(example_dir::AbstractString) + +Construct `Vacuum.WallShapeSettings` from the `[Wall]` section in `gpec.toml` +if present; otherwise return default settings. +""" +function make_wall_settings(example_dir::AbstractString) + inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml")) + if haskey(inputs, "Wall") + return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["Wall"])...) + elseif haskey(inputs, "WALL") + # Some examples use legacy capitalized section name + return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["WALL"])...) + else + return GPEC.Vacuum.WallShapeSettings() + end +end + +""" + load_equilibrium(example_dir::AbstractString) + +Set up the equilibrium specified by `[Equilibrium]` in `gpec.toml` under `example_dir`. +""" +function load_equilibrium(example_dir::AbstractString) + inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml")) + @assert haskey(inputs, "Equilibrium") "[Equilibrium] section missing in gpec.toml for $example_dir" + eq_cfg = GPEC.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], example_dir) + return GPEC.Equilibrium.setup_equilibrium(eq_cfg) +end + +""" + make_vacuum_input( + equil::GPEC.Equilibrium.PlasmaEquilibrium; + ψ::Real, + mtheta::Int, + nzeta::Int, + mpert::Int, + mlow::Int, + npert::Int, + nlow::Int, + use_galerkin::Bool, + ) -> GPEC.Vacuum.VacuumInput + +Construct a `VacuumInput` at flux surface `ψ` with the specified resolution and mode set. +The only parameter that differs between the two algorithms we compare is `use_galerkin`. +""" +function make_vacuum_input( + equil::GPEC.Equilibrium.PlasmaEquilibrium; + ψ::Real, + mtheta::Int, + nzeta::Int, + mpert::Int, + mlow::Int, + npert::Int, + nlow::Int, + use_galerkin::Bool +) + r, z, ν = GPEC.Vacuum.extract_plasma_surface_at_psi(equil, float(ψ)) + + return GPEC.Vacuum.VacuumInput(; + x=reverse(r), + z=reverse(z), + ν=reverse(ν), + mtheta_in=length(r), + nzeta_in=1, + mlow=mlow, + mpert=mpert, + nlow=nlow, + npert=npert, + mtheta=mtheta, + nzeta=nzeta, + force_wv_symmetry=true, + use_galerkin=use_galerkin + ) +end + +""" + benchmark_vacuum_2d( + example_dir::AbstractString; + ψ::Real = 0.95, + mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:9)), + mpert::Int = 32, + mlow::Int = 0, + npert::Int = 1, + nlow::Int = 1, + ) + +Benchmark `compute_vacuum_response` for 2D (nzeta = 1) using the Solovev example in +`example_dir`. Scans over `mtheta_values` and compares collocation (`use_galerkin=false`) +against Galerkin (`use_galerkin=true`) for convergence of the `wv` matrix and runtime. +""" +function benchmark_vacuum_2d( + example_dir::AbstractString; + ψ::Real=1.0, + mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:7)), + mpert::Int=32, + mlow::Int=0, + npert::Int=1, + nlow::Int=1 +) + println("\n===== 2D Vacuum Benchmark (Solovev, $(basename(example_dir))) =====") + println("ψ = $(ψ), mtheta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n") + + equil = load_equilibrium(example_dir) + wall_settings = make_wall_settings(example_dir) + + mtheta_values = collect(mtheta_values) + nm = length(mtheta_values) + + times_colloc = zeros(Float64, nm) + times_galerkin = zeros(Float64, nm) + errs_colloc = zeros(Float64, nm) + errs_galerkin = zeros(Float64, nm) + + # Reference wv for convergence (highest resolution * 2) – always use galerkin + mtheta_ref = maximum(mtheta_values) * 2 + println("Computing 2D reference matrices at mtheta_ref = $mtheta_ref") + + input_ref_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta_ref, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings) + # IMPORTANT: `compute_vacuum_response` uses AdaptiveArrayPools; take a copy so + # later pooled allocations do not overwrite the reference storage. + wv_ref_galerkin = copy(wv_ref_galerkin) + ref_norm_galerkin = norm(wv_ref_galerkin) + + for (i, mtheta) in enumerate(mtheta_values) + println(" 2D: mtheta = $(rpad(string(mtheta), 5)) (nzeta = 1)") + + # Collocation + input_colloc = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=false + ) + t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings) + wv, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings) + errs_colloc[i] = norm(wv .- wv_ref_galerkin) / ref_norm_galerkin + times_colloc[i] = t_colloc + + # Galerkin + input_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=1, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings) + wv_g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings) + errs_galerkin[i] = norm(wv_g .- wv_ref_galerkin) / ref_norm_galerkin + times_galerkin[i] = t_galerkin + + @printf(" Collocation: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i]) + @printf(" Galerkin: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i]) + end + + # Two‑pane plot: left = convergence, right = runtime + plt = plot(; layout=(1, 2), size=(1100, 420)) + + # Convergence + plot!(plt[1], mtheta_values, errs_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ", ylabel="rel‖Δwv‖", + title="2D Vacuum: wv convergence vs mθ") + plot!(plt[1], mtheta_values, errs_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + # Runtime + plot!(plt[2], mtheta_values, times_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ", ylabel="runtime [s]", + title="2D Vacuum: runtime vs mθ") + plot!(plt[2], mtheta_values, times_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + plot!(plt[1]; legend=:bottomleft) + plot!(plt[2]; legend=:topleft) + + outpath = joinpath(@__DIR__, "vacuum_galerkin_2d.png") + savefig(plt, outpath) + println("\n2D results saved to $(outpath)") + + return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin) +end + +""" + benchmark_vacuum_3d( + example_dir::AbstractString; + ψ::Real = 0.95, + mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:3)), + mpert::Int = 16, + mlow::Int = 0, + npert::Int = 1, + nlow::Int = 1, + ) + +Benchmark `compute_vacuum_response` for 3D (nzeta = mtheta) using the Solovev 3D example +in `example_dir`. Scans over `mtheta = nzeta` and compares collocation vs Galerkin. +""" +function benchmark_vacuum_3d( + example_dir::AbstractString; + ψ::Real=1.0, + mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:2)), + mpert::Int=16, + mlow::Int=0, + npert::Int=1, + nlow::Int=1 +) + println("\n===== 3D Vacuum Benchmark (Solovev 3D, $(basename(example_dir))) =====") + println("ψ = $(ψ), mtheta = nzeta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n") + + equil = load_equilibrium(example_dir) + wall_settings = make_wall_settings(example_dir) + + mtheta_values = collect(mtheta_values) + nm = length(mtheta_values) + + times_colloc = zeros(Float64, nm) + times_galerkin = zeros(Float64, nm) + errs_colloc = zeros(Float64, nm) + errs_galerkin = zeros(Float64, nm) + + # Reference wv for convergence (highest resolution * 2) – always use galerkin + mtheta_ref = maximum(mtheta_values) * 2 + nzeta_ref = mtheta_ref + println("Computing 3D reference matrices at mtheta_ref = nzeta_ref = $mtheta_ref") + + input_ref_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta_ref, + nzeta=nzeta_ref, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings) + # Again, protect the reference matrix from being overwritten by pooled allocations. + wv_ref_galerkin = copy(wv_ref_galerkin) + ref_norm_galerkin = norm(wv_ref_galerkin) + + for (i, mtheta) in enumerate(mtheta_values) + nzeta = mtheta + println(" 3D: mtheta = $(rpad(string(mtheta), 5)), nzeta = $(rpad(string(nzeta), 5))") + + # Collocation + input_colloc = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=nzeta, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=false + ) + t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings) + wv3, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings) + errs_colloc[i] = norm(wv3 .- wv_ref_galerkin) / ref_norm_galerkin + times_colloc[i] = t_colloc + + # Galerkin + input_galerkin = make_vacuum_input( + equil; + ψ=ψ, + mtheta=mtheta, + nzeta=nzeta, + mpert=mpert, + mlow=mlow, + npert=npert, + nlow=nlow, + use_galerkin=true + ) + t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings) + wv3g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings) + errs_galerkin[i] = norm(wv3g .- wv_ref_galerkin) / ref_norm_galerkin + times_galerkin[i] = t_galerkin + + @printf(" Collocation: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i]) + @printf(" Galerkin: t = %.3f s, rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i]) + end + + # Two‑pane plot: left = convergence, right = runtime (mtheta = nzeta on x-axis) + plt = plot(; layout=(1, 2), size=(1100, 420)) + + # Convergence + plot!(plt[1], mtheta_values, errs_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ = nzeta", ylabel="rel‖Δwv‖", + title="3D Vacuum: wv convergence vs mθ = nzeta") + plot!(plt[1], mtheta_values, errs_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + # Runtime + plot!(plt[2], mtheta_values, times_colloc; + lw=2, marker=:circle, xscale=:log10, yscale=:log10, + label="Collocation", xlabel="mθ = nzeta", ylabel="runtime [s]", + title="3D Vacuum: runtime vs mθ = nzeta") + plot!(plt[2], mtheta_values, times_galerkin; + lw=2, marker=:square, xscale=:log10, yscale=:log10, + label="Galerkin") + + plot!(plt[1]; legend=:bottomleft) + plot!(plt[2]; legend=:topleft) + + outpath = joinpath(@__DIR__, "vacuum_galerkin_3d.png") + savefig(plt, outpath) + println("\n3D results saved to $(outpath)") + + return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin) +end + +""" + main() + +Entry point when running this file as a script. + +Usage (from repository root): + +```bash +julia --project=. benchmarks/benchmark_vacuum_galerkin.jl +``` + +Edit the `mtheta_values` and other keyword arguments in the calls below to +explore different resolution ranges. +""" +function main() + # 2D Solovev example + example_2d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example") + + # 3D Solovev example + example_3d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example_3D") + + benchmark_vacuum_2d( + example_2d; + ψ=1.0, + mtheta_values=16 .* (2 .^ (0:9)), # 16 → 8192 (easily editable) + mpert=31, + mlow=-15, + npert=1, + nlow=1 + ) + + benchmark_vacuum_3d( + example_3d; + ψ=1.0, + mtheta_values=16 .* (2 .^ (0:3)), # 16 → 128 (easily editable) + mpert=31, + mlow=-15, + npert=1, + nlow=1 + ) + + return nothing +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 4fb1387c..c1c50514 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -125,68 +125,57 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC K_int = similar!(pool, K_ext) G_int = similar!(pool, G_ext) - # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z - fused_timing = @timed begin - kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - end - println(" Fused Projected Kernel TIME=$(round(fused_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))") + # Fused projected kernel: compute Z^H K Z and Z^H G Z + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) if !wall.nowall - kernel_timing = @timed begin - kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) - kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) - kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) - end - println(" Wall Galerkin Projected Kernels TIME=$(round(kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))") + kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) end - solve_timing = @timed begin - # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext - K_int .= -K_ext - K_int[1:P, 1:P] .+= 2 .* Gram - if !wall.nowall - K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram - end - G_int .= G_ext - - # Solve projected BIEs for exterior and interior kernels - if wall.nowall - F_ext = lu!(K_ext[1:P, 1:P]) - ldiv!(F_ext, @view(G_ext[1:P, :])) - F_int = lu!(K_int[1:P, 1:P]) - ldiv!(F_int, @view(G_int[1:P, :])) - else - F_ext = lu!(K_ext) - ldiv!(F_ext, G_ext) - F_int = lu!(K_int) - ldiv!(F_int, G_int) - end - - # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G - mul!(wv, Gram, view(G_ext, 1:P, :)) - wv .*= (4π^2 / M) + # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext + K_int .= -K_ext + K_int[1:P, 1:P] .+= 2 .* Gram + if !wall.nowall + K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram + end + G_int .= G_ext + + # Solve projected BIEs for exterior and interior kernels + if wall.nowall + F_ext = lu!(K_ext[1:P, 1:P]) + ldiv!(F_ext, @view(G_ext[1:P, :])) + F_int = lu!(K_int[1:P, 1:P]) + ldiv!(F_int, @view(G_int[1:P, :])) + else + F_ext = lu!(K_ext) + ldiv!(F_ext, G_ext) + F_int = lu!(K_int) + ldiv!(F_int, G_int) end - println(" Galerkin Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") - reconstruct_timing = @timed begin - # Backward-compatible reconstruction: grre/grri in M×2P real layout - # Need to convert mode space to physical space and unpack the real and imaginary parts - # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code - # perhaps make it a complex P * P matrix? Then don't need any of this section - mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - if !wall.nowall - mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) - @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - end + + # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G + mul!(wv, Gram, view(G_ext, 1:P, :)) + wv .*= (4π^2 / M) + + # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section + mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + if !wall.nowall + mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) + @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) end - println(" Reconstruct TIME=$(round(reconstruct_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(reconstruct_timing.bytes))") else # ================================================================ # Collocation approach: solve full physical-space system [M × M] @@ -196,62 +185,47 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC grad_green = zeros!(pool, num_points_total, num_points_total) green_temp = zeros!(pool, num_points_surf, num_points_surf) - pp_kernel_timing = @timed begin - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - end - println(" Plasma Kernel TIME=$(round(pp_kernel_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))") + kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z) - colloc_proj_timing = @timed begin - mul!(temp, green_temp, exp_mn_basis) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - end - println(" Plasma Project TIME=$(round(colloc_proj_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(colloc_proj_timing.bytes))") + mul!(temp, green_temp, exp_mn_basis) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) if !wall.nowall - wall_block_timing = @timed begin - # Plasma–Wall block - kernel!(grad_green, green_temp, plasma_surf, wall, kparams) - # Wall–Wall block - kernel!(grad_green, green_temp, wall, wall, kparams) - # Wall–Plasma block - kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Project obs=wall, src=plasma block to mode space - mul!(temp, green_temp, exp_mn_basis) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - end - println(" Wall Kernel and Project TIME=$(round(wall_block_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))") + # Plasma–Wall block + kernel!(grad_green, green_temp, plasma_surf, wall, kparams) + # Wall–Wall block + kernel!(grad_green, green_temp, wall, wall, kparams) + # Wall–Plasma block + kernel!(grad_green, green_temp, wall, plasma_surf, kparams) + # Project obs=wall, src=plasma block to mode space + mul!(temp, green_temp, exp_mn_basis) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) end # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) - solve_timing = @timed begin - grri .= grre # start from same as exterior - grad_green_interior = similar!(pool, grad_green) - grad_green_interior .= grad_green - - # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel - F_ext = lu!(grad_green) - ldiv!(F_ext, grre) - - # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal - grad_green_interior .*= -1 - for i in 1:num_points_total - grad_green_interior[i, i] += 2.0 - end - F_int = lu!(grad_green_interior) - ldiv!(F_int, grri) + grri .= grre # start from same as exterior + grad_green_interior = similar!(pool, grad_green) + grad_green_interior .= grad_green + + # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel + F_ext = lu!(grad_green) + ldiv!(F_ext, grre) + + # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal + grad_green_interior .*= -1 + for i in 1:num_points_total + grad_green_interior[i, i] += 2.0 end - println(" Invert and Solve TIME=$(round(solve_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))") + F_int = lu!(grad_green_interior) + ldiv!(F_int, grri) - wv_timing = @timed begin - # wv = (4π²/M) · Z^H · grre_complex [Chance Phys. Plasmas 2007 052506 eq. 115-118] - temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)])) - mul!(wv, exp_mn_basis', temp) - wv .*= (4π^2 / M) - end - println(" Compute Wv TIME=$(round(wv_timing.time; digits=6)) s ALLOCATIONS=$(Base.format_bytes(wv_timing.bytes))") + # wv = (4π²/M) · Z^H · grre_complex [Chance Phys. Plasmas 2007 052506 eq. 115-118] + temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)])) + mul!(wv, exp_mn_basis', temp) + wv .*= (4π^2 / M) end inputs.force_wv_symmetry && hermitianpart!(wv) From 0c5ef1c4d66bcb7cdaf38060c0b9dbbd05ba0999 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Mon, 16 Mar 2026 14:16:46 -0400 Subject: [PATCH 15/23] VACUUM - WIP - simplifying some math in Free.jl --- src/ForceFreeStates/Free.jl | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl index 65455d9e..f0e4a9be 100644 --- a/src/ForceFreeStates/Free.jl +++ b/src/ForceFreeStates/Free.jl @@ -28,14 +28,11 @@ and data dumping. # Compute vacuum response matrix in-place (handles 2D single-n, 2D multi-n block-diagonal, and 3D) vac_inputs = Vacuum.VacuumInput(equil, psilim, ctrl.mthvac, ctrl.nzvac, mpert, mlow, npert, nlow; force_wv_symmetry=ctrl.force_wv_symmetry) - @time Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings) + Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings) # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126] singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)') - @inbounds for ipert in 1:numpert_total - @views vac_data.wv[ipert, :] .*= singfac[ipert] - @views vac_data.wv[:, ipert] .*= singfac[ipert] - end + @inbounds @views vac_data.wv .*= singfac .* singfac' # Compute complex energy eigenvalues and vectors vac_data.wt .= wp .+ vac_data.wv @@ -75,13 +72,11 @@ and data dumping. # Compute plasma and vacuum contributions. # wpt = wt' * wp * wt ; wvt = wt' * wv * wt mul!(tmp_mat, wp, vac_data.wt) - mul!(wpt, adjoint(vac_data.wt), tmp_mat) + mul!(wpt, vac_data.wt', tmp_mat) mul!(tmp_mat, vac_data.wv, vac_data.wt) - mul!(wvt, adjoint(vac_data.wt), tmp_mat) - for ipert in 1:numpert_total - vac_data.ep[ipert] = wpt[ipert, ipert] - vac_data.ev[ipert] = wvt[ipert, ipert] - end + mul!(wvt, vac_data.wt', tmp_mat) + vac_data.ep .= diag(wpt) + vac_data.ev .= diag(wvt) # Normalize eigenvectors based on scaled wt coeffs = odet.u[:, :, 1, end] \ (vac_data.wt .* (2π * equil.psio * 1e-3)) @@ -123,8 +118,8 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium # TODO: 4 spline points is arbitrary - is there a better way? qedge = profiles.q_spline(ctrl.psiedge) npsi = max(4, ceil(Int, (intr.qlim - qedge) * intr.nhigh * 4)) - psi_array = zeros(Float64, npsi + 1) - wv_array = zeros(ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total) + psi_array = zeros!(pool, Float64, npsi + 1) + wv_array = zeros!(pool, ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total) for i in 1:(npsi+1) # Space points evenly in q @@ -143,10 +138,7 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium # Apply singular factor scaling: (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126] singfac = vec((intr.mlow:intr.mhigh) .- qi .* (intr.nlow:intr.nhigh)') - @inbounds for ipert in 1:intr.numpert_total - @views wv[ipert, :] .*= singfac[ipert] - @views wv[:, ipert] .*= singfac[ipert] - end + @inbounds @views wv .*= singfac .* singfac' @views wv_array[i, :, :] .= wv end From 8c2a78a62a3ff3699beed50bd9e8b6f857ed2ad2 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Tue, 17 Mar 2026 09:50:05 -0400 Subject: [PATCH 16/23] EXAMPLES - IMPROVEMENT - small cleanups to example tomls --- examples/DIIID-like_ideal_example/gpec.toml | 20 +++++++++---------- examples/Solovev_ideal_example/gpec.toml | 22 --------------------- examples/Solovev_ideal_example_3D/gpec.toml | 11 ----------- 3 files changed, 10 insertions(+), 43 deletions(-) diff --git a/examples/DIIID-like_ideal_example/gpec.toml b/examples/DIIID-like_ideal_example/gpec.toml index 982f6817..f95e67dc 100644 --- a/examples/DIIID-like_ideal_example/gpec.toml +++ b/examples/DIIID-like_ideal_example/gpec.toml @@ -14,16 +14,6 @@ newq0 = 0 # Override for on-axis safety factor (0 etol = 1e-7 # Error tolerance for equilibrium solver force_termination = false # Terminate after equilibrium setup (skip stability calculations) -[Wall] -shape = "nowall" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) -a = 0.2415 # Distance from plasma (conformal) or shape parameter -aw = 0.05 # Half-thickness parameter for Dee-shaped walls -bw = 1.5 # Elongation parameter for wall shapes -cw = 0 # Offset of wall center from major radius -dw = 0.5 # Triangularity parameter for wall shapes -tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) -equal_arc_wall = true # Equal arc length distribution of nodes on wall - [ForceFreeStates] bal_flag = false # Ideal MHD ballooning criterion for short wavelengths mat_flag = true # Construct coefficient matrices for diagnostic purposes @@ -63,6 +53,16 @@ save_interval = 3 # Save every Nth ODE step (1=all, 10=every 10th). A singfac_min = 1e-4 # Fractional distance from rational q at which ideal jump enforced ucrit = 1e4 # Maximum fraction of solutions allowed before re-normalized +[Wall] +shape = "nowall" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) +a = 0.2415 # Distance from plasma (conformal) or shape parameter +aw = 0.05 # Half-thickness parameter for Dee-shaped walls +bw = 1.5 # Elongation parameter for wall shapes +cw = 0 # Offset of wall center from major radius +dw = 0.5 # Triangularity parameter for wall shapes +tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) +equal_arc_wall = true # Equal arc length distribution of nodes on wall + [ForcingTerms] forcing_data_file = "forcing.dat" # Path to forcing data file (n, m, complex amplitude) forcing_data_format = "ascii" # Format of forcing data: "ascii" or "hdf5" diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml index 0065fde8..77c6d2e0 100644 --- a/examples/Solovev_ideal_example/gpec.toml +++ b/examples/Solovev_ideal_example/gpec.toml @@ -14,28 +14,6 @@ newq0 = 0 # Override for on-axis safety factor (0 etol = 1e-7 # Error tolerance for equilibrium solver force_termination = false # Terminate after equilibrium setup (skip stability calculations) - -[Wall] -shape = "conformal" # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath) -a = 0.2415 # Distance from plasma (conformal) or shape parameter -aw = 0.05 # Half-thickness parameter for Dee-shaped walls -bw = 1.5 # Elongation parameter for wall shapes -cw = 0 # Offset of wall center from major radius -dw = 0.5 # Triangularity parameter for wall shapes -tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) -equal_arc_wall = true # Equal arc length distribution of nodes on wall - -# [PerturbedEquilibrium] -# # Uncomment this section to enable perturbed equilibrium calculations -# forcing_data_file = "forcing.dat" # Path to forcing data (n, m, real, imag) -# forcing_data_format = "ascii" # "ascii" or "hdf5" -# fixed_boundary = false # Fixed boundary flag -# output_eigenmodes = true # Output mode fields as b-fields -# compute_response = true # Compute plasma response -# compute_singular_coupling = true # Compute singular coupling metrics -# verbose = true # Enable verbose logging -# write_outputs_to_HDF5 = true # Write outputs to HDF5 - [ForceFreeStates] bal_flag = false # Ideal MHD ballooning criterion for short wavelengths mat_flag = true # Construct coefficient matrices for diagnostic purposes diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml index 3dd466a0..93d4e995 100644 --- a/examples/Solovev_ideal_example_3D/gpec.toml +++ b/examples/Solovev_ideal_example_3D/gpec.toml @@ -63,14 +63,3 @@ cw = 0 # Offset of wall center from major radiu dw = 0.5 # Triangularity parameter for wall shapes tw = 0.05 # Sharpness of wall corners (try 0.05 as initial value) equal_arc_wall = false # Equal arc length distribution of nodes on wall - -# [PerturbedEquilibrium] -# # Uncomment this section to enable perturbed equilibrium calculations -# forcing_data_file = "forcing.dat" # Path to forcing data (n, m, real, imag) -# forcing_data_format = "ascii" # "ascii" or "hdf5" -# fixed_boundary = false # Fixed boundary flag -# output_eigenmodes = true # Output mode fields as b-fields -# compute_response = true # Compute plasma response -# compute_singular_coupling = true # Compute singular coupling metrics -# verbose = true # Enable verbose logging -# write_outputs_to_HDF5 = true # Write outputs to HDF5 From 52cceab9ecc1b319e7f6ba6a0607838792f6115d Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Tue, 17 Mar 2026 13:24:33 -0400 Subject: [PATCH 17/23] VACUUM - WIP - mergint the projected kernel functions into the regular ones, 2D working --- examples/Solovev_ideal_example/gpec.toml | 2 +- src/Vacuum/DataTypes.jl | 11 +- src/Vacuum/Kernel2D.jl | 116 ++++++++++++--------- src/Vacuum/ProjectedKernel.jl | 48 ++------- src/Vacuum/Utilities.jl | 25 +++++ src/Vacuum/Vacuum.jl | 126 +++++++++++------------ 6 files changed, 170 insertions(+), 158 deletions(-) diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml index 77c6d2e0..108c55bd 100644 --- a/examples/Solovev_ideal_example/gpec.toml +++ b/examples/Solovev_ideal_example/gpec.toml @@ -53,7 +53,7 @@ ucrit = 1e3 # Maximum fraction of solutions allowed before re- force_wv_symmetry = true # Forces vacuum energy matrix symmetry save_interval = 3 # Save every Nth ODE step (1=all, 10=every 10th). Always saves near rational surfaces. -[WALL] +[Wall] shape = "conformal" # String selecting wall shape ["nowall", "conformal", "elliptical", "dee", "mod_dee", "from_file"] a = 0.2415 # The distance of the wall from the plasma in units of major radius (conformal), or minor radius parameter (others). aw = 0.05 # Half-thickness of the wall. diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl index 5522c37b..63d029b4 100644 --- a/src/Vacuum/DataTypes.jl +++ b/src/Vacuum/DataTypes.jl @@ -22,10 +22,6 @@ nzeta > 1 for 3D vacuum calculation. - `mtheta::Int`: Number of vacuum calculation poloidal grid points - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation) - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix - - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)] - instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall - case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P - system with no M² storage. Defaults to `false`. """ @kwdef struct VacuumInput x::Vector{Float64} = Float64[] @@ -41,7 +37,6 @@ nzeta > 1 for 3D vacuum calculation. mtheta::Int = 1 nzeta::Int = 1 force_wv_symmetry::Bool = true - use_galerkin::Bool = false end """ @@ -81,8 +76,7 @@ function VacuumInput( mlow::Int, npert::Int, nlow::Int; - force_wv_symmetry::Bool=true, - use_galerkin::Bool=false + force_wv_symmetry::Bool=true ) # Extract plasma surface geometry at this psi r, z, ν = extract_plasma_surface_at_psi(equil, ψ) @@ -98,8 +92,7 @@ function VacuumInput( npert=npert, mtheta=mtheta, nzeta=nzeta, - force_wv_symmetry=force_wv_symmetry, - use_galerkin=true + force_wv_symmetry=force_wv_symmetry ) end diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl index b518ff4c..d4be71b1 100644 --- a/src/Vacuum/Kernel2D.jl +++ b/src/Vacuum/Kernel2D.jl @@ -67,17 +67,17 @@ The residue calculation needs to be updated for open walls.** # Arguments - - `grad_greenfunction`: Gradient Green's function matrix (output) - - `greenfunction`: Green's function matrix (output) + - `K`: Fourier-space Gradient Green's function matrix (output) + - `G`: Fourier-space Green's function matrix (output) - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry) - `source`: Source geometry struct (PlasmaGeometry or WallGeometry) - `n`: Toroidal mode number # Returns -Modifies `grad_greenfunction` and `greenfunction` in place. -Note that greenfunction is zeroed each time this function is called, -but grad_greenfunction is not since it fills a different block of the +Modifies `K` and `G` in place. +Note that G is zeroed each time this function is called, +but K is not since it fills a different block of the (2 * mtheta, 2 * mtheta) depending on the source/observer. # Notes @@ -87,28 +87,26 @@ but grad_greenfunction is not since it fills a different block of the - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161] """ @with_pool pool function compute_2D_kernel_matrices!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + K::AbstractMatrix{ComplexF64}, + G::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, - n::Int + n::Int, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractMatrix{ComplexF64} ) - mtheta = length(observer.x) - dtheta = 2π / mtheta - theta_grid = range(; start=0, length=mtheta, step=dtheta) + M, P = size(Z) # M = mtheta, P = num_modes + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access + dtheta = 2π / M + theta_grid = range(; start=0, length=M, step=dtheta) # Take a view of the corresponding block of the grad_greenfunction - col_index = (source isa PlasmaGeometry ? 1 : 2) - row_index = (observer isa PlasmaGeometry ? 1 : 2) - grad_greenfunction_block = view( - grad_greenfunction, - ((row_index-1)*mtheta+1):(row_index*mtheta), - ((col_index-1)*mtheta+1):(col_index*mtheta) - ) + col_idx = (source isa PlasmaGeometry ? 1 : 2) + row_idx = (observer isa PlasmaGeometry ? 1 : 2) + K_block = view(K, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) + G_block = view(G, ((row_idx-1)*P+1):(row_idx*P), :) - # Zero out greenfunction at start of each kernel call - fill!(greenfunction, 0.0) # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) populate_greenfunction = source isa PlasmaGeometry @@ -119,7 +117,7 @@ but grad_greenfunction is not since it fills a different block of the log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2) # Precompute the n-dependent prefactor 2√π·Γ(1/2-n) [Chance Phys. Plasmas 1997 2161 eq. 40] - # This is constant for all source/observer point pairs within this kernel call. + # This constant is only computed once for each n gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n) # Set up periodic splines used for off-grid Gaussian quadrature points @@ -134,8 +132,8 @@ but grad_greenfunction is not since it fills a different block of the # Precompute source derivatives on the theta grid once used in Simpson integration # The Gaussian singular-panel points are off-grid, so those still use spline evaluation directly. - dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta) - dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta) + dx_dtheta_grid = acquire!(pool, eltype(source.x), M) + dz_dtheta_grid = acquire!(pool, eltype(source.z), M) # Call in-place API to avoid allocations d1_spline_x(dx_dtheta_grid, theta_grid) @@ -144,35 +142,49 @@ but grad_greenfunction is not since it fills a different block of the # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition) legendre_buf = acquire!(pool, Float64, n + 2) + # Per-observer projection vectors: proj = (kernel row) · Z + proj_k = zeros!(pool, ComplexF64, P) + proj_g = zeros!(pool, ComplexF64, P) + # Loop through observer points - for j in 1:mtheta + for j in 1:M # Get observer coordinates x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] - # Perform Simpson integration for nonsingular source points + # Zero out projection terms + fill!(proj_k, 0.0) + fill!(proj_g, 0.0) + diag_accum = 0.0 + + # ============================================================ + # FAR FIELD: Simpson integration for nonsingular source points + # ============================================================ # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1. - @inbounds for k in 1:(mtheta-3) - isrc = mod1(j + 1 + k, mtheta) + @inbounds for k in 1:(M-3) + isrc = mod1(j + 1 + k, M) G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor) # Composite Simpson's 1/3 rule weights, excluding singular points # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0 - wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2)) + wsimpson = dtheta / 3 * ((k == 1 || k == M - 3) ? 1 : (iseven(k) ? 4 : 2)) - # Sum contributions to Green's function matrices using Simpson weight + # Sum and project contributions to Green's function matrices if populate_greenfunction - greenfunction[j, isrc] += G_n * wsimpson + _accum_row!(proj_g, G_n * wsimpson, Zt, isrc) end - grad_greenfunction_block[j, isrc] += gradG_n * wsimpson + _accum_row!(proj_k, gradG_n * wsimpson, Zt, isrc) # Subtract regular integral component of δⱼᵢK⁰ [Chance Phys. Plasmas 1997 2161 eq. 83] - grad_greenfunction_block[j, j] -= gradG_0 * wsimpson + diag_accum -= gradG_0 * wsimpson end - # Perform Gaussian quadrature for singular points (source = obs point) + # ============================================================ + # NEAR FIELD: Gaussian quadrature with singular correction + # ============================================================ # Indices of the singularity region, [j-2, j-1, j, j+1, j+2] (allocation-free) for (offset_idx, offset) in enumerate(-2:2) - sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta) + sing_idx[offset_idx] = mod1(j + offset + M, M) end + # Integrate region of length 2 * dtheta on left/right of singularity for leftpanel in (true, false) gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta) @@ -197,54 +209,64 @@ but grad_greenfunction is not since it fills a different block of the G_n += log((theta_obs - theta_gauss)^2) / x_obs end @inbounds for stencil_idx in 1:5 - greenfunction[j, sing_idx[stencil_idx]] += G_n * s[stencil_idx] * wgauss + _accum_row!(proj_g, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end end # Second type of singularity: 𝒦ⁿ [Chance Phys. Plasmas 1997 2161 eq. 83, 86] @inbounds for stencil_idx in 1:5 - grad_greenfunction_block[j, sing_idx[stencil_idx]] += gradG_n * s[stencil_idx] * wgauss + _accum_row!(proj_k, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) end # Subtract off the diverging singular n=0 component - grad_greenfunction_block[j, j] -= gradG_0 * wgauss + diag_accum -= gradG_0 * wgauss end end # Subtract off analytic singular integral [Chance Phys. Plasmas 1997 2161 eq. 75] if plasma-plasma block if populate_greenfunction && observer isa PlasmaGeometry @inbounds for stencil_idx in 1:5 - greenfunction[j, sing_idx[stencil_idx]] -= log_correction_array[stencil_idx] / x_obs + _accum_row!(proj_g, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx]) end end + + # Project the n=0 diagonal accumulation + _accum_row!(proj_k, diag_accum, Zt, j) + + # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ── + _rank1_conj!(K_block, Zt, j, proj_k) + if populate_greenfunction + _rank1_conj!(G_block, Zt, j, proj_g) + end end # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources if source isa PlasmaGeometry - grad_greenfunction_block .*= -1 + K_block .*= -1 end # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] + # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) - @inbounds for i in 1:mtheta - grad_greenfunction_block[i, i] += residue - end + K_block .+= residue .* Gram # Since we computed 2π𝒢, divide by 2π to get 𝒢 if populate_greenfunction - greenfunction ./= 2π + G_block ./= 2π end end # Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n function kernel!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + K::AbstractMatrix{ComplexF64}, + G::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, - params::KernelParams2D + params::KernelParams2D, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractMatrix{ComplexF64} ) - return compute_2D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, params.n) + return compute_2D_kernel_matrices!(K, G, observer, source, params.n, Z, Gram) end ############################################################# diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl index 8218739d..5c9c5369 100644 --- a/src/Vacuum/ProjectedKernel.jl +++ b/src/Vacuum/ProjectedKernel.jl @@ -15,32 +15,6 @@ # FLOP cost is identical to the two-step approach O(M²P), but memory drops # from O(M²) to O(MP + P²). -# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ────────── - -""" -Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P. -""" -@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64, - Zt::AbstractMatrix{ComplexF64}, col::Int) - @inbounds @simd for p in eachindex(proj) - proj[p] += w * Zt[p, col] - end -end - -""" -Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary. -""" -@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64}, - Zt::AbstractMatrix{ComplexF64}, j::Int, - y::AbstractVector{ComplexF64}) - @inbounds for p2 in eachindex(y) - y_p2 = y[p2] - for p1 in axes(A, 1) - A[p1, p2] += conj(Zt[p1, j]) * y_p2 - end - end -end - # ============================================================================ # 2D fused projected kernel # ============================================================================ @@ -62,17 +36,17 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types. - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ)) - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term) """ -function kernel!( - K_c::AbstractMatrix{ComplexF64}, - G_c::AbstractMatrix{ComplexF64}, - observer::Union{PlasmaGeometry,WallGeometry}, - source::Union{PlasmaGeometry,WallGeometry}, - params::KernelParams2D, - exp_mn_basis::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} -) - _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram) -end +# function kernel!( +# K_c::AbstractMatrix{ComplexF64}, +# G_c::AbstractMatrix{ComplexF64}, +# observer::Union{PlasmaGeometry,WallGeometry}, +# source::Union{PlasmaGeometry,WallGeometry}, +# params::KernelParams2D, +# exp_mn_basis::AbstractMatrix{ComplexF64}, +# Gram::AbstractMatrix{ComplexF64} +# ) +# _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram) +# end function kernel!( K_c::AbstractMatrix{ComplexF64}, diff --git a/src/Vacuum/Utilities.jl b/src/Vacuum/Utilities.jl index de4d220b..ac69a9e1 100644 --- a/src/Vacuum/Utilities.jl +++ b/src/Vacuum/Utilities.jl @@ -152,3 +152,28 @@ Inline function for fast cross product of two 3D vectors at a given index. c[idx, 3] = a1*b2 - a2*b1 end end + +# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ────────── +""" +Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P. +""" +@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64, + Zt::AbstractMatrix{ComplexF64}, col::Int) + @inbounds @simd for p in eachindex(proj) + proj[p] += w * Zt[p, col] + end +end + +""" +Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary. +""" +@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64}, + Zt::AbstractMatrix{ComplexF64}, j::Int, + y::AbstractVector{ComplexF64}) + @inbounds for p2 in eachindex(y) + y_p2 = y[p2] + for p1 in axes(A, 1) + A[p1, p2] += conj(Zt[p1, j]) * y_p2 + end + end +end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index c1c50514..7d994531 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -72,7 +72,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC n_override::Union{Nothing,Int}=nothing ) - (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin) = inputs + (; mtheta, mpert, mlow, nzeta, npert, nlow) = inputs # Initialize surface geometries plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs) @@ -114,69 +114,68 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # # FLOPs: O(M²P + P³) # ================================================================ - if use_galerkin - # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve - Gram = zeros!(pool, ComplexF64, P, P) - mul!(Gram, exp_mn_basis', exp_mn_basis) - - # Projected kernel matrices [P × P complex] - K_ext = zeros!(pool, ComplexF64, 2P, 2P) - G_ext = zeros!(pool, ComplexF64, 2P, P) - K_int = similar!(pool, K_ext) - G_int = similar!(pool, G_ext) - - # Fused projected kernel: compute Z^H K Z and Z^H G Z - kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) - if !wall.nowall - kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) - kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) - kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) - end - - # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext - K_int .= -K_ext - K_int[1:P, 1:P] .+= 2 .* Gram - if !wall.nowall - K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram - end - G_int .= G_ext - - # Solve projected BIEs for exterior and interior kernels - if wall.nowall - F_ext = lu!(K_ext[1:P, 1:P]) - ldiv!(F_ext, @view(G_ext[1:P, :])) - F_int = lu!(K_int[1:P, 1:P]) - ldiv!(F_int, @view(G_int[1:P, :])) - else - F_ext = lu!(K_ext) - ldiv!(F_ext, G_ext) - F_int = lu!(K_int) - ldiv!(F_int, G_int) - end - - # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G - mul!(wv, Gram, view(G_ext, 1:P, :)) - wv .*= (4π^2 / M) + # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve + Gram = zeros!(pool, ComplexF64, P, P) + mul!(Gram, exp_mn_basis', exp_mn_basis) + + # Projected kernel matrices [P × P complex] + K_ext = zeros!(pool, ComplexF64, 2P, 2P) + G_ext = zeros!(pool, ComplexF64, 2P, P) + K_int = similar!(pool, K_ext) + G_int = similar!(pool, G_ext) + + # Fused projected kernel: compute Z^H K Z and Z^H G Z + kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + if !wall.nowall + kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) + kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) + end - # Backward-compatible reconstruction: grre/grri in M×2P real layout - # Need to convert mode space to physical space and unpack the real and imaginary parts - # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code - # perhaps make it a complex P * P matrix? Then don't need any of this section - mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) - if !wall.nowall - mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) - @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - end + # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext + K_int .= -K_ext + K_int[1:P, 1:P] .+= 2 .* Gram + if !wall.nowall + K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram + end + G_int .= G_ext + + # Solve projected BIEs for exterior and interior kernels + if wall.nowall + F_ext = lu!(K_ext[1:P, 1:P]) + ldiv!(F_ext, @view(G_ext[1:P, :])) + F_int = lu!(K_int[1:P, 1:P]) + ldiv!(F_int, @view(G_int[1:P, :])) else + F_ext = lu!(K_ext) + ldiv!(F_ext, G_ext) + F_int = lu!(K_int) + ldiv!(F_int, G_int) + end + + # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G + mul!(wv, Gram, view(G_ext, 1:P, :)) + wv .*= (4π^2 / M) + + # Backward-compatible reconstruction: grre/grri in M×2P real layout + # Need to convert mode space to physical space and unpack the real and imaginary parts + # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code + # perhaps make it a complex P * P matrix? Then don't need any of this section + mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) + @view(grre[1:M, 1:P]) .= real.(temp) + @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) + @view(grri[1:M, 1:P]) .= real.(temp) + @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + if !wall.nowall + mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) + @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) + @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) + @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + end + """ # ================================================================ # Collocation approach: solve full physical-space system [M × M] # Handles both no-wall and wall cases. @@ -226,8 +225,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)])) mul!(wv, exp_mn_basis', temp) wv .*= (4π^2 / M) - end - +""" inputs.force_wv_symmetry && hermitianpart!(wv) if nzeta > 1 # 3D From f96030cfa35f075ecabf82c1415b1fdd563866c7 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Tue, 17 Mar 2026 14:49:11 -0400 Subject: [PATCH 18/23] VACUUM - IMPROVEMENT - full implementation of the projected kernel in both 2D and 3D. Removed dead code --- src/Vacuum/Kernel2D.jl | 139 +++++++++--- src/Vacuum/Kernel3D.jl | 252 +++++++++++++++------ src/Vacuum/ProjectedKernel.jl | 404 ---------------------------------- src/Vacuum/Vacuum.jl | 57 +---- 4 files changed, 295 insertions(+), 557 deletions(-) delete mode 100644 src/Vacuum/ProjectedKernel.jl diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl index d4be71b1..49772e96 100644 --- a/src/Vacuum/Kernel2D.jl +++ b/src/Vacuum/Kernel2D.jl @@ -59,36 +59,106 @@ const GL8_LAGRANGE_STENCILS = precompute_lagrange_stencils(GL8.x) # and per-n sinh/cosh cache are defined in PnQuadCache.jl. """ - kernel!(grad_greenfunction, greenfunction, observer, source, n) + compute_2D_kernel_matrices!(K, G, observer, source, n, Z, Gram) -Compute kernels of integral equation for Laplace's equation in a torus. -**WARNING: This kernel only supports closed toroidal walls currently. -The residue calculation needs to be updated for open walls.** +Compute the **Fourier/Galerkin-projected** 2D vacuum boundary-integral kernel blocks for +Laplace’s equation in an axisymmetric torus, **without ever forming the dense +`M×M` “point-to-point” kernel matrices. -# Arguments +This is the fused “evaluate kernel + project” path that the vacuum solver uses: - - `K`: Fourier-space Gradient Green's function matrix (output) - - `G`: Fourier-space Green's function matrix (output) - - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry) - - `source`: Source geometry struct (PlasmaGeometry or WallGeometry) - - `n`: Toroidal mode number + Kc = Zᴴ * K * Z + Gc = Zᴴ * G -# Returns +where: -Modifies `K` and `G` in place. -Note that G is zeroed each time this function is called, -but K is not since it fills a different block of the -(2 * mtheta, 2 * mtheta) depending on the source/observer. + - `K` is the **double-layer** kernel (normal derivative of the Green’s function), + - `G` is the **single-layer** kernel (Green’s function itself; only needed for plasma-as-source), + - `Z ∈ ℂ^{M×P}` is the complex Fourier basis on the poloidal grid, + and `Zᴴ` is its conjugate transpose. -# Notes +Rather than computing a full kernel row `K[j, :]` and then multiplying by `Z`, this routine +projects **on the fly**: + + - For each observer node `j`, it accumulates the projected row-vector + `proj_k = (K[j,:] * weights) · Z` and (optionally) `proj_g = (G[j,:] * weights) · Z` + into length-`P` work buffers. + + - It then performs a **rank-1 update** into the appropriate projected block: + + Kc += conj(Z[j, :])' * proj_k + Gc += conj(Z[j, :])' * proj_g + +This reduces peak memory from `O(M^2)` to `O(MP + P^2)` while keeping the same +mathematical discretization. + +## Arguments + + - `Kc`: Complex global projected double-layer kernel matrix. + - `Gc`: Complex global projected single-layer kernel matrix. + - `observer`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`. + - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`. + - `n`: Integer representing the order of the toroidal Fourier component. + - `Z`: Complex Fourier basis sampled on the `θ` grid. + - `Gram`: Mode-space Gram matrix for this basis on the discrete grid. + +## Block layout + +`Kc` and `Gc` are the complex global projected matrices. `Kc` contains four blocks corresponding to +plasma/wall as observer/source, `Gc` contains two blocks corresponding to plasma/wall as observer. +This function writes **only one block** to each of `Kc` and `Gc` per call: + + - `Kc_block` is a `P×P` view into `Kc` selected by `(observer isa PlasmaGeometry ? 1 : 2, source isa PlasmaGeometry ? 1 : 2)`. + - `Gc_block` is a `P×(2P)` view into `Gc` with the same observer block-row; only the columns + corresponding to the source being plasma are populated (when `source isa PlasmaGeometry`). - - Uses Simpson's rule for integration away from singular points - - Uses Gaussian quadrature near singular points for improved accuracy - - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161] +## Toroidal Green's functions + + - The scalar `G_n` returned by `green(...)` is `2π * 𝒢ⁿ(θ, θ′)` (Chance 1997), + the `n`-th toroidal Fourier component of the Laplace Green’s function in axisymmetry. + - The scalar `gradG_n` returned by `green(...)` corresponds to the toroidal-mode `n` + contribution to the **double-layer** integrand `𝒥 * (∇′𝒢ⁿ · ∇′ℒ)` + (Chance 1997), i.e. the normal-derivative factor multiplied by the geometric Jacobian. + - `gradG_0` is the `n = 0` piece used for analytic diagonal/singularity bookkeeping. + +## Numerical treatment of the singularity + +The toroidal Green’s function kernel is weakly singular as `θ′ → θ`. The implementation follows +Chance *Phys. Plasmas* **4**, 2161 (1997) and uses a mixed strategy: + + - **Far field (nonsingular region)**: composite Simpson’s `1/3` rule on the uniform `θ` grid, + skipping the near-singular stencil around `j`. + - **Near field (singular panels)**: 8-point Gauss–Legendre quadrature on the two panels + of length `dtheta` immediately to the left/right of `θ_j`, using: + + periodic cubic splines of `source.x(θ)` and `source.z(θ)` to evaluate geometry at off-grid nodes, + + precomputed 5-point Lagrange stencils to map each Gauss node back to the five neighboring + discrete source indices `[j-2, j-1, j, j+1, j+2]` *without allocations*, + + analytic logarithmic/singular correction terms (`log_correction_array`) for the single-layer + kernel when the observer/source block is plasma–plasma (Chance 1997, e.g. eqs. 75, 78), + + an analytic diagonal/residue correction for the double-layer kernel (Chance 1997, Table I / residues). + +## Performance and allocation avoidance (hot-path optimizations) + +This routine is intentionally written to be allocation-light in tight loops: + + - **Precomputed quadrature tables**: `GL8` and `GL8_LAGRANGE_STENCILS` are global constants. + - **Hoisted n-dependent constants**: `gamma_prefactor = 2√π * Γ(1/2 - n)` is computed once per call + and passed into `green(...)` rather than recomputed per quadrature node. + - **Spline derivative batching**: `∂R/∂θ` and `∂Z/∂θ` are evaluated on the full grid once (for Simpson), + while off-grid Gauss points evaluate splines directly. + - **Projection reuse**: the transpose `Zt = transpose(Z)` is materialized so that “row-accumulate” + operations `_accum_row!` can access `Z` with contiguous column memory. + - **Rank-1 assembly**: the final projected update uses `conj(Z[j,:]) ⊗ proj_*` via `_rank1_conj!`, + avoiding constructing intermediate `P×P` temporaries. + +## Caveats / limitations + + - **Closed-wall assumption**: the current residue/diagonal handling is written for closed, + periodic toroidal boundaries. Open-wall residue logic is not implemented. """ @with_pool pool function compute_2D_kernel_matrices!( - K::AbstractMatrix{ComplexF64}, - G::AbstractMatrix{ComplexF64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, n::Int, @@ -104,8 +174,8 @@ but K is not since it fills a different block of the # Take a view of the corresponding block of the grad_greenfunction col_idx = (source isa PlasmaGeometry ? 1 : 2) row_idx = (observer isa PlasmaGeometry ? 1 : 2) - K_block = view(K, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) - G_block = view(G, ((row_idx-1)*P+1):(row_idx*P), :) + Kc_block = view(Kc, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) + Gc_block = view(Gc, ((row_idx-1)*P+1):(row_idx*P), :) # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) populate_greenfunction = source isa PlasmaGeometry @@ -233,40 +303,45 @@ but K is not since it fills a different block of the _accum_row!(proj_k, diag_accum, Zt, j) # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ── - _rank1_conj!(K_block, Zt, j, proj_k) + _rank1_conj!(Kc_block, Zt, j, proj_k) if populate_greenfunction - _rank1_conj!(G_block, Zt, j, proj_g) + _rank1_conj!(Gc_block, Zt, j, proj_g) end end # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources if source isa PlasmaGeometry - K_block .*= -1 + Kc_block .*= -1 end # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) - K_block .+= residue .* Gram + Kc_block .+= residue .* Gram # Since we computed 2π𝒢, divide by 2π to get 𝒢 if populate_greenfunction - G_block ./= 2π + Gc_block ./= 2π end end -# Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n +""" + kernel!(Kc, Gc, observer, source, params::KernelParams2D, Z, Gram) + +Public 2D kernel entry point. This is a thin wrapper that forwards to +`compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)`. +""" function kernel!( - K::AbstractMatrix{ComplexF64}, - G::AbstractMatrix{ComplexF64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry,WallGeometry}, source::Union{PlasmaGeometry,WallGeometry}, params::KernelParams2D, Z::AbstractMatrix{ComplexF64}, Gram::AbstractMatrix{ComplexF64} ) - return compute_2D_kernel_matrices!(K, G, observer, source, params.n, Z, Gram) + return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram) end ############################################################# diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index c158734e..e066e788 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -395,68 +395,128 @@ function KernelWorkspace(PATCH_DIM::Int, RAD_DIM::Int, ANG_DIM::Int) end """ - compute_3D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER) + compute_3D_kernel_matrices!(K, G, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, Z, Gram) -Compute boundary integral kernel matrices for 3D geometries with the singular correction -algorithm from [Malhotra Plasma Phys. and Cont. Fusion 2019 024004]. -Uses multi-threading for parallel computation over observer points. +Compute the **Fourier/Galerkin-projected** 3D vacuum boundary-integral kernel blocks for +Laplace’s equation, using a high-order singular quadrature / partition-of-unity (POU) +scheme on a tensor-product `(θ, ζ)` surface grid. - - Far regions: Rectangle rule with uniform weights (1/N) - - Singular regions: Polar quadrature with partition-of-unity blending +Like the 2D kernel, this routine implements the **fused projection path** used by the vacuum solve: +it produces the projected operators in mode space **without materializing a dense** +`N×N` point-space kernel (where `N = mtheta * nzeta`). -grad_greenfunction is the double-layer kernel matrix, where each entry is -∇_{x_src} φ(x_obs, x_src) · n_src, and greenfunction is the single-layer kernel matrix, -where each entry is φ(x_obs, x_src). +## Mathematical object being discretized -# Arguments +Let `x(θ, ζ) ∈ ℝ^3` be a surface parametrization (plasma or wall surface) with outward +unit normal `n(θ, ζ)`. The Laplace kernels are: + + - **Single-layer**: `φ(x_obs, x_src) = 1 / |x_obs - x_src|` + - **Double-layer**: `∂φ/∂n_src = ∇_{x_src} φ ⋅ n_src = (x_obs - x_src) ⋅ n_src / |x_obs - x_src|^3` + +This routine computes the *discrete, projected* operators corresponding to these kernels, +using a uniform quadrature weight `dθdζ = 4π^2 / N` for the far field and a specialized +near-field correction for the singular region. + +## Arguments and block layout + + - `Kc`: Complex global projected double-layer kernel matrix (2P×2P). + - `Gc`: Complex global projected single-layer kernel matrix (2P×P). + - `observer`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data. + - `source`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data. + - `PATCH_RAD`: Half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors. + - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`). + - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`). + - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`. + - `Gram`: Mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution). + +This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`) +selected by whether observer/source are plasma or wall. + +## Numerical treatment of the singularity + +The kernel is weakly singular as `x_src → x_obs`. The implementation follows the +approach used in [Malhotra Journal of Comp. Phys. 2019 108791 eq. 38] - - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place + - **Far field** (nonsingular sources): - - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place + + Use a uniform trapezoidal/rectangle rule on the `(θ, ζ)` grid. + + For each observer point, a square patch of size `PATCH_DIM = 2*PATCH_RAD+1` + surrounding the singularity is excluded from the far-field sum. - - `observer`: Observer geometry (PlasmaGeometry3D) + - **Near field** (singular patch): - - `source`: Source geometry (PlasmaGeometry3D) + + Extract a Cartesian `PATCH_DIM×PATCH_DIM` patch of the source geometry around the + observer-aligned source index. + + Interpolate the patch to a **polar quadrature grid** (`RAD_DIM × ANG_DIM`, with `ANG_DIM=2*RAD_DIM`) + using a precomputed sparse interpolation operator `P2G` built from tensor-product + Lagrange stencils (`INTERP_ORDER` controls the stencil width). + + Evaluate kernels on the polar grid and weight them with a **partition-of-unity** + quadrature factor `Ppou` that includes the polar Jacobian factor (roughly `r * dr * dθ`) + and a smooth cutoff function `χ(ρ)` that localizes the singular correction. + + Map the polar correction back onto the Cartesian patch via `P2G` and blend with the + far-field trapezoid contribution using `Gpou`, so the combined weight is effectively + `trap*(1-χ) + singular_correction`. - - `PATCH_RAD`: Number of points adjacent to source point to treat as singular +## Fused projection and threading - + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1) +This function is written to be parallel over observer points: - - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM + - Each thread owns a `KernelWorkspace` (scratch arrays for patch extraction, polar interpolation, + and temporary kernel values), plus per-thread accumulation buffers `proj_k` / `proj_g` + (length `P`) and a boolean `is_patch` mask to skip patch indices in the far-field loop. - - `INTERP_ORDER`: Lagrange interpolation order + - For a given observer index `idx_obs`, the code accumulates the **projected row** + `(kernel row idx_obs) · Z` directly into `proj_k` / `proj_g` using `_accum_row!`, and then writes + these into shared buffers `KZt[:, idx_obs]` and `GZt[:, idx_obs]`. This is race-free because + each observer writes to a unique column. - + Must be ≤ (2 * PATCH_RAD + 1) + - After the threaded loop completes, the final `P×P` blocks are assembled efficiently with BLAS: -# Threading + Kc = Zᴴ * (KZt)' + Gc = Zᴴ * (GZt)' -This function automatically uses all available threads (`Threads.nthreads()`). -Start Julia with `julia -t auto` or set `JULIA_NUM_THREADS` to enable multi-threading. + implemented as `mul!(Kc_block, Z', transpose(KZt))` (and similarly for `Gc`). + +Normalization by `2π` is applied to match the 2D kernel convention so the downstream “add identity” +logic is consistent between 2D/3D. + +## Important parameters + + - `PATCH_RAD`: half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors. + - `RAD_DIM`: radial quadrature order on the polar grid (angular order is `2*RAD_DIM`). + - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`). + +## Performance notes / numerical optimizations + + - **Cached quadrature data**: `get_singular_quadrature` memoizes `P2G`, `Gpou`, `Ppou`, etc. for a given + `(PATCH_RAD, RAD_DIM, INTERP_ORDER)` triple to avoid expensive rebuilds. + - **Allocation control**: all near-field arrays live in thread-local `KernelWorkspace` objects; no per-observer + heap allocation is intended in the hot path. + - **Scalar kernel evaluation**: the Laplace kernels have scalar-argument overloads to avoid view/slice creation + and to enable LLVM to keep values in registers. """ function compute_3D_kernel_matrices!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + K::AbstractMatrix{ComplexF64}, + G::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, PATCH_RAD::Int, RAD_DIM::Int, - INTERP_ORDER::Int + INTERP_ORDER::Int, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractMatrix{ComplexF64} ) - num_points = observer.mtheta * observer.nzeta - dθdζ = 4π^2 / (num_points) + N, P = size(Z) # N = mtheta * nzeta, P = num_modes + dθdζ = 4π^2 / N + Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access - # Get block of grad green function matrix + # Take a view of the corresponding block of the K and G matrices col_index = (source isa PlasmaGeometry3D ? 1 : 2) row_index = (observer isa PlasmaGeometry3D ? 1 : 2) - grad_greenfunction_block = view( - grad_greenfunction, - ((row_index-1)*num_points+1):(row_index*num_points), - ((col_index-1)*num_points+1):(col_index*num_points) - ) + K_block = view(K, ((row_index-1)*P+1):(row_index*P), ((col_index-1)*P+1):(col_index*P)) + G_block = view(G, ((row_index-1)*P+1):(row_index*P), :) - # Zero out green function matrix - fill!(greenfunction, 0.0) - # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) + # G only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997) populate_greenfunction = source isa PlasmaGeometry3D # Initialize quadrature data @@ -470,16 +530,30 @@ function compute_3D_kernel_matrices!( @assert observer.mtheta ≥ PATCH_DIM "Must have observer.mtheta ≥ PATCH_DIM, got observer.mtheta=$(observer.mtheta), PATCH_DIM=$PATCH_DIM" @assert observer.nzeta ≥ PATCH_DIM "Must have observer.nzeta ≥ PATCH_DIM, got observer.nzeta=$(observer.nzeta), PATCH_DIM=$PATCH_DIM" + # Buffers for the projection: column idx_obs holds (kernel row idx_obs) · Z + KZt = zeros(ComplexF64, P, N) + GZt = zeros(ComplexF64, P, N) + # Allocate thread-local workspaces (one per thread) max_threadid = Threads.maxthreadid() workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_threadid] + proj_k_all = [zeros(ComplexF64, P) for _ in 1:max_threadid] + proj_g_all = [zeros(ComplexF64, P) for _ in 1:max_threadid] + is_patch_all = [falses(N) for _ in 1:max_threadid] # Parallel loop through observer points - Threads.@threads for idx_obs in 1:num_points + Threads.@threads for idx_obs in 1:N # Get thread-local workspace ws = workspaces[Threads.threadid()] (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar, n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws + proj_k = proj_k_all[Threads.threadid()] + proj_g = proj_g_all[Threads.threadid()] + is_patch = is_patch_all[Threads.threadid()] + + fill!(proj_k, 0.0) + fill!(proj_g, 0.0) + fill!(is_patch, false) # Convert linear index to 2D indices i_obs = mod1(idx_obs, observer.mtheta) @@ -488,21 +562,36 @@ function compute_3D_kernel_matrices!( @inbounds oy = observer.r[idx_obs, 2] @inbounds oz = observer.r[idx_obs, 3] + # Mark patch source indices so the far-field loop can skip them + @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM + idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) + idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) + is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true + end + # ============================================================ # FAR FIELD: Trapezoidal rule for nonsingular source points # Note: kernels return zero for r_src = r_obs # ============================================================ - @inbounds for idx_src in 1:num_points - sx = source.r[idx_src, 1]; - sy = source.r[idx_src, 2]; - sz = source.r[idx_src, 3] - nx = source.normal[idx_src, 1]; - ny = source.normal[idx_src, 2]; - nz = source.normal[idx_src, 3] - # Apply weights (periodic trapezoidal rule = constant weights) - grad_greenfunction_block[idx_obs, idx_src] = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ + @inbounds for idx_src in 1:N + is_patch[idx_src] && continue + w_double = + laplace_double_layer( + ox, + oy, + oz, + source.r[idx_src, 1], + source.r[idx_src, 2], + source.r[idx_src, 3], + source.normal[idx_src, 1], + source.normal[idx_src, 2], + source.normal[idx_src, 3] + ) * dθdζ + _accum_row!(proj_k, w_double, Zt, idx_src) + if populate_greenfunction - greenfunction[idx_obs, idx_src] = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ + w_single = laplace_single_layer(ox, oy, oz, source.r[idx_src, 1], source.r[idx_src, 2], source.r[idx_src, 3]) * dθdζ + _accum_row!(proj_g, w_single, Zt, idx_src) end end @@ -525,11 +614,11 @@ function compute_3D_kernel_matrices!( # Evaluate kernels at polar points with POU weighting @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already - rsx = r_polar[ir, ia, 1]; - rsy = r_polar[ir, ia, 2]; + rsx = r_polar[ir, ia, 1] + rsy = r_polar[ir, ia, 2] rsz = r_polar[ir, ia, 3] - nsx = n_polar[ir, ia, 1]; - nsy = n_polar[ir, ia, 2]; + nsx = n_polar[ir, ia, 1] + nsy = n_polar[ir, ia, 2] nsz = n_polar[ir, ia, 3] M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ @@ -550,49 +639,74 @@ function compute_3D_kernel_matrices!( idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta) idx_src = idx_pol + source.mtheta * (idx_tor - 1) - trap_double = grad_greenfunction_block[idx_obs, idx_src] - grad_greenfunction_block[idx_obs, idx_src] = trap_double + M_grid_double[i, j] + trap_double * Gpou[i, j] + sx = source.r[idx_src, 1] + sy = source.r[idx_src, 2] + sz = source.r[idx_src, 3] + nx = source.normal[idx_src, 1] + ny = source.normal[idx_src, 2] + nz = source.normal[idx_src, 3] + + far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ + _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) # Apply near + far contributions if populate_greenfunction - trap_single = greenfunction[idx_obs, idx_src] - greenfunction[idx_obs, idx_src] = trap_single + M_grid_single[i, j] + trap_single * Gpou[i, j] + far_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[i, j]) * dθdζ + _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src) + end + end + + # ── Write projected column to buffer (each idx_obs owns its column) ── + @inbounds for p in 1:P + KZt[p, idx_obs] = proj_k[p] + end + if populate_greenfunction + @inbounds for p in 1:P + GZt[p, idx_obs] = proj_g[p] end end end # Use the same normalization as in the 2D kernel so we can just add I to the diagonal # This makes the grri logic identical to the 2D kernel. - grad_greenfunction_block ./= 2π - greenfunction ./= 2π + mul!(K_block, Z', transpose(KZt)) + K_block ./= 2π + if populate_greenfunction + mul!(G_block, Z', transpose(GZt)) + G_block ./= 2π + end # Add the term that comes from the volume integral of Green's identity - typeof(source) == typeof(observer) && begin - for i in 1:num_points - grad_greenfunction_block[i, i] += 1.0 - end + if typeof(source) == typeof(observer) + K_block .+= Gram end end """ - kernel!(grad_greenfunction, greenfunction, observer, source, params::KernelParams3D) + kernel!(Kc, Gc, observer, source, params::KernelParams3D, Z, Gram) + +Public 3D kernel entry point. Forwards to: -Dispatch wrapper for 3D kernel that forwards to `compute_3D_kernel_matrices!` with params. +`compute_3D_kernel_matrices!(Kc, Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, Z, Gram)`. """ function kernel!( - grad_greenfunction::AbstractMatrix{Float64}, - greenfunction::AbstractMatrix{Float64}, + Kc::AbstractMatrix{ComplexF64}, + Gc::AbstractMatrix{ComplexF64}, observer::Union{PlasmaGeometry3D,WallGeometry3D}, source::Union{PlasmaGeometry3D,WallGeometry3D}, - params::KernelParams3D + params::KernelParams3D, + Z::AbstractMatrix{ComplexF64}, + Gram::AbstractMatrix{ComplexF64} ) return compute_3D_kernel_matrices!( - grad_greenfunction, - greenfunction, + Kc, + Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, - params.INTERP_ORDER + params.INTERP_ORDER, + Z, + Gram ) end diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl deleted file mode 100644 index 5c9c5369..00000000 --- a/src/Vacuum/ProjectedKernel.jl +++ /dev/null @@ -1,404 +0,0 @@ -# Fused kernel assembly + Fourier projection for Galerkin vacuum solve. -# -# Instead of materializing the full M×M kernel matrices and then projecting, -# these functions accumulate the P×P projected matrices row by row as the -# kernel values are computed, reducing memory from O(M²) to O(MP). -# -# K_c = Z^H K Z and G_c = Z^H G Z -# -# where Z = C + iS is the [M × P] complex Fourier basis, K is the double-layer -# kernel, and G is the single-layer kernel. For each observer point j, the -# kernel row is projected and accumulated via rank-1 updates: -# -# K_c += conj(Z[j,:]) ⊗ (K[j,:] · Z) -# -# FLOP cost is identical to the two-step approach O(M²P), but memory drops -# from O(M²) to O(MP + P²). - -# ============================================================================ -# 2D fused projected kernel -# ============================================================================ -""" - kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram) - -Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z -directly, without materializing the full M×M kernel matrices. - -Dispatches to the 2D or 3D implementation based on the geometry/params types. - -# Arguments - - - `K_c::Matrix{ComplexF64}`: Output P×P projected double-layer kernel [filled in-place] - - `G_c::Matrix{ComplexF64}`: Output P×P projected single-layer kernel [filled in-place] - - `observer`: Observer geometry struct - - `source`: Source geometry struct - - `params`: Kernel parameters (KernelParams2D or KernelParams3D) - - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ)) - - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term) -""" -# function kernel!( -# K_c::AbstractMatrix{ComplexF64}, -# G_c::AbstractMatrix{ComplexF64}, -# observer::Union{PlasmaGeometry,WallGeometry}, -# source::Union{PlasmaGeometry,WallGeometry}, -# params::KernelParams2D, -# exp_mn_basis::AbstractMatrix{ComplexF64}, -# Gram::AbstractMatrix{ComplexF64} -# ) -# _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram) -# end - -function kernel!( - K_c::AbstractMatrix{ComplexF64}, - G_c::AbstractMatrix{ComplexF64}, - observer::Union{PlasmaGeometry3D,WallGeometry3D}, - source::Union{PlasmaGeometry3D,WallGeometry3D}, - params::KernelParams3D, - exp_mn_basis::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} -) - _projected_kernel_3D!(K_c, G_c, observer, source, - params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, - exp_mn_basis, Gram) -end -""" - _projected_kernel_2D!(K_c, G_c, observer, source, n, exp_mn_basis, Gram) - -Fused 2D kernel assembly + projection. Mirrors the loop structure of -`compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the -P×P projected matrices instead of filling the M×M kernel matrices. - -Memory: O(MP) instead of O(M²). -""" -@with_pool pool function _projected_kernel_2D!( - K_c::AbstractMatrix{ComplexF64}, - G_c::AbstractMatrix{ComplexF64}, - observer::Union{PlasmaGeometry,WallGeometry}, - source::Union{PlasmaGeometry,WallGeometry}, - n::Int, - exp_mn_basis::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} -) - M, P = size(exp_mn_basis) - Z = exp_mn_basis - Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access - mtheta = length(observer.x) - dtheta = 2π / mtheta - theta_grid = range(; start=0, length=mtheta, step=dtheta) - - # Take a view of the corresponding block of the K_c and G_c matrices - col_idx = (source isa PlasmaGeometry ? 1 : 2) - row_idx = (observer isa PlasmaGeometry ? 1 : 2) - K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) - G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :) - - populate_greenfunction = source isa PlasmaGeometry - - # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78] - log_correction_0 = 16.0 * dtheta * (log(2 * dtheta) - 68.0 / 15.0) / 15.0 - log_correction_1 = 128.0 * dtheta * (log(2 * dtheta) - 8.0 / 15.0) / 45.0 - log_correction_2 = 4.0 * dtheta * (7.0 * log(2 * dtheta) - 11.0 / 15.0) / 45.0 - log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2) - - gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n) - - spline_x = cubic_interp(theta_grid, source.x; bc=PeriodicBC(; endpoint=:exclusive, period=2π)) - spline_z = cubic_interp(theta_grid, source.z; bc=PeriodicBC(; endpoint=:exclusive, period=2π)) - d1_spline_x = deriv1(spline_x) - d1_spline_z = deriv1(spline_z) - - stencils_left, stencils_right = GL8_LAGRANGE_STENCILS - sing_idx = zeros!(pool, Int, 5) - - dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta) - dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta) - d1_spline_x(dx_dtheta_grid, theta_grid) - d1_spline_z(dz_dtheta_grid, theta_grid) - - # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition) - legendre_buf = Vector{Float64}(undef, n + 2) - - # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z - proj_kz = zeros!(pool, ComplexF64, P) - proj_gz = zeros!(pool, ComplexF64, P) - - for j in 1:mtheta - x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j] - - fill!(proj_kz, 0.0) - fill!(proj_gz, 0.0) - diag_accum = 0.0 - - # ── Simpson integration for nonsingular source points ── - @inbounds for k in 1:(mtheta-3) - isrc = mod1(j + 1 + k, mtheta) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, - source.x[isrc], source.z[isrc], - dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; - gamma_prefactor) - - wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2)) - - if populate_greenfunction - _accum_row!(proj_gz, G_n * wsimpson, Zt, isrc) - end - _accum_row!(proj_kz, gradG_n * wsimpson, Zt, isrc) - - diag_accum -= gradG_0 * wsimpson - end - - # ── Gaussian quadrature for singular points ── - for (offset_idx, offset) in enumerate(-2:2) - sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta) - end - - for leftpanel in (true, false) - gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta) - @inbounds for ig in 1:8 - theta_gauss = gauss_mid + GL8.x[ig] * dtheta - theta_gauss0 = mod(theta_gauss, 2π) - x_gauss = spline_x(theta_gauss0) - dx_dtheta_gauss = d1_spline_x(theta_gauss0) - z_gauss = spline_z(theta_gauss0) - dz_dtheta_gauss = d1_spline_z(theta_gauss0) - G_n, gradG_n, gradG_0 = green(x_obs, z_obs, - x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; - gamma_prefactor) - - s = leftpanel ? stencils_left[ig] : stencils_right[ig] - wgauss = GL8.w[ig] * dtheta - - if populate_greenfunction - if observer isa PlasmaGeometry - G_n += log((theta_obs - theta_gauss)^2) / x_obs - end - @inbounds for stencil_idx in 1:5 - _accum_row!(proj_gz, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) - end - end - - @inbounds for stencil_idx in 1:5 - _accum_row!(proj_kz, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx]) - end - - diag_accum -= gradG_0 * wgauss - end - end - - # Analytic singular integral correction [Chance 1997 eq. 75] - if populate_greenfunction && observer isa PlasmaGeometry - @inbounds for stencil_idx in 1:5 - _accum_row!(proj_gz, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx]) - end - end - - # Fold diagonal accumulation into projection - _accum_row!(proj_kz, diag_accum, Zt, j) - - # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ── - _rank1_conj!(K_c_block, Zt, j, proj_kz) - if populate_greenfunction - _rank1_conj!(G_c_block, Zt, j, proj_gz) - end - end - - # ── Post-processing (mirrors compute_2D_kernel_matrices!) ── - - # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source - if source isa PlasmaGeometry - K_c_block .*= -1 - end - - # Diagonal residue: K += residue·I → K_c += residue·Gram - # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] - residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) - if residue != 0.0 - K_c_block .+= residue .* Gram - end - - # 2π𝒢 → 𝒢 - if populate_greenfunction - G_c_block ./= 2π - end -end - - -# ============================================================================ -# 3D fused projected kernel -# ============================================================================ - -""" - _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, exp_mn_basis, Gram) - -Fused 3D kernel assembly + projection. Mirrors the loop structure of -`compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction) -but writes projected P-vectors to per-observer rows of [M × P] buffers instead of -filling the M×M kernel matrices. The P×P assembly is done after the parallel loop -via sequential GEMM calls. - -Each observer writes to its own row of the shared buffers, so there are no -cross-thread accumulation races — the same write pattern as the original -`compute_3D_kernel_matrices!`. - -Memory: O(2MP + P²) instead of O(M²). -""" -function _projected_kernel_3D!( - K_c::AbstractMatrix{ComplexF64}, - G_c::AbstractMatrix{ComplexF64}, - observer::Union{PlasmaGeometry3D,WallGeometry3D}, - source::Union{PlasmaGeometry3D,WallGeometry3D}, - PATCH_RAD::Int, - RAD_DIM::Int, - INTERP_ORDER::Int, - exp_mn_basis::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} -) - M, P = size(exp_mn_basis) - Z = exp_mn_basis - Zt = Matrix{ComplexF64}(transpose(Z)) # [P × M] for contiguous column access - num_points = observer.mtheta * observer.nzeta - dθdζ = 4π^2 / num_points - - # Take a view of the corresponding block of the K_c and G_c matrices - col_idx = (source isa PlasmaGeometry3D ? 1 : 2) - row_idx = (observer isa PlasmaGeometry3D ? 1 : 2) - K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P)) - G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :) - populate_greenfunction = source isa PlasmaGeometry3D - - if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2 - @warn "PATCH_RAD clamped in projected kernel" max_PATCH_RAD=(min(source.mtheta, source.nzeta) - 1) ÷ 2 - PATCH_RAD = (min(source.mtheta, source.nzeta) - 1) ÷ 2 - end - quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER) - (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data - - # [P × M] buffers: column idx_obs holds (kernel row idx_obs) · Z - KZt = zeros(ComplexF64, P, M) - GZt = zeros(ComplexF64, P, M) - - # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors + patch mask) - max_tid = Threads.maxthreadid() - workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid] - proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] - proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid] - is_patch_all = [falses(num_points) for _ in 1:max_tid] - - Threads.@threads :static for idx_obs in 1:num_points - tid = Threads.threadid() - ws = workspaces[tid] - (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar, - n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws - - proj_kz = proj_kz_all[tid] - proj_gz = proj_gz_all[tid] - is_patch = is_patch_all[tid] - - fill!(proj_kz, 0.0) - fill!(proj_gz, 0.0) - fill!(is_patch, false) - - i_obs = mod1(idx_obs, observer.mtheta) - j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 - @inbounds ox = observer.r[idx_obs, 1] - @inbounds oy = observer.r[idx_obs, 2] - @inbounds oz = observer.r[idx_obs, 3] - - # Mark patch source indices so the far-field loop can skip them - @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM - idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) - idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) - is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true - end - - # ── FAR FIELD: Trapezoidal rule (skip patch — handled in POU correction) ── - @inbounds for idx_src in 1:num_points - is_patch[idx_src] && continue - sx = source.r[idx_src, 1]; - sy = source.r[idx_src, 2]; - sz = source.r[idx_src, 3] - nx = source.normal[idx_src, 1]; - ny = source.normal[idx_src, 2]; - nz = source.normal[idx_src, 3] - w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ - _accum_row!(proj_kz, w_double, Zt, idx_src) - - if populate_greenfunction - w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ - _accum_row!(proj_gz, w_single, Zt, idx_src) - end - end - - # ── NEAR FIELD: Polar quadrature with BIEST singular correction ── - extract_patch!(r_patch, source.r, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) - extract_patch!(dr_dθ_patch, source.dr_dθ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) - extract_patch!(dr_dζ_patch, source.dr_dζ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM) - - interpolate_to_polar!(r_polar, r_patch, P2G) - interpolate_to_polar!(dr_dθ_polar, dr_dθ_patch, P2G) - interpolate_to_polar!(dr_dζ_polar, dr_dζ_patch, P2G) - - compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient) - - @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM - rsx = r_polar[ir, ia, 1]; - rsy = r_polar[ir, ia, 2]; - rsz = r_polar[ir, ia, 3] - nsx = n_polar[ir, ia, 1]; - nsy = n_polar[ir, ia, 2]; - nsz = n_polar[ir, ia, 3] - M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ - end - - mul!(M_grid_single_flat, P2G, vec(M_polar_single)) - mul!(M_grid_double_flat, P2G, vec(M_polar_double)) - M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) - M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) - - # POU correction: evaluate kernel once with combined weight (1+Gpou) = (1-χ) - # since far-field skipped patch points, we include the full trapezoidal + polar here - @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM - idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta) - idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta) - idx_src = idx_pol + source.mtheta * (idx_tor - 1) - - sx = source.r[idx_src, 1]; - sy = source.r[idx_src, 2]; - sz = source.r[idx_src, 3] - nx = source.normal[idx_src, 1]; - ny = source.normal[idx_src, 2]; - nz = source.normal[idx_src, 3] - full_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[ii, jj]) * dθdζ - _accum_row!(proj_kz, M_grid_double[ii, jj] + full_double, Zt, idx_src) - - if populate_greenfunction - full_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[ii, jj]) * dθdζ - _accum_row!(proj_gz, M_grid_single[ii, jj] + full_single, Zt, idx_src) - end - end - - # ── Write projected column to buffer (each idx_obs owns its column) ── - @inbounds for p in 1:P - KZt[p, idx_obs] = proj_kz[p] - end - if populate_greenfunction - @inbounds for p in 1:P - GZt[p, idx_obs] = proj_gz[p] - end - end - end - - # ── Assemble P×P projected matrices: K_c = Z^H * KZt^T, G_c = Z^H * GZt^T ── - mul!(K_c_block, Z', transpose(KZt)) - K_c_block ./= 2π - if populate_greenfunction - mul!(G_c_block, Z', transpose(GZt)) - G_c_block ./= 2π - end - - # Diagonal: K += I → K_c += Gram [for same-type source/observer] - if typeof(source) == typeof(observer) - K_c_block .+= Gram - end -end diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 7d994531..398d0ff5 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -16,7 +16,6 @@ include("DataTypes.jl") include("PnQuadCache.jl") include("Kernel2D.jl") include("Kernel3D.jl") -include("ProjectedKernel.jl") include("Field.jl") export VacuumInput, WallShapeSettings @@ -124,9 +123,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC K_int = similar!(pool, K_ext) G_int = similar!(pool, G_ext) - # Fused projected kernel: compute Z^H K Z and Z^H G Z + # Fused projected kernel: compute Z^H K Z and Z^H G Z for all operator blocks + # Plasma-plasma block kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram) + # Wall-plasma, plasma-wall, wall-wall blocks if !wall.nowall + # Wall-plasma, plasma-wall, wall-wall blocks kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram) kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram) kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram) @@ -175,57 +177,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) end - """ - # ================================================================ - # Collocation approach: solve full physical-space system [M × M] - # Handles both no-wall and wall cases. - # ================================================================ - # Full-size kernel matrices - grad_green = zeros!(pool, num_points_total, num_points_total) - green_temp = zeros!(pool, num_points_surf, num_points_surf) - - kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams) - - # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z) - mul!(temp, green_temp, exp_mn_basis) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - - if !wall.nowall - # Plasma–Wall block - kernel!(grad_green, green_temp, plasma_surf, wall, kparams) - # Wall–Wall block - kernel!(grad_green, green_temp, wall, wall, kparams) - # Wall–Plasma block - kernel!(grad_green, green_temp, wall, plasma_surf, kparams) - # Project obs=wall, src=plasma block to mode space - mul!(temp, green_temp, exp_mn_basis) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - end - - # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1) - grri .= grre # start from same as exterior - grad_green_interior = similar!(pool, grad_green) - grad_green_interior .= grad_green - - # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel - F_ext = lu!(grad_green) - ldiv!(F_ext, grre) - # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal - grad_green_interior .*= -1 - for i in 1:num_points_total - grad_green_interior[i, i] += 2.0 - end - F_int = lu!(grad_green_interior) - ldiv!(F_int, grri) - - # wv = (4π²/M) · Z^H · grre_complex [Chance Phys. Plasmas 2007 052506 eq. 115-118] - temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)])) - mul!(wv, exp_mn_basis', temp) - wv .*= (4π^2 / M) -""" + # Enforce symmetry in the vacuum response matrix if desired inputs.force_wv_symmetry && hermitianpart!(wv) if nzeta > 1 # 3D From 84542557e147691e05c905fdc4b428f090262ca7 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Tue, 17 Mar 2026 15:09:15 -0400 Subject: [PATCH 19/23] VACUUM - IMPROVEMENT - updating the main docstring --- src/Vacuum/Vacuum.jl | 85 ++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 398d0ff5..b2ab97fd 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -23,42 +23,76 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field export extract_plasma_surface_at_psi """ - compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings) + _compute_vacuum_response_single!( + wv, grri_in, grre_in, plasma_pts, wall_pts, + inputs::VacuumInput, wall_settings::WallShapeSettings; + n_override=nothing + ) -Compute the vacuum response matrix and both Green's functions using provided vacuum inputs. +Compute a single vacuum solve (one coupled 3D solve, or one `n`-slice in 2D) by building and solving +the boundary integral equation in mode space with an optional conducting wall present and writing out the results: -Single entry point for vacuum calculations. + - `wv`: complex vacuum response matrix in straight-fieldline mode space + - `grri_in`: interior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility) + - `grre_in`: exterior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility) + - `plasma_pts` / `wall_pts`: output point clouds for downstream plotting/diagnostics - - For **3D** (`inputs.nzeta > 1`), computes the full coupled response across all (m,n) modes defined - by `inputs.(mlow, mpert, nlow, npert)`. +## Fused kernel assembly + projection - - For **2D geometry** (`inputs.nzeta == 1`), supports either: +This routine uses a **newer kernel evaluation path** that never forms dense point-space kernel matrices. +Instead, it fuses kernel evaluation and Fourier/Galerkin projection into a single pass. - + **single-n** (`inputs.npert == 1`): computes (m,n) response for `n = inputs.nlow` - + **multi-n** (`inputs.npert > 1`): loops over `n = inputs.nlow:(inputs.nlow+inputs.npert-1)` and returns - **blocks** of the full response matrices with one block per toroidal mode number. +The key idea is: -This is the pure Julia implementation that replaces the Fortran `mscvac` function. -It computes both interior (grri) and exterior (grre) Green's functions for GPEC response calculations. + - Assemble and solve the boundary integral equation directly in `P×P` mode space. -# Arguments + - Avoid materializing `M×M` (2D) or `N×N` (3D) kernel matrices. - - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info. - - `wall_settings::WallShapeSettings`: Wall geometry configuration. + - Uses complex basis `Z = C + iS` so projected operators are `P×P` complex. -# Returns + - The projected operators are accumulated row-by-row while kernel values are computed. + + - Memory drops from `O(M^2)` (or `O(N^2)`) down to `O(MP + P^2)` (or `O(NP + P^2)`). + + - FLOPs remain dominated by the same scaling as the two-step approach (kernel evaluation + projection), + plus an additional `O(P^3)` for the LU factorization/solve in mode space. + + - **Projected matrices** + + + Exterior projected kernel blocks are assembled into `K_ext` and `G_ext`. + + Interior operators are formed from the exterior ones using the discrete Green-identity diagonal term: + the implementation uses `K_int = 2*Gram - K_ext` for same-type source/observer blocks. This effectively + computes the kernel with an negative normal direction without recalculating the kernel. + + - **Solves** - - `wv`: Complex vacuum response matrix. + + If `nowall`, solve the plasma-only `P×P` system. + + If a wall is present, solve the coupled `2P×2P` block system. - + 2D single-n: `mpert × mpert` - + 2D multi-n: `(mpert*npert) × (mpert*npert)` (block diagonal) - + 3D: `num_modes × num_modes` (full coupled) + - **Back-compat outputs** - - `grri`: Interior Green's function matrix. + + Although the solve is performed in mode space, `grri_in` and `grre_in` are reconstructed into the + legacy real `M×(2P)` layout for downstream code paths that still expect that shape. - - `grre`: Exterior Green's function matrix. +## Arguments - - `xzpts`: Coordinate array (mtheta×4 for 2D, mtheta*nzeta×4 for 3D) [R_plasma, Z_plasma, R_wall, Z_wall]. + - **`wv::AbstractMatrix{ComplexF64}`**: output vacuum response matrix (modified in-place) + - **`grri_in::AbstractMatrix{Float64}`**: output interior Green's function (modified in-place; real/legacy layout) + - **`grre_in::AbstractMatrix{Float64}`**: output exterior Green's function (modified in-place; real/legacy layout) + - **`plasma_pts::AbstractMatrix{Float64}`**: plasma surface coordinates (modified in-place) + - **`wall_pts::AbstractMatrix{Float64}`**: wall surface coordinates (modified in-place) + - **`inputs::VacuumInput`**: mode ranges, grid resolution, and geometry settings + - **`wall_settings::WallShapeSettings`**: wall geometry configuration + - **`n_override::Union{Nothing,Int}`**: optional toroidal mode number override (only used for 2D) + +## 2D vs 3D behavior + + - **3D (`inputs.nzeta > 1`)**: computes the full coupled response across all `(m, n)` modes specified by + `inputs.(mlow, mpert, nlow, npert)` in a single call using the 3D kernel method in Kernel3D.jl. + - **2D (`inputs.nzeta == 1`)**: + + If `inputs.npert == 1`, computes the response for `n = inputs.nlow` using the 2D kernel method in Kernel2D.jl. + + If `inputs.npert > 1`, the public driver loops over `n` and calls this function once per `n`, + writing block columns into the full output matrices using the 2D kernel method in Kernel2D.jl. """ @with_pool pool function _compute_vacuum_response_single!( wv::AbstractMatrix{ComplexF64}, @@ -92,10 +126,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Active rows for computation (plasma only if no wall, plasma+wall if wall present) num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf - # Views into output Green's function matrices for the active rows/columns - grre = @view grre_in[1:num_points_total, :] - grri = @view grri_in[1:num_points_total, :] - # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility M = num_points_surf P = num_modes @@ -163,6 +193,9 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC # Need to convert mode space to physical space and unpack the real and imaginary parts # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code # perhaps make it a complex P * P matrix? Then don't need any of this section + # Views into output Green's function matrices for the active rows/columns + grre = @view grre_in[1:num_points_total, :] + grri = @view grri_in[1:num_points_total, :] mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) @view(grre[1:M, 1:P]) .= real.(temp) @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) From c719a6b0d7d104d6e43f37542b52a588b7e87b18 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Tue, 17 Mar 2026 15:26:46 -0400 Subject: [PATCH 20/23] VACUUM - IMPROVEMENT - adding back in the logic that only allotes the full matrices if a wall is present --- src/Vacuum/Vacuum.jl | 90 +++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index b2ab97fd..400dacc5 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -114,7 +114,7 @@ The key idea is: # Compute Fourier basis coefficients ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν) - num_points_surf, num_modes = size(exp_mn_basis) + num_points, num_modes = size(exp_mn_basis) # Create kernel parameters structs used to dispatch to the correct kernel # Hardcode these values for now - can expose to the user in the future @@ -123,33 +123,16 @@ The key idea is: INTERP_ORDER = 5 kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override) - # Active rows for computation (plasma only if no wall, plasma+wall if wall present) - num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf - - # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility - M = num_points_surf - P = num_modes - temp = zeros!(pool, ComplexF64, M, P) - - # ================================================================ - # Galerkin: solve system in P×P mode space. Uses complex basis - # Z = C + iS so projected matrices are P×P complex. - # - # Fused (fuse_projection=true): kernel assembly + Fourier projection - # in one pass. The full M×M kernel matrices are never materialized — - # instead the P×P projected matrices grad_green_fourier and G_c are - # accumulated row by row as kernel values are computed. - # Memory: O(MP + P²) instead of O(M²) - # - # FLOPs: O(M²P + P³) - # ================================================================ + # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed) + wall_fac = wall.nowall ? 1 : 2 + # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve - Gram = zeros!(pool, ComplexF64, P, P) + Gram = zeros!(pool, ComplexF64, num_modes, num_modes) mul!(Gram, exp_mn_basis', exp_mn_basis) # Projected kernel matrices [P × P complex] - K_ext = zeros!(pool, ComplexF64, 2P, 2P) - G_ext = zeros!(pool, ComplexF64, 2P, P) + K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes) + G_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, num_modes) K_int = similar!(pool, K_ext) G_int = similar!(pool, G_ext) @@ -166,54 +149,49 @@ The key idea is: # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext K_int .= -K_ext - K_int[1:P, 1:P] .+= 2 .* Gram + K_int[1:num_modes, 1:num_modes] .+= 2 .* Gram if !wall.nowall - K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram + K_int[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)] .+= 2 .* Gram end G_int .= G_ext # Solve projected BIEs for exterior and interior kernels - if wall.nowall - F_ext = lu!(K_ext[1:P, 1:P]) - ldiv!(F_ext, @view(G_ext[1:P, :])) - F_int = lu!(K_int[1:P, 1:P]) - ldiv!(F_int, @view(G_int[1:P, :])) - else - F_ext = lu!(K_ext) - ldiv!(F_ext, G_ext) - F_int = lu!(K_int) - ldiv!(F_int, G_int) - end + F_ext = lu!(K_ext) + ldiv!(F_ext, G_ext) + F_int = lu!(K_int) + ldiv!(F_int, G_int) # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G - mul!(wv, Gram, view(G_ext, 1:P, :)) - wv .*= (4π^2 / M) + mul!(wv, Gram, view(G_ext, 1:num_modes, :)) + wv .*= (4π^2 / num_points) + + # Enforce Hermitian symmetry if desired + inputs.force_wv_symmetry && hermitianpart!(wv) # Backward-compatible reconstruction: grre/grri in M×2P real layout # Need to convert mode space to physical space and unpack the real and imaginary parts # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code # perhaps make it a complex P * P matrix? Then don't need any of this section # Views into output Green's function matrices for the active rows/columns - grre = @view grre_in[1:num_points_total, :] - grri = @view grri_in[1:num_points_total, :] - mul!(temp, exp_mn_basis, view(G_ext, 1:P, :)) - @view(grre[1:M, 1:P]) .= real.(temp) - @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, 1:P, :)) - @view(grri[1:M, 1:P]) .= real.(temp) - @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp) + grre = @view grre_in[1:(wall_fac*num_points), :] + grri = @view grri_in[1:(wall_fac*num_points), :] + temp = zeros!(pool, ComplexF64, num_points, num_modes) + + mul!(temp, exp_mn_basis, view(G_ext, 1:num_modes, :)) + @view(grre[1:num_points, 1:num_modes]) .= real.(temp) + @view(grre[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, 1:num_modes, :)) + @view(grri[1:num_points, 1:num_modes]) .= real.(temp) + @view(grri[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp) if !wall.nowall - mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :)) - @view(grre[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) - mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :)) - @view(grri[(M+1):(2*M), 1:P]) .= real.(temp) - @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_ext, (num_modes+1):(2*num_modes), :)) + @view(grre[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp) + @view(grre[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp) + mul!(temp, exp_mn_basis, view(G_int, (num_modes+1):(2*num_modes), :)) + @view(grri[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp) + @view(grri[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp) end - # Enforce symmetry in the vacuum response matrix if desired - inputs.force_wv_symmetry && hermitianpart!(wv) - if nzeta > 1 # 3D plasma_pts .= plasma_surf.r wall_pts .= wall.r From 4c001f46432b71f3ff997be6c72277d521104445 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Wed, 18 Mar 2026 10:02:02 -0400 Subject: [PATCH 21/23] VACUUM - IMPROVEMENT - using a vector for the diagonal of the gram matrix instead of a dense matrix --- src/Vacuum/Kernel2D.jl | 12 +++++++----- src/Vacuum/Kernel3D.jl | 35 +++++++++++++++++------------------ src/Vacuum/Vacuum.jl | 26 ++++++++++++++++++++------ test/runtests_vacuum.jl | 3 ++- 4 files changed, 46 insertions(+), 30 deletions(-) diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl index 49772e96..17748804 100644 --- a/src/Vacuum/Kernel2D.jl +++ b/src/Vacuum/Kernel2D.jl @@ -100,7 +100,7 @@ mathematical discretization. - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`. - `n`: Integer representing the order of the toroidal Fourier component. - `Z`: Complex Fourier basis sampled on the `θ` grid. - - `Gram`: Mode-space Gram matrix for this basis on the discrete grid. + - `Gram`: Diagonal of the mode-space Gram matrix for this basis on the discrete grid. ## Block layout @@ -163,7 +163,7 @@ This routine is intentionally written to be allocation-light in tight loops: source::Union{PlasmaGeometry,WallGeometry}, n::Int, Z::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} + Gram::AbstractVector{ComplexF64} ) M, P = size(Z) # M = mtheta, P = num_modes @@ -316,9 +316,11 @@ This routine is intentionally written to be allocation-light in tight loops: end # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89] - # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue + # The residue term is diagonal in mode space and is scaled by the Gram diagonal. residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0) - Kc_block .+= residue .* Gram + @inbounds for p in 1:P + Kc_block[p, p] += residue * Gram[p] + end # Since we computed 2π𝒢, divide by 2π to get 𝒢 if populate_greenfunction @@ -339,7 +341,7 @@ function kernel!( source::Union{PlasmaGeometry,WallGeometry}, params::KernelParams2D, Z::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} + Gram::AbstractVector{ComplexF64} ) return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram) end diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index e066e788..d7abd667 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -427,7 +427,7 @@ near-field correction for the singular region. - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`). - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`). - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`. - - `Gram`: Mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution). + - `Gram`: Diagonal of the mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution). This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`) selected by whether observer/source are plasma or wall. @@ -504,7 +504,7 @@ function compute_3D_kernel_matrices!( RAD_DIM::Int, INTERP_ORDER::Int, Z::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} + Gram::AbstractVector{ComplexF64} ) N, P = size(Z) # N = mtheta * nzeta, P = num_modes dθdζ = 4π^2 / N @@ -575,22 +575,19 @@ function compute_3D_kernel_matrices!( # ============================================================ @inbounds for idx_src in 1:N is_patch[idx_src] && continue - w_double = - laplace_double_layer( - ox, - oy, - oz, - source.r[idx_src, 1], - source.r[idx_src, 2], - source.r[idx_src, 3], - source.normal[idx_src, 1], - source.normal[idx_src, 2], - source.normal[idx_src, 3] - ) * dθdζ + + sx = source.r[idx_src, 1] + sy = source.r[idx_src, 2] + sz = source.r[idx_src, 3] + nx = source.normal[idx_src, 1] + ny = source.normal[idx_src, 2] + nz = source.normal[idx_src, 3] + + w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ _accum_row!(proj_k, w_double, Zt, idx_src) if populate_greenfunction - w_single = laplace_single_layer(ox, oy, oz, source.r[idx_src, 1], source.r[idx_src, 2], source.r[idx_src, 3]) * dθdζ + w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ _accum_row!(proj_g, w_single, Zt, idx_src) end end @@ -676,9 +673,11 @@ function compute_3D_kernel_matrices!( G_block ./= 2π end - # Add the term that comes from the volume integral of Green's identity + # Add the term that comes from the volume integral of Green's identity. if typeof(source) == typeof(observer) - K_block .+= Gram + @inbounds for p in 1:P + K_block[p, p] += Gram[p] + end end end @@ -696,7 +695,7 @@ function kernel!( source::Union{PlasmaGeometry3D,WallGeometry3D}, params::KernelParams3D, Z::AbstractMatrix{ComplexF64}, - Gram::AbstractMatrix{ComplexF64} + Gram::AbstractVector{ComplexF64} ) return compute_3D_kernel_matrices!( Kc, diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl index 400dacc5..72ad0ad4 100644 --- a/src/Vacuum/Vacuum.jl +++ b/src/Vacuum/Vacuum.jl @@ -126,9 +126,16 @@ The key idea is: # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed) wall_fac = wall.nowall ? 1 : 2 - # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve - Gram = zeros!(pool, ComplexF64, num_modes, num_modes) - mul!(Gram, exp_mn_basis', exp_mn_basis) + # Gram matrix diagonal for the discrete Fourier basis on the uniform grid. + # + # For the basis produced by `compute_fourier_coefficients(...)` (complex exponentials sampled on a + # uniform grid), the discrete inner products satisfy: + # + # ZᴴZ = num_points · I + # + # up to roundoff, so the Gram matrix is diagonal. Store only the diagonal as a length-P vector. + Gram = acquire!(pool, ComplexF64, num_modes) + fill!(Gram, ComplexF64(num_points)) # Projected kernel matrices [P × P complex] K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes) @@ -149,9 +156,13 @@ The key idea is: # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext K_int .= -K_ext - K_int[1:num_modes, 1:num_modes] .+= 2 .* Gram + @inbounds for p in 1:num_modes + K_int[p, p] += 2 * Gram[p] + end if !wall.nowall - K_int[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)] .+= 2 .* Gram + @inbounds for p in 1:num_modes + K_int[num_modes+p, num_modes+p] += 2 * Gram[p] + end end G_int .= G_ext @@ -162,7 +173,10 @@ The key idea is: ldiv!(F_int, G_int) # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G - mul!(wv, Gram, view(G_ext, 1:num_modes, :)) + wv .= view(G_ext, 1:num_modes, :) + @inbounds for p in 1:num_modes + @views wv[p, :] .*= Gram[p] + end wv .*= (4π^2 / num_points) # Enforce Hermitian symmetry if desired diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl index 472c6c68..3ae4b648 100644 --- a/test/runtests_vacuum.jl +++ b/test/runtests_vacuum.jl @@ -1,4 +1,5 @@ @testset "Vacuum.jl Unit Tests" begin + using LinearAlgebra @testset "Vacuum.jl (2D)" begin @@ -481,7 +482,7 @@ ν=plasma_surf.ν ) M, P = size(exp_mn_basis) - Gram = exp_mn_basis' * exp_mn_basis + Gram = fill(ComplexF64(M), P) # --- Two-step Galerkin: materialize full kernels then project --- grad_green_full = zeros(Float64, 2M, 2M) From 46f24d9548b4ad10800f7173886dfa16eb1e76d9 Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Wed, 18 Mar 2026 10:17:39 -0400 Subject: [PATCH 22/23] VACUUM - IMPROVEMENT - merging the single and double layer kernels to reduce extra computations, reduces runtime by around 10% for the 3D solovev example with wall --- src/Vacuum/Kernel3D.jl | 61 +++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index d7abd667..58a35b3d 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -280,6 +280,34 @@ Scalar-argument double-layer kernel. Avoids view creation in tight loops. return (dx*nx + dy*ny + dz*nz) * r3inv end +""" + laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double) + +Fused scalar-argument Laplace single-layer and double-layer kernels. + +This is the hot-path variant used in the 3D projected-kernel assembly when both kernels are needed +(`populate_greenfunction == true`). It shares the distance computation (`sqrt(r²)`) so we only pay +for one `sqrt`/reciprocal pipeline per source/observer pair. +""" +@fastmath @inline function laplace_single_double_layer( + ox::Float64, oy::Float64, oz::Float64, + sx::Float64, sy::Float64, sz::Float64, + nx::Float64, ny::Float64, nz::Float64 +) + dx = ox - sx + dy = oy - sy + dz = oz - sz + r2 = dx*dx + dy*dy + dz*dz + r2 < 1e-30 && return (0.0, 0.0) + rinv = inv(sqrt(r2)) + # single-layer: 1/r + single = rinv + # double-layer: (Δx·n)/r^3 + r3inv = rinv * rinv * rinv + double = (dx*nx + dy*ny + dz*nz) * r3inv + return (single, double) +end + """ extract_patch!(patch, data, idx_pol_center, idx_tor_center, npol, ntor, PATCH_DIM) @@ -583,12 +611,13 @@ function compute_3D_kernel_matrices!( ny = source.normal[idx_src, 2] nz = source.normal[idx_src, 3] - w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ - _accum_row!(proj_k, w_double, Zt, idx_src) - if populate_greenfunction - w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ - _accum_row!(proj_g, w_single, Zt, idx_src) + w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) + _accum_row!(proj_k, w_double * dθdζ, Zt, idx_src) + _accum_row!(proj_g, w_single * dθdζ, Zt, idx_src) + else + w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ + _accum_row!(proj_k, w_double, Zt, idx_src) end end @@ -617,8 +646,14 @@ function compute_3D_kernel_matrices!( nsx = n_polar[ir, ia, 1] nsy = n_polar[ir, ia, 2] nsz = n_polar[ir, ia, 3] - M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ + if populate_greenfunction + w_single, w_double = laplace_single_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) + M_polar_single[ir, ia] = w_single * Ppou[ir, ia] * dθdζ + M_polar_double[ir, ia] = w_double * Ppou[ir, ia] * dθdζ + else + # Only the double-layer kernel is needed when the source is the wall. + M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ + end end # Distribute polar singular corrections back to Cartesian grid using sparse matrix @@ -643,13 +678,15 @@ function compute_3D_kernel_matrices!( ny = source.normal[idx_src, 2] nz = source.normal[idx_src, 3] - far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ - _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) - - # Apply near + far contributions if populate_greenfunction - far_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[i, j]) * dθdζ + w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) + far_double = w_double * (1.0 + Gpou[i, j]) * dθdζ + far_single = w_single * (1.0 + Gpou[i, j]) * dθdζ + _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src) + else + far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ + _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) end end From c85493e0983eddc614aa1b35a0d249ae64f44f7b Mon Sep 17 00:00:00 2001 From: Jake Halpern Date: Wed, 18 Mar 2026 11:48:12 -0400 Subject: [PATCH 23/23] VACUUM - IMPROVEMENT - combining kernels into one operation, reduces 3D time by around 10% --- src/Vacuum/Kernel3D.jl | 204 ++++++++--------------------------------- 1 file changed, 38 insertions(+), 166 deletions(-) diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl index 58a35b3d..dead2b31 100644 --- a/src/Vacuum/Kernel3D.jl +++ b/src/Vacuum/Kernel3D.jl @@ -177,119 +177,19 @@ function get_singular_quadrature(PATCH_RAD::Int, RAD_DIM::Int, INTERP_ORDER::Int end """ - laplace_single_layer(x_obs, x_src) -> Float64 + laplace_kernel(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double) -Evaluate the Laplace single-layer (FxU) kernel between two 3D points. Returns -0.0 if the observation point coincides with the source point to avoid singularity. +Fused scalar-argument Laplace kernels for the 3D vacuum BIE. -The single-layer kernel φ is the fundamental solution to Laplace's equation: +Returns a tuple `(single, double)` where: -``` -φ(x_obs, x_src) = 1 / |x_obs - x_src| -``` + - `single = 1/r` is the single-layer kernel + - `double = (Δx⋅n)/r^3` is the double-layer kernel -# Arguments - - - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector) - - `x_src`: Source point (3D Cartesian coordinates, any AbstractVector) - -# Returns - - - `Float64`: Kernel value φ(x_obs, x_src) -""" -@fastmath function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}) - @inbounds begin - dx = x_obs[1] - x_src[1] - dy = x_obs[2] - x_src[2] - dz = x_obs[3] - x_src[3] - end - r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 - return inv(sqrt(r2)) -end - -""" -Scalar-argument single-layer kernel. Avoids view creation in tight loops. +This is used when `compute_3D_kernel_matrices!` needs **both** kernels for the same pair, so the +distance computation (`sqrt(r²)`) is shared. Returns `(0.0, 0.0)` when `r² < 1e-30`. """ -@fastmath @inline function laplace_single_layer( - ox::Float64, oy::Float64, oz::Float64, - sx::Float64, sy::Float64, sz::Float64 -) - dx = ox - sx; - dy = oy - sy; - dz = oz - sz - r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 - return inv(sqrt(r2)) -end - -""" - laplace_double_layer(x_obs, x_src, n_src) -> Float64 - -Evaluate the Laplace double-layer (DxU) kernel between a point and a surface element. Returns -0.0 if the observation point coincides with the source point to avoid singularity. Allocation-free -scalar arithmetic is used for maximum performance. - -The double-layer kernel K is the normal derivative of the fundamental solution: - -``` -K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x_obs - x_src|³ -``` - -# Arguments - - - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector) - - `x_src`: Source point on surface (3D Cartesian coordinates, any AbstractVector) - - `n_src`: Outward UNIT normal at source point (must be normalized!, any AbstractVector) - -# Returns - - - `Float64`: Kernel value K(x_obs, x_src, n_src) -""" -@fastmath function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real}) - @inbounds begin - dx = x_obs[1] - x_src[1] - dy = x_obs[2] - x_src[2] - dz = x_obs[3] - x_src[3] - nx = n_src[1] - ny = n_src[2] - nz = n_src[3] - end - r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 - rinv = inv(sqrt(r2)) - r3inv = rinv * rinv * rinv - return (dx*nx + dy*ny + dz*nz) * r3inv -end - -""" -Scalar-argument double-layer kernel. Avoids view creation in tight loops. -""" -@fastmath @inline function laplace_double_layer( - ox::Float64, oy::Float64, oz::Float64, - sx::Float64, sy::Float64, sz::Float64, - nx::Float64, ny::Float64, nz::Float64 -) - dx = ox - sx; - dy = oy - sy; - dz = oz - sz - r2 = dx*dx + dy*dy + dz*dz - r2 < 1e-30 && return 0.0 - rinv = inv(sqrt(r2)) - r3inv = rinv * rinv * rinv - return (dx*nx + dy*ny + dz*nz) * r3inv -end - -""" - laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double) - -Fused scalar-argument Laplace single-layer and double-layer kernels. - -This is the hot-path variant used in the 3D projected-kernel assembly when both kernels are needed -(`populate_greenfunction == true`). It shares the distance computation (`sqrt(r²)`) so we only pay -for one `sqrt`/reciprocal pipeline per source/observer pair. -""" -@fastmath @inline function laplace_single_double_layer( +@fastmath @inline function laplace_kernel( ox::Float64, oy::Float64, oz::Float64, sx::Float64, sy::Float64, sz::Float64, nx::Float64, ny::Float64, nz::Float64 @@ -342,8 +242,7 @@ end interpolate_to_polar!(polar_data, patch, quad_data) Interpolate Cartesian patch data to polar quadrature points using sparse matrix multiply. -Overwrites `polar_data` using mul! function arguments, mul!(C, A, B, α, β) -> C where -C = α * A * B + β * C. +Overwrites `polar_data` using mul! function arguments, mul!(C, A, B) -> C where C = A * B. # Arguments @@ -352,17 +251,17 @@ C = α * A * B + β * C. - `P2G`: Sparse interpolation matrix """ function interpolate_to_polar!(polar_data::Array{Float64,3}, patch::Array{Float64,3}, P2G::SparseMatrixCSC{Float64,Int}) - # Flatten patch to (Ngrid × dof), apply P2G' to get (Npolar × dof) patch_flat = reshape(patch, :, size(patch, 3)) - mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat, 1.0, 0.0) + mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat) end """ - compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar) + compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, normal_orient) Compute normal vector (= ∂r/∂θ × ∂r/∂ζ) at polar quadrature points from interpolated tangent vectors. We already scaled the normals by normal_orient in the geometry construction, so we need to reapply -that here since we are recomputing the normals from the derivatives. +that here since we are recomputing the normals from the derivatives. We use inline cross products +to avoid slice allocation. # Arguments @@ -372,7 +271,6 @@ that here since we are recomputing the normals from the derivatives. - `normal_orient`: Multiplier applied to normals to make them orient out of vacuum region (+1 or -1) """ function compute_polar_normal!(n_polar::Array{Float64,3}, dr_dθ::Array{Float64,3}, dr_dζ::Array{Float64,3}, normal_orient::Int) - # Inline cross product to avoid slice allocation @inbounds for ia in axes(dr_dθ, 2), ir in axes(dr_dθ, 1) n_polar[ir, ia, 1] = dr_dθ[ir, ia, 2] * dr_dζ[ir, ia, 3] - dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 2] n_polar[ir, ia, 2] = dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 1] - dr_dθ[ir, ia, 1] * dr_dζ[ir, ia, 3] @@ -586,9 +484,9 @@ function compute_3D_kernel_matrices!( # Convert linear index to 2D indices i_obs = mod1(idx_obs, observer.mtheta) j_obs = (idx_obs - 1) ÷ observer.mtheta + 1 - @inbounds ox = observer.r[idx_obs, 1] - @inbounds oy = observer.r[idx_obs, 2] - @inbounds oz = observer.r[idx_obs, 3] + ox = observer.r[idx_obs, 1] + oy = observer.r[idx_obs, 2] + oz = observer.r[idx_obs, 3] # Mark patch source indices so the far-field loop can skip them @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM @@ -604,20 +502,13 @@ function compute_3D_kernel_matrices!( @inbounds for idx_src in 1:N is_patch[idx_src] && continue - sx = source.r[idx_src, 1] - sy = source.r[idx_src, 2] - sz = source.r[idx_src, 3] - nx = source.normal[idx_src, 1] - ny = source.normal[idx_src, 2] - nz = source.normal[idx_src, 3] + sr = view(source.r, idx_src, :) + sn = view(source.normal, idx_src, :) + far_single, far_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* dθdζ + _accum_row!(proj_k, far_double, Zt, idx_src) if populate_greenfunction - w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) - _accum_row!(proj_k, w_double * dθdζ, Zt, idx_src) - _accum_row!(proj_g, w_single * dθdζ, Zt, idx_src) - else - w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ - _accum_row!(proj_k, w_double, Zt, idx_src) + _accum_row!(proj_g, far_single, Zt, idx_src) end end @@ -637,56 +528,37 @@ function compute_3D_kernel_matrices!( # Compute normal vectors at polar points from interpolated tangent vectors compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient) - # Evaluate kernels at polar points with POU weighting + # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM - # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already - rsx = r_polar[ir, ia, 1] - rsy = r_polar[ir, ia, 2] - rsz = r_polar[ir, ia, 3] - nsx = n_polar[ir, ia, 1] - nsy = n_polar[ir, ia, 2] - nsz = n_polar[ir, ia, 3] - if populate_greenfunction - w_single, w_double = laplace_single_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) - M_polar_single[ir, ia] = w_single * Ppou[ir, ia] * dθdζ - M_polar_double[ir, ia] = w_double * Ppou[ir, ia] * dθdζ - else - # Only the double-layer kernel is needed when the source is the wall. - M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ - end + sr = view(r_polar, ir, ia, :) + sn = view(n_polar, ir, ia, :) + w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* Ppou[ir, ia] .* dθdζ + M_polar_single[ir, ia] = w_single + M_polar_double[ir, ia] = w_double end # Distribute polar singular corrections back to Cartesian grid using sparse matrix # grid = P2G * polar (maps Npolar → Ngrid) - mul!(M_grid_single_flat, P2G, vec(M_polar_single)) mul!(M_grid_double_flat, P2G, vec(M_polar_double)) - M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM) + if populate_greenfunction + mul!(M_grid_single_flat, P2G, vec(M_polar_single)) + M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM) + end - # POU correction: read back far-field trapezoidal values instead of re-evaluating kernels. - # trap + M_grid + trap*Gpou = trap*(1+Gpou) + M_grid = trap*(1-χ) + M_grid + # POU correction: singular correction + (1 + Gpou) * far-field terms @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM # Map back to global indices idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta) idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta) idx_src = idx_pol + source.mtheta * (idx_tor - 1) + sr = view(source.r, idx_src, :) + sn = view(source.normal, idx_src, :) - sx = source.r[idx_src, 1] - sy = source.r[idx_src, 2] - sz = source.r[idx_src, 3] - nx = source.normal[idx_src, 1] - ny = source.normal[idx_src, 2] - nz = source.normal[idx_src, 3] - + w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* (1.0 + Gpou[i, j]) .* dθdζ + _accum_row!(proj_k, M_grid_double[i, j] + w_double, Zt, idx_src) if populate_greenfunction - w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) - far_double = w_double * (1.0 + Gpou[i, j]) * dθdζ - far_single = w_single * (1.0 + Gpou[i, j]) * dθdζ - _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) - _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src) - else - far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ - _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src) + _accum_row!(proj_g, M_grid_single[i, j] + w_single, Zt, idx_src) end end @@ -701,7 +573,7 @@ function compute_3D_kernel_matrices!( end end - # Use the same normalization as in the 2D kernel so we can just add I to the diagonal + # Use the same normalization as in the 2D kernel so we can just add Gram to the diagonal # This makes the grri logic identical to the 2D kernel. mul!(K_block, Z', transpose(KZt)) K_block ./= 2π