From b4ebfee60160849c6fe0dc5e802fd3f953f2f429 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Fri, 13 Mar 2026 10:34:16 -0400
Subject: [PATCH 01/23] VACUUM - WIP - removing green_only and adding a
 galerkin projection before inverting the system

---
 src/ForceFreeStates/Free.jl                  |   2 +-
 src/PerturbedEquilibrium/SingularCoupling.jl |   2 +-
 src/Vacuum/DataTypes.jl                      |  10 +-
 src/Vacuum/Vacuum.jl                         | 206 ++++++++++++-------
 test/runtests_vacuum.jl                      |  15 --
 5 files changed, 140 insertions(+), 95 deletions(-)

diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl
index 9c8ec89c..65455d9e 100644
--- a/src/ForceFreeStates/Free.jl
+++ b/src/ForceFreeStates/Free.jl
@@ -28,7 +28,7 @@ and data dumping.
 
     # Compute vacuum response matrix in-place (handles 2D single-n, 2D multi-n block-diagonal, and 3D)
     vac_inputs = Vacuum.VacuumInput(equil, psilim, ctrl.mthvac, ctrl.nzvac, mpert, mlow, npert, nlow; force_wv_symmetry=ctrl.force_wv_symmetry)
-    Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings)
+    @time Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings)
 
     # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126]
     singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)')
diff --git a/src/PerturbedEquilibrium/SingularCoupling.jl b/src/PerturbedEquilibrium/SingularCoupling.jl
index b27da960..40a8c1ce 100644
--- a/src/PerturbedEquilibrium/SingularCoupling.jl
+++ b/src/PerturbedEquilibrium/SingularCoupling.jl
@@ -154,7 +154,7 @@ function compute_singular_coupling_metrics!(
             # Compute Green's functions at this surface for this n
             # TODO: This assumes an initial 2D equilibrum, getting 2D Green's functions for independent n
             vac_input = Vacuum.VacuumInput(equil, sing_surf.psifac, mtheta, 1, mpert, mlow, 1, nn)
-            _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings; green_only=true)
+            _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings)
 
             # Store in singular surface struct (overwrites for each n)
             ffs_intr.sing[s].grri = grri
diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl
index 63d029b4..01811b41 100644
--- a/src/Vacuum/DataTypes.jl
+++ b/src/Vacuum/DataTypes.jl
@@ -22,6 +22,9 @@ nzeta > 1 for 3D vacuum calculation.
   - `mtheta::Int`: Number of vacuum calculation poloidal grid points
   - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation)
   - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix
+  - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)]
+    instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always
+    use collocation. Defaults to `false`.
 """
 @kwdef struct VacuumInput
     x::Vector{Float64} = Float64[]
@@ -37,6 +40,7 @@ nzeta > 1 for 3D vacuum calculation.
     mtheta::Int = 1
     nzeta::Int = 1
     force_wv_symmetry::Bool = true
+    use_galerkin::Bool = false
 end
 
 """
@@ -76,7 +80,8 @@ function VacuumInput(
     mlow::Int,
     npert::Int,
     nlow::Int;
-    force_wv_symmetry::Bool=true
+    force_wv_symmetry::Bool=true,
+    use_galerkin::Bool=false
 )
     # Extract plasma surface geometry at this psi
     r, z, ν = extract_plasma_surface_at_psi(equil, ψ)
@@ -92,7 +97,8 @@ function VacuumInput(
         npert=npert,
         mtheta=mtheta,
         nzeta=nzeta,
-        force_wv_symmetry=force_wv_symmetry
+        force_wv_symmetry=force_wv_symmetry,
+        use_galerkin=true
     )
 end
 
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 2f6335d6..99ae36b1 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -23,8 +23,7 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field
 export extract_plasma_surface_at_psi
 
 """
-    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings;
-        green_only=false)
+    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
 
 Compute the vacuum response matrix and both Green's functions using provided vacuum inputs.
 
@@ -46,7 +45,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
 
   - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info.
   - `wall_settings::WallShapeSettings`: Wall geometry configuration.
-  - `green_only`: If true, skip building the response matrix `wv` and return zeros for `wv` and `xzpts`.
 
 # Returns
 
@@ -70,8 +68,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     wall_pts::AbstractMatrix{Float64},
     inputs::VacuumInput,
     wall_settings::WallShapeSettings;
-    n_override::Union{Nothing,Int}=nothing,
-    green_only::Bool=false
+    n_override::Union{Nothing,Int}=nothing
 )
 
     # Initialize surface geometries
@@ -84,12 +81,11 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     num_points_surf, num_modes = size(cos_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
-    if inputs.nzeta > 1
-        # Hardcode these values for now - can expose to the user in the future
-        kparams = KernelParams3D(11, 20, 5)
-    else
-        kparams = KernelParams2D(n_override)
-    end
+    # Hardcode these values for now - can expose to the user in the future
+    PATCH_RAD = 11
+    RAD_DIM = 20
+    INTERP_ORDER = 5
+    kparams = inputs.nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override)
 
     # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
     num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
@@ -105,74 +101,138 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     # Plasma–Plasma block
     kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
 
-    # Fourier transform obs=plasma, src=plasma block
-    fourier_transform!(grre, green_temp, cos_mn_basis)
-    fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
-
-    if !wall.nowall
-        # Plasma–Wall block
-        kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
-        # Wall–Wall block
-        kernel!(grad_green, green_temp, wall, wall, kparams)
-        # Wall–Plasma block
-        kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-        # Fourier transform obs=wall, src=plasma block
-        fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
-        fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
-    end
+    if wall.nowall && inputs.use_galerkin
+        # ================================================================
+        # Galerkin projection: solve in Fourier space [2P × 2P] instead of
+        # the full collocation system [num_points_surf × num_points_surf].
+        #
+        # Instead of:  wv = F_inv * (K \ (G * F))       O(M³)
+        # We compute:  wv ~ (F'KF) \ (F'GF)             O(M²P + P³)
+        #
+        # where M = num_points_surf and P = num_modes and
+        # F = [cos_basis | sin_basis] is the [M × 2P] Fourier basis and
+        # K = grad_green is the [M × M] double-layer kernel matrix.
+        # ================================================================
+        temp = zeros!(pool, num_points_surf, num_modes)
+
+        # K_proj = F' * grad_green * F  [2 * num_modes × 2 * num_modes]
+        K_proj = zeros(2 * num_modes, 2 * num_modes)
+        mul!(temp, grad_green, cos_mn_basis)
+        mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
+        mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
+        mul!(temp, grad_green, sin_mn_basis)
+        mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
+        mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
+
+        # G_proj = F' * green_temp * F  [2 * num_modes × 2 * num_modes]
+        G_proj = zeros(2 * num_modes, 2 * num_modes)
+        mul!(temp, green_temp, cos_mn_basis)
+        mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
+        mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
+        mul!(temp, green_temp, sin_mn_basis)
+        mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
+        mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
+
+        # Gram matrix F'F (needed for interior kernel and wv normalization)
+        FtF = zeros(2 * num_modes, 2 * num_modes)
+        mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis)
+        mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis)
+        mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis)
+        mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis)
+
+        # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier
+        # basis with the SFL angle correction (ν) can make the projected operators
+        # rank-deficient — the interior BIE operator in particular has a physical
+        # null space (constant potential mode). The pseudoinverse finds the
+        # minimum-norm solution, correctly projecting out numerically null directions
+        # without affecting well-resolved modes.
+        Y_ext = pinv(K_proj) * G_proj
+
+        # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
+        K_proj_int = 2 .* FtF .- K_proj
+        Y_int = pinv(K_proj_int) * G_proj
+
+        # Reconstruct physical-space Green's functions for backward compatibility
+        # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :]
+        mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :]))
+        mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
+        mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :]))
+        mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
+
+        # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext,
+        # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
+        wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext)
+        wv .= complex.(
+            @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]),
+            @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes])
+        )
+    else
+        # ================================================================
+        # Collocation approach: solve full physical-space system [M × M]
+        # Handles both no-wall and wall cases.
+        # ================================================================
+
+        # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp)
+        fourier_transform!(grre, green_temp, cos_mn_basis)
+        fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
+
+        if !wall.nowall
+            # Plasma–Wall block
+            kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
+            # Wall–Wall block
+            kernel!(grad_green, green_temp, wall, wall, kparams)
+            # Wall–Plasma block
+            kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
+            # Fourier transform obs=wall, src=plasma block
+            fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
+            fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
+        end
 
-    # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
-    grri .= grre # start from same as exterior
-    grad_green_interior = similar!(pool, grad_green)
-    grad_green_interior .= grad_green
+        # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
+        grri .= grre # start from same as exterior
+        grad_green_interior = similar!(pool, grad_green)
+        grad_green_interior .= grad_green
 
-    # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
-    F_ext = lu!(grad_green)
-    ldiv!(F_ext, grre)
+        # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
+        F_ext = lu!(grad_green)
+        ldiv!(F_ext, grre)
 
-    # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
-    grad_green_interior .*= -1
-    for i in 1:num_points_total
-        grad_green_interior[i, i] += 2.0
-    end
-    F_int = lu!(grad_green_interior)
-    ldiv!(F_int, grri)
+        # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
+        grad_green_interior .*= -1
+        for i in 1:num_points_total
+            grad_green_interior[i, i] += 2.0
+        end
+        F_int = lu!(grad_green_interior)
+        ldiv!(F_int, grri)
 
-    # Always initialise wv to zero so that green_only keeps it zeroed
-    if !green_only
-        # Perform inverse Fourier transforms to get response matrix components [Chance Phys. Plasmas 2007 052506 eq. 115-118]
+        # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118]
         arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4)
         fourier_inverse_transform!(arr, grre, cos_mn_basis)
         fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes)
         fourier_inverse_transform!(ari, grre, sin_mn_basis)
         fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes)
-
-        # Final form of vacuum response matrix [Chance Phys. Plasmas 2007 052506 eq. 114]
         wv .= complex.(arr .+ aii, air .- ari)
-        inputs.force_wv_symmetry && hermitianpart!(wv)
-
-        # Fill coordinate arrays
-        if inputs.nzeta > 1 # 3D
-            plasma_pts .= plasma_surf.r
-            wall_pts .= wall.r
-        else # 2D
-            @views begin
-                plasma_pts[:, 1] .= plasma_surf.x
-                plasma_pts[:, 2] .= 0.0
-                plasma_pts[:, 3] .= plasma_surf.z
-                wall_pts[:, 1] .= wall.x
-                wall_pts[:, 2] .= 0.0
-                wall_pts[:, 3] .= wall.z
-            end
+    end
+
+    inputs.force_wv_symmetry && hermitianpart!(wv)
+
+    if inputs.nzeta > 1 # 3D
+        plasma_pts .= plasma_surf.r
+        wall_pts .= wall.r
+    else # 2D
+        @views begin
+            plasma_pts[:, 1] .= plasma_surf.x
+            plasma_pts[:, 2] .= 0.0
+            plasma_pts[:, 3] .= plasma_surf.z
+            wall_pts[:, 1] .= wall.x
+            wall_pts[:, 2] .= 0.0
+            wall_pts[:, 3] .= wall.z
         end
     end
 end
 
 """
-    compute_vacuum_response(
-        inputs::VacuumInput,
-        wall_settings::WallShapeSettings;
-        green_only=false)
+    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
 
 Allocate and return the vacuum response matrix and Green's functions for the given
 vacuum inputs.
@@ -182,7 +242,7 @@ implementation. For performance‑critical paths that already own preallocated s
 (e.g. `ForceFreeStates.VacuumData`), prefer the in‑place method to avoid extra
 heap allocations.
 """
-@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false)
+@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
 
     # Allocate storage for the vacuum response matrix and Green's functions
     numpoints = inputs.mtheta * inputs.nzeta
@@ -195,17 +255,13 @@ heap allocations.
         wall_pts=zeros!(pool, numpoints, 3)
     )
 
-    compute_vacuum_response!(vac, inputs, wall_settings; green_only=green_only)
+    compute_vacuum_response!(vac, inputs, wall_settings)
 
     return vac.wv, vac.grri, vac.grre, vac.plasma_pts, vac.wall_pts
 end
 
 """
-    compute_vacuum_response!(
-        vac_data,
-        inputs::VacuumInput,
-        wall_settings::WallShapeSettings;
-        green_only=false)
+    compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings)
 
 In-place variant that computes the vacuum response and directly populates the arrays
 stored in `vac_data`.
@@ -222,7 +278,7 @@ compatible sizes:
 This is designed to work with `ForceFreeStates.VacuumData` but does not depend on
 its concrete type (duck-typed on field names only).
 """
-function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false)
+function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings)
 
     mpert = inputs.mpert
     npert = inputs.npert
@@ -237,8 +293,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::
             vac_data.plasma_pts,
             vac_data.wall_pts,
             inputs,
-            wall_settings;
-            green_only=green_only
+            wall_settings
         )
     else
         # 2D vacuum: fill diagonal blocks of the response matrix
@@ -262,8 +317,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::
                 vac_data.wall_pts,
                 inputs,
                 wall_settings;
-                n_override=n,
-                green_only=green_only
+                n_override=n
             )
         end
     end
diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl
index debb8299..d51bd622 100644
--- a/test/runtests_vacuum.jl
+++ b/test/runtests_vacuum.jl
@@ -631,21 +631,6 @@
             @test isapprox(wv, wv', rtol=1e-12)
         end
 
-        @testset "compute_vacuum_response 3D green_only" begin
-            inputs = _make_3d_inputs(mtheta=32, nzeta=32, mtheta_eq=17)
-            wall_settings = WallShapeSettings(shape="nowall")
-            wv, grri, grre, plasma_pts, wall_pts = compute_vacuum_response(inputs, wall_settings; green_only=true)
-
-            numpoints = inputs.mtheta * inputs.nzeta
-            num_modes = inputs.mpert * inputs.npert
-            @test size(wv) == (num_modes, num_modes)
-            @test all(wv .== 0)
-            @test size(grri) == (2 * numpoints, 2 * num_modes)
-            @test size(grre) == (2 * numpoints, 2 * num_modes)
-            @test all(isfinite, grri)
-            @test all(isfinite, grre)
-        end
-
         @testset "Kernel3D laplace_single_layer" begin
             x_obs = [1.0, 0.0, 0.0]
             x_src = [2.0, 0.0, 0.0]

From 388240dcdd885abad3efa786069215f50f8a8648 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Fri, 13 Mar 2026 11:04:10 -0400
Subject: [PATCH 02/23] VACUUM - WIP - adding rough profiling for the code

---
 src/Vacuum/Vacuum.jl | 211 +++++++++++++++++++++++++------------------
 1 file changed, 122 insertions(+), 89 deletions(-)

diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 99ae36b1..8796048d 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -72,12 +72,27 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
 )
 
     # Initialize surface geometries
-    plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
-    wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
+    geom_timing = @timed begin
+        plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
+        wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
+    end
+    println(" Compute geometry  TIME=$(round(geom_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(geom_timing.bytes))")
 
     # Compute Fourier basis coefficients
-    ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
-    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(inputs.mtheta, inputs.mpert, inputs.mlow, inputs.nzeta, inputs.npert, inputs.nlow; n_2D=n_override, ν=ν)
+    basis_timing = @timed begin
+        ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
+        cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(
+            inputs.mtheta,
+            inputs.mpert,
+            inputs.mlow,
+            inputs.nzeta,
+            inputs.npert,
+            inputs.nlow;
+            n_2D=n_override,
+            ν=ν
+        )
+    end
+    println(" Compute Fourier basis  TIME=$(round(basis_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(basis_timing.bytes))")
     num_points_surf, num_modes = size(cos_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
@@ -99,7 +114,10 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     grri = @view grri_in[1:num_points_total, :]
 
     # Plasma–Plasma block
-    kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+    pp_kernel_timing = @timed begin
+        kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+    end
+    println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
 
     if wall.nowall && inputs.use_galerkin
         # ================================================================
@@ -115,57 +133,60 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         # ================================================================
         temp = zeros!(pool, num_points_surf, num_modes)
 
-        # K_proj = F' * grad_green * F  [2 * num_modes × 2 * num_modes]
-        K_proj = zeros(2 * num_modes, 2 * num_modes)
-        mul!(temp, grad_green, cos_mn_basis)
-        mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
-        mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
-        mul!(temp, grad_green, sin_mn_basis)
-        mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
-        mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
-
-        # G_proj = F' * green_temp * F  [2 * num_modes × 2 * num_modes]
-        G_proj = zeros(2 * num_modes, 2 * num_modes)
-        mul!(temp, green_temp, cos_mn_basis)
-        mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
-        mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
-        mul!(temp, green_temp, sin_mn_basis)
-        mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
-        mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
-
-        # Gram matrix F'F (needed for interior kernel and wv normalization)
-        FtF = zeros(2 * num_modes, 2 * num_modes)
-        mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis)
-        mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis)
-        mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis)
-        mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis)
-
-        # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier
-        # basis with the SFL angle correction (ν) can make the projected operators
-        # rank-deficient — the interior BIE operator in particular has a physical
-        # null space (constant potential mode). The pseudoinverse finds the
-        # minimum-norm solution, correctly projecting out numerically null directions
-        # without affecting well-resolved modes.
-        Y_ext = pinv(K_proj) * G_proj
-
-        # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
-        K_proj_int = 2 .* FtF .- K_proj
-        Y_int = pinv(K_proj_int) * G_proj
-
-        # Reconstruct physical-space Green's functions for backward compatibility
-        # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :]
-        mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :]))
-        mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
-        mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :]))
-        mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
-
-        # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext,
-        # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
-        wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext)
-        wv .= complex.(
-            @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]),
-            @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes])
-        )
+        proj_timing = @timed begin
+            # K_proj = F' * grad_green * F  [2 * num_modes × 2 * num_modes]
+            K_proj = zeros(2 * num_modes, 2 * num_modes)
+            mul!(temp, grad_green, cos_mn_basis)
+            mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
+            mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
+            mul!(temp, grad_green, sin_mn_basis)
+            mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
+            mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
+
+            # G_proj = F' * green_temp * F  [2 * num_modes × 2 * num_modes]
+            G_proj = zeros(2 * num_modes, 2 * num_modes)
+            mul!(temp, green_temp, cos_mn_basis)
+            mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
+            mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
+            mul!(temp, green_temp, sin_mn_basis)
+            mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
+            mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
+
+            # Gram matrix F'F (needed for interior kernel and wv normalization)
+            FtF = zeros(2 * num_modes, 2 * num_modes)
+            mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis)
+            mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis)
+            mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis)
+            mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis)
+
+            # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier
+            # basis with the SFL angle correction (ν) can make the projected operators
+            # rank-deficient — the interior BIE operator in particular has a physical
+            # null space (constant potential mode). The pseudoinverse finds the
+            # minimum-norm solution, correctly projecting out numerically null directions
+            # without affecting well-resolved modes.
+            Y_ext = pinv(K_proj) * G_proj
+
+            # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
+            K_proj_int = 2 .* FtF .- K_proj
+            Y_int = pinv(K_proj_int) * G_proj
+
+            # Reconstruct physical-space Green's functions for backward compatibility
+            # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :]
+            mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :]))
+            mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
+            mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :]))
+            mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
+
+            # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext,
+            # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
+            wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext)
+            wv .= complex.(
+                @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]),
+                @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes])
+            )
+        end
+        println(" Galerkin Project and Solve  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
     else
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]
@@ -173,45 +194,57 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         # ================================================================
 
         # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp)
-        fourier_transform!(grre, green_temp, cos_mn_basis)
-        fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
+        colloc_ft_timing = @timed begin
+            fourier_transform!(grre, green_temp, cos_mn_basis)
+            fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
+        end
+        println(" Plasma Fourier Transform  TIME=$(round(colloc_ft_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(colloc_ft_timing.bytes))")
 
         if !wall.nowall
-            # Plasma–Wall block
-            kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
-            # Wall–Wall block
-            kernel!(grad_green, green_temp, wall, wall, kparams)
-            # Wall–Plasma block
-            kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-            # Fourier transform obs=wall, src=plasma block
-            fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
-            fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
+            wall_block_timing = @timed begin
+                # Plasma–Wall block
+                kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
+                # Wall–Wall block
+                kernel!(grad_green, green_temp, wall, wall, kparams)
+                # Wall–Plasma block
+                kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
+                # Fourier transform obs=wall, src=plasma block
+                fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
+                fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
+            end
+            println(" Wall Kernel and Fourier Transform  TIME=$(round(wall_block_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))")
         end
 
         # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
-        grri .= grre # start from same as exterior
-        grad_green_interior = similar!(pool, grad_green)
-        grad_green_interior .= grad_green
-
-        # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
-        F_ext = lu!(grad_green)
-        ldiv!(F_ext, grre)
-
-        # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
-        grad_green_interior .*= -1
-        for i in 1:num_points_total
-            grad_green_interior[i, i] += 2.0
+        solve_timing = @timed begin
+            grri .= grre # start from same as exterior
+            grad_green_interior = similar!(pool, grad_green)
+            grad_green_interior .= grad_green
+
+            # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
+            F_ext = lu!(grad_green)
+            ldiv!(F_ext, grre)
+
+            # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
+            grad_green_interior .*= -1
+            for i in 1:num_points_total
+                grad_green_interior[i, i] += 2.0
+            end
+            F_int = lu!(grad_green_interior)
+            ldiv!(F_int, grri)
+        end
+        println(" Invert and Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+
+        invft_timing = @timed begin
+            # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118]
+            arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4)
+            fourier_inverse_transform!(arr, grre, cos_mn_basis)
+            fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes)
+            fourier_inverse_transform!(ari, grre, sin_mn_basis)
+            fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes)
+            wv .= complex.(arr .+ aii, air .- ari)
         end
-        F_int = lu!(grad_green_interior)
-        ldiv!(F_int, grri)
-
-        # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118]
-        arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4)
-        fourier_inverse_transform!(arr, grre, cos_mn_basis)
-        fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes)
-        fourier_inverse_transform!(ari, grre, sin_mn_basis)
-        fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes)
-        wv .= complex.(arr .+ aii, air .- ari)
+        println(" Compute Wv  TIME=$(round(invft_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(invft_timing.bytes))")
     end
 
     inputs.force_wv_symmetry && hermitianpart!(wv)

From 7733df93e7750227051da9fb83db7274a044491a Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Fri, 13 Mar 2026 11:52:27 -0400
Subject: [PATCH 03/23] VACUUM - WIP - fused kernel operations so full matrix
 is never stored in memory. Works for 1 thread, but currently some
 nondeterministic results in multi-threading

---
 src/Vacuum/DataTypes.jl       |  10 +-
 src/Vacuum/ProjectedKernel.jl | 464 ++++++++++++++++++++++++++++++++++
 src/Vacuum/Vacuum.jl          | 121 ++++++---
 3 files changed, 561 insertions(+), 34 deletions(-)
 create mode 100644 src/Vacuum/ProjectedKernel.jl

diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl
index 01811b41..10c1f8bd 100644
--- a/src/Vacuum/DataTypes.jl
+++ b/src/Vacuum/DataTypes.jl
@@ -25,6 +25,9 @@ nzeta > 1 for 3D vacuum calculation.
   - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)]
     instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always
     use collocation. Defaults to `false`.
+  - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with
+    the Fourier projection so that the full M×M kernel matrices are never materialized.
+    Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`.
 """
 @kwdef struct VacuumInput
     x::Vector{Float64} = Float64[]
@@ -41,6 +44,7 @@ nzeta > 1 for 3D vacuum calculation.
     nzeta::Int = 1
     force_wv_symmetry::Bool = true
     use_galerkin::Bool = false
+    fuse_projection::Bool = false
 end
 
 """
@@ -81,7 +85,8 @@ function VacuumInput(
     npert::Int,
     nlow::Int;
     force_wv_symmetry::Bool=true,
-    use_galerkin::Bool=false
+    use_galerkin::Bool=false,
+    fuse_projection::Bool=false
 )
     # Extract plasma surface geometry at this psi
     r, z, ν = extract_plasma_surface_at_psi(equil, ψ)
@@ -98,7 +103,8 @@ function VacuumInput(
         mtheta=mtheta,
         nzeta=nzeta,
         force_wv_symmetry=force_wv_symmetry,
-        use_galerkin=true
+        use_galerkin=true,
+        fuse_projection=true
     )
 end
 
diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
new file mode 100644
index 00000000..bd17a9a9
--- /dev/null
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -0,0 +1,464 @@
+# Fused kernel assembly + Fourier projection for Galerkin vacuum solve.
+#
+# Instead of materializing the full M×M kernel matrices and then projecting,
+# these functions accumulate the P×P projected matrices row by row as the
+# kernel values are computed, reducing memory from O(M²) to O(MP).
+#
+# K_c = Z^H K Z  and  G_c = Z^H G Z
+#
+# where Z = C + iS is the [M × P] complex Fourier basis, K is the double-layer
+# kernel, and G is the single-layer kernel. For each observer point j, the
+# kernel row is projected and accumulated via rank-1 updates:
+#
+#   K_c += conj(Z[j,:]) ⊗ (K[j,:] · Z)
+#
+# FLOP cost is identical to the two-step approach O(M²P), but memory drops
+# from O(M²) to O(MP + P²).
+
+"""
+    projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram)
+
+Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z
+directly, without materializing the full M×M kernel matrices.
+
+Dispatches to the 2D or 3D implementation based on the geometry/params types.
+
+# Arguments
+
+  - `K_c::Matrix{ComplexF64}`: Output P×P projected double-layer kernel [filled in-place]
+  - `G_c::Matrix{ComplexF64}`: Output P×P projected single-layer kernel [filled in-place]
+  - `observer`: Observer geometry struct
+  - `source`: Source geometry struct
+  - `params`: Kernel parameters (KernelParams2D or KernelParams3D)
+  - `cos_basis::Matrix{Float64}`: [M × P] cosine Fourier basis
+  - `sin_basis::Matrix{Float64}`: [M × P] sine Fourier basis
+  - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term)
+"""
+function projected_kernel! end
+
+function projected_kernel!(
+    K_c::Matrix{ComplexF64},
+    G_c::Matrix{ComplexF64},
+    observer::Union{PlasmaGeometry,WallGeometry},
+    source::Union{PlasmaGeometry,WallGeometry},
+    params::KernelParams2D,
+    cos_basis::Matrix{Float64},
+    sin_basis::Matrix{Float64},
+    Gram::Matrix{ComplexF64}
+)
+    _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram)
+end
+
+function projected_kernel!(
+    K_c::Matrix{ComplexF64},
+    G_c::Matrix{ComplexF64},
+    observer::Union{PlasmaGeometry3D,WallGeometry3D},
+    source::Union{PlasmaGeometry3D,WallGeometry3D},
+    params::KernelParams3D,
+    cos_basis::Matrix{Float64},
+    sin_basis::Matrix{Float64},
+    Gram::Matrix{ComplexF64}
+)
+    _projected_kernel_3D!(K_c, G_c, observer, source,
+        params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER,
+        cos_basis, sin_basis, Gram)
+end
+
+
+# ============================================================================
+# 2D fused projected kernel
+# ============================================================================
+
+"""
+    _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram)
+
+Fused 2D kernel assembly + projection. Mirrors the loop structure of
+`compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the
+P×P projected matrices instead of filling the M×M kernel matrices.
+
+Memory: O(MP) instead of O(M²).
+"""
+@with_pool pool function _projected_kernel_2D!(
+    K_c::Matrix{ComplexF64},
+    G_c::Matrix{ComplexF64},
+    observer::Union{PlasmaGeometry,WallGeometry},
+    source::Union{PlasmaGeometry,WallGeometry},
+    n::Int,
+    cos_basis::Matrix{Float64},
+    sin_basis::Matrix{Float64},
+    Gram::Matrix{ComplexF64}
+)
+    M, P = size(cos_basis)
+    mtheta = length(observer.x)
+    dtheta = 2π / mtheta
+    theta_grid = range(; start=0, length=mtheta, step=dtheta)
+
+    populate_greenfunction = source isa PlasmaGeometry
+
+    # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78]
+    log_correction_0 = 16.0 * dtheta * (log(2 * dtheta) - 68.0 / 15.0) / 15.0
+    log_correction_1 = 128.0 * dtheta * (log(2 * dtheta) - 8.0 / 15.0) / 45.0
+    log_correction_2 = 4.0 * dtheta * (7.0 * log(2 * dtheta) - 11.0 / 15.0) / 45.0
+    log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2)
+
+    gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n)
+
+    spline_x = cubic_interp(theta_grid, source.x; bc=PeriodicBC(; endpoint=:exclusive, period=2π))
+    spline_z = cubic_interp(theta_grid, source.z; bc=PeriodicBC(; endpoint=:exclusive, period=2π))
+    d1_spline_x = deriv1(spline_x)
+    d1_spline_z = deriv1(spline_z)
+
+    stencils_left, stencils_right = GL8_LAGRANGE_STENCILS
+    sing_idx = zeros!(pool, Int, 5)
+
+    dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta)
+    dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta)
+    d1_spline_x(dx_dtheta_grid, theta_grid)
+    d1_spline_z(dz_dtheta_grid, theta_grid)
+
+    # Pre-transpose basis for contiguous column access: Ct[:, k] = C[k, :]
+    Ct = acquire!(pool, Float64, P, M)
+    St = acquire!(pool, Float64, P, M)
+    Ct .= cos_basis'
+    St .= sin_basis'
+
+    # Real/imaginary accumulators for P×P projected matrices
+    K_re = zeros(P, P)
+    K_im = zeros(P, P)
+    G_re = zeros(P, P)
+    G_im = zeros(P, P)
+
+    # Per-observer projection vectors (P-length)
+    proj_kc = zeros(P)
+    proj_ks = zeros(P)
+    proj_gc = zeros(P)
+    proj_gs = zeros(P)
+
+    for j in 1:mtheta
+        x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
+
+        fill!(proj_kc, 0.0)
+        fill!(proj_ks, 0.0)
+        fill!(proj_gc, 0.0)
+        fill!(proj_gs, 0.0)
+        diag_accum = 0.0
+
+        # ── Simpson integration for nonsingular source points ──
+        @inbounds for k in 1:(mtheta-3)
+            isrc = mod1(j + 1 + k, mtheta)
+            G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
+                source.x[isrc], source.z[isrc],
+                dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n;
+                gamma_prefactor)
+
+            wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2))
+
+            if populate_greenfunction
+                w_g = G_n * wsimpson
+                BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
+                BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+            end
+            w_k = gradG_n * wsimpson
+            BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc)
+            BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks)
+
+            diag_accum -= gradG_0 * wsimpson
+        end
+
+        # ── Gaussian quadrature for singular points ──
+        for (offset_idx, offset) in enumerate(-2:2)
+            sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta)
+        end
+
+        for leftpanel in (true, false)
+            gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta)
+            @inbounds for ig in 1:8
+                theta_gauss = gauss_mid + GL8.x[ig] * dtheta
+                theta_gauss0 = mod(theta_gauss, 2π)
+                x_gauss = spline_x(theta_gauss0)
+                dx_dtheta_gauss = d1_spline_x(theta_gauss0)
+                z_gauss = spline_z(theta_gauss0)
+                dz_dtheta_gauss = d1_spline_z(theta_gauss0)
+                G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
+                    x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n;
+                    gamma_prefactor)
+
+                s = leftpanel ? stencils_left[ig] : stencils_right[ig]
+                wgauss = GL8.w[ig] * dtheta
+
+                if populate_greenfunction
+                    if observer isa PlasmaGeometry
+                        G_n += log((theta_obs - theta_gauss)^2) / x_obs
+                    end
+                    @inbounds for stencil_idx in 1:5
+                        w_g = G_n * s[stencil_idx] * wgauss
+                        isrc = sing_idx[stencil_idx]
+                        BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
+                        BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+                    end
+                end
+
+                @inbounds for stencil_idx in 1:5
+                    w_k = gradG_n * s[stencil_idx] * wgauss
+                    isrc = sing_idx[stencil_idx]
+                    BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc)
+                    BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks)
+                end
+
+                diag_accum -= gradG_0 * wgauss
+            end
+        end
+
+        # Analytic singular integral correction [Chance 1997 eq. 75]
+        if populate_greenfunction && observer isa PlasmaGeometry
+            @inbounds for stencil_idx in 1:5
+                w_g = -log_correction_array[stencil_idx] / x_obs
+                isrc = sing_idx[stencil_idx]
+                BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
+                BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+            end
+        end
+
+        # Fold diagonal accumulation into projection
+        BLAS.axpy!(diag_accum, @view(Ct[:, j]), proj_kc)
+        BLAS.axpy!(diag_accum, @view(St[:, j]), proj_ks)
+
+        # ── Rank-1 accumulate into P×P projection matrices ──
+        # K_c_re += C[j,:] ⊗ proj_kc + S[j,:] ⊗ proj_ks
+        BLAS.ger!(1.0, @view(Ct[:, j]), proj_kc, K_re)
+        BLAS.ger!(1.0, @view(St[:, j]), proj_ks, K_re)
+        # K_c_im += C[j,:] ⊗ proj_ks − S[j,:] ⊗ proj_kc
+        BLAS.ger!(1.0, @view(Ct[:, j]), proj_ks, K_im)
+        BLAS.ger!(-1.0, @view(St[:, j]), proj_kc, K_im)
+
+        if populate_greenfunction
+            BLAS.ger!(1.0, @view(Ct[:, j]), proj_gc, G_re)
+            BLAS.ger!(1.0, @view(St[:, j]), proj_gs, G_re)
+            BLAS.ger!(1.0, @view(Ct[:, j]), proj_gs, G_im)
+            BLAS.ger!(-1.0, @view(St[:, j]), proj_gc, G_im)
+        end
+    end
+
+    # ── Post-processing (mirrors compute_2D_kernel_matrices!) ──
+
+    # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source
+    if source isa PlasmaGeometry
+        K_re .*= -1
+        K_im .*= -1
+    end
+
+    # Diagonal residue: K += residue·I  →  K_c += residue·Gram
+    # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
+    residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
+    if residue != 0.0
+        K_re .+= residue .* real.(Gram)
+        K_im .+= residue .* imag.(Gram)
+    end
+
+    # 2π𝒢 → 𝒢
+    if populate_greenfunction
+        G_re ./= 2π
+        G_im ./= 2π
+    end
+
+    K_c .= complex.(K_re, K_im)
+    G_c .= complex.(G_re, G_im)
+end
+
+
+# ============================================================================
+# 3D fused projected kernel
+# ============================================================================
+
+"""
+    _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, cos_basis, sin_basis, Gram)
+
+Fused 3D kernel assembly + projection. Mirrors the loop structure of
+`compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction)
+but writes projected P-vectors to per-observer rows of [M × P] buffers instead of
+filling the M×M kernel matrices. The P×P assembly is done after the parallel loop
+via sequential GEMM calls.
+
+Each observer writes to its own row of the shared buffers, so there are no
+cross-thread accumulation races — the same write pattern as the original
+`compute_3D_kernel_matrices!`.
+
+Memory: O(4MP + P²) instead of O(M²).
+"""
+function _projected_kernel_3D!(
+    K_c::Matrix{ComplexF64},
+    G_c::Matrix{ComplexF64},
+    observer::Union{PlasmaGeometry3D,WallGeometry3D},
+    source::Union{PlasmaGeometry3D,WallGeometry3D},
+    PATCH_RAD::Int,
+    RAD_DIM::Int,
+    INTERP_ORDER::Int,
+    cos_basis::Matrix{Float64},
+    sin_basis::Matrix{Float64},
+    Gram::Matrix{ComplexF64}
+)
+    M, P = size(cos_basis)
+    num_points = observer.mtheta * observer.nzeta
+    dθdζ = 4π^2 / num_points
+
+    populate_greenfunction = source isa PlasmaGeometry3D
+
+    if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2
+        @warn "PATCH_RAD clamped in projected kernel" max_PATCH_RAD=(min(source.mtheta, source.nzeta) - 1) ÷ 2
+        PATCH_RAD = (min(source.mtheta, source.nzeta) - 1) ÷ 2
+    end
+    quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER)
+    (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data
+
+    # Pre-transpose basis for contiguous column access in the inner loop
+    Ct = Matrix(cos_basis')   # [P × M]
+    St = Matrix(sin_basis')   # [P × M]
+
+    # [M × P] buffers for projected kernel rows.
+    # Row idx_obs = Σ_k K[idx_obs, k] · basis[k, :] — each observer writes to
+    # its own row, so no cross-thread races.
+    KZ_c = zeros(M, P)
+    KZ_s = zeros(M, P)
+    GZ_c = zeros(M, P)
+    GZ_s = zeros(M, P)
+
+    # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors)
+    max_tid = Threads.maxthreadid()
+    workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid]
+    proj_kc_all = [zeros(P) for _ in 1:max_tid]
+    proj_ks_all = [zeros(P) for _ in 1:max_tid]
+    proj_gc_all = [zeros(P) for _ in 1:max_tid]
+    proj_gs_all = [zeros(P) for _ in 1:max_tid]
+
+    Threads.@threads :static for idx_obs in 1:num_points
+        tid = Threads.threadid()
+        ws = workspaces[tid]
+        (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar,
+            n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws
+
+        proj_kc = proj_kc_all[tid]
+        proj_ks = proj_ks_all[tid]
+        proj_gc = proj_gc_all[tid]
+        proj_gs = proj_gs_all[tid]
+
+        fill!(proj_kc, 0.0)
+        fill!(proj_ks, 0.0)
+        fill!(proj_gc, 0.0)
+        fill!(proj_gs, 0.0)
+
+        i_obs = mod1(idx_obs, observer.mtheta)
+        j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
+        r_obs = @view observer.r[idx_obs, :]
+
+        # ── FAR FIELD: Trapezoidal rule ──
+        @inbounds for idx_src in 1:num_points
+            r_src = @view source.r[idx_src, :]
+            n_src = @view source.normal[idx_src, :]
+            w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ
+            @inbounds @simd for m in 1:P
+                proj_kc[m] += w_double * Ct[m, idx_src]
+                proj_ks[m] += w_double * St[m, idx_src]
+            end
+
+            if populate_greenfunction
+                w_single = laplace_single_layer(r_obs, r_src) * dθdζ
+                @inbounds @simd for m in 1:P
+                    proj_gc[m] += w_single * Ct[m, idx_src]
+                    proj_gs[m] += w_single * St[m, idx_src]
+                end
+            end
+        end
+
+        # ── NEAR FIELD: Polar quadrature with BIEST singular correction ──
+        extract_patch!(r_patch, source.r, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
+        extract_patch!(dr_dθ_patch, source.dr_dθ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
+        extract_patch!(dr_dζ_patch, source.dr_dζ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
+
+        interpolate_to_polar!(r_polar, r_patch, P2G)
+        interpolate_to_polar!(dr_dθ_polar, dr_dθ_patch, P2G)
+        interpolate_to_polar!(dr_dζ_polar, dr_dζ_patch, P2G)
+
+        compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient)
+
+        @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
+            r_src = @view r_polar[ir, ia, :]
+            n_src = @view n_polar[ir, ia, :]
+            M_polar_single[ir, ia] = laplace_single_layer(r_obs, r_src) * Ppou[ir, ia] * dθdζ
+            M_polar_double[ir, ia] = laplace_double_layer(r_obs, r_src, n_src) * Ppou[ir, ia] * dθdζ
+        end
+
+        mul!(M_grid_single_flat, P2G, vec(M_polar_single))
+        mul!(M_grid_double_flat, P2G, vec(M_polar_double))
+        M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
+        M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
+
+        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
+            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
+            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
+            idx_src = idx_pol + source.mtheta * (idx_tor - 1)
+
+            r_src = @view source.r[idx_src, :]
+            n_src = @view source.normal[idx_src, :]
+            far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ
+            w_double = M_grid_double[ii, jj] + far_double
+            @simd for m in 1:P
+                proj_kc[m] += w_double * Ct[m, idx_src]
+                proj_ks[m] += w_double * St[m, idx_src]
+            end
+
+            if populate_greenfunction
+                far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ
+                w_single = M_grid_single[ii, jj] + far_single
+                @simd for m in 1:P
+                    proj_gc[m] += w_single * Ct[m, idx_src]
+                    proj_gs[m] += w_single * St[m, idx_src]
+                end
+            end
+        end
+
+        # ── Write projected row to buffer (each idx_obs owns its row) ──
+        @inbounds for m in 1:P
+            KZ_c[idx_obs, m] = proj_kc[m]
+            KZ_s[idx_obs, m] = proj_ks[m]
+        end
+        if populate_greenfunction
+            @inbounds for m in 1:P
+                GZ_c[idx_obs, m] = proj_gc[m]
+                GZ_s[idx_obs, m] = proj_gs[m]
+            end
+        end
+    end
+
+    # ── Assemble P×P projected matrices via GEMM (sequential, after barrier) ──
+    # K_c = Z^H K Z = (C'·KZ_c + S'·KZ_s) + i(C'·KZ_s − S'·KZ_c)
+    K_re = zeros(P, P)
+    K_im = zeros(P, P)
+    mul!(K_re, cos_basis', KZ_c)
+    mul!(K_re, sin_basis', KZ_s, 1.0, 1.0)
+    mul!(K_im, cos_basis', KZ_s)
+    mul!(K_im, sin_basis', KZ_c, -1.0, 1.0)
+
+    G_re = zeros(P, P)
+    G_im = zeros(P, P)
+    if populate_greenfunction
+        mul!(G_re, cos_basis', GZ_c)
+        mul!(G_re, sin_basis', GZ_s, 1.0, 1.0)
+        mul!(G_im, cos_basis', GZ_s)
+        mul!(G_im, sin_basis', GZ_c, -1.0, 1.0)
+    end
+
+    # ── Post-processing (mirrors compute_3D_kernel_matrices!) ──
+    K_re ./= 2π
+    K_im ./= 2π
+    G_re ./= 2π
+    G_im ./= 2π
+
+    # Diagonal: K += I → K_c += Gram [for same-type source/observer]
+    if typeof(source) == typeof(observer)
+        K_re .+= real.(Gram)
+        K_im .+= imag.(Gram)
+    end
+
+    K_c .= complex.(K_re, K_im)
+    G_c .= complex.(G_re, G_im)
+end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 8796048d..1970786b 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -16,6 +16,7 @@ include("DataTypes.jl")
 include("PnQuadCache.jl")
 include("Kernel2D.jl")
 include("Kernel3D.jl")
+include("ProjectedKernel.jl")
 include("Field.jl")
 
 export VacuumInput, WallShapeSettings
@@ -105,36 +106,88 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
     num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
 
-    # Local work arrays
-    grad_green = zeros!(pool, num_points_total, num_points_total)
-    green_temp = zeros!(pool, num_points_surf, num_points_surf)
-
     # Views into output Green's function matrices for the active rows/columns
     grre = @view grre_in[1:num_points_total, :]
     grri = @view grri_in[1:num_points_total, :]
 
-    # Plasma–Plasma block
-    pp_kernel_timing = @timed begin
-        kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
-    end
-    println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
-
-    if wall.nowall && inputs.use_galerkin
+    if wall.nowall && inputs.use_galerkin && inputs.fuse_projection
         # ================================================================
-        # Galerkin projection: solve in Fourier space [2P × 2P] instead of
-        # the full collocation system [num_points_surf × num_points_surf].
+        # Fused Galerkin: kernel assembly + Fourier projection in one pass.
+        # The full M×M kernel matrices are never materialized — instead the
+        # P×P projected matrices K_c and G_c are accumulated row by row as
+        # kernel values are computed.
         #
-        # Instead of:  wv = F_inv * (K \ (G * F))       O(M³)
-        # We compute:  wv ~ (F'KF) \ (F'GF)             O(M²P + P³)
+        # Memory:  O(MP + P²)  instead of  O(M²)
+        # FLOPs:   O(M²P + P³) — same as two-step Galerkin
+        # ================================================================
+        P = num_modes
+        M = num_points_surf
+
+        fused_timing = @timed begin
+            # Gram matrix Gram = Z^H Z  [P × P complex]
+            Gram = complex.(cos_mn_basis' * cos_mn_basis .+ sin_mn_basis' * sin_mn_basis,
+                cos_mn_basis' * sin_mn_basis .- sin_mn_basis' * cos_mn_basis)
+
+            # Fused projected kernel: K_c = Z^H K Z, G_c = Z^H G Z  [P × P complex]
+            K_c = zeros(ComplexF64, P, P)
+            G_c = zeros(ComplexF64, P, P)
+            projected_kernel!(K_c, G_c, plasma_surf, plasma_surf, kparams,
+                cos_mn_basis, sin_mn_basis, Gram)
+        end
+        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
+
+        solve_timing = @timed begin
+            # Solve projected BIE via SVD-based pseudoinverse [pinv handles rank deficiency]
+            c_ext = pinv(K_c) * G_c
+
+            # Interior kernel: K_int = -K + 2I → K_c_int = 2·Gram - K_c
+            K_c_int = 2 .* Gram .- K_c
+            c_int = pinv(K_c_int) * G_c
+
+            # wv = (4π²/M) · Gram · c_ext  [P × P complex, Chance 2007 eq. 114]
+            wv .= (4π^2 / M) .* (Gram * c_ext)
+
+            # ── Backward-compatible reconstruction of real grri/grre ─────────
+            # The downstream code (ForceFreeStates, PerturbedEquilibrium) still expects
+            # grre/grri as [M × 2P] real matrices. Reconstruct from the complex P×P
+            # solution coefficients.  This section can be removed once the downstream
+            # modules are updated to work directly in mode space.
+            c_ext_r, c_ext_i = real.(c_ext), imag.(c_ext)
+            c_int_r, c_int_i = real.(c_int), imag.(c_int)
+
+            mul!(@view(grre[1:M, 1:P]), cos_mn_basis, c_ext_r)
+            mul!(@view(grre[1:M, 1:P]), sin_mn_basis, c_ext_i, -1.0, 1.0)
+            mul!(@view(grre[1:M, (P+1):(2*P)]), cos_mn_basis, c_ext_i)
+            mul!(@view(grre[1:M, (P+1):(2*P)]), sin_mn_basis, c_ext_r, 1.0, 1.0)
+
+            mul!(@view(grri[1:M, 1:P]), cos_mn_basis, c_int_r)
+            mul!(@view(grri[1:M, 1:P]), sin_mn_basis, c_int_i, -1.0, 1.0)
+            mul!(@view(grri[1:M, (P+1):(2*P)]), cos_mn_basis, c_int_i)
+            mul!(@view(grri[1:M, (P+1):(2*P)]), sin_mn_basis, c_int_r, 1.0, 1.0)
+        end
+        println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+
+    elseif wall.nowall && inputs.use_galerkin
+        # ================================================================
+        # Two-step Galerkin: full M×M kernel → project → solve in P×P.
         #
-        # where M = num_points_surf and P = num_modes and
-        # F = [cos_basis | sin_basis] is the [M × 2P] Fourier basis and
-        # K = grad_green is the [M × M] double-layer kernel matrix.
+        # Memory:  O(M²) for kernel storage
+        # FLOPs:   O(M²P + P³)
         # ================================================================
+
+        # Full-size kernel matrices
+        grad_green = zeros!(pool, num_points_total, num_points_total)
+        green_temp = zeros!(pool, num_points_surf, num_points_surf)
+
+        pp_kernel_timing = @timed begin
+            kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+        end
+        println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
+
         temp = zeros!(pool, num_points_surf, num_modes)
 
         proj_timing = @timed begin
-            # K_proj = F' * grad_green * F  [2 * num_modes × 2 * num_modes]
+            # K_proj = F' * grad_green * F  [2P × 2P]
             K_proj = zeros(2 * num_modes, 2 * num_modes)
             mul!(temp, grad_green, cos_mn_basis)
             mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
@@ -143,7 +196,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
             mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
 
-            # G_proj = F' * green_temp * F  [2 * num_modes × 2 * num_modes]
+            # G_proj = F' * green_temp * F  [2P × 2P]
             G_proj = zeros(2 * num_modes, 2 * num_modes)
             mul!(temp, green_temp, cos_mn_basis)
             mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
@@ -152,34 +205,28 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
             mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
 
-            # Gram matrix F'F (needed for interior kernel and wv normalization)
+            # Gram matrix F'F
             FtF = zeros(2 * num_modes, 2 * num_modes)
             mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis)
             mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis)
             mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis)
             mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis)
 
-            # Solve projected systems via SVD-based pseudoinverse. The truncated Fourier
-            # basis with the SFL angle correction (ν) can make the projected operators
-            # rank-deficient — the interior BIE operator in particular has a physical
-            # null space (constant potential mode). The pseudoinverse finds the
-            # minimum-norm solution, correctly projecting out numerically null directions
-            # without affecting well-resolved modes.
+            # Solve projected systems via SVD-based pseudoinverse [pinv handles rank deficiency]
             Y_ext = pinv(K_proj) * G_proj
 
-            # Interior kernel in projected space: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
+            # Interior kernel: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
             K_proj_int = 2 .* FtF .- K_proj
             Y_int = pinv(K_proj_int) * G_proj
 
-            # Reconstruct physical-space Green's functions for backward compatibility
-            # grre = F * Y = cos * Y[1:P, :] + sin * Y[P+1:2P, :]
+            # ── Backward-compatible reconstruction of real grri/grre ─────────
+            # This section can be removed once downstream modules work in mode space.
             mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :]))
             mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
             mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :]))
             mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
 
-            # Extract wv: the [arr air; ari aii] blocks equal (4π²/M) * F'F * Y_ext,
-            # then wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
+            # wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
             wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext)
             wv .= complex.(
                 @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]),
@@ -187,12 +234,22 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             )
         end
         println(" Galerkin Project and Solve  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
+
     else
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]
         # Handles both no-wall and wall cases.
         # ================================================================
 
+        # Full-size kernel matrices
+        grad_green = zeros!(pool, num_points_total, num_points_total)
+        green_temp = zeros!(pool, num_points_surf, num_points_surf)
+
+        pp_kernel_timing = @timed begin
+            kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+        end
+        println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
+
         # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp)
         colloc_ft_timing = @timed begin
             fourier_transform!(grre, green_temp, cos_mn_basis)

From 2e1e728b7ee1a46be02f722875d2b509c8dba99a Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Sat, 14 Mar 2026 16:48:59 -0400
Subject: [PATCH 04/23] VACUUM - WIP - condensing tri operations into a single
 complex operation, some pooling optimizations

---
 src/Vacuum/ProjectedKernel.jl |  33 +++---
 src/Vacuum/Vacuum.jl          | 189 ++++++++++++++++------------------
 2 files changed, 104 insertions(+), 118 deletions(-)

diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
index bd17a9a9..4bca9062 100644
--- a/src/Vacuum/ProjectedKernel.jl
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -15,6 +15,9 @@
 # FLOP cost is identical to the two-step approach O(M²P), but memory drops
 # from O(M²) to O(MP + P²).
 
+# ============================================================================
+# 2D fused projected kernel
+# ============================================================================
 """
     projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram)
 
@@ -37,38 +40,32 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types.
 function projected_kernel! end
 
 function projected_kernel!(
-    K_c::Matrix{ComplexF64},
-    G_c::Matrix{ComplexF64},
+    K_c::AbstractMatrix{ComplexF64},
+    G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     params::KernelParams2D,
     cos_basis::Matrix{Float64},
     sin_basis::Matrix{Float64},
-    Gram::Matrix{ComplexF64}
+    Gram::AbstractMatrix{ComplexF64}
 )
     _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram)
 end
 
 function projected_kernel!(
-    K_c::Matrix{ComplexF64},
-    G_c::Matrix{ComplexF64},
+    K_c::AbstractMatrix{ComplexF64},
+    G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     params::KernelParams3D,
     cos_basis::Matrix{Float64},
     sin_basis::Matrix{Float64},
-    Gram::Matrix{ComplexF64}
+    Gram::AbstractMatrix{ComplexF64}
 )
     _projected_kernel_3D!(K_c, G_c, observer, source,
         params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER,
         cos_basis, sin_basis, Gram)
 end
-
-
-# ============================================================================
-# 2D fused projected kernel
-# ============================================================================
-
 """
     _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram)
 
@@ -79,14 +76,14 @@ P×P projected matrices instead of filling the M×M kernel matrices.
 Memory: O(MP) instead of O(M²).
 """
 @with_pool pool function _projected_kernel_2D!(
-    K_c::Matrix{ComplexF64},
-    G_c::Matrix{ComplexF64},
+    K_c::AbstractMatrix{ComplexF64},
+    G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     n::Int,
     cos_basis::Matrix{Float64},
     sin_basis::Matrix{Float64},
-    Gram::Matrix{ComplexF64}
+    Gram::AbstractMatrix{ComplexF64}
 )
     M, P = size(cos_basis)
     mtheta = length(observer.x)
@@ -286,8 +283,8 @@ cross-thread accumulation races — the same write pattern as the original
 Memory: O(4MP + P²) instead of O(M²).
 """
 function _projected_kernel_3D!(
-    K_c::Matrix{ComplexF64},
-    G_c::Matrix{ComplexF64},
+    K_c::AbstractMatrix{ComplexF64},
+    G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     PATCH_RAD::Int,
@@ -295,7 +292,7 @@ function _projected_kernel_3D!(
     INTERP_ORDER::Int,
     cos_basis::Matrix{Float64},
     sin_basis::Matrix{Float64},
-    Gram::Matrix{ComplexF64}
+    Gram::AbstractMatrix{ComplexF64}
 )
     M, P = size(cos_basis)
     num_points = observer.mtheta * observer.nzeta
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 1970786b..64decca8 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -72,28 +72,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     n_override::Union{Nothing,Int}=nothing
 )
 
+    (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin, fuse_projection) = inputs
+
     # Initialize surface geometries
-    geom_timing = @timed begin
-        plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
-        wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
-    end
-    println(" Compute geometry  TIME=$(round(geom_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(geom_timing.bytes))")
+    plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
+    wall = nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
 
     # Compute Fourier basis coefficients
-    basis_timing = @timed begin
-        ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
-        cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(
-            inputs.mtheta,
-            inputs.mpert,
-            inputs.mlow,
-            inputs.nzeta,
-            inputs.npert,
-            inputs.nlow;
-            n_2D=n_override,
-            ν=ν
-        )
-    end
-    println(" Compute Fourier basis  TIME=$(round(basis_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(basis_timing.bytes))")
+    ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
+    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν)
     num_points_surf, num_modes = size(cos_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
@@ -101,7 +88,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     PATCH_RAD = 11
     RAD_DIM = 20
     INTERP_ORDER = 5
-    kparams = inputs.nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override)
+    kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override)
 
     # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
     num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
@@ -110,12 +97,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     grre = @view grre_in[1:num_points_total, :]
     grri = @view grri_in[1:num_points_total, :]
 
-    if wall.nowall && inputs.use_galerkin && inputs.fuse_projection
+    if wall.nowall && use_galerkin && fuse_projection
         # ================================================================
         # Fused Galerkin: kernel assembly + Fourier projection in one pass.
         # The full M×M kernel matrices are never materialized — instead the
-        # P×P projected matrices K_c and G_c are accumulated row by row as
-        # kernel values are computed.
+        # P×P projected matrices grad_green_fourier and G_c are accumulated
+        # row by row as kernel values are computed.
         #
         # Memory:  O(MP + P²)  instead of  O(M²)
         # FLOPs:   O(M²P + P³) — same as two-step Galerkin
@@ -123,115 +110,117 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         P = num_modes
         M = num_points_surf
 
-        fused_timing = @timed begin
-            # Gram matrix Gram = Z^H Z  [P × P complex]
-            Gram = complex.(cos_mn_basis' * cos_mn_basis .+ sin_mn_basis' * sin_mn_basis,
-                cos_mn_basis' * sin_mn_basis .- sin_mn_basis' * cos_mn_basis)
+        # Temporary matrices
+        exp_mn_basis = zeros!(pool, ComplexF64, M, P)
+        exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis)
+        Gram = zeros!(pool, ComplexF64, P, P)
+
+        # Projected kernel matrices
+        grad_green_fourier = zeros!(pool, ComplexF64, P, P)
+        green_fourier = zeros!(pool, ComplexF64, P, P)
+        grad_green_fourier_int = similar!(pool, grad_green_fourier)
+        green_fourier_int = similar!(pool, green_fourier)
 
-            # Fused projected kernel: K_c = Z^H K Z, G_c = Z^H G Z  [P × P complex]
-            K_c = zeros(ComplexF64, P, P)
-            G_c = zeros(ComplexF64, P, P)
-            projected_kernel!(K_c, G_c, plasma_surf, plasma_surf, kparams,
+        fused_timing = @timed begin
+            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z  [P × P complex]
+            projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams,
                 cos_mn_basis, sin_mn_basis, Gram)
         end
         println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
 
         solve_timing = @timed begin
-            # Solve projected BIE via SVD-based pseudoinverse [pinv handles rank deficiency]
-            c_ext = pinv(K_c) * G_c
+            # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
+            # Gram matrix Gram = Z^H Z  [P × P complex]
+            mul!(Gram, exp_mn_basis', exp_mn_basis)
+            grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
+            green_fourier_int .= green_fourier
 
-            # Interior kernel: K_int = -K + 2I → K_c_int = 2·Gram - K_c
-            K_c_int = 2 .* Gram .- K_c
-            c_int = pinv(K_c_int) * G_c
+            # Solve projected BIEs for exterior and interior kernels
+            F = lu!(grad_green_fourier)
+            ldiv!(F, green_fourier)
+            F = lu!(grad_green_fourier_int)
+            ldiv!(F, green_fourier_int)
 
             # wv = (4π²/M) · Gram · c_ext  [P × P complex, Chance 2007 eq. 114]
-            wv .= (4π^2 / M) .* (Gram * c_ext)
+            wv .= (4π^2 / M) .* (Gram * green_fourier)
 
             # ── Backward-compatible reconstruction of real grri/grre ─────────
-            # The downstream code (ForceFreeStates, PerturbedEquilibrium) still expects
-            # grre/grri as [M × 2P] real matrices. Reconstruct from the complex P×P
-            # solution coefficients.  This section can be removed once the downstream
-            # modules are updated to work directly in mode space.
-            c_ext_r, c_ext_i = real.(c_ext), imag.(c_ext)
-            c_int_r, c_int_i = real.(c_int), imag.(c_int)
-
-            mul!(@view(grre[1:M, 1:P]), cos_mn_basis, c_ext_r)
-            mul!(@view(grre[1:M, 1:P]), sin_mn_basis, c_ext_i, -1.0, 1.0)
-            mul!(@view(grre[1:M, (P+1):(2*P)]), cos_mn_basis, c_ext_i)
-            mul!(@view(grre[1:M, (P+1):(2*P)]), sin_mn_basis, c_ext_r, 1.0, 1.0)
-
-            mul!(@view(grri[1:M, 1:P]), cos_mn_basis, c_int_r)
-            mul!(@view(grri[1:M, 1:P]), sin_mn_basis, c_int_i, -1.0, 1.0)
-            mul!(@view(grri[1:M, (P+1):(2*P)]), cos_mn_basis, c_int_i)
-            mul!(@view(grri[1:M, (P+1):(2*P)]), sin_mn_basis, c_int_r, 1.0, 1.0)
+            # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext).
+            # This section can be removed once downstream modules work in mode space.
+            temp = zeros!(pool, ComplexF64, M, P)
+            mul!(temp, exp_mn_basis, green_fourier)
+            @view(grre[1:M, 1:P]) .= real.(temp)
+            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, green_fourier_int)
+            @view(grri[1:M, 1:P]) .= real.(temp)
+            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
         end
         println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
 
-    elseif wall.nowall && inputs.use_galerkin
+    elseif wall.nowall && use_galerkin
         # ================================================================
         # Two-step Galerkin: full M×M kernel → project → solve in P×P.
+        # Uses complex basis Z = C + iS so projected matrices are P×P complex.
         #
         # Memory:  O(M²) for kernel storage
         # FLOPs:   O(M²P + P³)
         # ================================================================
 
+        P = num_modes
+        M = num_points_surf
+
         # Full-size kernel matrices
         grad_green = zeros!(pool, num_points_total, num_points_total)
         green_temp = zeros!(pool, num_points_surf, num_points_surf)
 
+        # Projected kernel matrices
+        grad_green_fourier = zeros!(pool, ComplexF64, P, P)
+        green_fourier = zeros!(pool, ComplexF64, P, P)
+        Gram = zeros!(pool, ComplexF64, P, P)
+        green_fourier_int = similar!(pool, green_fourier)
+        grad_green_fourier_int = similar!(pool, grad_green_fourier)
+
+        # Temporary matrices
+        exp_mn_basis = zeros!(pool, ComplexF64, M, P)
+        exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis)
+        temp = zeros!(pool, ComplexF64, M, P)
+
         pp_kernel_timing = @timed begin
             kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
         end
         println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
 
-        temp = zeros!(pool, num_points_surf, num_modes)
-
         proj_timing = @timed begin
-            # K_proj = F' * grad_green * F  [2P × 2P]
-            K_proj = zeros(2 * num_modes, 2 * num_modes)
-            mul!(temp, grad_green, cos_mn_basis)
-            mul!(@view(K_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
-            mul!(@view(K_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
-            mul!(temp, grad_green, sin_mn_basis)
-            mul!(@view(K_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
-            mul!(@view(K_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
-
-            # G_proj = F' * green_temp * F  [2P × 2P]
-            G_proj = zeros(2 * num_modes, 2 * num_modes)
-            mul!(temp, green_temp, cos_mn_basis)
-            mul!(@view(G_proj[1:num_modes, 1:num_modes]), cos_mn_basis', temp)
-            mul!(@view(G_proj[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', temp)
-            mul!(temp, green_temp, sin_mn_basis)
-            mul!(@view(G_proj[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', temp)
-            mul!(@view(G_proj[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', temp)
-
-            # Gram matrix F'F
-            FtF = zeros(2 * num_modes, 2 * num_modes)
-            mul!(@view(FtF[1:num_modes, 1:num_modes]), cos_mn_basis', cos_mn_basis)
-            mul!(@view(FtF[1:num_modes, (num_modes+1):(2*num_modes)]), cos_mn_basis', sin_mn_basis)
-            mul!(@view(FtF[(num_modes+1):(2*num_modes), 1:num_modes]), sin_mn_basis', cos_mn_basis)
-            mul!(@view(FtF[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]), sin_mn_basis', sin_mn_basis)
-
-            # Solve projected systems via SVD-based pseudoinverse [pinv handles rank deficiency]
-            Y_ext = pinv(K_proj) * G_proj
-
-            # Interior kernel: K_int = -K + 2I → K_proj_int = 2*F'F - K_proj
-            K_proj_int = 2 .* FtF .- K_proj
-            Y_int = pinv(K_proj_int) * G_proj
+            # Project matrices to mode space
+            # grad_green_fourier = Z^H * grad_green * Z
+            mul!(temp, grad_green, exp_mn_basis)
+            mul!(grad_green_fourier, exp_mn_basis', temp)
+            # green_fourier = Z^H * green_temp * Z
+            mul!(temp, green_temp, exp_mn_basis)
+            mul!(green_fourier, exp_mn_basis', temp)
+
+            # Interior kernel: grad_green_fourier_int = 2·Gram - grad_green_fourier
+            # Gram = Z^H Z  [P × P complex]
+            mul!(Gram, exp_mn_basis', exp_mn_basis)
+            grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
+
+            # Solve projected BIEs for exterior and interior kernels
+            F = lu!(grad_green_fourier)
+            ldiv!(F, green_fourier)
+            F = lu!(grad_green_fourier_int)
+            ldiv!(F, green_fourier_int)
+
+            # wv = (4π²/M) · Gram · green_fourier  [P × P complex, Chance 2007 eq. 114]
+            wv .= (4π^2 / M) .* (Gram * green_fourier)
 
             # ── Backward-compatible reconstruction of real grri/grre ─────────
-            # This section can be removed once downstream modules work in mode space.
-            mul!(grre, cos_mn_basis, @view(Y_ext[1:num_modes, :]))
-            mul!(grre, sin_mn_basis, @view(Y_ext[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
-            mul!(grri, cos_mn_basis, @view(Y_int[1:num_modes, :]))
-            mul!(grri, sin_mn_basis, @view(Y_int[(num_modes+1):(2*num_modes), :]), 1.0, 1.0)
-
-            # wv = complex(arr + aii, air - ari) [Chance 2007 eq. 114]
-            wv_blocks = (4π^2 / num_points_surf) .* (FtF * Y_ext)
-            wv .= complex.(
-                @view(wv_blocks[1:num_modes, 1:num_modes]) .+ @view(wv_blocks[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)]),
-                @view(wv_blocks[1:num_modes, (num_modes+1):(2*num_modes)]) .- @view(wv_blocks[(num_modes+1):(2*num_modes), 1:num_modes])
-            )
+            # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext).
+            mul!(temp, exp_mn_basis, green_fourier)
+            @view(grre[1:M, 1:P]) .= real.(temp)
+            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, green_fourier_int)
+            @view(grri[1:M, 1:P]) .= real.(temp)
+            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
         end
         println(" Galerkin Project and Solve  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
 
@@ -306,7 +295,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
 
     inputs.force_wv_symmetry && hermitianpart!(wv)
 
-    if inputs.nzeta > 1 # 3D
+    if nzeta > 1 # 3D
         plasma_pts .= plasma_surf.r
         wall_pts .= wall.r
     else # 2D

From feb07e4e2f33330b033f044506507f9f318bcb75 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Sat, 14 Mar 2026 16:56:19 -0400
Subject: [PATCH 05/23] VACUUM - WIP - combining galerkin and fused galerkin
 into one main with a small kernel subbranch for fused or not

---
 .../Solovev_ideal_example_3D/run_example.jl   |   4 +
 src/Vacuum/Vacuum.jl                          | 125 ++++++------------
 2 files changed, 41 insertions(+), 88 deletions(-)
 create mode 100644 examples/Solovev_ideal_example_3D/run_example.jl

diff --git a/examples/Solovev_ideal_example_3D/run_example.jl b/examples/Solovev_ideal_example_3D/run_example.jl
new file mode 100644
index 00000000..fa0a9e53
--- /dev/null
+++ b/examples/Solovev_ideal_example_3D/run_example.jl
@@ -0,0 +1,4 @@
+using Pkg;
+Pkg.activate(joinpath(@__DIR__, "../.."))
+using GeneralizedPerturbedEquilibrium
+GeneralizedPerturbedEquilibrium.main([dirname(@__FILE__)])
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 64decca8..2004ffc4 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -97,124 +97,73 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     grre = @view grre_in[1:num_points_total, :]
     grri = @view grri_in[1:num_points_total, :]
 
-    if wall.nowall && use_galerkin && fuse_projection
+    if wall.nowall && use_galerkin
         # ================================================================
-        # Fused Galerkin: kernel assembly + Fourier projection in one pass.
-        # The full M×M kernel matrices are never materialized — instead the
-        # P×P projected matrices grad_green_fourier and G_c are accumulated
-        # row by row as kernel values are computed.
+        # Galerkin: solve in P×P mode space. Uses complex basis Z = C + iS
+        # so projected matrices are P×P complex.
         #
+        # Fused (fuse_projection=true): kernel assembly + Fourier projection
+        # in one pass. The full M×M kernel matrices are never materialized —
+        # instead the P×P projected matrices grad_green_fourier and G_c are
+        # accumulated row by row as kernel values are computed.
         # Memory:  O(MP + P²)  instead of  O(M²)
-        # FLOPs:   O(M²P + P³) — same as two-step Galerkin
+        #
+        # Two-step (fuse_projection=false): full M×M kernel → project → solve.
+        # Memory:  O(M²) for kernel storage
+        #
+        # FLOPs (both):  O(M²P + P³)
         # ================================================================
         P = num_modes
         M = num_points_surf
 
-        # Temporary matrices
+        # Temporary and projected kernel matrices [P × P complex]
         exp_mn_basis = zeros!(pool, ComplexF64, M, P)
         exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis)
         Gram = zeros!(pool, ComplexF64, P, P)
-
-        # Projected kernel matrices
         grad_green_fourier = zeros!(pool, ComplexF64, P, P)
         green_fourier = zeros!(pool, ComplexF64, P, P)
         grad_green_fourier_int = similar!(pool, grad_green_fourier)
         green_fourier_int = similar!(pool, green_fourier)
-
-        fused_timing = @timed begin
-            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z  [P × P complex]
-            projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams,
-                cos_mn_basis, sin_mn_basis, Gram)
-        end
-        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
-
-        solve_timing = @timed begin
-            # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
-            # Gram matrix Gram = Z^H Z  [P × P complex]
-            mul!(Gram, exp_mn_basis', exp_mn_basis)
-            grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
-            green_fourier_int .= green_fourier
-
-            # Solve projected BIEs for exterior and interior kernels
-            F = lu!(grad_green_fourier)
-            ldiv!(F, green_fourier)
-            F = lu!(grad_green_fourier_int)
-            ldiv!(F, green_fourier_int)
-
-            # wv = (4π²/M) · Gram · c_ext  [P × P complex, Chance 2007 eq. 114]
-            wv .= (4π^2 / M) .* (Gram * green_fourier)
-
-            # ── Backward-compatible reconstruction of real grri/grre ─────────
-            # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext).
-            # This section can be removed once downstream modules work in mode space.
-            temp = zeros!(pool, ComplexF64, M, P)
-            mul!(temp, exp_mn_basis, green_fourier)
-            @view(grre[1:M, 1:P]) .= real.(temp)
-            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, green_fourier_int)
-            @view(grri[1:M, 1:P]) .= real.(temp)
-            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-        end
-        println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
-
-    elseif wall.nowall && use_galerkin
-        # ================================================================
-        # Two-step Galerkin: full M×M kernel → project → solve in P×P.
-        # Uses complex basis Z = C + iS so projected matrices are P×P complex.
-        #
-        # Memory:  O(M²) for kernel storage
-        # FLOPs:   O(M²P + P³)
-        # ================================================================
-
-        P = num_modes
-        M = num_points_surf
-
-        # Full-size kernel matrices
-        grad_green = zeros!(pool, num_points_total, num_points_total)
-        green_temp = zeros!(pool, num_points_surf, num_points_surf)
-
-        # Projected kernel matrices
-        grad_green_fourier = zeros!(pool, ComplexF64, P, P)
-        green_fourier = zeros!(pool, ComplexF64, P, P)
-        Gram = zeros!(pool, ComplexF64, P, P)
-        green_fourier_int = similar!(pool, green_fourier)
-        grad_green_fourier_int = similar!(pool, grad_green_fourier)
-
-        # Temporary matrices
-        exp_mn_basis = zeros!(pool, ComplexF64, M, P)
-        exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis)
         temp = zeros!(pool, ComplexF64, M, P)
 
-        pp_kernel_timing = @timed begin
-            kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
-        end
-        println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
-
-        proj_timing = @timed begin
-            # Project matrices to mode space
-            # grad_green_fourier = Z^H * grad_green * Z
+        if fuse_projection
+            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
+            fused_timing = @timed begin
+                projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams,
+                    cos_mn_basis, sin_mn_basis, Gram)
+            end
+            println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
+        else
+            # Full-size kernel matrices, then project to mode space
+            grad_green = zeros!(pool, num_points_total, num_points_total)
+            green_temp = zeros!(pool, num_points_surf, num_points_surf)
+            pp_kernel_timing = @timed begin
+                kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+            end
+            println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
+            # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z
             mul!(temp, grad_green, exp_mn_basis)
             mul!(grad_green_fourier, exp_mn_basis', temp)
-            # green_fourier = Z^H * green_temp * Z
             mul!(temp, green_temp, exp_mn_basis)
             mul!(green_fourier, exp_mn_basis', temp)
+        end
 
-            # Interior kernel: grad_green_fourier_int = 2·Gram - grad_green_fourier
-            # Gram = Z^H Z  [P × P complex]
+        solve_timing = @timed begin
+            # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
             mul!(Gram, exp_mn_basis', exp_mn_basis)
             grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
+            green_fourier_int .= green_fourier
 
-            # Solve projected BIEs for exterior and interior kernels
+            # Solve projected BIEs for exterior and interior.
             F = lu!(grad_green_fourier)
             ldiv!(F, green_fourier)
             F = lu!(grad_green_fourier_int)
             ldiv!(F, green_fourier_int)
 
-            # wv = (4π²/M) · Gram · green_fourier  [P × P complex, Chance 2007 eq. 114]
+            # wv = (4π²/M) · Gram · green_fourier  [Chance 2007 eq. 114]
             wv .= (4π^2 / M) .* (Gram * green_fourier)
 
-            # ── Backward-compatible reconstruction of real grri/grre ─────────
-            # Reconstruct M×2P real from P×P complex: grre = real(Z·c_ext), imag(Z·c_ext).
+            # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
             mul!(temp, exp_mn_basis, green_fourier)
             @view(grre[1:M, 1:P]) .= real.(temp)
             @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
@@ -222,7 +171,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             @view(grri[1:M, 1:P]) .= real.(temp)
             @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
         end
-        println(" Galerkin Project and Solve  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
+        println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
 
     else
         # ================================================================

From 0e3b7e5fdaf894ca82e7062f9cdc26fc33a140b3 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Sat, 14 Mar 2026 18:17:41 -0400
Subject: [PATCH 06/23] VACUUM - WIP - consolidating cos/sin_mn_basis into
 exp_mn_basis and propagating it through the vacuum module

---
 benchmarks/benchmark_fourier_transforms.jl |  44 ++---
 src/Utilities/FourierTransforms.jl         |  30 ++-
 src/Vacuum/ProjectedKernel.jl              | 202 ++++++---------------
 src/Vacuum/Vacuum.jl                       |  82 +++++----
 test/runtests_vacuum.jl                    |  76 ++++++++
 5 files changed, 210 insertions(+), 224 deletions(-)

diff --git a/benchmarks/benchmark_fourier_transforms.jl b/benchmarks/benchmark_fourier_transforms.jl
index 3a978417..840b63f6 100644
--- a/benchmarks/benchmark_fourier_transforms.jl
+++ b/benchmarks/benchmark_fourier_transforms.jl
@@ -28,10 +28,10 @@ function extract_modes(fft_result, mlow, mhigh, mtheta)
     for (i, m) in enumerate(mlow:mhigh)
         if m >= 0
             # Positive frequencies
-            modes[i] = fft_result[m + 1] / mtheta  # FFT normalization
+            modes[i] = fft_result[m+1] / mtheta  # FFT normalization
         else
             # Negative frequencies (wrap around)
-            modes[i] = fft_result[mtheta + m + 1] / mtheta
+            modes[i] = fft_result[mtheta+m+1] / mtheta
         end
     end
     return modes
@@ -39,10 +39,10 @@ end
 
 # Test configurations
 test_cases = [
-    (name="Small (mtheta=128, mpert=10)",   mtheta=128,  mpert=10,  mlow=-5),
-    (name="Medium (mtheta=256, mpert=20)",  mtheta=256,  mpert=20,  mlow=-10),
-    (name="Large (mtheta=480, mpert=40)",   mtheta=480,  mpert=40,  mlow=-20),
-    (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40),
+    (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5),
+    (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10),
+    (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20),
+    (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40)
 ]
 
 for test in test_cases
@@ -56,7 +56,7 @@ for test in test_cases
     mhigh = mlow + mpert - 1
 
     # Create test data
-    theta = range(0, 2π, length=mtheta+1)[1:end-1]
+    theta = range(0, 2π; length=mtheta+1)[1:(end-1)]
     data = sin.(3 .* theta) .+ 0.5 .* cos.(7 .* theta) .+ 0.2 .* sin.(11 .* theta)
 
     # Initialize FourierTransform
@@ -67,7 +67,8 @@ for test in test_cases
     theta_buffer = zeros(ComplexF64, mtheta)
 
     # Pre-allocate for low-level API
-    cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow)
+    cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis)
     gij = reshape(data, mtheta, 1)  # Matrix form
     gil = zeros(Float64, mtheta, mpert)
 
@@ -99,7 +100,7 @@ for test in test_cases
     # Note: Our transform uses a different normalization and basis
     println("\n--- Accuracy Check ---")
     println("FourierTransform allocating vs in-place: ",
-            @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer))))
+        @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer))))
 
     # Compare magnitudes of modes (since basis might differ)
     println("Mode magnitudes comparison (FourierTransform vs FFTW):")
@@ -129,9 +130,9 @@ for test in test_cases
     full_modes = zeros(ComplexF64, mtheta)
     for (i, m) in enumerate(mlow:mhigh)
         if m >= 0
-            full_modes[m + 1] = modes_test[i]
+            full_modes[m+1] = modes_test[i]
         else
-            full_modes[mtheta + m + 1] = modes_test[i]
+            full_modes[mtheta+m+1] = modes_test[i]
         end
     end
     t6 = @benchmark ifft($full_modes)
@@ -140,21 +141,21 @@ for test in test_cases
     # Accuracy check
     println("\n--- Inverse Accuracy Check ---")
     println("inverse() allocating vs in-place: ",
-            @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer))))
+        @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer))))
     println("Round-trip error (real part):     ",
-            @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data))))
+        @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data))))
 
     # Performance summary
     println("\n--- Performance Summary ---")
     println(@sprintf("Forward transform speedup (in-place vs allocating): %.2fx",
-            median(t1).time / median(t2).time))
+        median(t1).time / median(t2).time))
     println(@sprintf("Allocations eliminated: %d → %d",
-            t1.allocs, t2.allocs))
+        t1.allocs, t2.allocs))
 
     # Compare to FFTW
     println(@sprintf("\nFourier vs FFTW (forward): %.2fx %s",
-            abs(median(t2).time / median(t3).time),
-            median(t2).time < median(t3).time ? "faster" : "slower"))
+        abs(median(t2).time / median(t3).time),
+        median(t2).time < median(t3).time ? "faster" : "slower"))
     println("Note: FFTW computes full DFT (all N modes), we compute truncated series ($mpert modes)")
 end
 
@@ -169,7 +170,7 @@ mlow = -10
 nbatch = 10  # Transform 10 functions simultaneously
 
 ft = FourierTransform(mtheta, mpert, mlow)
-theta = range(0, 2π, length=mtheta+1)[1:end-1]
+theta = range(0, 2π; length=mtheta+1)[1:(end-1)]
 
 # Create batch data
 data_matrix = zeros(Float64, mtheta, nbatch)
@@ -182,7 +183,7 @@ modes_matrix = zeros(ComplexF64, mpert, nbatch)
 println("\nTransforming $nbatch functions of length $mtheta:")
 
 print("Allocating (loop):  ")
-@btime for i in 1:$nbatch
+@btime for i in 1:($nbatch)
     modes = $ft($data_matrix[:, i])
 end
 
@@ -191,7 +192,7 @@ print("Allocating (matrix):")
 
 print("In-place (loop):    ")
 modes_buffer = zeros(ComplexF64, mpert)
-@btime for i in 1:$nbatch
+@btime for i in 1:($nbatch)
     transform!($modes_buffer, $ft, $data_matrix[:, i])
 end
 
@@ -208,7 +209,8 @@ mpert = 10
 mlow = 1
 
 # Setup for low-level API
-cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow)
+exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow)
+cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis)
 gij = randn(mtheta, mtheta)  # Green's function matrix
 gil = zeros(Float64, mtheta, mpert)
 
diff --git a/src/Utilities/FourierTransforms.jl b/src/Utilities/FourierTransforms.jl
index 36a53e0b..ae3fe8ce 100644
--- a/src/Utilities/FourierTransforms.jl
+++ b/src/Utilities/FourierTransforms.jl
@@ -66,19 +66,16 @@ just use the n argument. In 3D, we need to compute the basis for all modes and g
 # Returns
 
 - 2D
-  - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν)` [mtheta, mpert]
-  - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν)` [mtheta, mpert]
+  - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν))` [mtheta, mpert]
 - 3D
-  - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert]
-  - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert]
+  - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν - n*ϕ))` [mtheta * nzeta, mpert * npert]
 
 # Notes
 
 The theta and phi grids are uniform: `θᵢ = 2π*i/mtheta` for `i = 0:mtheta-1` and `ϕⱼ = 2π*j/nzeta` for `j = 0:nzeta-1`
 
 When `n=0, ν=0` (default), this reduces to simple harmonic basis:
-- `cos_mn_basis[i,l] = cos(m*θᵢ)`
-- `sin_mn_basis[i,l] = sin(m*θᵢ)`
+- `exp_mn_basis[i,l] = exp(i(m*θᵢ))`
 """
 function compute_fourier_coefficients(
     mtheta::Int,
@@ -100,17 +97,15 @@ function compute_fourier_coefficients(
         @assert length(ν) == mtheta "ν must have length mtheta"
 
         # In 2D, we only use one toroidal mode at a time
-        # Compute sin(mθ - nν) and cos(mθ - nν)
-        sin_mn_basis = sin.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)
-        cos_mn_basis = cos.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)
+        # Compute exp(i(mθ - nν))
+        exp_mn_basis = exp.(im .* ((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν))
     else # 3D
         @assert (n_2D === nothing && ν === nothing) "n_2D and ν should be nothing for 3D"
 
         # In 3D, we need to compute the basis for all modes and grid points
-        # Compute sin(mθ - nζ) and cos(mθ - nζ)
+        # Compute exp(i(mθ - nζ))
         ζ_grid = range(; start=0, length=nzeta, step=2π/nzeta)
-        sin_mn_basis = zeros(mtheta * nzeta, mpert * npert)
-        cos_mn_basis = zeros(mtheta * nzeta, mpert * npert)
+        exp_mn_basis = zeros(ComplexF64, mtheta * nzeta, mpert * npert)
         for idx_n in 1:npert
             n = nlow + idx_n - 1
             n_col_offset = (idx_n - 1) * mpert
@@ -119,16 +114,13 @@ function compute_fourier_coefficients(
                 col = idx_m + n_col_offset
                 for (j, ζ) in enumerate(ζ_grid), (i, θ) in enumerate(θ_grid)
                     idx = i + (j-1)*mtheta
-                    arg = m * θ - n * ζ
-                    s, c = sincos(arg)
-                    cos_mn_basis[idx, col] = c
-                    sin_mn_basis[idx, col] = s
+                    exp_mn_basis[idx, col] = exp(im * (m * θ - n * ζ))
                 end
             end
         end
     end
 
-    return cos_mn_basis, sin_mn_basis
+    return exp_mn_basis
 end
 
 """
@@ -207,8 +199,8 @@ function FourierTransform(
     n::Int=0,
     ν::Vector{Float64}=zeros(Float64, mtheta)
 )
-    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν)
-    return FourierTransform(mtheta, mpert, mlow, cos_mn_basis, sin_mn_basis)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν)
+    return FourierTransform(mtheta, mpert, mlow, real(exp_mn_basis), imag(exp_mn_basis))
 end
 
 """
diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
index 4bca9062..06f8b3f2 100644
--- a/src/Vacuum/ProjectedKernel.jl
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -19,7 +19,7 @@
 # 2D fused projected kernel
 # ============================================================================
 """
-    projected_kernel!(K_c, G_c, observer, source, params, cos_basis, sin_basis, Gram)
+    projected_kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram)
 
 Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z
 directly, without materializing the full M×M kernel matrices.
@@ -33,8 +33,7 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types.
   - `observer`: Observer geometry struct
   - `source`: Source geometry struct
   - `params`: Kernel parameters (KernelParams2D or KernelParams3D)
-  - `cos_basis::Matrix{Float64}`: [M × P] cosine Fourier basis
-  - `sin_basis::Matrix{Float64}`: [M × P] sine Fourier basis
+  - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ))
   - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term)
 """
 function projected_kernel! end
@@ -45,11 +44,10 @@ function projected_kernel!(
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     params::KernelParams2D,
-    cos_basis::Matrix{Float64},
-    sin_basis::Matrix{Float64},
+    exp_mn_basis::AbstractMatrix{ComplexF64},
     Gram::AbstractMatrix{ComplexF64}
 )
-    _projected_kernel_2D!(K_c, G_c, observer, source, params.n, cos_basis, sin_basis, Gram)
+    _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram)
 end
 
 function projected_kernel!(
@@ -58,16 +56,15 @@ function projected_kernel!(
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     params::KernelParams3D,
-    cos_basis::Matrix{Float64},
-    sin_basis::Matrix{Float64},
+    exp_mn_basis::AbstractMatrix{ComplexF64},
     Gram::AbstractMatrix{ComplexF64}
 )
     _projected_kernel_3D!(K_c, G_c, observer, source,
         params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER,
-        cos_basis, sin_basis, Gram)
+        exp_mn_basis, Gram)
 end
 """
-    _projected_kernel_2D!(K_c, G_c, observer, source, n, cos_basis, sin_basis, Gram)
+    _projected_kernel_2D!(K_c, G_c, observer, source, n, exp_mn_basis, Gram)
 
 Fused 2D kernel assembly + projection. Mirrors the loop structure of
 `compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the
@@ -81,11 +78,11 @@ Memory: O(MP) instead of O(M²).
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     n::Int,
-    cos_basis::Matrix{Float64},
-    sin_basis::Matrix{Float64},
+    exp_mn_basis::AbstractMatrix{ComplexF64},
     Gram::AbstractMatrix{ComplexF64}
 )
-    M, P = size(cos_basis)
+    M, P = size(exp_mn_basis)
+    Z = exp_mn_basis
     mtheta = length(observer.x)
     dtheta = 2π / mtheta
     theta_grid = range(; start=0, length=mtheta, step=dtheta)
@@ -113,31 +110,19 @@ Memory: O(MP) instead of O(M²).
     d1_spline_x(dx_dtheta_grid, theta_grid)
     d1_spline_z(dz_dtheta_grid, theta_grid)
 
-    # Pre-transpose basis for contiguous column access: Ct[:, k] = C[k, :]
-    Ct = acquire!(pool, Float64, P, M)
-    St = acquire!(pool, Float64, P, M)
-    Ct .= cos_basis'
-    St .= sin_basis'
+    # Zero output matrices; we accumulate rank-1 updates (conj(Z[j,:]) ⊗ proj_z)
+    fill!(K_c, 0.0)
+    fill!(G_c, 0.0)
 
-    # Real/imaginary accumulators for P×P projected matrices
-    K_re = zeros(P, P)
-    K_im = zeros(P, P)
-    G_re = zeros(P, P)
-    G_im = zeros(P, P)
-
-    # Per-observer projection vectors (P-length)
-    proj_kc = zeros(P)
-    proj_ks = zeros(P)
-    proj_gc = zeros(P)
-    proj_gs = zeros(P)
+    # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z
+    proj_kz = zeros(ComplexF64, P)
+    proj_gz = zeros(ComplexF64, P)
 
     for j in 1:mtheta
         x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
 
-        fill!(proj_kc, 0.0)
-        fill!(proj_ks, 0.0)
-        fill!(proj_gc, 0.0)
-        fill!(proj_gs, 0.0)
+        fill!(proj_kz, 0.0)
+        fill!(proj_gz, 0.0)
         diag_accum = 0.0
 
         # ── Simpson integration for nonsingular source points ──
@@ -152,12 +137,10 @@ Memory: O(MP) instead of O(M²).
 
             if populate_greenfunction
                 w_g = G_n * wsimpson
-                BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
-                BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+                BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
             end
             w_k = gradG_n * wsimpson
-            BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc)
-            BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks)
+            BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz)
 
             diag_accum -= gradG_0 * wsimpson
         end
@@ -190,16 +173,14 @@ Memory: O(MP) instead of O(M²).
                     @inbounds for stencil_idx in 1:5
                         w_g = G_n * s[stencil_idx] * wgauss
                         isrc = sing_idx[stencil_idx]
-                        BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
-                        BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+                        BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
                     end
                 end
 
                 @inbounds for stencil_idx in 1:5
                     w_k = gradG_n * s[stencil_idx] * wgauss
                     isrc = sing_idx[stencil_idx]
-                    BLAS.axpy!(w_k, @view(Ct[:, isrc]), proj_kc)
-                    BLAS.axpy!(w_k, @view(St[:, isrc]), proj_ks)
+                    BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz)
                 end
 
                 diag_accum -= gradG_0 * wgauss
@@ -211,28 +192,17 @@ Memory: O(MP) instead of O(M²).
             @inbounds for stencil_idx in 1:5
                 w_g = -log_correction_array[stencil_idx] / x_obs
                 isrc = sing_idx[stencil_idx]
-                BLAS.axpy!(w_g, @view(Ct[:, isrc]), proj_gc)
-                BLAS.axpy!(w_g, @view(St[:, isrc]), proj_gs)
+                BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
             end
         end
 
         # Fold diagonal accumulation into projection
-        BLAS.axpy!(diag_accum, @view(Ct[:, j]), proj_kc)
-        BLAS.axpy!(diag_accum, @view(St[:, j]), proj_ks)
-
-        # ── Rank-1 accumulate into P×P projection matrices ──
-        # K_c_re += C[j,:] ⊗ proj_kc + S[j,:] ⊗ proj_ks
-        BLAS.ger!(1.0, @view(Ct[:, j]), proj_kc, K_re)
-        BLAS.ger!(1.0, @view(St[:, j]), proj_ks, K_re)
-        # K_c_im += C[j,:] ⊗ proj_ks − S[j,:] ⊗ proj_kc
-        BLAS.ger!(1.0, @view(Ct[:, j]), proj_ks, K_im)
-        BLAS.ger!(-1.0, @view(St[:, j]), proj_kc, K_im)
+        BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz)
 
+        # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ──
+        BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c)
         if populate_greenfunction
-            BLAS.ger!(1.0, @view(Ct[:, j]), proj_gc, G_re)
-            BLAS.ger!(1.0, @view(St[:, j]), proj_gs, G_re)
-            BLAS.ger!(1.0, @view(Ct[:, j]), proj_gs, G_im)
-            BLAS.ger!(-1.0, @view(St[:, j]), proj_gc, G_im)
+            BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c)
         end
     end
 
@@ -240,26 +210,20 @@ Memory: O(MP) instead of O(M²).
 
     # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source
     if source isa PlasmaGeometry
-        K_re .*= -1
-        K_im .*= -1
+        K_c .*= -1
     end
 
     # Diagonal residue: K += residue·I  →  K_c += residue·Gram
     # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
     if residue != 0.0
-        K_re .+= residue .* real.(Gram)
-        K_im .+= residue .* imag.(Gram)
+        K_c .+= residue .* Gram
     end
 
     # 2π𝒢 → 𝒢
     if populate_greenfunction
-        G_re ./= 2π
-        G_im ./= 2π
+        G_c ./= 2π
     end
-
-    K_c .= complex.(K_re, K_im)
-    G_c .= complex.(G_re, G_im)
 end
 
 
@@ -268,7 +232,7 @@ end
 # ============================================================================
 
 """
-    _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, cos_basis, sin_basis, Gram)
+    _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, exp_mn_basis, Gram)
 
 Fused 3D kernel assembly + projection. Mirrors the loop structure of
 `compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction)
@@ -280,7 +244,7 @@ Each observer writes to its own row of the shared buffers, so there are no
 cross-thread accumulation races — the same write pattern as the original
 `compute_3D_kernel_matrices!`.
 
-Memory: O(4MP + P²) instead of O(M²).
+Memory: O(2MP + P²) instead of O(M²).
 """
 function _projected_kernel_3D!(
     K_c::AbstractMatrix{ComplexF64},
@@ -290,11 +254,11 @@ function _projected_kernel_3D!(
     PATCH_RAD::Int,
     RAD_DIM::Int,
     INTERP_ORDER::Int,
-    cos_basis::Matrix{Float64},
-    sin_basis::Matrix{Float64},
+    exp_mn_basis::AbstractMatrix{ComplexF64},
     Gram::AbstractMatrix{ComplexF64}
 )
-    M, P = size(cos_basis)
+    M, P = size(exp_mn_basis)
+    Z = exp_mn_basis
     num_points = observer.mtheta * observer.nzeta
     dθdζ = 4π^2 / num_points
 
@@ -307,25 +271,15 @@ function _projected_kernel_3D!(
     quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER)
     (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data
 
-    # Pre-transpose basis for contiguous column access in the inner loop
-    Ct = Matrix(cos_basis')   # [P × M]
-    St = Matrix(sin_basis')   # [P × M]
-
-    # [M × P] buffers for projected kernel rows.
-    # Row idx_obs = Σ_k K[idx_obs, k] · basis[k, :] — each observer writes to
-    # its own row, so no cross-thread races.
-    KZ_c = zeros(M, P)
-    KZ_s = zeros(M, P)
-    GZ_c = zeros(M, P)
-    GZ_s = zeros(M, P)
+    # [M × P] buffers: row idx_obs holds (kernel row idx_obs) · Z
+    KZ = zeros(ComplexF64, M, P)
+    GZ = zeros(ComplexF64, M, P)
 
     # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors)
     max_tid = Threads.maxthreadid()
     workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid]
-    proj_kc_all = [zeros(P) for _ in 1:max_tid]
-    proj_ks_all = [zeros(P) for _ in 1:max_tid]
-    proj_gc_all = [zeros(P) for _ in 1:max_tid]
-    proj_gs_all = [zeros(P) for _ in 1:max_tid]
+    proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
+    proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
 
     Threads.@threads :static for idx_obs in 1:num_points
         tid = Threads.threadid()
@@ -333,15 +287,11 @@ function _projected_kernel_3D!(
         (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar,
             n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws
 
-        proj_kc = proj_kc_all[tid]
-        proj_ks = proj_ks_all[tid]
-        proj_gc = proj_gc_all[tid]
-        proj_gs = proj_gs_all[tid]
+        proj_kz = proj_kz_all[tid]
+        proj_gz = proj_gz_all[tid]
 
-        fill!(proj_kc, 0.0)
-        fill!(proj_ks, 0.0)
-        fill!(proj_gc, 0.0)
-        fill!(proj_gs, 0.0)
+        fill!(proj_kz, 0.0)
+        fill!(proj_gz, 0.0)
 
         i_obs = mod1(idx_obs, observer.mtheta)
         j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
@@ -352,17 +302,11 @@ function _projected_kernel_3D!(
             r_src = @view source.r[idx_src, :]
             n_src = @view source.normal[idx_src, :]
             w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ
-            @inbounds @simd for m in 1:P
-                proj_kc[m] += w_double * Ct[m, idx_src]
-                proj_ks[m] += w_double * St[m, idx_src]
-            end
+            BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz)
 
             if populate_greenfunction
                 w_single = laplace_single_layer(r_obs, r_src) * dθdζ
-                @inbounds @simd for m in 1:P
-                    proj_gc[m] += w_single * Ct[m, idx_src]
-                    proj_gs[m] += w_single * St[m, idx_src]
-                end
+                BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz)
             end
         end
 
@@ -398,64 +342,34 @@ function _projected_kernel_3D!(
             n_src = @view source.normal[idx_src, :]
             far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ
             w_double = M_grid_double[ii, jj] + far_double
-            @simd for m in 1:P
-                proj_kc[m] += w_double * Ct[m, idx_src]
-                proj_ks[m] += w_double * St[m, idx_src]
-            end
+            BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz)
 
             if populate_greenfunction
                 far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ
                 w_single = M_grid_single[ii, jj] + far_single
-                @simd for m in 1:P
-                    proj_gc[m] += w_single * Ct[m, idx_src]
-                    proj_gs[m] += w_single * St[m, idx_src]
-                end
+                BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz)
             end
         end
 
         # ── Write projected row to buffer (each idx_obs owns its row) ──
-        @inbounds for m in 1:P
-            KZ_c[idx_obs, m] = proj_kc[m]
-            KZ_s[idx_obs, m] = proj_ks[m]
-        end
+        @inbounds KZ[idx_obs, :] .= proj_kz
         if populate_greenfunction
-            @inbounds for m in 1:P
-                GZ_c[idx_obs, m] = proj_gc[m]
-                GZ_s[idx_obs, m] = proj_gs[m]
-            end
+            @inbounds GZ[idx_obs, :] .= proj_gz
         end
     end
 
-    # ── Assemble P×P projected matrices via GEMM (sequential, after barrier) ──
-    # K_c = Z^H K Z = (C'·KZ_c + S'·KZ_s) + i(C'·KZ_s − S'·KZ_c)
-    K_re = zeros(P, P)
-    K_im = zeros(P, P)
-    mul!(K_re, cos_basis', KZ_c)
-    mul!(K_re, sin_basis', KZ_s, 1.0, 1.0)
-    mul!(K_im, cos_basis', KZ_s)
-    mul!(K_im, sin_basis', KZ_c, -1.0, 1.0)
-
-    G_re = zeros(P, P)
-    G_im = zeros(P, P)
+    # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ──
+    mul!(K_c, Z', KZ)
+    K_c ./= 2π
     if populate_greenfunction
-        mul!(G_re, cos_basis', GZ_c)
-        mul!(G_re, sin_basis', GZ_s, 1.0, 1.0)
-        mul!(G_im, cos_basis', GZ_s)
-        mul!(G_im, sin_basis', GZ_c, -1.0, 1.0)
+        mul!(G_c, Z', GZ)
+        G_c ./= 2π
+    else
+        fill!(G_c, 0.0)
     end
 
-    # ── Post-processing (mirrors compute_3D_kernel_matrices!) ──
-    K_re ./= 2π
-    K_im ./= 2π
-    G_re ./= 2π
-    G_im ./= 2π
-
     # Diagonal: K += I → K_c += Gram [for same-type source/observer]
     if typeof(source) == typeof(observer)
-        K_re .+= real.(Gram)
-        K_im .+= imag.(Gram)
+        K_c .+= Gram
     end
-
-    K_c .= complex.(K_re, K_im)
-    G_c .= complex.(G_re, G_im)
 end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 2004ffc4..ac804eb3 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -9,7 +9,7 @@ using AdaptiveArrayPools
 
 # Import parent modules
 import ..Equilibrium
-using ..Utilities.FourierTransforms: compute_fourier_coefficients, fourier_transform!, fourier_inverse_transform!
+using ..Utilities.FourierTransforms: compute_fourier_coefficients
 
 include("Utilities.jl")
 include("DataTypes.jl")
@@ -80,8 +80,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
 
     # Compute Fourier basis coefficients
     ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
-    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν)
-    num_points_surf, num_modes = size(cos_mn_basis)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν)
+    num_points_surf, num_modes = size(exp_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
     # Hardcode these values for now - can expose to the user in the future
@@ -97,10 +97,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     grre = @view grre_in[1:num_points_total, :]
     grri = @view grri_in[1:num_points_total, :]
 
+    # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility
+    M = num_points_surf
+    P = num_modes
+    temp = zeros!(pool, ComplexF64, M, P)
+
     if wall.nowall && use_galerkin
         # ================================================================
-        # Galerkin: solve in P×P mode space. Uses complex basis Z = C + iS
-        # so projected matrices are P×P complex.
+        # Galerkin: solve system in P×P mode space. Uses complex basis
+        # Z = C + iS so projected matrices are P×P complex.
         #
         # Fused (fuse_projection=true): kernel assembly + Fourier projection
         # in one pass. The full M×M kernel matrices are never materialized —
@@ -113,24 +118,21 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         #
         # FLOPs (both):  O(M²P + P³)
         # ================================================================
-        P = num_modes
-        M = num_points_surf
-
-        # Temporary and projected kernel matrices [P × P complex]
-        exp_mn_basis = zeros!(pool, ComplexF64, M, P)
-        exp_mn_basis .= complex.(cos_mn_basis, sin_mn_basis)
-        Gram = zeros!(pool, ComplexF64, P, P)
+        # Projected kernel matrices [P × P complex]
         grad_green_fourier = zeros!(pool, ComplexF64, P, P)
         green_fourier = zeros!(pool, ComplexF64, P, P)
         grad_green_fourier_int = similar!(pool, grad_green_fourier)
         green_fourier_int = similar!(pool, green_fourier)
-        temp = zeros!(pool, ComplexF64, M, P)
+
+        # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
+        Gram = zeros!(pool, ComplexF64, P, P)
+        mul!(Gram, exp_mn_basis', exp_mn_basis)
 
         if fuse_projection
             # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
             fused_timing = @timed begin
                 projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams,
-                    cos_mn_basis, sin_mn_basis, Gram)
+                    exp_mn_basis, Gram)
             end
             println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
         else
@@ -142,25 +144,27 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             end
             println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
             # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z
-            mul!(temp, grad_green, exp_mn_basis)
-            mul!(grad_green_fourier, exp_mn_basis', temp)
-            mul!(temp, green_temp, exp_mn_basis)
-            mul!(green_fourier, exp_mn_basis', temp)
+            proj_timing = @timed begin
+                mul!(temp, grad_green, exp_mn_basis)
+                mul!(grad_green_fourier, exp_mn_basis', temp)
+                mul!(temp, green_temp, exp_mn_basis)
+                mul!(green_fourier, exp_mn_basis', temp)
+            end
+            println(" Project Kernel  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
         end
 
         solve_timing = @timed begin
             # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
-            mul!(Gram, exp_mn_basis', exp_mn_basis)
             grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
             green_fourier_int .= green_fourier
 
-            # Solve projected BIEs for exterior and interior.
+            # Solve projected BIEs for exterior and interior
             F = lu!(grad_green_fourier)
             ldiv!(F, green_fourier)
             F = lu!(grad_green_fourier_int)
             ldiv!(F, green_fourier_int)
 
-            # wv = (4π²/M) · Gram · green_fourier  [Chance 2007 eq. 114]
+            # wv = (4π²/M) · Gram · green_fourier
             wv .= (4π^2 / M) .* (Gram * green_fourier)
 
             # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
@@ -178,7 +182,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         # Collocation approach: solve full physical-space system [M × M]
         # Handles both no-wall and wall cases.
         # ================================================================
-
         # Full-size kernel matrices
         grad_green = zeros!(pool, num_points_total, num_points_total)
         green_temp = zeros!(pool, num_points_surf, num_points_surf)
@@ -188,12 +191,13 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         end
         println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
 
-        # FT plasma→plasma Green's function (must precede kernel! calls that overwrite green_temp)
-        colloc_ft_timing = @timed begin
-            fourier_transform!(grre, green_temp, cos_mn_basis)
-            fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
+        # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z)
+        colloc_proj_timing = @timed begin
+            mul!(temp, green_temp, exp_mn_basis)
+            @view(grre[1:M, 1:P]) .= real.(temp)
+            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
         end
-        println(" Plasma Fourier Transform  TIME=$(round(colloc_ft_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(colloc_ft_timing.bytes))")
+        println(" Plasma Project  TIME=$(round(colloc_proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(colloc_proj_timing.bytes))")
 
         if !wall.nowall
             wall_block_timing = @timed begin
@@ -203,11 +207,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
                 kernel!(grad_green, green_temp, wall, wall, kparams)
                 # Wall–Plasma block
                 kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-                # Fourier transform obs=wall, src=plasma block
-                fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
-                fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
+                # Project obs=wall, src=plasma block to mode space
+                mul!(temp, green_temp, exp_mn_basis)
+                @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+                @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
             end
-            println(" Wall Kernel and Fourier Transform  TIME=$(round(wall_block_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))")
+            println(" Wall Kernel and Project  TIME=$(round(wall_block_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))")
         end
 
         # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
@@ -230,16 +235,13 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         end
         println(" Invert and Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
 
-        invft_timing = @timed begin
-            # Inverse Fourier transform to extract wv [Chance Phys. Plasmas 2007 052506 eq. 115-118]
-            arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4)
-            fourier_inverse_transform!(arr, grre, cos_mn_basis)
-            fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes)
-            fourier_inverse_transform!(ari, grre, sin_mn_basis)
-            fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes)
-            wv .= complex.(arr .+ aii, air .- ari)
+        wv_timing = @timed begin
+            # wv = (4π²/M) · Z^H · grre_complex  [Chance Phys. Plasmas 2007 052506 eq. 115-118]
+            temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)]))
+            mul!(wv, exp_mn_basis', temp)
+            wv .*= (4π^2 / M)
         end
-        println(" Compute Wv  TIME=$(round(invft_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(invft_timing.bytes))")
+        println(" Compute Wv  TIME=$(round(wv_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wv_timing.bytes))")
     end
 
     inputs.force_wv_symmetry && hermitianpart!(wv)
diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl
index d51bd622..bba2ff61 100644
--- a/test/runtests_vacuum.jl
+++ b/test/runtests_vacuum.jl
@@ -445,6 +445,82 @@
                 @test size(plasma_pts) == (16, 3)
             end
         end
+
+        # -------------------------------------------------------------------------
+        @testset "fused vs two-step Galerkin (2D, nowall)" begin
+            # Small case where both Galerkin paths are cheap: compare K_c, G_c
+            # assembled via the full M×M kernel + projection against the fused
+            # projected kernels from the unified `kernel!` API.
+            inputs = VacuumInput(
+                mtheta_in=17,
+                nzeta_in=1,
+                x=collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=17))),
+                z=collect(0.3 .* sin.(range(0, 2π, length=17))),
+                ν=zeros(17),
+                mlow=1,
+                mpert=2,
+                nlow=1,
+                npert=1,
+                nzeta=1,
+                mtheta=32
+            )
+            wall_settings = WallShapeSettings(shape="nowall")
+
+            plasma_surf = GeneralizedPerturbedEquilibrium.Vacuum.PlasmaGeometry(inputs)
+            kparams = GeneralizedPerturbedEquilibrium.Vacuum.KernelParams2D(inputs.nlow)
+
+            # Fourier basis on the surface grid
+            exp_mn_basis = GeneralizedPerturbedEquilibrium.Utilities.FourierTransforms.compute_fourier_coefficients(
+                inputs.mtheta,
+                inputs.mpert,
+                inputs.mlow,
+                inputs.nzeta,
+                inputs.npert,
+                inputs.nlow;
+                n_2D=inputs.nlow,
+                ν=plasma_surf.ν
+            )
+            M, P = size(exp_mn_basis)
+            Gram = exp_mn_basis' * exp_mn_basis
+
+            # --- Two-step Galerkin: materialize full kernels then project ---
+            grad_green_full = zeros(Float64, 2M, 2M)
+            green_full = zeros(Float64, M, M)
+            GeneralizedPerturbedEquilibrium.Vacuum.kernel!(
+                grad_green_full,
+                green_full,
+                plasma_surf,
+                plasma_surf,
+                kparams
+            )
+
+            # Exterior projected kernels from full matrices: K_c = Z^H K Z, G_c = Z^H G Z
+            K_c_two = zeros(ComplexF64, P, P)
+            G_c_two = zeros(ComplexF64, P, P)
+            tmp = zeros(ComplexF64, M, P)
+
+            grad_pp = @view grad_green_full[1:M, 1:M]
+            mul!(tmp, grad_pp, exp_mn_basis)
+            mul!(K_c_two, exp_mn_basis', tmp)
+            mul!(tmp, green_full, exp_mn_basis)
+            mul!(G_c_two, exp_mn_basis', tmp)
+
+            # --- Fused Galerkin via unified kernel! ---
+            K_c_fused = zeros(ComplexF64, P, P)
+            G_c_fused = zeros(ComplexF64, P, P)
+            GeneralizedPerturbedEquilibrium.Vacuum.projected_kernel!(
+                K_c_fused,
+                G_c_fused,
+                plasma_surf,
+                plasma_surf,
+                kparams,
+                exp_mn_basis=exp_mn_basis,
+                Gram=Gram
+            )
+
+            @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12)
+            @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12)
+        end
     end
 
     # -------------------------------------------------------------------------

From c29d41692d0562f2aa9810d6731915d73191e30c Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Sat, 14 Mar 2026 18:55:59 -0400
Subject: [PATCH 07/23] VACUUM - WIP - wall implementation of the galerkin
 method (working for solovev already)

---
 src/Vacuum/DataTypes.jl |  5 ++-
 src/Vacuum/Vacuum.jl    | 87 ++++++++++++++++++++++++++++++++++++++++-
 test/runtests_vacuum.jl | 81 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 168 insertions(+), 5 deletions(-)

diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl
index 10c1f8bd..e8d4fad4 100644
--- a/src/Vacuum/DataTypes.jl
+++ b/src/Vacuum/DataTypes.jl
@@ -23,8 +23,9 @@ nzeta > 1 for 3D vacuum calculation.
   - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation)
   - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix
   - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)]
-    instead of full collocation [O(M³)]. Only applies to the no-wall case; wall cases always
-    use collocation. Defaults to `false`.
+    instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall
+    case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P
+    system with no M² storage. Defaults to `false`.
   - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with
     the Fourier projection so that the full M×M kernel matrices are never materialized.
     Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`.
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index ac804eb3..d7a41f4b 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -158,7 +158,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
             green_fourier_int .= green_fourier
 
-            # Solve projected BIEs for exterior and interior
+            # Solve projected BIEs for exterior and interior kernels
             F = lu!(grad_green_fourier)
             ldiv!(F, green_fourier)
             F = lu!(grad_green_fourier_int)
@@ -168,6 +168,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             wv .= (4π^2 / M) .* (Gram * green_fourier)
 
             # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
+            # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+            # perhaps make it a complex P * P matrix? Then don't need any of this section
             mul!(temp, exp_mn_basis, green_fourier)
             @view(grre[1:M, 1:P]) .= real.(temp)
             @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
@@ -177,6 +179,89 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         end
         println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
 
+    elseif !wall.nowall && use_galerkin
+        # ================================================================
+        # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space.
+        # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via
+        # projected_kernel!, assembles a 2P×2P system, and solves directly.
+        # Same exp_mn_basis and Gram for all blocks (same angular grid).
+        # Memory: O(MP + P²), no M² or (2M)² storage.
+        # ================================================================
+
+        Gram = zeros!(pool, ComplexF64, P, P)
+        mul!(Gram, exp_mn_basis', exp_mn_basis)
+
+        # Four projected kernel blocks [P × P complex each]
+        K_pp_c = zeros!(pool, ComplexF64, P, P)
+        G_pp_c = zeros!(pool, ComplexF64, P, P)
+        K_pw_c = zeros!(pool, ComplexF64, P, P)
+        G_pw_c = zeros!(pool, ComplexF64, P, P)
+        K_wp_c = zeros!(pool, ComplexF64, P, P)
+        G_wp_c = zeros!(pool, ComplexF64, P, P)
+        K_ww_c = zeros!(pool, ComplexF64, P, P)
+        G_ww_c = zeros!(pool, ComplexF64, P, P)
+
+        kernel_timing = @timed begin
+            projected_kernel!(K_pp_c, G_pp_c, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+            projected_kernel!(K_pw_c, G_pw_c, plasma_surf, wall, kparams, exp_mn_basis, Gram)
+            projected_kernel!(K_wp_c, G_wp_c, wall, plasma_surf, kparams, exp_mn_basis, Gram)
+            projected_kernel!(K_ww_c, G_ww_c, wall, wall, kparams, exp_mn_basis, Gram)
+        end
+        println(" Wall Galerkin Projected Kernels  TIME=$(round(kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))")
+
+        solve_timing = @timed begin
+            # Assemble 2P×2P exterior system and interior system (before LU overwrites)
+            K_ext = zeros!(pool, ComplexF64, 2P, 2P)
+            K_ext[1:P, 1:P] .= K_pp_c
+            K_ext[1:P, (P+1):(2P)] .= K_pw_c
+            K_ext[(P+1):(2P), 1:P] .= K_wp_c
+            K_ext[(P+1):(2P), (P+1):(2P)] .= K_ww_c
+
+            K_int = zeros!(pool, ComplexF64, 2P, 2P)
+            K_int[1:P, 1:P] .= 2 .* Gram .- K_pp_c
+            K_int[1:P, (P+1):(2P)] .= .-K_pw_c
+            K_int[(P+1):(2P), 1:P] .= .-K_wp_c
+            K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c
+
+            # RHS [2P × P]: single-layer blocks (only plasma-source blocks are nonzero)
+            G_rhs_ext = zeros!(pool, ComplexF64, 2P, P)
+            G_rhs_ext[1:P, :] .= G_pp_c
+            G_rhs_ext[(P+1):(2P), :] .= G_wp_c
+
+            G_rhs_int = similar!(pool, G_rhs_ext)
+            G_rhs_int .= G_rhs_ext
+
+            # Exterior solve: K_ext * C_ext = G_rhs_ext
+            F_ext = lu!(K_ext)
+            ldiv!(F_ext, G_rhs_ext)
+            c_p_ext = @view G_rhs_ext[1:P, :]
+            c_w_ext = @view G_rhs_ext[(P+1):(2P), :]
+
+            # Interior solve: K_int * C_int = G_rhs_int
+            F_int = lu!(K_int)
+            ldiv!(F_int, G_rhs_int)
+            c_p_int = @view G_rhs_int[1:P, :]
+            c_w_int = @view G_rhs_int[(P+1):(2P), :]
+
+            # wv = (4π²/M) · Gram · c_p_ext (plasma observer only)
+            wv .= (4π^2 / M) .* (Gram * c_p_ext)
+
+            # Backward-compatible reconstruction: grre/grri in M×2P real layout
+            mul!(temp, exp_mn_basis, c_p_ext)
+            @view(grre[1:M, 1:P]) .= real.(temp)
+            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, c_p_int)
+            @view(grri[1:M, 1:P]) .= real.(temp)
+            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, c_w_ext)
+            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, c_w_int)
+            @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
+            @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+        end
+        println(" Wall Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+
     else
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]
diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl
index bba2ff61..472c6c68 100644
--- a/test/runtests_vacuum.jl
+++ b/test/runtests_vacuum.jl
@@ -514,13 +514,51 @@
                 plasma_surf,
                 plasma_surf,
                 kparams,
-                exp_mn_basis=exp_mn_basis,
-                Gram=Gram
+                exp_mn_basis,
+                Gram
             )
 
             @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12)
             @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12)
         end
+
+        # -------------------------------------------------------------------------
+        @testset "wall Galerkin vs collocation (2D, conformal)" begin
+            mtheta_eq = 17
+            mtheta = 128
+            mpert = 3
+            boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq)))
+            boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq)))
+
+            inputs_colloc = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=1, npert=1,
+                nzeta=1, mtheta=mtheta,
+                use_galerkin=false
+            )
+            inputs_galerkin = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=1, npert=1,
+                nzeta=1, mtheta=mtheta,
+                use_galerkin=true
+            )
+
+            wall_settings = WallShapeSettings(shape="conformal", a=0.5)
+
+            wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings)
+            wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings)
+
+            M = mtheta
+            P = mpert
+
+            @test isapprox(wv_g, wv_c; rtol=1e-8)
+            @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+        end
     end
 
     # -------------------------------------------------------------------------
@@ -707,6 +745,45 @@
             @test isapprox(wv, wv', rtol=1e-12)
         end
 
+        @testset "wall Galerkin vs collocation (3D, conformal)" begin
+            mtheta_eq = 17
+            mtheta = 32
+            nzeta = 32
+            mpert = 2
+            npert = 2
+            boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq)))
+            boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq)))
+
+            inputs_colloc = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=0, npert=npert,
+                nzeta=nzeta, mtheta=mtheta,
+                use_galerkin=false
+            )
+            inputs_galerkin = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=0, npert=npert,
+                nzeta=nzeta, mtheta=mtheta,
+                use_galerkin=true
+            )
+
+            wall_settings = WallShapeSettings(shape="conformal", a=0.3)
+
+            wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings)
+            wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings)
+
+            M = mtheta * nzeta
+            P = mpert * npert
+
+            @test isapprox(wv_g, wv_c; rtol=1e-8)
+            @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+        end
+
         @testset "Kernel3D laplace_single_layer" begin
             x_obs = [1.0, 0.0, 0.0]
             x_src = [2.0, 0.0, 0.0]

From 98851de232fd44a6da1baa6f6d4d95092cc97872 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 08:15:08 -0400
Subject: [PATCH 08/23] VACUUM - WIP - renaming matrices

---
 src/Vacuum/Vacuum.jl | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index d7a41f4b..c65f3774 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -119,10 +119,10 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         # FLOPs (both):  O(M²P + P³)
         # ================================================================
         # Projected kernel matrices [P × P complex]
-        grad_green_fourier = zeros!(pool, ComplexF64, P, P)
-        green_fourier = zeros!(pool, ComplexF64, P, P)
-        grad_green_fourier_int = similar!(pool, grad_green_fourier)
-        green_fourier_int = similar!(pool, green_fourier)
+        K_ext = zeros!(pool, ComplexF64, P, P)
+        G_ext = zeros!(pool, ComplexF64, P, P)
+        K_int = similar!(pool, K_ext)
+        G_int = similar!(pool, G_ext)
 
         # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
         Gram = zeros!(pool, ComplexF64, P, P)
@@ -131,38 +131,37 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         if fuse_projection
             # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
             fused_timing = @timed begin
-                projected_kernel!(grad_green_fourier, green_fourier, plasma_surf, plasma_surf, kparams,
-                    exp_mn_basis, Gram)
+                projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
             end
             println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
         else
             # Full-size kernel matrices, then project to mode space
-            grad_green = zeros!(pool, num_points_total, num_points_total)
-            green_temp = zeros!(pool, num_points_surf, num_points_surf)
+            K_ext_temp = zeros!(pool, num_points_total, num_points_total)
+            G_ext_temp = zeros!(pool, num_points_surf, num_points_surf)
             pp_kernel_timing = @timed begin
-                kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
+                kernel!(K_ext_temp, G_ext_temp, plasma_surf, plasma_surf, kparams)
             end
             println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
             # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z
             proj_timing = @timed begin
-                mul!(temp, grad_green, exp_mn_basis)
-                mul!(grad_green_fourier, exp_mn_basis', temp)
-                mul!(temp, green_temp, exp_mn_basis)
-                mul!(green_fourier, exp_mn_basis', temp)
+                mul!(temp, K_ext_temp, exp_mn_basis)
+                mul!(K_ext, exp_mn_basis', temp)
+                mul!(temp, G_ext_temp, exp_mn_basis)
+                mul!(G_ext, exp_mn_basis', temp)
             end
             println(" Project Kernel  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
         end
 
         solve_timing = @timed begin
             # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
-            grad_green_fourier_int .= 2 .* Gram .- grad_green_fourier
-            green_fourier_int .= green_fourier
+            K_int .= 2 .* Gram .- K_ext
+            G_int .= G_ext
 
             # Solve projected BIEs for exterior and interior kernels
-            F = lu!(grad_green_fourier)
-            ldiv!(F, green_fourier)
-            F = lu!(grad_green_fourier_int)
-            ldiv!(F, green_fourier_int)
+            F = lu!(K_ext)
+            ldiv!(F, G_ext)
+            F = lu!(K_int)
+            ldiv!(F, G_int)
 
             # wv = (4π²/M) · Gram · green_fourier
             wv .= (4π^2 / M) .* (Gram * green_fourier)
@@ -223,7 +222,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             K_int[(P+1):(2P), 1:P] .= .-K_wp_c
             K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c
 
-            # RHS [2P × P]: single-layer blocks (only plasma-source blocks are nonzero)
+            # RHS [2P × P]: single-layer blocks with plasma as source
             G_rhs_ext = zeros!(pool, ComplexF64, 2P, P)
             G_rhs_ext[1:P, :] .= G_pp_c
             G_rhs_ext[(P+1):(2P), :] .= G_wp_c
@@ -247,6 +246,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             wv .= (4π^2 / M) .* (Gram * c_p_ext)
 
             # Backward-compatible reconstruction: grre/grri in M×2P real layout
+            # Need to convert mode space to physical space and unpack the real and imaginary parts
             mul!(temp, exp_mn_basis, c_p_ext)
             @view(grre[1:M, 1:P]) .= real.(temp)
             @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)

From d872ce9cdbb0dfdc88c9b9a7a2b6fe257add8f33 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 08:20:35 -0400
Subject: [PATCH 09/23] VACUUM - WIP - removing the non-fused galerkin method

---
 src/Vacuum/DataTypes.jl | 10 ++--------
 src/Vacuum/Vacuum.jl    | 39 +++++++++------------------------------
 2 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl
index e8d4fad4..5522c37b 100644
--- a/src/Vacuum/DataTypes.jl
+++ b/src/Vacuum/DataTypes.jl
@@ -26,9 +26,6 @@ nzeta > 1 for 3D vacuum calculation.
     instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall
     case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P
     system with no M² storage. Defaults to `false`.
-  - `fuse_projection::Bool`: When combined with `use_galerkin`, fuse the kernel assembly with
-    the Fourier projection so that the full M×M kernel matrices are never materialized.
-    Reduces memory from O(M²) to O(MP). Requires `use_galerkin = true`. Defaults to `false`.
 """
 @kwdef struct VacuumInput
     x::Vector{Float64} = Float64[]
@@ -45,7 +42,6 @@ nzeta > 1 for 3D vacuum calculation.
     nzeta::Int = 1
     force_wv_symmetry::Bool = true
     use_galerkin::Bool = false
-    fuse_projection::Bool = false
 end
 
 """
@@ -86,8 +82,7 @@ function VacuumInput(
     npert::Int,
     nlow::Int;
     force_wv_symmetry::Bool=true,
-    use_galerkin::Bool=false,
-    fuse_projection::Bool=false
+    use_galerkin::Bool=false
 )
     # Extract plasma surface geometry at this psi
     r, z, ν = extract_plasma_surface_at_psi(equil, ψ)
@@ -104,8 +99,7 @@ function VacuumInput(
         mtheta=mtheta,
         nzeta=nzeta,
         force_wv_symmetry=force_wv_symmetry,
-        use_galerkin=true,
-        fuse_projection=true
+        use_galerkin=true
     )
 end
 
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index c65f3774..a50cfe6b 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -72,7 +72,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     n_override::Union{Nothing,Int}=nothing
 )
 
-    (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin, fuse_projection) = inputs
+    (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin) = inputs
 
     # Initialize surface geometries
     plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
@@ -113,10 +113,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         # accumulated row by row as kernel values are computed.
         # Memory:  O(MP + P²)  instead of  O(M²)
         #
-        # Two-step (fuse_projection=false): full M×M kernel → project → solve.
-        # Memory:  O(M²) for kernel storage
-        #
-        # FLOPs (both):  O(M²P + P³)
+        # FLOPs:  O(M²P + P³)
         # ================================================================
         # Projected kernel matrices [P × P complex]
         K_ext = zeros!(pool, ComplexF64, P, P)
@@ -128,29 +125,11 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         Gram = zeros!(pool, ComplexF64, P, P)
         mul!(Gram, exp_mn_basis', exp_mn_basis)
 
-        if fuse_projection
-            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
-            fused_timing = @timed begin
-                projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-            end
-            println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
-        else
-            # Full-size kernel matrices, then project to mode space
-            K_ext_temp = zeros!(pool, num_points_total, num_points_total)
-            G_ext_temp = zeros!(pool, num_points_surf, num_points_surf)
-            pp_kernel_timing = @timed begin
-                kernel!(K_ext_temp, G_ext_temp, plasma_surf, plasma_surf, kparams)
-            end
-            println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
-            # Project the kernels to mode space - Z^H * K * Z and Z^H * G * Z
-            proj_timing = @timed begin
-                mul!(temp, K_ext_temp, exp_mn_basis)
-                mul!(K_ext, exp_mn_basis', temp)
-                mul!(temp, G_ext_temp, exp_mn_basis)
-                mul!(G_ext, exp_mn_basis', temp)
-            end
-            println(" Project Kernel  TIME=$(round(proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(proj_timing.bytes))")
+        # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
+        fused_timing = @timed begin
+            projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
         end
+        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
 
         solve_timing = @timed begin
             # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
@@ -164,15 +143,15 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
             ldiv!(F, G_int)
 
             # wv = (4π²/M) · Gram · green_fourier
-            wv .= (4π^2 / M) .* (Gram * green_fourier)
+            wv .= (4π^2 / M) .* (Gram * G_ext)
 
             # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
             # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
             # perhaps make it a complex P * P matrix? Then don't need any of this section
-            mul!(temp, exp_mn_basis, green_fourier)
+            mul!(temp, exp_mn_basis, G_ext)
             @view(grre[1:M, 1:P]) .= real.(temp)
             @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, green_fourier_int)
+            mul!(temp, exp_mn_basis, G_int)
             @view(grri[1:M, 1:P]) .= real.(temp)
             @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
         end

From eb0e5f79ee8e617ccd03d2634aca71edb919df88 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 08:53:34 -0400
Subject: [PATCH 10/23] temp

---
 examples/Solovev_ideal_example_3D/gpec.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml
index d9e8b663..3dd466a0 100644
--- a/examples/Solovev_ideal_example_3D/gpec.toml
+++ b/examples/Solovev_ideal_example_3D/gpec.toml
@@ -32,8 +32,8 @@ nn_high = 1                   # Largest toroidal mode number to include
 delta_mlow = 8                # Expands lower bound of Fourier harmonics
 delta_mhigh = 8               # Expands upper bound of Fourier harmonics
 delta_mband = 0               # Integration keeps only this wide a band...
-mthvac = 96                  # Number of points used in splines over poloidal angle at plasma-vacuum interface.
-nzvac = 64
+mthvac = 128                  # Number of points used in splines over poloidal angle at plasma-vacuum interface.
+nzvac = 128
 thmax0 = 1                    # Linear multiplier on the automatic choice of theta integration bounds
 
 kin_flag = false              # Kinetic EL equation (default: false)

From afbc93397ad3d0a787cea580466ceda2f5d5e29e Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 10:49:31 -0400
Subject: [PATCH 11/23] VACUUM - WIP - modifying kernels to take views of
 larger K and G matrices

---
 src/Vacuum/ProjectedKernel.jl |  49 +++----
 src/Vacuum/Vacuum.jl          | 240 +++++++++++++++-------------------
 2 files changed, 134 insertions(+), 155 deletions(-)

diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
index 06f8b3f2..ac8e526f 100644
--- a/src/Vacuum/ProjectedKernel.jl
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -19,7 +19,7 @@
 # 2D fused projected kernel
 # ============================================================================
 """
-    projected_kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram)
+    kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram)
 
 Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z
 directly, without materializing the full M×M kernel matrices.
@@ -36,9 +36,7 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types.
   - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ))
   - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term)
 """
-function projected_kernel! end
-
-function projected_kernel!(
+function kernel!(
     K_c::AbstractMatrix{ComplexF64},
     G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
@@ -50,7 +48,7 @@ function projected_kernel!(
     _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram)
 end
 
-function projected_kernel!(
+function kernel!(
     K_c::AbstractMatrix{ComplexF64},
     G_c::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
@@ -87,6 +85,12 @@ Memory: O(MP) instead of O(M²).
     dtheta = 2π / mtheta
     theta_grid = range(; start=0, length=mtheta, step=dtheta)
 
+    # Take a view of the corresponding block of the K_c and G_c matrices
+    col_idx = (source isa PlasmaGeometry ? 1 : 2)
+    row_idx = (observer isa PlasmaGeometry ? 1 : 2)
+    K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
+    G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :)
+
     populate_greenfunction = source isa PlasmaGeometry
 
     # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78]
@@ -110,13 +114,9 @@ Memory: O(MP) instead of O(M²).
     d1_spline_x(dx_dtheta_grid, theta_grid)
     d1_spline_z(dz_dtheta_grid, theta_grid)
 
-    # Zero output matrices; we accumulate rank-1 updates (conj(Z[j,:]) ⊗ proj_z)
-    fill!(K_c, 0.0)
-    fill!(G_c, 0.0)
-
     # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z
-    proj_kz = zeros(ComplexF64, P)
-    proj_gz = zeros(ComplexF64, P)
+    proj_kz = zeros!(pool, ComplexF64, P)
+    proj_gz = zeros!(pool, ComplexF64, P)
 
     for j in 1:mtheta
         x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
@@ -200,9 +200,9 @@ Memory: O(MP) instead of O(M²).
         BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz)
 
         # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ──
-        BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c)
+        BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c_block)
         if populate_greenfunction
-            BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c)
+            BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c_block)
         end
     end
 
@@ -210,19 +210,19 @@ Memory: O(MP) instead of O(M²).
 
     # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source
     if source isa PlasmaGeometry
-        K_c .*= -1
+        K_c_block .*= -1
     end
 
     # Diagonal residue: K += residue·I  →  K_c += residue·Gram
     # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
     if residue != 0.0
-        K_c .+= residue .* Gram
+        K_c_block .+= residue .* Gram
     end
 
     # 2π𝒢 → 𝒢
     if populate_greenfunction
-        G_c ./= 2π
+        G_c_block ./= 2π
     end
 end
 
@@ -262,6 +262,11 @@ function _projected_kernel_3D!(
     num_points = observer.mtheta * observer.nzeta
     dθdζ = 4π^2 / num_points
 
+    # Take a view of the corresponding block of the K_c and G_c matrices
+    col_idx = (source isa PlasmaGeometry3D ? 1 : 2)
+    row_idx = (observer isa PlasmaGeometry3D ? 1 : 2)
+    K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
+    G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :)
     populate_greenfunction = source isa PlasmaGeometry3D
 
     if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2
@@ -359,17 +364,15 @@ function _projected_kernel_3D!(
     end
 
     # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ──
-    mul!(K_c, Z', KZ)
-    K_c ./= 2π
+    mul!(K_c_block, Z', KZ)
+    K_c_block ./= 2π
     if populate_greenfunction
-        mul!(G_c, Z', GZ)
-        G_c ./= 2π
-    else
-        fill!(G_c, 0.0)
+        mul!(G_c_block, Z', GZ)
+        G_c_block ./= 2π
     end
 
     # Diagonal: K += I → K_c += Gram [for same-type source/observer]
     if typeof(source) == typeof(observer)
-        K_c .+= Gram
+        K_c_block .+= Gram
     end
 end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index a50cfe6b..e603fe1c 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -102,144 +102,120 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     P = num_modes
     temp = zeros!(pool, ComplexF64, M, P)
 
-    if wall.nowall && use_galerkin
-        # ================================================================
-        # Galerkin: solve system in P×P mode space. Uses complex basis
-        # Z = C + iS so projected matrices are P×P complex.
-        #
-        # Fused (fuse_projection=true): kernel assembly + Fourier projection
-        # in one pass. The full M×M kernel matrices are never materialized —
-        # instead the P×P projected matrices grad_green_fourier and G_c are
-        # accumulated row by row as kernel values are computed.
-        # Memory:  O(MP + P²)  instead of  O(M²)
-        #
-        # FLOPs:  O(M²P + P³)
-        # ================================================================
-        # Projected kernel matrices [P × P complex]
-        K_ext = zeros!(pool, ComplexF64, P, P)
-        G_ext = zeros!(pool, ComplexF64, P, P)
-        K_int = similar!(pool, K_ext)
-        G_int = similar!(pool, G_ext)
-
+    if use_galerkin
         # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
         Gram = zeros!(pool, ComplexF64, P, P)
         mul!(Gram, exp_mn_basis', exp_mn_basis)
 
-        # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
-        fused_timing = @timed begin
-            projected_kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-        end
-        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
-
-        solve_timing = @timed begin
-            # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
-            K_int .= 2 .* Gram .- K_ext
-            G_int .= G_ext
-
-            # Solve projected BIEs for exterior and interior kernels
-            F = lu!(K_ext)
-            ldiv!(F, G_ext)
-            F = lu!(K_int)
-            ldiv!(F, G_int)
-
-            # wv = (4π²/M) · Gram · green_fourier
-            wv .= (4π^2 / M) .* (Gram * G_ext)
-
-            # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
-            # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
-            # perhaps make it a complex P * P matrix? Then don't need any of this section
-            mul!(temp, exp_mn_basis, G_ext)
-            @view(grre[1:M, 1:P]) .= real.(temp)
-            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, G_int)
-            @view(grri[1:M, 1:P]) .= real.(temp)
-            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-        end
-        println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
-
-    elseif !wall.nowall && use_galerkin
-        # ================================================================
-        # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space.
-        # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via
-        # projected_kernel!, assembles a 2P×2P system, and solves directly.
-        # Same exp_mn_basis and Gram for all blocks (same angular grid).
-        # Memory: O(MP + P²), no M² or (2M)² storage.
-        # ================================================================
-
-        Gram = zeros!(pool, ComplexF64, P, P)
-        mul!(Gram, exp_mn_basis', exp_mn_basis)
-
-        # Four projected kernel blocks [P × P complex each]
-        K_pp_c = zeros!(pool, ComplexF64, P, P)
-        G_pp_c = zeros!(pool, ComplexF64, P, P)
-        K_pw_c = zeros!(pool, ComplexF64, P, P)
-        G_pw_c = zeros!(pool, ComplexF64, P, P)
-        K_wp_c = zeros!(pool, ComplexF64, P, P)
-        G_wp_c = zeros!(pool, ComplexF64, P, P)
-        K_ww_c = zeros!(pool, ComplexF64, P, P)
-        G_ww_c = zeros!(pool, ComplexF64, P, P)
-
-        kernel_timing = @timed begin
-            projected_kernel!(K_pp_c, G_pp_c, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-            projected_kernel!(K_pw_c, G_pw_c, plasma_surf, wall, kparams, exp_mn_basis, Gram)
-            projected_kernel!(K_wp_c, G_wp_c, wall, plasma_surf, kparams, exp_mn_basis, Gram)
-            projected_kernel!(K_ww_c, G_ww_c, wall, wall, kparams, exp_mn_basis, Gram)
-        end
-        println(" Wall Galerkin Projected Kernels  TIME=$(round(kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))")
-
-        solve_timing = @timed begin
-            # Assemble 2P×2P exterior system and interior system (before LU overwrites)
+        if wall.nowall
+            # ================================================================
+            # Galerkin (no wall): solve system in P×P mode space. Uses complex basis
+            # Z = C + iS so projected matrices are P×P complex.
+            #
+            # Fused (fuse_projection=true): kernel assembly + Fourier projection
+            # in one pass. The full M×M kernel matrices are never materialized —
+            # instead the P×P projected matrices grad_green_fourier and G_c are
+            # accumulated row by row as kernel values are computed.
+            # Memory:  O(MP + P²)  instead of  O(M²)
+            #
+            # FLOPs:  O(M²P + P³)
+            # ================================================================
+            # Projected kernel matrices [P × P complex]
+            K_ext = zeros!(pool, ComplexF64, P, P)
+            G_ext = zeros!(pool, ComplexF64, P, P)
+            K_int = similar!(pool, K_ext)
+            G_int = similar!(pool, G_ext)
+
+            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
+            fused_timing = @timed begin
+                kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+            end
+            println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
+
+            solve_timing = @timed begin
+                # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
+                K_int .= 2 .* Gram .- K_ext
+                G_int .= G_ext
+
+                # Solve projected BIEs for exterior and interior kernels
+                F = lu!(K_ext)
+                ldiv!(F, G_ext)
+                F = lu!(K_int)
+                ldiv!(F, G_int)
+
+                # wv = (4π²/M) · Gram · green_fourier
+                wv .= (4π^2 / M) .* (Gram * G_ext)
+
+                # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
+                # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+                # perhaps make it a complex P * P matrix? Then don't need any of this section
+                mul!(temp, exp_mn_basis, G_ext)
+                @view(grre[1:M, 1:P]) .= real.(temp)
+                @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+                mul!(temp, exp_mn_basis, G_int)
+                @view(grri[1:M, 1:P]) .= real.(temp)
+                @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+            end
+            println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+
+        else
+            # ================================================================
+            # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space.
+            # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via
+            # projected_kernel!, assembles a 2P×2P system, and solves directly.
+            # Same exp_mn_basis and Gram for all blocks (same angular grid).
+            # Memory: O(MP + P²), no M² or (2M)² storage.
+            # ================================================================
+
+            # Four projected kernel blocks [P × P complex each]
             K_ext = zeros!(pool, ComplexF64, 2P, 2P)
-            K_ext[1:P, 1:P] .= K_pp_c
-            K_ext[1:P, (P+1):(2P)] .= K_pw_c
-            K_ext[(P+1):(2P), 1:P] .= K_wp_c
-            K_ext[(P+1):(2P), (P+1):(2P)] .= K_ww_c
-
-            K_int = zeros!(pool, ComplexF64, 2P, 2P)
-            K_int[1:P, 1:P] .= 2 .* Gram .- K_pp_c
-            K_int[1:P, (P+1):(2P)] .= .-K_pw_c
-            K_int[(P+1):(2P), 1:P] .= .-K_wp_c
-            K_int[(P+1):(2P), (P+1):(2P)] .= 2 .* Gram .- K_ww_c
-
-            # RHS [2P × P]: single-layer blocks with plasma as source
-            G_rhs_ext = zeros!(pool, ComplexF64, 2P, P)
-            G_rhs_ext[1:P, :] .= G_pp_c
-            G_rhs_ext[(P+1):(2P), :] .= G_wp_c
-
-            G_rhs_int = similar!(pool, G_rhs_ext)
-            G_rhs_int .= G_rhs_ext
-
-            # Exterior solve: K_ext * C_ext = G_rhs_ext
-            F_ext = lu!(K_ext)
-            ldiv!(F_ext, G_rhs_ext)
-            c_p_ext = @view G_rhs_ext[1:P, :]
-            c_w_ext = @view G_rhs_ext[(P+1):(2P), :]
-
-            # Interior solve: K_int * C_int = G_rhs_int
-            F_int = lu!(K_int)
-            ldiv!(F_int, G_rhs_int)
-            c_p_int = @view G_rhs_int[1:P, :]
-            c_w_int = @view G_rhs_int[(P+1):(2P), :]
-
-            # wv = (4π²/M) · Gram · c_p_ext (plasma observer only)
-            wv .= (4π^2 / M) .* (Gram * c_p_ext)
-
-            # Backward-compatible reconstruction: grre/grri in M×2P real layout
-            # Need to convert mode space to physical space and unpack the real and imaginary parts
-            mul!(temp, exp_mn_basis, c_p_ext)
-            @view(grre[1:M, 1:P]) .= real.(temp)
-            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, c_p_int)
-            @view(grri[1:M, 1:P]) .= real.(temp)
-            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, c_w_ext)
-            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, c_w_int)
-            @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
-            @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+            G_ext = zeros!(pool, ComplexF64, 2P, P)
+            K_int = similar!(pool, K_ext)
+            G_int = similar!(pool, G_ext)
+
+            kernel_timing = @timed begin
+                kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+                kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
+                kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
+                kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
+            end
+            println(" Wall Galerkin Projected Kernels  TIME=$(round(kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))")
+
+            solve_timing = @timed begin
+                # Compute interior system: K_int = 2·Gram - K_ext
+                K_int .= -K_ext
+                K_int[1:P, 1:P] .+= 2 .* Gram
+                K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
+                G_int .= G_ext
+
+                # Exterior solve: K_ext * G_ext = G_ext
+                F_ext = lu!(K_ext)
+                ldiv!(F_ext, G_ext)
+
+                # Interior solve: K_int * C_int = G_rhs_int
+                F_int = lu!(K_int)
+                ldiv!(F_int, G_int)
+
+                # wv = (4π²/M) · Gram · G_p_ext (plasma observer only)
+                wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :))
+
+                # Backward-compatible reconstruction: grre/grri in M×2P real layout
+                # Need to convert mode space to physical space and unpack the real and imaginary parts
+                mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
+                @view(grre[1:M, 1:P]) .= real.(temp)
+                @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+                mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
+                @view(grri[1:M, 1:P]) .= real.(temp)
+                @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+                mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
+                @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+                @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+                mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
+                @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
+                @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+            end
+            println(" Wall Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
         end
-        println(" Wall Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
 
     else
         # ================================================================

From 4925412f8ba1c007bc7107128183aadd4833a45f Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 11:16:13 -0400
Subject: [PATCH 12/23] VACUUM - WIP - consolidating nowall and wall into one
 branch for galerkin

---
 src/Vacuum/Vacuum.jl | 145 +++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 88 deletions(-)

diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index e603fe1c..7b058e19 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -102,111 +102,81 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     P = num_modes
     temp = zeros!(pool, ComplexF64, M, P)
 
+    # ================================================================
+    # Galerkin: solve system in P×P mode space. Uses complex basis
+    # Z = C + iS so projected matrices are P×P complex.
+    #
+    # Fused (fuse_projection=true): kernel assembly + Fourier projection
+    # in one pass. The full M×M kernel matrices are never materialized —
+    # instead the P×P projected matrices grad_green_fourier and G_c are
+    # accumulated row by row as kernel values are computed.
+    # Memory:  O(MP + P²)  instead of  O(M²)
+    #
+    # FLOPs:  O(M²P + P³)
+    # ================================================================
     if use_galerkin
         # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
         Gram = zeros!(pool, ComplexF64, P, P)
         mul!(Gram, exp_mn_basis', exp_mn_basis)
 
-        if wall.nowall
-            # ================================================================
-            # Galerkin (no wall): solve system in P×P mode space. Uses complex basis
-            # Z = C + iS so projected matrices are P×P complex.
-            #
-            # Fused (fuse_projection=true): kernel assembly + Fourier projection
-            # in one pass. The full M×M kernel matrices are never materialized —
-            # instead the P×P projected matrices grad_green_fourier and G_c are
-            # accumulated row by row as kernel values are computed.
-            # Memory:  O(MP + P²)  instead of  O(M²)
-            #
-            # FLOPs:  O(M²P + P³)
-            # ================================================================
-            # Projected kernel matrices [P × P complex]
-            K_ext = zeros!(pool, ComplexF64, P, P)
-            G_ext = zeros!(pool, ComplexF64, P, P)
-            K_int = similar!(pool, K_ext)
-            G_int = similar!(pool, G_ext)
-
-            # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
-            fused_timing = @timed begin
-                kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-            end
-            println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
-
-            solve_timing = @timed begin
-                # Interior kernel: K_int = -K + 2I → grad_green_fourier_int = 2·Gram - grad_green_fourier
-                K_int .= 2 .* Gram .- K_ext
-                G_int .= G_ext
-
-                # Solve projected BIEs for exterior and interior kernels
-                F = lu!(K_ext)
-                ldiv!(F, G_ext)
-                F = lu!(K_int)
-                ldiv!(F, G_int)
-
-                # wv = (4π²/M) · Gram · green_fourier
-                wv .= (4π^2 / M) .* (Gram * G_ext)
-
-                # Backward-compatible reconstruction: grre/grri = real(Z·c), imag(Z·c) in M×2P real.
-                # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
-                # perhaps make it a complex P * P matrix? Then don't need any of this section
-                mul!(temp, exp_mn_basis, G_ext)
-                @view(grre[1:M, 1:P]) .= real.(temp)
-                @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-                mul!(temp, exp_mn_basis, G_int)
-                @view(grri[1:M, 1:P]) .= real.(temp)
-                @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-            end
-            println(" Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
-
-        else
-            # ================================================================
-            # Wall Galerkin: both plasma and wall unknowns in (m,n) mode space.
-            # Builds four P×P projected kernel blocks (pp, pw, wp, ww) via
-            # projected_kernel!, assembles a 2P×2P system, and solves directly.
-            # Same exp_mn_basis and Gram for all blocks (same angular grid).
-            # Memory: O(MP + P²), no M² or (2M)² storage.
-            # ================================================================
-
-            # Four projected kernel blocks [P × P complex each]
-            K_ext = zeros!(pool, ComplexF64, 2P, 2P)
-            G_ext = zeros!(pool, ComplexF64, 2P, P)
-            K_int = similar!(pool, K_ext)
-            G_int = similar!(pool, G_ext)
+        # Projected kernel matrices [P × P complex]
+        K_ext = zeros!(pool, ComplexF64, 2P, 2P)
+        G_ext = zeros!(pool, ComplexF64, 2P, P)
+        K_int = similar!(pool, K_ext)
+        G_int = similar!(pool, G_ext)
 
+        # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
+        fused_timing = @timed begin
+            kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+        end
+        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
+        if !wall.nowall
             kernel_timing = @timed begin
-                kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
                 kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
                 kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
                 kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
             end
             println(" Wall Galerkin Projected Kernels  TIME=$(round(kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))")
+        end
 
-            solve_timing = @timed begin
-                # Compute interior system: K_int = 2·Gram - K_ext
-                K_int .= -K_ext
-                K_int[1:P, 1:P] .+= 2 .* Gram
+        solve_timing = @timed begin
+            # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
+            K_int .= -K_ext
+            K_int[1:P, 1:P] .+= 2 .* Gram
+            if !wall.nowall
                 K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
-                G_int .= G_ext
-
-                # Exterior solve: K_ext * G_ext = G_ext
+            end
+            G_int .= G_ext
+
+            # Solve projected BIEs for exterior and interior kernels
+            if wall.nowall
+                F_ext = lu!(K_ext[1:P, 1:P])
+                ldiv!(F_ext, @view(G_ext[1:P, :]))
+                F_int = lu!(K_int[1:P, 1:P])
+                ldiv!(F_int, @view(G_int[1:P, :]))
+            else
                 F_ext = lu!(K_ext)
                 ldiv!(F_ext, G_ext)
-
-                # Interior solve: K_int * C_int = G_rhs_int
                 F_int = lu!(K_int)
                 ldiv!(F_int, G_int)
+            end
 
-                # wv = (4π²/M) · Gram · G_p_ext (plasma observer only)
-                wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :))
-
-                # Backward-compatible reconstruction: grre/grri in M×2P real layout
-                # Need to convert mode space to physical space and unpack the real and imaginary parts
-                mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
-                @view(grre[1:M, 1:P]) .= real.(temp)
-                @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-                mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
-                @view(grri[1:M, 1:P]) .= real.(temp)
-                @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+            # wv = (4π²/M) · Gram · green_fourier
+            wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :))
+        end
+        println(" Galerkin Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+        reconstruct_timing = @timed begin
+            # Backward-compatible reconstruction: grre/grri in M×2P real layout
+            # Need to convert mode space to physical space and unpack the real and imaginary parts
+            # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+            # perhaps make it a complex P * P matrix? Then don't need any of this section
+            mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
+            @view(grre[1:M, 1:P]) .= real.(temp)
+            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
+            @view(grri[1:M, 1:P]) .= real.(temp)
+            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+            if !wall.nowall
                 mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
                 @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
                 @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
@@ -214,9 +184,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
                 @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
                 @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
             end
-            println(" Wall Galerkin Solve + Reconstruct  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
         end
-
+        println(" Reconstruct  TIME=$(round(reconstruct_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(reconstruct_timing.bytes))")
     else
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]

From 8bda42053fd875dd68b1ac2a183f6304940c5d6d Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 12:41:15 -0400
Subject: [PATCH 13/23] VACUUM - WIP - optimizing the fused Galerkin code
 (mostly 3D benefits)

---
 src/Vacuum/Kernel2D.jl        |  20 +++--
 src/Vacuum/Kernel3D.jl        |  95 +++++++++++++++-------
 src/Vacuum/ProjectedKernel.jl | 148 +++++++++++++++++++++++-----------
 src/Vacuum/Vacuum.jl          |   5 +-
 4 files changed, 181 insertions(+), 87 deletions(-)

diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl
index eac91b01..b518ff4c 100644
--- a/src/Vacuum/Kernel2D.jl
+++ b/src/Vacuum/Kernel2D.jl
@@ -141,6 +141,9 @@ but grad_greenfunction is not since it fills a different block of the
     d1_spline_x(dx_dtheta_grid, theta_grid)
     d1_spline_z(dz_dtheta_grid, theta_grid)
 
+    # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition)
+    legendre_buf = acquire!(pool, Float64, n + 2)
+
     # Loop through observer points
     for j in 1:mtheta
         # Get observer coordinates
@@ -150,7 +153,7 @@ but grad_greenfunction is not since it fills a different block of the
         # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1.
         @inbounds for k in 1:(mtheta-3)
             isrc = mod1(j + 1 + k, mtheta)
-            G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; gamma_prefactor)
+            G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor)
 
             # Composite Simpson's 1/3 rule weights, excluding singular points
             # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0
@@ -181,7 +184,7 @@ but grad_greenfunction is not since it fills a different block of the
                 dx_dtheta_gauss = d1_spline_x(theta_gauss0)
                 z_gauss = spline_z(theta_gauss0)
                 dz_dtheta_gauss = d1_spline_z(theta_gauss0)
-                G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; gamma_prefactor)
+                G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; gamma_prefactor)
 
                 # Get stencil and weight for the Gaussian point
                 s = leftpanel ? stencils_left[ig] : stencils_right[ig]
@@ -639,19 +642,22 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c
   - Implements analytical derivatives from Chance 1997 equations
   - The coupling terms include the Jacobian factor from the coordinate transformation
   - By default uses the 2007 Legendre function implementation (Bulirsch + Gaussian integration)
+
+An overload accepting a pre-allocated `legendre_buf::Vector{Float64}` of length `n+2` is available.
+Callers in tight loops should allocate this buffer once and pass it in to avoid per-call pool acquisition.
 """
-@with_pool pool function green(
+function green(
     x_obs::Float64,
     z_obs::Float64,
     x_source::Float64,
     z_source::Float64,
     dx_dtheta::Float64,
     dz_dtheta::Float64,
-    n::Int;
+    n::Int,
+    legendre::AbstractVector{Float64};
     gamma_prefactor::Float64=2 * sqrt(π) * gamma(0.5 - n),
     uselegacygreenfunction::Bool=false
 )
-
     x_obs2 = x_obs^2
     x_source2 = x_source^2
     x_minus2 = (x_obs - x_source)^2
@@ -670,9 +676,7 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c
     # Argument of Legendre function 𝘴 [Chance Phys. Plasmas 1997 2161 eq. 42]
     s = (x_obs2 + x_source2 + ζ2) / R2
 
-    # Legendre functions for
-    # P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1
-    legendre = acquire!(pool, Float64, n + 2)
+    # Legendre functions: P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1
     if uselegacygreenfunction
         Pn_minus_half_1997!(legendre, s, n)
     else
diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index df28dd45..c158734e 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -197,7 +197,7 @@ The single-layer kernel φ is the fundamental solution to Laplace's equation:
 
   - `Float64`: Kernel value φ(x_obs, x_src)
 """
-function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real})
+@fastmath function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real})
     @inbounds begin
         dx = x_obs[1] - x_src[1]
         dy = x_obs[2] - x_src[2]
@@ -208,6 +208,21 @@ function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVect
     return inv(sqrt(r2))
 end
 
+"""
+Scalar-argument single-layer kernel. Avoids view creation in tight loops.
+"""
+@fastmath @inline function laplace_single_layer(
+    ox::Float64, oy::Float64, oz::Float64,
+    sx::Float64, sy::Float64, sz::Float64
+)
+    dx = ox - sx;
+    dy = oy - sy;
+    dz = oz - sz
+    r2 = dx*dx + dy*dy + dz*dz
+    r2 < 1e-30 && return 0.0
+    return inv(sqrt(r2))
+end
+
 """
     laplace_double_layer(x_obs, x_src, n_src) -> Float64
 
@@ -231,7 +246,7 @@ K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x
 
   - `Float64`: Kernel value K(x_obs, x_src, n_src)
 """
-function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real})
+@fastmath function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real})
     @inbounds begin
         dx = x_obs[1] - x_src[1]
         dy = x_obs[2] - x_src[2]
@@ -247,6 +262,24 @@ function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVect
     return (dx*nx + dy*ny + dz*nz) * r3inv
 end
 
+"""
+Scalar-argument double-layer kernel. Avoids view creation in tight loops.
+"""
+@fastmath @inline function laplace_double_layer(
+    ox::Float64, oy::Float64, oz::Float64,
+    sx::Float64, sy::Float64, sz::Float64,
+    nx::Float64, ny::Float64, nz::Float64
+)
+    dx = ox - sx;
+    dy = oy - sy;
+    dz = oz - sz
+    r2 = dx*dx + dy*dy + dz*dz
+    r2 < 1e-30 && return 0.0
+    rinv = inv(sqrt(r2))
+    r3inv = rinv * rinv * rinv
+    return (dx*nx + dy*ny + dz*nz) * r3inv
+end
+
 """
     extract_patch!(patch, data, idx_pol_center, idx_tor_center, npol, ntor, PATCH_DIM)
 
@@ -380,12 +413,17 @@ where each entry is φ(x_obs, x_src).
   - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place
 
   - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place
+
   - `observer`: Observer geometry (PlasmaGeometry3D)
+
   - `source`: Source geometry (PlasmaGeometry3D)
+
   - `PATCH_RAD`: Number of points adjacent to source point to treat as singular
 
       + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1)
+
   - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM
+
   - `INTERP_ORDER`: Lagrange interpolation order
 
       + Must be ≤ (2 * PATCH_RAD + 1)
@@ -446,24 +484,26 @@ function compute_3D_kernel_matrices!(
         # Convert linear index to 2D indices
         i_obs = mod1(idx_obs, observer.mtheta)
         j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
-        r_obs = @view observer.r[idx_obs, :]
+        @inbounds ox = observer.r[idx_obs, 1]
+        @inbounds oy = observer.r[idx_obs, 2]
+        @inbounds oz = observer.r[idx_obs, 3]
 
         # ============================================================
         # FAR FIELD: Trapezoidal rule for nonsingular source points
         # Note: kernels return zero for r_src = r_obs
         # ============================================================
         @inbounds for idx_src in 1:num_points
-            # Evaluate kernels at grid points
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            K_single = laplace_single_layer(r_obs, r_src)
-            K_double = laplace_double_layer(r_obs, r_src, n_src)
-
+            sx = source.r[idx_src, 1];
+            sy = source.r[idx_src, 2];
+            sz = source.r[idx_src, 3]
+            nx = source.normal[idx_src, 1];
+            ny = source.normal[idx_src, 2];
+            nz = source.normal[idx_src, 3]
             # Apply weights (periodic trapezoidal rule = constant weights)
+            grad_greenfunction_block[idx_obs, idx_src] = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
             if populate_greenfunction
-                greenfunction[idx_obs, idx_src] = K_single * dθdζ
+                greenfunction[idx_obs, idx_src] = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
             end
-            grad_greenfunction_block[idx_obs, idx_src] = K_double * dθdζ
         end
 
         # ============================================================
@@ -484,15 +524,15 @@ function compute_3D_kernel_matrices!(
 
         # Evaluate kernels at polar points with POU weighting
         @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
-            # Evaluate kernels using recomputed normal (use @view to avoid allocation)
-            r_src = @view r_polar[ir, ia, :]
-            n_src = @view n_polar[ir, ia, :]
-            K_single = laplace_single_layer(r_obs, r_src)
-            K_double = laplace_double_layer(r_obs, r_src, n_src)
-
-            # Apply quadrature weights: area element × POU, where POU contains rdrdθ already
-            M_polar_single[ir, ia] = K_single * Ppou[ir, ia] * dθdζ
-            M_polar_double[ir, ia] = K_double * Ppou[ir, ia] * dθdζ
+            # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already
+            rsx = r_polar[ir, ia, 1];
+            rsy = r_polar[ir, ia, 2];
+            rsz = r_polar[ir, ia, 3]
+            nsx = n_polar[ir, ia, 1];
+            nsy = n_polar[ir, ia, 2];
+            nsz = n_polar[ir, ia, 3]
+            M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ
+            M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
         end
 
         # Distribute polar singular corrections back to Cartesian grid using sparse matrix
@@ -502,25 +542,22 @@ function compute_3D_kernel_matrices!(
         M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
         M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
 
-        # Compute remaining far-field POU contribution and near-field polar quadrature result
-        # We include this region in the far-field trapezoidal rule, so use Gpou = -χ here to get 1-χ
+        # POU correction: read back far-field trapezoidal values instead of re-evaluating kernels.
+        # trap + M_grid + trap*Gpou = trap*(1+Gpou) + M_grid = trap*(1-χ) + M_grid
         @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM
             # Map back to global indices
             idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta)
             idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta)
             idx_src = idx_pol + source.mtheta * (idx_tor - 1)
 
-            # Remainder of far-field contribution on the singular grid: Gpou = -χ
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            far_single = laplace_single_layer(r_obs, r_src) * Gpou[i, j] * dθdζ
-            far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[i, j] * dθdζ
+            trap_double = grad_greenfunction_block[idx_obs, idx_src]
+            grad_greenfunction_block[idx_obs, idx_src] = trap_double + M_grid_double[i, j] + trap_double * Gpou[i, j]
 
             # Apply near + far contributions
             if populate_greenfunction
-                greenfunction[idx_obs, idx_src] += M_grid_single[i, j] + far_single
+                trap_single = greenfunction[idx_obs, idx_src]
+                greenfunction[idx_obs, idx_src] = trap_single + M_grid_single[i, j] + trap_single * Gpou[i, j]
             end
-            grad_greenfunction_block[idx_obs, idx_src] += M_grid_double[i, j] + far_double
         end
     end
 
diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
index ac8e526f..8218739d 100644
--- a/src/Vacuum/ProjectedKernel.jl
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -15,6 +15,32 @@
 # FLOP cost is identical to the two-step approach O(M²P), but memory drops
 # from O(M²) to O(MP + P²).
 
+# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ──────────
+
+"""
+Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P.
+"""
+@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64,
+    Zt::AbstractMatrix{ComplexF64}, col::Int)
+    @inbounds @simd for p in eachindex(proj)
+        proj[p] += w * Zt[p, col]
+    end
+end
+
+"""
+Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary.
+"""
+@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64},
+    Zt::AbstractMatrix{ComplexF64}, j::Int,
+    y::AbstractVector{ComplexF64})
+    @inbounds for p2 in eachindex(y)
+        y_p2 = y[p2]
+        for p1 in axes(A, 1)
+            A[p1, p2] += conj(Zt[p1, j]) * y_p2
+        end
+    end
+end
+
 # ============================================================================
 # 2D fused projected kernel
 # ============================================================================
@@ -81,6 +107,7 @@ Memory: O(MP) instead of O(M²).
 )
     M, P = size(exp_mn_basis)
     Z = exp_mn_basis
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
     mtheta = length(observer.x)
     dtheta = 2π / mtheta
     theta_grid = range(; start=0, length=mtheta, step=dtheta)
@@ -114,6 +141,9 @@ Memory: O(MP) instead of O(M²).
     d1_spline_x(dx_dtheta_grid, theta_grid)
     d1_spline_z(dz_dtheta_grid, theta_grid)
 
+    # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition)
+    legendre_buf = Vector{Float64}(undef, n + 2)
+
     # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z
     proj_kz = zeros!(pool, ComplexF64, P)
     proj_gz = zeros!(pool, ComplexF64, P)
@@ -130,17 +160,15 @@ Memory: O(MP) instead of O(M²).
             isrc = mod1(j + 1 + k, mtheta)
             G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
                 source.x[isrc], source.z[isrc],
-                dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n;
+                dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf;
                 gamma_prefactor)
 
             wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2))
 
             if populate_greenfunction
-                w_g = G_n * wsimpson
-                BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
+                _accum_row!(proj_gz, G_n * wsimpson, Zt, isrc)
             end
-            w_k = gradG_n * wsimpson
-            BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz)
+            _accum_row!(proj_kz, gradG_n * wsimpson, Zt, isrc)
 
             diag_accum -= gradG_0 * wsimpson
         end
@@ -160,7 +188,7 @@ Memory: O(MP) instead of O(M²).
                 z_gauss = spline_z(theta_gauss0)
                 dz_dtheta_gauss = d1_spline_z(theta_gauss0)
                 G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
-                    x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n;
+                    x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf;
                     gamma_prefactor)
 
                 s = leftpanel ? stencils_left[ig] : stencils_right[ig]
@@ -171,16 +199,12 @@ Memory: O(MP) instead of O(M²).
                         G_n += log((theta_obs - theta_gauss)^2) / x_obs
                     end
                     @inbounds for stencil_idx in 1:5
-                        w_g = G_n * s[stencil_idx] * wgauss
-                        isrc = sing_idx[stencil_idx]
-                        BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
+                        _accum_row!(proj_gz, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                     end
                 end
 
                 @inbounds for stencil_idx in 1:5
-                    w_k = gradG_n * s[stencil_idx] * wgauss
-                    isrc = sing_idx[stencil_idx]
-                    BLAS.axpy!(ComplexF64(w_k), @view(Z[isrc, :]), proj_kz)
+                    _accum_row!(proj_kz, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                 end
 
                 diag_accum -= gradG_0 * wgauss
@@ -190,19 +214,17 @@ Memory: O(MP) instead of O(M²).
         # Analytic singular integral correction [Chance 1997 eq. 75]
         if populate_greenfunction && observer isa PlasmaGeometry
             @inbounds for stencil_idx in 1:5
-                w_g = -log_correction_array[stencil_idx] / x_obs
-                isrc = sing_idx[stencil_idx]
-                BLAS.axpy!(ComplexF64(w_g), @view(Z[isrc, :]), proj_gz)
+                _accum_row!(proj_gz, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx])
             end
         end
 
         # Fold diagonal accumulation into projection
-        BLAS.axpy!(ComplexF64(diag_accum), @view(Z[j, :]), proj_kz)
+        _accum_row!(proj_kz, diag_accum, Zt, j)
 
         # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ──
-        BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_kz, K_c_block)
+        _rank1_conj!(K_c_block, Zt, j, proj_kz)
         if populate_greenfunction
-            BLAS.geru!(ComplexF64(1.0), conj.(@view(Z[j, :])), proj_gz, G_c_block)
+            _rank1_conj!(G_c_block, Zt, j, proj_gz)
         end
     end
 
@@ -259,6 +281,7 @@ function _projected_kernel_3D!(
 )
     M, P = size(exp_mn_basis)
     Z = exp_mn_basis
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
     num_points = observer.mtheta * observer.nzeta
     dθdζ = 4π^2 / num_points
 
@@ -276,15 +299,16 @@ function _projected_kernel_3D!(
     quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER)
     (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data
 
-    # [M × P] buffers: row idx_obs holds (kernel row idx_obs) · Z
-    KZ = zeros(ComplexF64, M, P)
-    GZ = zeros(ComplexF64, M, P)
+    # [P × M] buffers: column idx_obs holds (kernel row idx_obs) · Z
+    KZt = zeros(ComplexF64, P, M)
+    GZt = zeros(ComplexF64, P, M)
 
-    # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors)
+    # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors + patch mask)
     max_tid = Threads.maxthreadid()
     workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid]
     proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
     proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
+    is_patch_all = [falses(num_points) for _ in 1:max_tid]
 
     Threads.@threads :static for idx_obs in 1:num_points
         tid = Threads.threadid()
@@ -294,24 +318,40 @@ function _projected_kernel_3D!(
 
         proj_kz = proj_kz_all[tid]
         proj_gz = proj_gz_all[tid]
+        is_patch = is_patch_all[tid]
 
         fill!(proj_kz, 0.0)
         fill!(proj_gz, 0.0)
+        fill!(is_patch, false)
 
         i_obs = mod1(idx_obs, observer.mtheta)
         j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
-        r_obs = @view observer.r[idx_obs, :]
+        @inbounds ox = observer.r[idx_obs, 1]
+        @inbounds oy = observer.r[idx_obs, 2]
+        @inbounds oz = observer.r[idx_obs, 3]
+
+        # Mark patch source indices so the far-field loop can skip them
+        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
+            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
+            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
+            is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true
+        end
 
-        # ── FAR FIELD: Trapezoidal rule ──
+        # ── FAR FIELD: Trapezoidal rule (skip patch — handled in POU correction) ──
         @inbounds for idx_src in 1:num_points
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            w_double = laplace_double_layer(r_obs, r_src, n_src) * dθdζ
-            BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz)
+            is_patch[idx_src] && continue
+            sx = source.r[idx_src, 1];
+            sy = source.r[idx_src, 2];
+            sz = source.r[idx_src, 3]
+            nx = source.normal[idx_src, 1];
+            ny = source.normal[idx_src, 2];
+            nz = source.normal[idx_src, 3]
+            w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
+            _accum_row!(proj_kz, w_double, Zt, idx_src)
 
             if populate_greenfunction
-                w_single = laplace_single_layer(r_obs, r_src) * dθdζ
-                BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz)
+                w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
+                _accum_row!(proj_gz, w_single, Zt, idx_src)
             end
         end
 
@@ -327,10 +367,14 @@ function _projected_kernel_3D!(
         compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient)
 
         @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
-            r_src = @view r_polar[ir, ia, :]
-            n_src = @view n_polar[ir, ia, :]
-            M_polar_single[ir, ia] = laplace_single_layer(r_obs, r_src) * Ppou[ir, ia] * dθdζ
-            M_polar_double[ir, ia] = laplace_double_layer(r_obs, r_src, n_src) * Ppou[ir, ia] * dθdζ
+            rsx = r_polar[ir, ia, 1];
+            rsy = r_polar[ir, ia, 2];
+            rsz = r_polar[ir, ia, 3]
+            nsx = n_polar[ir, ia, 1];
+            nsy = n_polar[ir, ia, 2];
+            nsz = n_polar[ir, ia, 3]
+            M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ
+            M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
         end
 
         mul!(M_grid_single_flat, P2G, vec(M_polar_single))
@@ -338,36 +382,44 @@ function _projected_kernel_3D!(
         M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
         M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
 
+        # POU correction: evaluate kernel once with combined weight (1+Gpou) = (1-χ)
+        # since far-field skipped patch points, we include the full trapezoidal + polar here
         @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
             idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
             idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
             idx_src = idx_pol + source.mtheta * (idx_tor - 1)
 
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[ii, jj] * dθdζ
-            w_double = M_grid_double[ii, jj] + far_double
-            BLAS.axpy!(ComplexF64(w_double), @view(Z[idx_src, :]), proj_kz)
+            sx = source.r[idx_src, 1];
+            sy = source.r[idx_src, 2];
+            sz = source.r[idx_src, 3]
+            nx = source.normal[idx_src, 1];
+            ny = source.normal[idx_src, 2];
+            nz = source.normal[idx_src, 3]
+            full_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[ii, jj]) * dθdζ
+            _accum_row!(proj_kz, M_grid_double[ii, jj] + full_double, Zt, idx_src)
 
             if populate_greenfunction
-                far_single = laplace_single_layer(r_obs, r_src) * Gpou[ii, jj] * dθdζ
-                w_single = M_grid_single[ii, jj] + far_single
-                BLAS.axpy!(ComplexF64(w_single), @view(Z[idx_src, :]), proj_gz)
+                full_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[ii, jj]) * dθdζ
+                _accum_row!(proj_gz, M_grid_single[ii, jj] + full_single, Zt, idx_src)
             end
         end
 
-        # ── Write projected row to buffer (each idx_obs owns its row) ──
-        @inbounds KZ[idx_obs, :] .= proj_kz
+        # ── Write projected column to buffer (each idx_obs owns its column) ──
+        @inbounds for p in 1:P
+            KZt[p, idx_obs] = proj_kz[p]
+        end
         if populate_greenfunction
-            @inbounds GZ[idx_obs, :] .= proj_gz
+            @inbounds for p in 1:P
+                GZt[p, idx_obs] = proj_gz[p]
+            end
         end
     end
 
-    # ── Assemble P×P projected matrices: K_c = Z^H K Z, G_c = Z^H G Z ──
-    mul!(K_c_block, Z', KZ)
+    # ── Assemble P×P projected matrices: K_c = Z^H * KZt^T, G_c = Z^H * GZt^T ──
+    mul!(K_c_block, Z', transpose(KZt))
     K_c_block ./= 2π
     if populate_greenfunction
-        mul!(G_c_block, Z', GZ)
+        mul!(G_c_block, Z', transpose(GZt))
         G_c_block ./= 2π
     end
 
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 7b058e19..4fb1387c 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -161,8 +161,9 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
                 ldiv!(F_int, G_int)
             end
 
-            # wv = (4π²/M) · Gram · green_fourier
-            wv .= (4π^2 / M) .* (Gram * view(G_ext, 1:P, :))
+            # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
+            mul!(wv, Gram, view(G_ext, 1:P, :))
+            wv .*= (4π^2 / M)
         end
         println(" Galerkin Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
         reconstruct_timing = @timed begin

From b185e95b6d3e1eed3e31f35f5a38e7d1633f5666 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 13:44:53 -0400
Subject: [PATCH 14/23] VACUUM - WIP - removing timers, adding benchmark

---
 benchmarks/benchmark_vacuum_galerkin.jl | 399 ++++++++++++++++++++++++
 src/Vacuum/Vacuum.jl                    | 182 +++++------
 2 files changed, 477 insertions(+), 104 deletions(-)
 create mode 100644 benchmarks/benchmark_vacuum_galerkin.jl

diff --git a/benchmarks/benchmark_vacuum_galerkin.jl b/benchmarks/benchmark_vacuum_galerkin.jl
new file mode 100644
index 00000000..358c6240
--- /dev/null
+++ b/benchmarks/benchmark_vacuum_galerkin.jl
@@ -0,0 +1,399 @@
+#!/usr/bin/env julia
+
+using Printf
+using LinearAlgebra
+using TOML
+using Plots
+
+using Pkg
+Pkg.instantiate()
+using BenchmarkTools
+
+using GeneralizedPerturbedEquilibrium
+const GPEC = GeneralizedPerturbedEquilibrium
+
+"""
+    make_wall_settings(example_dir::AbstractString)
+
+Construct `Vacuum.WallShapeSettings` from the `[Wall]` section in `gpec.toml`
+if present; otherwise return default settings.
+"""
+function make_wall_settings(example_dir::AbstractString)
+    inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml"))
+    if haskey(inputs, "Wall")
+        return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    elseif haskey(inputs, "WALL")
+        # Some examples use legacy capitalized section name
+        return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["WALL"])...)
+    else
+        return GPEC.Vacuum.WallShapeSettings()
+    end
+end
+
+"""
+    load_equilibrium(example_dir::AbstractString)
+
+Set up the equilibrium specified by `[Equilibrium]` in `gpec.toml` under `example_dir`.
+"""
+function load_equilibrium(example_dir::AbstractString)
+    inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml"))
+    @assert haskey(inputs, "Equilibrium") "[Equilibrium] section missing in gpec.toml for $example_dir"
+    eq_cfg = GPEC.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], example_dir)
+    return GPEC.Equilibrium.setup_equilibrium(eq_cfg)
+end
+
+"""
+    make_vacuum_input(
+        equil::GPEC.Equilibrium.PlasmaEquilibrium;
+        ψ::Real,
+        mtheta::Int,
+        nzeta::Int,
+        mpert::Int,
+        mlow::Int,
+        npert::Int,
+        nlow::Int,
+        use_galerkin::Bool,
+    ) -> GPEC.Vacuum.VacuumInput
+
+Construct a `VacuumInput` at flux surface `ψ` with the specified resolution and mode set.
+The only parameter that differs between the two algorithms we compare is `use_galerkin`.
+"""
+function make_vacuum_input(
+    equil::GPEC.Equilibrium.PlasmaEquilibrium;
+    ψ::Real,
+    mtheta::Int,
+    nzeta::Int,
+    mpert::Int,
+    mlow::Int,
+    npert::Int,
+    nlow::Int,
+    use_galerkin::Bool
+)
+    r, z, ν = GPEC.Vacuum.extract_plasma_surface_at_psi(equil, float(ψ))
+
+    return GPEC.Vacuum.VacuumInput(;
+        x=reverse(r),
+        z=reverse(z),
+        ν=reverse(ν),
+        mtheta_in=length(r),
+        nzeta_in=1,
+        mlow=mlow,
+        mpert=mpert,
+        nlow=nlow,
+        npert=npert,
+        mtheta=mtheta,
+        nzeta=nzeta,
+        force_wv_symmetry=true,
+        use_galerkin=use_galerkin
+    )
+end
+
+"""
+    benchmark_vacuum_2d(
+        example_dir::AbstractString;
+        ψ::Real = 0.95,
+        mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:9)),
+        mpert::Int = 32,
+        mlow::Int = 0,
+        npert::Int = 1,
+        nlow::Int = 1,
+    )
+
+Benchmark `compute_vacuum_response` for 2D (nzeta = 1) using the Solovev example in
+`example_dir`. Scans over `mtheta_values` and compares collocation (`use_galerkin=false`)
+against Galerkin (`use_galerkin=true`) for convergence of the `wv` matrix and runtime.
+"""
+function benchmark_vacuum_2d(
+    example_dir::AbstractString;
+    ψ::Real=1.0,
+    mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:7)),
+    mpert::Int=32,
+    mlow::Int=0,
+    npert::Int=1,
+    nlow::Int=1
+)
+    println("\n===== 2D Vacuum Benchmark (Solovev, $(basename(example_dir))) =====")
+    println("ψ = $(ψ), mtheta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n")
+
+    equil = load_equilibrium(example_dir)
+    wall_settings = make_wall_settings(example_dir)
+
+    mtheta_values = collect(mtheta_values)
+    nm = length(mtheta_values)
+
+    times_colloc = zeros(Float64, nm)
+    times_galerkin = zeros(Float64, nm)
+    errs_colloc = zeros(Float64, nm)
+    errs_galerkin = zeros(Float64, nm)
+
+    # Reference wv for convergence (highest resolution * 2) – always use galerkin
+    mtheta_ref = maximum(mtheta_values) * 2
+    println("Computing 2D reference matrices at mtheta_ref = $mtheta_ref")
+
+    input_ref_galerkin = make_vacuum_input(
+        equil;
+        ψ=ψ,
+        mtheta=mtheta_ref,
+        nzeta=1,
+        mpert=mpert,
+        mlow=mlow,
+        npert=npert,
+        nlow=nlow,
+        use_galerkin=true
+    )
+    wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings)
+    # IMPORTANT: `compute_vacuum_response` uses AdaptiveArrayPools; take a copy so
+    # later pooled allocations do not overwrite the reference storage.
+    wv_ref_galerkin = copy(wv_ref_galerkin)
+    ref_norm_galerkin = norm(wv_ref_galerkin)
+
+    for (i, mtheta) in enumerate(mtheta_values)
+        println("  2D: mtheta = $(rpad(string(mtheta), 5))  (nzeta = 1)")
+
+        # Collocation
+        input_colloc = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=1,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=false
+        )
+        t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings)
+        wv, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings)
+        errs_colloc[i] = norm(wv .- wv_ref_galerkin) / ref_norm_galerkin
+        times_colloc[i] = t_colloc
+
+        # Galerkin
+        input_galerkin = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=1,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=true
+        )
+        t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings)
+        wv_g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings)
+        errs_galerkin[i] = norm(wv_g .- wv_ref_galerkin) / ref_norm_galerkin
+        times_galerkin[i] = t_galerkin
+
+        @printf("    Collocation:  t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i])
+        @printf("    Galerkin:     t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i])
+    end
+
+    # Two‑pane plot: left = convergence, right = runtime
+    plt = plot(; layout=(1, 2), size=(1100, 420))
+
+    # Convergence
+    plot!(plt[1], mtheta_values, errs_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ", ylabel="rel‖Δwv‖",
+        title="2D Vacuum: wv convergence vs mθ")
+    plot!(plt[1], mtheta_values, errs_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    # Runtime
+    plot!(plt[2], mtheta_values, times_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ", ylabel="runtime [s]",
+        title="2D Vacuum: runtime vs mθ")
+    plot!(plt[2], mtheta_values, times_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    plot!(plt[1]; legend=:bottomleft)
+    plot!(plt[2]; legend=:topleft)
+
+    outpath = joinpath(@__DIR__, "vacuum_galerkin_2d.png")
+    savefig(plt, outpath)
+    println("\n2D results saved to $(outpath)")
+
+    return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin)
+end
+
+"""
+    benchmark_vacuum_3d(
+        example_dir::AbstractString;
+        ψ::Real = 0.95,
+        mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:3)),
+        mpert::Int = 16,
+        mlow::Int = 0,
+        npert::Int = 1,
+        nlow::Int = 1,
+    )
+
+Benchmark `compute_vacuum_response` for 3D (nzeta = mtheta) using the Solovev 3D example
+in `example_dir`. Scans over `mtheta = nzeta` and compares collocation vs Galerkin.
+"""
+function benchmark_vacuum_3d(
+    example_dir::AbstractString;
+    ψ::Real=1.0,
+    mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:2)),
+    mpert::Int=16,
+    mlow::Int=0,
+    npert::Int=1,
+    nlow::Int=1
+)
+    println("\n===== 3D Vacuum Benchmark (Solovev 3D, $(basename(example_dir))) =====")
+    println("ψ = $(ψ), mtheta = nzeta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n")
+
+    equil = load_equilibrium(example_dir)
+    wall_settings = make_wall_settings(example_dir)
+
+    mtheta_values = collect(mtheta_values)
+    nm = length(mtheta_values)
+
+    times_colloc = zeros(Float64, nm)
+    times_galerkin = zeros(Float64, nm)
+    errs_colloc = zeros(Float64, nm)
+    errs_galerkin = zeros(Float64, nm)
+
+    # Reference wv for convergence (highest resolution * 2) – always use galerkin
+    mtheta_ref = maximum(mtheta_values) * 2
+    nzeta_ref = mtheta_ref
+    println("Computing 3D reference matrices at mtheta_ref = nzeta_ref = $mtheta_ref")
+
+    input_ref_galerkin = make_vacuum_input(
+        equil;
+        ψ=ψ,
+        mtheta=mtheta_ref,
+        nzeta=nzeta_ref,
+        mpert=mpert,
+        mlow=mlow,
+        npert=npert,
+        nlow=nlow,
+        use_galerkin=true
+    )
+    wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings)
+    # Again, protect the reference matrix from being overwritten by pooled allocations.
+    wv_ref_galerkin = copy(wv_ref_galerkin)
+    ref_norm_galerkin = norm(wv_ref_galerkin)
+
+    for (i, mtheta) in enumerate(mtheta_values)
+        nzeta = mtheta
+        println("  3D: mtheta = $(rpad(string(mtheta), 5)), nzeta = $(rpad(string(nzeta), 5))")
+
+        # Collocation
+        input_colloc = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=nzeta,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=false
+        )
+        t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings)
+        wv3, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings)
+        errs_colloc[i] = norm(wv3 .- wv_ref_galerkin) / ref_norm_galerkin
+        times_colloc[i] = t_colloc
+
+        # Galerkin
+        input_galerkin = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=nzeta,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=true
+        )
+        t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings)
+        wv3g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings)
+        errs_galerkin[i] = norm(wv3g .- wv_ref_galerkin) / ref_norm_galerkin
+        times_galerkin[i] = t_galerkin
+
+        @printf("    Collocation:  t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i])
+        @printf("    Galerkin:     t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i])
+    end
+
+    # Two‑pane plot: left = convergence, right = runtime (mtheta = nzeta on x-axis)
+    plt = plot(; layout=(1, 2), size=(1100, 420))
+
+    # Convergence
+    plot!(plt[1], mtheta_values, errs_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ = nzeta", ylabel="rel‖Δwv‖",
+        title="3D Vacuum: wv convergence vs mθ = nzeta")
+    plot!(plt[1], mtheta_values, errs_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    # Runtime
+    plot!(plt[2], mtheta_values, times_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ = nzeta", ylabel="runtime [s]",
+        title="3D Vacuum: runtime vs mθ = nzeta")
+    plot!(plt[2], mtheta_values, times_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    plot!(plt[1]; legend=:bottomleft)
+    plot!(plt[2]; legend=:topleft)
+
+    outpath = joinpath(@__DIR__, "vacuum_galerkin_3d.png")
+    savefig(plt, outpath)
+    println("\n3D results saved to $(outpath)")
+
+    return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin)
+end
+
+"""
+    main()
+
+Entry point when running this file as a script.
+
+Usage (from repository root):
+
+```bash
+julia --project=. benchmarks/benchmark_vacuum_galerkin.jl
+```
+
+Edit the `mtheta_values` and other keyword arguments in the calls below to
+explore different resolution ranges.
+"""
+function main()
+    # 2D Solovev example
+    example_2d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example")
+
+    # 3D Solovev example
+    example_3d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example_3D")
+
+    benchmark_vacuum_2d(
+        example_2d;
+        ψ=1.0,
+        mtheta_values=16 .* (2 .^ (0:9)),  # 16 → 8192 (easily editable)
+        mpert=31,
+        mlow=-15,
+        npert=1,
+        nlow=1
+    )
+
+    benchmark_vacuum_3d(
+        example_3d;
+        ψ=1.0,
+        mtheta_values=16 .* (2 .^ (0:3)), # 16 → 128 (easily editable)
+        mpert=31,
+        mlow=-15,
+        npert=1,
+        nlow=1
+    )
+
+    return nothing
+end
+
+if abspath(PROGRAM_FILE) == @__FILE__
+    main()
+end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 4fb1387c..c1c50514 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -125,68 +125,57 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         K_int = similar!(pool, K_ext)
         G_int = similar!(pool, G_ext)
 
-        # Fused projected kernel: grad_green_fourier = Z^H K Z, green_fourier = Z^H G Z
-        fused_timing = @timed begin
-            kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-        end
-        println(" Fused Projected Kernel  TIME=$(round(fused_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(fused_timing.bytes))")
+        # Fused projected kernel: compute Z^H K Z and Z^H G Z
+        kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
         if !wall.nowall
-            kernel_timing = @timed begin
-                kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
-                kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
-                kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
-            end
-            println(" Wall Galerkin Projected Kernels  TIME=$(round(kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(kernel_timing.bytes))")
+            kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
+            kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
+            kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
         end
 
-        solve_timing = @timed begin
-            # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
-            K_int .= -K_ext
-            K_int[1:P, 1:P] .+= 2 .* Gram
-            if !wall.nowall
-                K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
-            end
-            G_int .= G_ext
-
-            # Solve projected BIEs for exterior and interior kernels
-            if wall.nowall
-                F_ext = lu!(K_ext[1:P, 1:P])
-                ldiv!(F_ext, @view(G_ext[1:P, :]))
-                F_int = lu!(K_int[1:P, 1:P])
-                ldiv!(F_int, @view(G_int[1:P, :]))
-            else
-                F_ext = lu!(K_ext)
-                ldiv!(F_ext, G_ext)
-                F_int = lu!(K_int)
-                ldiv!(F_int, G_int)
-            end
-
-            # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
-            mul!(wv, Gram, view(G_ext, 1:P, :))
-            wv .*= (4π^2 / M)
+        # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
+        K_int .= -K_ext
+        K_int[1:P, 1:P] .+= 2 .* Gram
+        if !wall.nowall
+            K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
+        end
+        G_int .= G_ext
+
+        # Solve projected BIEs for exterior and interior kernels
+        if wall.nowall
+            F_ext = lu!(K_ext[1:P, 1:P])
+            ldiv!(F_ext, @view(G_ext[1:P, :]))
+            F_int = lu!(K_int[1:P, 1:P])
+            ldiv!(F_int, @view(G_int[1:P, :]))
+        else
+            F_ext = lu!(K_ext)
+            ldiv!(F_ext, G_ext)
+            F_int = lu!(K_int)
+            ldiv!(F_int, G_int)
         end
-        println(" Galerkin Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
-        reconstruct_timing = @timed begin
-            # Backward-compatible reconstruction: grre/grri in M×2P real layout
-            # Need to convert mode space to physical space and unpack the real and imaginary parts
-            # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
-            # perhaps make it a complex P * P matrix? Then don't need any of this section
-            mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
-            @view(grre[1:M, 1:P]) .= real.(temp)
-            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
-            @view(grri[1:M, 1:P]) .= real.(temp)
-            @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-            if !wall.nowall
-                mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
-                @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-                @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-                mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
-                @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
-                @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-            end
+
+        # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
+        mul!(wv, Gram, view(G_ext, 1:P, :))
+        wv .*= (4π^2 / M)
+
+        # Backward-compatible reconstruction: grre/grri in M×2P real layout
+        # Need to convert mode space to physical space and unpack the real and imaginary parts
+        # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+        # perhaps make it a complex P * P matrix? Then don't need any of this section
+        mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
+        @view(grre[1:M, 1:P]) .= real.(temp)
+        @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+        mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
+        @view(grri[1:M, 1:P]) .= real.(temp)
+        @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+        if !wall.nowall
+            mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
+            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+            mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
+            @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
+            @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
         end
-        println(" Reconstruct  TIME=$(round(reconstruct_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(reconstruct_timing.bytes))")
     else
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]
@@ -196,62 +185,47 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         grad_green = zeros!(pool, num_points_total, num_points_total)
         green_temp = zeros!(pool, num_points_surf, num_points_surf)
 
-        pp_kernel_timing = @timed begin
-            kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
-        end
-        println(" Plasma Kernel  TIME=$(round(pp_kernel_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(pp_kernel_timing.bytes))")
+        kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
 
         # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z)
-        colloc_proj_timing = @timed begin
-            mul!(temp, green_temp, exp_mn_basis)
-            @view(grre[1:M, 1:P]) .= real.(temp)
-            @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-        end
-        println(" Plasma Project  TIME=$(round(colloc_proj_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(colloc_proj_timing.bytes))")
+        mul!(temp, green_temp, exp_mn_basis)
+        @view(grre[1:M, 1:P]) .= real.(temp)
+        @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
 
         if !wall.nowall
-            wall_block_timing = @timed begin
-                # Plasma–Wall block
-                kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
-                # Wall–Wall block
-                kernel!(grad_green, green_temp, wall, wall, kparams)
-                # Wall–Plasma block
-                kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-                # Project obs=wall, src=plasma block to mode space
-                mul!(temp, green_temp, exp_mn_basis)
-                @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-                @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-            end
-            println(" Wall Kernel and Project  TIME=$(round(wall_block_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wall_block_timing.bytes))")
+            # Plasma–Wall block
+            kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
+            # Wall–Wall block
+            kernel!(grad_green, green_temp, wall, wall, kparams)
+            # Wall–Plasma block
+            kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
+            # Project obs=wall, src=plasma block to mode space
+            mul!(temp, green_temp, exp_mn_basis)
+            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
         end
 
         # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
-        solve_timing = @timed begin
-            grri .= grre # start from same as exterior
-            grad_green_interior = similar!(pool, grad_green)
-            grad_green_interior .= grad_green
-
-            # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
-            F_ext = lu!(grad_green)
-            ldiv!(F_ext, grre)
-
-            # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
-            grad_green_interior .*= -1
-            for i in 1:num_points_total
-                grad_green_interior[i, i] += 2.0
-            end
-            F_int = lu!(grad_green_interior)
-            ldiv!(F_int, grri)
+        grri .= grre # start from same as exterior
+        grad_green_interior = similar!(pool, grad_green)
+        grad_green_interior .= grad_green
+
+        # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
+        F_ext = lu!(grad_green)
+        ldiv!(F_ext, grre)
+
+        # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
+        grad_green_interior .*= -1
+        for i in 1:num_points_total
+            grad_green_interior[i, i] += 2.0
         end
-        println(" Invert and Solve  TIME=$(round(solve_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(solve_timing.bytes))")
+        F_int = lu!(grad_green_interior)
+        ldiv!(F_int, grri)
 
-        wv_timing = @timed begin
-            # wv = (4π²/M) · Z^H · grre_complex  [Chance Phys. Plasmas 2007 052506 eq. 115-118]
-            temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)]))
-            mul!(wv, exp_mn_basis', temp)
-            wv .*= (4π^2 / M)
-        end
-        println(" Compute Wv  TIME=$(round(wv_timing.time; digits=6)) s  ALLOCATIONS=$(Base.format_bytes(wv_timing.bytes))")
+        # wv = (4π²/M) · Z^H · grre_complex  [Chance Phys. Plasmas 2007 052506 eq. 115-118]
+        temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)]))
+        mul!(wv, exp_mn_basis', temp)
+        wv .*= (4π^2 / M)
     end
 
     inputs.force_wv_symmetry && hermitianpart!(wv)

From 0c5ef1c4d66bcb7cdaf38060c0b9dbbd05ba0999 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Mon, 16 Mar 2026 14:16:46 -0400
Subject: [PATCH 15/23] VACUUM - WIP - simplifying some math in Free.jl

---
 src/ForceFreeStates/Free.jl | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl
index 65455d9e..f0e4a9be 100644
--- a/src/ForceFreeStates/Free.jl
+++ b/src/ForceFreeStates/Free.jl
@@ -28,14 +28,11 @@ and data dumping.
 
     # Compute vacuum response matrix in-place (handles 2D single-n, 2D multi-n block-diagonal, and 3D)
     vac_inputs = Vacuum.VacuumInput(equil, psilim, ctrl.mthvac, ctrl.nzvac, mpert, mlow, npert, nlow; force_wv_symmetry=ctrl.force_wv_symmetry)
-    @time Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings)
+    Vacuum.compute_vacuum_response!(vac_data, vac_inputs, wall_settings)
 
     # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126]
     singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)')
-    @inbounds for ipert in 1:numpert_total
-        @views vac_data.wv[ipert, :] .*= singfac[ipert]
-        @views vac_data.wv[:, ipert] .*= singfac[ipert]
-    end
+    @inbounds @views vac_data.wv .*= singfac .* singfac'
 
     # Compute complex energy eigenvalues and vectors
     vac_data.wt .= wp .+ vac_data.wv
@@ -75,13 +72,11 @@ and data dumping.
     # Compute plasma and vacuum contributions.
     # wpt = wt' * wp * wt  ; wvt = wt' * wv * wt
     mul!(tmp_mat, wp, vac_data.wt)
-    mul!(wpt, adjoint(vac_data.wt), tmp_mat)
+    mul!(wpt, vac_data.wt', tmp_mat)
     mul!(tmp_mat, vac_data.wv, vac_data.wt)
-    mul!(wvt, adjoint(vac_data.wt), tmp_mat)
-    for ipert in 1:numpert_total
-        vac_data.ep[ipert] = wpt[ipert, ipert]
-        vac_data.ev[ipert] = wvt[ipert, ipert]
-    end
+    mul!(wvt, vac_data.wt', tmp_mat)
+    vac_data.ep .= diag(wpt)
+    vac_data.ev .= diag(wvt)
 
     # Normalize eigenvectors based on scaled wt
     coeffs = odet.u[:, :, 1, end] \ (vac_data.wt .* (2π * equil.psio * 1e-3))
@@ -123,8 +118,8 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium
     # TODO: 4 spline points is arbitrary - is there a better way?
     qedge = profiles.q_spline(ctrl.psiedge)
     npsi = max(4, ceil(Int, (intr.qlim - qedge) * intr.nhigh * 4))
-    psi_array = zeros(Float64, npsi + 1)
-    wv_array = zeros(ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total)
+    psi_array = zeros!(pool, Float64, npsi + 1)
+    wv_array = zeros!(pool, ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total)
 
     for i in 1:(npsi+1)
         # Space points evenly in q
@@ -143,10 +138,7 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium
 
         # Apply singular factor scaling: (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126]
         singfac = vec((intr.mlow:intr.mhigh) .- qi .* (intr.nlow:intr.nhigh)')
-        @inbounds for ipert in 1:intr.numpert_total
-            @views wv[ipert, :] .*= singfac[ipert]
-            @views wv[:, ipert] .*= singfac[ipert]
-        end
+        @inbounds @views wv .*= singfac .* singfac'
 
         @views wv_array[i, :, :] .= wv
     end

From 8c2a78a62a3ff3699beed50bd9e8b6f857ed2ad2 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Tue, 17 Mar 2026 09:50:05 -0400
Subject: [PATCH 16/23] EXAMPLES - IMPROVEMENT - small cleanups to example
 tomls

---
 examples/DIIID-like_ideal_example/gpec.toml | 20 +++++++++----------
 examples/Solovev_ideal_example/gpec.toml    | 22 ---------------------
 examples/Solovev_ideal_example_3D/gpec.toml | 11 -----------
 3 files changed, 10 insertions(+), 43 deletions(-)

diff --git a/examples/DIIID-like_ideal_example/gpec.toml b/examples/DIIID-like_ideal_example/gpec.toml
index 982f6817..f95e67dc 100644
--- a/examples/DIIID-like_ideal_example/gpec.toml
+++ b/examples/DIIID-like_ideal_example/gpec.toml
@@ -14,16 +14,6 @@ newq0 = 0                               # Override for on-axis safety factor (0
 etol = 1e-7                             # Error tolerance for equilibrium solver
 force_termination = false               # Terminate after equilibrium setup (skip stability calculations)
 
-[Wall]
-shape = "nowall"                        # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
-a = 0.2415                              # Distance from plasma (conformal) or shape parameter
-aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
-bw = 1.5                                # Elongation parameter for wall shapes
-cw = 0                                  # Offset of wall center from major radius
-dw = 0.5                                # Triangularity parameter for wall shapes
-tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
-equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
-
 [ForceFreeStates]
 bal_flag = false              # Ideal MHD ballooning criterion for short wavelengths
 mat_flag = true               # Construct coefficient matrices for diagnostic purposes
@@ -63,6 +53,16 @@ save_interval = 3            # Save every Nth ODE step (1=all, 10=every 10th). A
 singfac_min = 1e-4            # Fractional distance from rational q at which ideal jump enforced
 ucrit = 1e4                   # Maximum fraction of solutions allowed before re-normalized
 
+[Wall]
+shape = "nowall"                        # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
+a = 0.2415                              # Distance from plasma (conformal) or shape parameter
+aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
+bw = 1.5                                # Elongation parameter for wall shapes
+cw = 0                                  # Offset of wall center from major radius
+dw = 0.5                                # Triangularity parameter for wall shapes
+tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
+equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
+
 [ForcingTerms]
 forcing_data_file = "forcing.dat"       # Path to forcing data file (n, m, complex amplitude)
 forcing_data_format = "ascii"           # Format of forcing data: "ascii" or "hdf5"
diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml
index 0065fde8..77c6d2e0 100644
--- a/examples/Solovev_ideal_example/gpec.toml
+++ b/examples/Solovev_ideal_example/gpec.toml
@@ -14,28 +14,6 @@ newq0 = 0                               # Override for on-axis safety factor (0
 etol = 1e-7                             # Error tolerance for equilibrium solver
 force_termination = false               # Terminate after equilibrium setup (skip stability calculations)
 
-
-[Wall]
-shape = "conformal"                     # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
-a = 0.2415                              # Distance from plasma (conformal) or shape parameter
-aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
-bw = 1.5                                # Elongation parameter for wall shapes
-cw = 0                                  # Offset of wall center from major radius
-dw = 0.5                                # Triangularity parameter for wall shapes
-tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
-equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
-
-# [PerturbedEquilibrium]
-# # Uncomment this section to enable perturbed equilibrium calculations
-# forcing_data_file = "forcing.dat"      # Path to forcing data (n, m, real, imag)
-# forcing_data_format = "ascii"          # "ascii" or "hdf5"
-# fixed_boundary = false                 # Fixed boundary flag
-# output_eigenmodes = true               # Output mode fields as b-fields
-# compute_response = true                # Compute plasma response
-# compute_singular_coupling = true       # Compute singular coupling metrics
-# verbose = true                         # Enable verbose logging
-# write_outputs_to_HDF5 = true           # Write outputs to HDF5
-
 [ForceFreeStates]
 bal_flag = false              # Ideal MHD ballooning criterion for short wavelengths
 mat_flag = true               # Construct coefficient matrices for diagnostic purposes
diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml
index 3dd466a0..93d4e995 100644
--- a/examples/Solovev_ideal_example_3D/gpec.toml
+++ b/examples/Solovev_ideal_example_3D/gpec.toml
@@ -63,14 +63,3 @@ cw = 0                                  # Offset of wall center from major radiu
 dw = 0.5                                # Triangularity parameter for wall shapes
 tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
 equal_arc_wall = false                   # Equal arc length distribution of nodes on wall
-
-# [PerturbedEquilibrium]
-# # Uncomment this section to enable perturbed equilibrium calculations
-# forcing_data_file = "forcing.dat"      # Path to forcing data (n, m, real, imag)
-# forcing_data_format = "ascii"          # "ascii" or "hdf5"
-# fixed_boundary = false                 # Fixed boundary flag
-# output_eigenmodes = true               # Output mode fields as b-fields
-# compute_response = true                # Compute plasma response
-# compute_singular_coupling = true       # Compute singular coupling metrics
-# verbose = true                         # Enable verbose logging
-# write_outputs_to_HDF5 = true           # Write outputs to HDF5

From 52cceab9ecc1b319e7f6ba6a0607838792f6115d Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Tue, 17 Mar 2026 13:24:33 -0400
Subject: [PATCH 17/23] VACUUM - WIP - mergint the projected kernel functions
 into the regular ones, 2D working

---
 examples/Solovev_ideal_example/gpec.toml |   2 +-
 src/Vacuum/DataTypes.jl                  |  11 +-
 src/Vacuum/Kernel2D.jl                   | 116 ++++++++++++---------
 src/Vacuum/ProjectedKernel.jl            |  48 ++-------
 src/Vacuum/Utilities.jl                  |  25 +++++
 src/Vacuum/Vacuum.jl                     | 126 +++++++++++------------
 6 files changed, 170 insertions(+), 158 deletions(-)

diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml
index 77c6d2e0..108c55bd 100644
--- a/examples/Solovev_ideal_example/gpec.toml
+++ b/examples/Solovev_ideal_example/gpec.toml
@@ -53,7 +53,7 @@ ucrit = 1e3                   # Maximum fraction of solutions allowed before re-
 force_wv_symmetry = true      # Forces vacuum energy matrix symmetry
 save_interval = 3            # Save every Nth ODE step (1=all, 10=every 10th). Always saves near rational surfaces.
 
-[WALL]
+[Wall]
 shape = "conformal"           # String selecting wall shape ["nowall", "conformal", "elliptical", "dee", "mod_dee", "from_file"]
 a = 0.2415                    # The distance of the wall from the plasma in units of major radius (conformal), or minor radius parameter (others).
 aw = 0.05                     # Half-thickness of the wall.
diff --git a/src/Vacuum/DataTypes.jl b/src/Vacuum/DataTypes.jl
index 5522c37b..63d029b4 100644
--- a/src/Vacuum/DataTypes.jl
+++ b/src/Vacuum/DataTypes.jl
@@ -22,10 +22,6 @@ nzeta > 1 for 3D vacuum calculation.
   - `mtheta::Int`: Number of vacuum calculation poloidal grid points
   - `nzeta::Int`: Number of vacuum calculation toroidal grid points (1 for 2D vacuum calculation, > 1 for 3D vacuum calculation)
   - `force_wv_symmetry::Bool`: Boolean flag to enforce symmetry in the vacuum response matrix
-  - `use_galerkin::Bool`: Use Galerkin projection to solve in truncated Fourier space [O(P³)]
-    instead of full collocation [O(M³)]. Applies to both no-wall and wall cases. For the wall
-    case, both plasma and wall unknowns are represented in (m,n) mode space, yielding a 2P×2P
-    system with no M² storage. Defaults to `false`.
 """
 @kwdef struct VacuumInput
     x::Vector{Float64} = Float64[]
@@ -41,7 +37,6 @@ nzeta > 1 for 3D vacuum calculation.
     mtheta::Int = 1
     nzeta::Int = 1
     force_wv_symmetry::Bool = true
-    use_galerkin::Bool = false
 end
 
 """
@@ -81,8 +76,7 @@ function VacuumInput(
     mlow::Int,
     npert::Int,
     nlow::Int;
-    force_wv_symmetry::Bool=true,
-    use_galerkin::Bool=false
+    force_wv_symmetry::Bool=true
 )
     # Extract plasma surface geometry at this psi
     r, z, ν = extract_plasma_surface_at_psi(equil, ψ)
@@ -98,8 +92,7 @@ function VacuumInput(
         npert=npert,
         mtheta=mtheta,
         nzeta=nzeta,
-        force_wv_symmetry=force_wv_symmetry,
-        use_galerkin=true
+        force_wv_symmetry=force_wv_symmetry
     )
 end
 
diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl
index b518ff4c..d4be71b1 100644
--- a/src/Vacuum/Kernel2D.jl
+++ b/src/Vacuum/Kernel2D.jl
@@ -67,17 +67,17 @@ The residue calculation needs to be updated for open walls.**
 
 # Arguments
 
-  - `grad_greenfunction`: Gradient Green's function matrix (output)
-  - `greenfunction`: Green's function matrix (output)
+  - `K`: Fourier-space Gradient Green's function matrix (output)
+  - `G`: Fourier-space Green's function matrix (output)
   - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry)
   - `source`: Source geometry struct (PlasmaGeometry or WallGeometry)
   - `n`: Toroidal mode number
 
 # Returns
 
-Modifies `grad_greenfunction` and `greenfunction` in place.
-Note that greenfunction is zeroed each time this function is called,
-but grad_greenfunction is not since it fills a different block of the
+Modifies `K` and `G` in place.
+Note that G is zeroed each time this function is called,
+but K is not since it fills a different block of the
 (2 * mtheta, 2 * mtheta) depending on the source/observer.
 
 # Notes
@@ -87,28 +87,26 @@ but grad_greenfunction is not since it fills a different block of the
   - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161]
 """
 @with_pool pool function compute_2D_kernel_matrices!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    K::AbstractMatrix{ComplexF64},
+    G::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
-    n::Int
+    n::Int,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractMatrix{ComplexF64}
 )
 
-    mtheta = length(observer.x)
-    dtheta = 2π / mtheta
-    theta_grid = range(; start=0, length=mtheta, step=dtheta)
+    M, P = size(Z) # M = mtheta, P = num_modes
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
+    dtheta = 2π / M
+    theta_grid = range(; start=0, length=M, step=dtheta)
 
     # Take a view of the corresponding block of the grad_greenfunction
-    col_index = (source isa PlasmaGeometry ? 1 : 2)
-    row_index = (observer isa PlasmaGeometry ? 1 : 2)
-    grad_greenfunction_block = view(
-        grad_greenfunction,
-        ((row_index-1)*mtheta+1):(row_index*mtheta),
-        ((col_index-1)*mtheta+1):(col_index*mtheta)
-    )
+    col_idx = (source isa PlasmaGeometry ? 1 : 2)
+    row_idx = (observer isa PlasmaGeometry ? 1 : 2)
+    K_block = view(K, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
+    G_block = view(G, ((row_idx-1)*P+1):(row_idx*P), :)
 
-    # Zero out greenfunction at start of each kernel call
-    fill!(greenfunction, 0.0)
     # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
     populate_greenfunction = source isa PlasmaGeometry
 
@@ -119,7 +117,7 @@ but grad_greenfunction is not since it fills a different block of the
     log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2)
 
     # Precompute the n-dependent prefactor 2√π·Γ(1/2-n) [Chance Phys. Plasmas 1997 2161 eq. 40]
-    # This is constant for all source/observer point pairs within this kernel call.
+    # This constant is only computed once for each n
     gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n)
 
     # Set up periodic splines used for off-grid Gaussian quadrature points
@@ -134,8 +132,8 @@ but grad_greenfunction is not since it fills a different block of the
 
     # Precompute source derivatives on the theta grid once used in Simpson integration
     # The Gaussian singular-panel points are off-grid, so those still use spline evaluation directly.
-    dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta)
-    dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta)
+    dx_dtheta_grid = acquire!(pool, eltype(source.x), M)
+    dz_dtheta_grid = acquire!(pool, eltype(source.z), M)
 
     # Call in-place API to avoid allocations
     d1_spline_x(dx_dtheta_grid, theta_grid)
@@ -144,35 +142,49 @@ but grad_greenfunction is not since it fills a different block of the
     # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition)
     legendre_buf = acquire!(pool, Float64, n + 2)
 
+    # Per-observer projection vectors: proj = (kernel row) · Z
+    proj_k = zeros!(pool, ComplexF64, P)
+    proj_g = zeros!(pool, ComplexF64, P)
+
     # Loop through observer points
-    for j in 1:mtheta
+    for j in 1:M
         # Get observer coordinates
         x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
 
-        # Perform Simpson integration for nonsingular source points
+        # Zero out projection terms
+        fill!(proj_k, 0.0)
+        fill!(proj_g, 0.0)
+        diag_accum = 0.0
+
+        # ============================================================
+        # FAR FIELD: Simpson integration for nonsingular source points
+        # ============================================================
         # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1.
-        @inbounds for k in 1:(mtheta-3)
-            isrc = mod1(j + 1 + k, mtheta)
+        @inbounds for k in 1:(M-3)
+            isrc = mod1(j + 1 + k, M)
             G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor)
 
             # Composite Simpson's 1/3 rule weights, excluding singular points
             # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0
-            wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2))
+            wsimpson = dtheta / 3 * ((k == 1 || k == M - 3) ? 1 : (iseven(k) ? 4 : 2))
 
-            # Sum contributions to Green's function matrices using Simpson weight
+            # Sum and project contributions to Green's function matrices
             if populate_greenfunction
-                greenfunction[j, isrc] += G_n * wsimpson
+                _accum_row!(proj_g, G_n * wsimpson, Zt, isrc)
             end
-            grad_greenfunction_block[j, isrc] += gradG_n * wsimpson
+            _accum_row!(proj_k, gradG_n * wsimpson, Zt, isrc)
             # Subtract regular integral component of δⱼᵢK⁰ [Chance Phys. Plasmas 1997 2161 eq. 83]
-            grad_greenfunction_block[j, j] -= gradG_0 * wsimpson
+            diag_accum -= gradG_0 * wsimpson
         end
 
-        # Perform Gaussian quadrature for singular points (source = obs point)
+        # ============================================================
+        # NEAR FIELD: Gaussian quadrature with singular correction
+        # ============================================================
         # Indices of the singularity region, [j-2, j-1, j, j+1, j+2] (allocation-free)
         for (offset_idx, offset) in enumerate(-2:2)
-            sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta)
+            sing_idx[offset_idx] = mod1(j + offset + M, M)
         end
+
         # Integrate region of length 2 * dtheta on left/right of singularity
         for leftpanel in (true, false)
             gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta)
@@ -197,54 +209,64 @@ but grad_greenfunction is not since it fills a different block of the
                         G_n += log((theta_obs - theta_gauss)^2) / x_obs
                     end
                     @inbounds for stencil_idx in 1:5
-                        greenfunction[j, sing_idx[stencil_idx]] += G_n * s[stencil_idx] * wgauss
+                        _accum_row!(proj_g, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                     end
                 end
 
                 # Second type of singularity: 𝒦ⁿ [Chance Phys. Plasmas 1997 2161 eq. 83, 86]
                 @inbounds for stencil_idx in 1:5
-                    grad_greenfunction_block[j, sing_idx[stencil_idx]] += gradG_n * s[stencil_idx] * wgauss
+                    _accum_row!(proj_k, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                 end
                 # Subtract off the diverging singular n=0 component
-                grad_greenfunction_block[j, j] -= gradG_0 * wgauss
+                diag_accum -= gradG_0 * wgauss
             end
         end
 
         # Subtract off analytic singular integral [Chance Phys. Plasmas 1997 2161 eq. 75] if plasma-plasma block
         if populate_greenfunction && observer isa PlasmaGeometry
             @inbounds for stencil_idx in 1:5
-                greenfunction[j, sing_idx[stencil_idx]] -= log_correction_array[stencil_idx] / x_obs
+                _accum_row!(proj_g, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx])
             end
         end
+
+        # Project the n=0 diagonal accumulation
+        _accum_row!(proj_k, diag_accum, Zt, j)
+
+        # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ──
+        _rank1_conj!(K_block, Zt, j, proj_k)
+        if populate_greenfunction
+            _rank1_conj!(G_block, Zt, j, proj_g)
+        end
     end
 
     # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points
     # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources
     if source isa PlasmaGeometry
-        grad_greenfunction_block .*= -1
+        K_block .*= -1
     end
 
     # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
+    # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
-    @inbounds for i in 1:mtheta
-        grad_greenfunction_block[i, i] += residue
-    end
+    K_block .+= residue .* Gram
 
     # Since we computed 2π𝒢, divide by 2π to get 𝒢
     if populate_greenfunction
-        greenfunction ./= 2π
+        G_block ./= 2π
     end
 end
 
 # Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n
 function kernel!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    K::AbstractMatrix{ComplexF64},
+    G::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
-    params::KernelParams2D
+    params::KernelParams2D,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractMatrix{ComplexF64}
 )
-    return compute_2D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, params.n)
+    return compute_2D_kernel_matrices!(K, G, observer, source, params.n, Z, Gram)
 end
 
 #############################################################
diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
index 8218739d..5c9c5369 100644
--- a/src/Vacuum/ProjectedKernel.jl
+++ b/src/Vacuum/ProjectedKernel.jl
@@ -15,32 +15,6 @@
 # FLOP cost is identical to the two-step approach O(M²P), but memory drops
 # from O(M²) to O(MP + P²).
 
-# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ──────────
-
-"""
-Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P.
-"""
-@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64,
-    Zt::AbstractMatrix{ComplexF64}, col::Int)
-    @inbounds @simd for p in eachindex(proj)
-        proj[p] += w * Zt[p, col]
-    end
-end
-
-"""
-Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary.
-"""
-@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64},
-    Zt::AbstractMatrix{ComplexF64}, j::Int,
-    y::AbstractVector{ComplexF64})
-    @inbounds for p2 in eachindex(y)
-        y_p2 = y[p2]
-        for p1 in axes(A, 1)
-            A[p1, p2] += conj(Zt[p1, j]) * y_p2
-        end
-    end
-end
-
 # ============================================================================
 # 2D fused projected kernel
 # ============================================================================
@@ -62,17 +36,17 @@ Dispatches to the 2D or 3D implementation based on the geometry/params types.
   - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ))
   - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term)
 """
-function kernel!(
-    K_c::AbstractMatrix{ComplexF64},
-    G_c::AbstractMatrix{ComplexF64},
-    observer::Union{PlasmaGeometry,WallGeometry},
-    source::Union{PlasmaGeometry,WallGeometry},
-    params::KernelParams2D,
-    exp_mn_basis::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
-)
-    _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram)
-end
+# function kernel!(
+#     K_c::AbstractMatrix{ComplexF64},
+#     G_c::AbstractMatrix{ComplexF64},
+#     observer::Union{PlasmaGeometry,WallGeometry},
+#     source::Union{PlasmaGeometry,WallGeometry},
+#     params::KernelParams2D,
+#     exp_mn_basis::AbstractMatrix{ComplexF64},
+#     Gram::AbstractMatrix{ComplexF64}
+# )
+#     _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram)
+# end
 
 function kernel!(
     K_c::AbstractMatrix{ComplexF64},
diff --git a/src/Vacuum/Utilities.jl b/src/Vacuum/Utilities.jl
index de4d220b..ac69a9e1 100644
--- a/src/Vacuum/Utilities.jl
+++ b/src/Vacuum/Utilities.jl
@@ -152,3 +152,28 @@ Inline function for fast cross product of two 3D vectors at a given index.
         c[idx, 3] = a1*b2 - a2*b1
     end
 end
+
+# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ──────────
+"""
+Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P.
+"""
+@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64,
+    Zt::AbstractMatrix{ComplexF64}, col::Int)
+    @inbounds @simd for p in eachindex(proj)
+        proj[p] += w * Zt[p, col]
+    end
+end
+
+"""
+Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary.
+"""
+@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64},
+    Zt::AbstractMatrix{ComplexF64}, j::Int,
+    y::AbstractVector{ComplexF64})
+    @inbounds for p2 in eachindex(y)
+        y_p2 = y[p2]
+        for p1 in axes(A, 1)
+            A[p1, p2] += conj(Zt[p1, j]) * y_p2
+        end
+    end
+end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index c1c50514..7d994531 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -72,7 +72,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     n_override::Union{Nothing,Int}=nothing
 )
 
-    (; mtheta, mpert, mlow, nzeta, npert, nlow, use_galerkin) = inputs
+    (; mtheta, mpert, mlow, nzeta, npert, nlow) = inputs
 
     # Initialize surface geometries
     plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
@@ -114,69 +114,68 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     #
     # FLOPs:  O(M²P + P³)
     # ================================================================
-    if use_galerkin
-        # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
-        Gram = zeros!(pool, ComplexF64, P, P)
-        mul!(Gram, exp_mn_basis', exp_mn_basis)
-
-        # Projected kernel matrices [P × P complex]
-        K_ext = zeros!(pool, ComplexF64, 2P, 2P)
-        G_ext = zeros!(pool, ComplexF64, 2P, P)
-        K_int = similar!(pool, K_ext)
-        G_int = similar!(pool, G_ext)
-
-        # Fused projected kernel: compute Z^H K Z and Z^H G Z
-        kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
-        if !wall.nowall
-            kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
-            kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
-            kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
-        end
-
-        # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
-        K_int .= -K_ext
-        K_int[1:P, 1:P] .+= 2 .* Gram
-        if !wall.nowall
-            K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
-        end
-        G_int .= G_ext
-
-        # Solve projected BIEs for exterior and interior kernels
-        if wall.nowall
-            F_ext = lu!(K_ext[1:P, 1:P])
-            ldiv!(F_ext, @view(G_ext[1:P, :]))
-            F_int = lu!(K_int[1:P, 1:P])
-            ldiv!(F_int, @view(G_int[1:P, :]))
-        else
-            F_ext = lu!(K_ext)
-            ldiv!(F_ext, G_ext)
-            F_int = lu!(K_int)
-            ldiv!(F_int, G_int)
-        end
-
-        # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
-        mul!(wv, Gram, view(G_ext, 1:P, :))
-        wv .*= (4π^2 / M)
+    # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
+    Gram = zeros!(pool, ComplexF64, P, P)
+    mul!(Gram, exp_mn_basis', exp_mn_basis)
+
+    # Projected kernel matrices [P × P complex]
+    K_ext = zeros!(pool, ComplexF64, 2P, 2P)
+    G_ext = zeros!(pool, ComplexF64, 2P, P)
+    K_int = similar!(pool, K_ext)
+    G_int = similar!(pool, G_ext)
+
+    # Fused projected kernel: compute Z^H K Z and Z^H G Z
+    kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+    if !wall.nowall
+        kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
+        kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
+        kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
+    end
 
-        # Backward-compatible reconstruction: grre/grri in M×2P real layout
-        # Need to convert mode space to physical space and unpack the real and imaginary parts
-        # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
-        # perhaps make it a complex P * P matrix? Then don't need any of this section
-        mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
-        @view(grre[1:M, 1:P]) .= real.(temp)
-        @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-        mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
-        @view(grri[1:M, 1:P]) .= real.(temp)
-        @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
-        if !wall.nowall
-            mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
-            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-            mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
-            @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
-            @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-        end
+    # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
+    K_int .= -K_ext
+    K_int[1:P, 1:P] .+= 2 .* Gram
+    if !wall.nowall
+        K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
+    end
+    G_int .= G_ext
+
+    # Solve projected BIEs for exterior and interior kernels
+    if wall.nowall
+        F_ext = lu!(K_ext[1:P, 1:P])
+        ldiv!(F_ext, @view(G_ext[1:P, :]))
+        F_int = lu!(K_int[1:P, 1:P])
+        ldiv!(F_int, @view(G_int[1:P, :]))
     else
+        F_ext = lu!(K_ext)
+        ldiv!(F_ext, G_ext)
+        F_int = lu!(K_int)
+        ldiv!(F_int, G_int)
+    end
+
+    # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
+    mul!(wv, Gram, view(G_ext, 1:P, :))
+    wv .*= (4π^2 / M)
+
+    # Backward-compatible reconstruction: grre/grri in M×2P real layout
+    # Need to convert mode space to physical space and unpack the real and imaginary parts
+    # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+    # perhaps make it a complex P * P matrix? Then don't need any of this section
+    mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
+    @view(grre[1:M, 1:P]) .= real.(temp)
+    @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
+    mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
+    @view(grri[1:M, 1:P]) .= real.(temp)
+    @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+    if !wall.nowall
+        mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
+        @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
+        @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+        mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
+        @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
+        @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+    end
+    """
         # ================================================================
         # Collocation approach: solve full physical-space system [M × M]
         # Handles both no-wall and wall cases.
@@ -226,8 +225,7 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)]))
         mul!(wv, exp_mn_basis', temp)
         wv .*= (4π^2 / M)
-    end
-
+"""
     inputs.force_wv_symmetry && hermitianpart!(wv)
 
     if nzeta > 1 # 3D

From f96030cfa35f075ecabf82c1415b1fdd563866c7 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Tue, 17 Mar 2026 14:49:11 -0400
Subject: [PATCH 18/23] VACUUM - IMPROVEMENT - full implementation of the
 projected kernel in both 2D and 3D. Removed dead code

---
 src/Vacuum/Kernel2D.jl        | 139 +++++++++---
 src/Vacuum/Kernel3D.jl        | 252 +++++++++++++++------
 src/Vacuum/ProjectedKernel.jl | 404 ----------------------------------
 src/Vacuum/Vacuum.jl          |  57 +----
 4 files changed, 295 insertions(+), 557 deletions(-)
 delete mode 100644 src/Vacuum/ProjectedKernel.jl

diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl
index d4be71b1..49772e96 100644
--- a/src/Vacuum/Kernel2D.jl
+++ b/src/Vacuum/Kernel2D.jl
@@ -59,36 +59,106 @@ const GL8_LAGRANGE_STENCILS = precompute_lagrange_stencils(GL8.x)
 # and per-n sinh/cosh cache are defined in PnQuadCache.jl.
 
 """
-    kernel!(grad_greenfunction, greenfunction, observer, source, n)
+    compute_2D_kernel_matrices!(K, G, observer, source, n, Z, Gram)
 
-Compute kernels of integral equation for Laplace's equation in a torus.
-**WARNING: This kernel only supports closed toroidal walls currently.
-The residue calculation needs to be updated for open walls.**
+Compute the **Fourier/Galerkin-projected** 2D vacuum boundary-integral kernel blocks for
+Laplace’s equation in an axisymmetric torus, **without ever forming the dense
+`M×M` “point-to-point” kernel matrices.
 
-# Arguments
+This is the fused “evaluate kernel + project” path that the vacuum solver uses:
 
-  - `K`: Fourier-space Gradient Green's function matrix (output)
-  - `G`: Fourier-space Green's function matrix (output)
-  - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry)
-  - `source`: Source geometry struct (PlasmaGeometry or WallGeometry)
-  - `n`: Toroidal mode number
+    Kc = Zᴴ * K * Z
+    Gc = Zᴴ * G
 
-# Returns
+where:
 
-Modifies `K` and `G` in place.
-Note that G is zeroed each time this function is called,
-but K is not since it fills a different block of the
-(2 * mtheta, 2 * mtheta) depending on the source/observer.
+  - `K` is the **double-layer** kernel (normal derivative of the Green’s function),
+  - `G` is the **single-layer** kernel (Green’s function itself; only needed for plasma-as-source),
+  - `Z ∈ ℂ^{M×P}` is the complex Fourier basis on the poloidal grid,
+    and `Zᴴ` is its conjugate transpose.
 
-# Notes
+Rather than computing a full kernel row `K[j, :]` and then multiplying by `Z`, this routine
+projects **on the fly**:
+
+  - For each observer node `j`, it accumulates the projected row-vector
+    `proj_k = (K[j,:] * weights) · Z` and (optionally) `proj_g = (G[j,:] * weights) · Z`
+    into length-`P` work buffers.
+
+  - It then performs a **rank-1 update** into the appropriate projected block:
+
+        Kc += conj(Z[j, :])' * proj_k
+        Gc += conj(Z[j, :])' * proj_g
+
+This reduces peak memory from `O(M^2)` to `O(MP + P^2)` while keeping the same
+mathematical discretization.
+
+## Arguments
+
+  - `Kc`: Complex global projected double-layer kernel matrix.
+  - `Gc`: Complex global projected single-layer kernel matrix.
+  - `observer`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`.
+  - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`.
+  - `n`: Integer representing the order of the toroidal Fourier component.
+  - `Z`: Complex Fourier basis sampled on the `θ` grid.
+  - `Gram`: Mode-space Gram matrix for this basis on the discrete grid.
+
+## Block layout
+
+`Kc` and `Gc` are the complex global projected matrices. `Kc` contains four blocks corresponding to
+plasma/wall as observer/source, `Gc` contains two blocks corresponding to plasma/wall as observer.
+This function writes **only one block** to each of `Kc` and `Gc` per call:
+
+  - `Kc_block` is a `P×P` view into `Kc` selected by `(observer isa PlasmaGeometry ? 1 : 2, source isa PlasmaGeometry ? 1 : 2)`.
+  - `Gc_block` is a `P×(2P)` view into `Gc` with the same observer block-row; only the columns
+    corresponding to the source being plasma are populated (when `source isa PlasmaGeometry`).
 
-  - Uses Simpson's rule for integration away from singular points
-  - Uses Gaussian quadrature near singular points for improved accuracy
-  - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161]
+## Toroidal Green's functions
+
+  - The scalar `G_n` returned by `green(...)` is `2π * 𝒢ⁿ(θ, θ′)` (Chance 1997),
+    the `n`-th toroidal Fourier component of the Laplace Green’s function in axisymmetry.
+  - The scalar `gradG_n` returned by `green(...)` corresponds to the toroidal-mode `n`
+    contribution to the **double-layer** integrand `𝒥 * (∇′𝒢ⁿ · ∇′ℒ)`
+    (Chance 1997), i.e. the normal-derivative factor multiplied by the geometric Jacobian.
+  - `gradG_0` is the `n = 0` piece used for analytic diagonal/singularity bookkeeping.
+
+## Numerical treatment of the singularity
+
+The toroidal Green’s function kernel is weakly singular as `θ′ → θ`. The implementation follows
+Chance *Phys. Plasmas* **4**, 2161 (1997) and uses a mixed strategy:
+
+  - **Far field (nonsingular region)**: composite Simpson’s `1/3` rule on the uniform `θ` grid,
+    skipping the near-singular stencil around `j`.
+  - **Near field (singular panels)**: 8-point Gauss–Legendre quadrature on the two panels
+    of length `dtheta` immediately to the left/right of `θ_j`, using:
+      + periodic cubic splines of `source.x(θ)` and `source.z(θ)` to evaluate geometry at off-grid nodes,
+      + precomputed 5-point Lagrange stencils to map each Gauss node back to the five neighboring
+        discrete source indices `[j-2, j-1, j, j+1, j+2]` *without allocations*,
+      + analytic logarithmic/singular correction terms (`log_correction_array`) for the single-layer
+        kernel when the observer/source block is plasma–plasma (Chance 1997, e.g. eqs. 75, 78),
+      + an analytic diagonal/residue correction for the double-layer kernel (Chance 1997, Table I / residues).
+
+## Performance and allocation avoidance (hot-path optimizations)
+
+This routine is intentionally written to be allocation-light in tight loops:
+
+  - **Precomputed quadrature tables**: `GL8` and `GL8_LAGRANGE_STENCILS` are global constants.
+  - **Hoisted n-dependent constants**: `gamma_prefactor = 2√π * Γ(1/2 - n)` is computed once per call
+    and passed into `green(...)` rather than recomputed per quadrature node.
+  - **Spline derivative batching**: `∂R/∂θ` and `∂Z/∂θ` are evaluated on the full grid once (for Simpson),
+    while off-grid Gauss points evaluate splines directly.
+  - **Projection reuse**: the transpose `Zt = transpose(Z)` is materialized so that “row-accumulate”
+    operations `_accum_row!` can access `Z` with contiguous column memory.
+  - **Rank-1 assembly**: the final projected update uses `conj(Z[j,:]) ⊗ proj_*` via `_rank1_conj!`,
+    avoiding constructing intermediate `P×P` temporaries.
+
+## Caveats / limitations
+
+  - **Closed-wall assumption**: the current residue/diagonal handling is written for closed,
+    periodic toroidal boundaries. Open-wall residue logic is not implemented.
 """
 @with_pool pool function compute_2D_kernel_matrices!(
-    K::AbstractMatrix{ComplexF64},
-    G::AbstractMatrix{ComplexF64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     n::Int,
@@ -104,8 +174,8 @@ but K is not since it fills a different block of the
     # Take a view of the corresponding block of the grad_greenfunction
     col_idx = (source isa PlasmaGeometry ? 1 : 2)
     row_idx = (observer isa PlasmaGeometry ? 1 : 2)
-    K_block = view(K, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
-    G_block = view(G, ((row_idx-1)*P+1):(row_idx*P), :)
+    Kc_block = view(Kc, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
+    Gc_block = view(Gc, ((row_idx-1)*P+1):(row_idx*P), :)
 
     # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
     populate_greenfunction = source isa PlasmaGeometry
@@ -233,40 +303,45 @@ but K is not since it fills a different block of the
         _accum_row!(proj_k, diag_accum, Zt, j)
 
         # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ──
-        _rank1_conj!(K_block, Zt, j, proj_k)
+        _rank1_conj!(Kc_block, Zt, j, proj_k)
         if populate_greenfunction
-            _rank1_conj!(G_block, Zt, j, proj_g)
+            _rank1_conj!(Gc_block, Zt, j, proj_g)
         end
     end
 
     # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points
     # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources
     if source isa PlasmaGeometry
-        K_block .*= -1
+        Kc_block .*= -1
     end
 
     # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
     # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
-    K_block .+= residue .* Gram
+    Kc_block .+= residue .* Gram
 
     # Since we computed 2π𝒢, divide by 2π to get 𝒢
     if populate_greenfunction
-        G_block ./= 2π
+        Gc_block ./= 2π
     end
 end
 
-# Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n
+"""
+    kernel!(Kc, Gc, observer, source, params::KernelParams2D, Z, Gram)
+
+Public 2D kernel entry point. This is a thin wrapper that forwards to
+`compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)`.
+"""
 function kernel!(
-    K::AbstractMatrix{ComplexF64},
-    G::AbstractMatrix{ComplexF64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
     params::KernelParams2D,
     Z::AbstractMatrix{ComplexF64},
     Gram::AbstractMatrix{ComplexF64}
 )
-    return compute_2D_kernel_matrices!(K, G, observer, source, params.n, Z, Gram)
+    return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)
 end
 
 #############################################################
diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index c158734e..e066e788 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -395,68 +395,128 @@ function KernelWorkspace(PATCH_DIM::Int, RAD_DIM::Int, ANG_DIM::Int)
 end
 
 """
-    compute_3D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER)
+    compute_3D_kernel_matrices!(K, G, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, Z, Gram)
 
-Compute boundary integral kernel matrices for 3D geometries with the singular correction
-algorithm from [Malhotra Plasma Phys. and Cont. Fusion 2019 024004].
-Uses multi-threading for parallel computation over observer points.
+Compute the **Fourier/Galerkin-projected** 3D vacuum boundary-integral kernel blocks for
+Laplace’s equation, using a high-order singular quadrature / partition-of-unity (POU)
+scheme on a tensor-product `(θ, ζ)` surface grid.
 
-  - Far regions: Rectangle rule with uniform weights (1/N)
-  - Singular regions: Polar quadrature with partition-of-unity blending
+Like the 2D kernel, this routine implements the **fused projection path** used by the vacuum solve:
+it produces the projected operators in mode space **without materializing a dense**
+`N×N` point-space kernel (where `N = mtheta * nzeta`).
 
-grad_greenfunction is the double-layer kernel matrix, where each entry is
-∇_{x_src} φ(x_obs, x_src) · n_src, and greenfunction is the single-layer kernel matrix,
-where each entry is φ(x_obs, x_src).
+## Mathematical object being discretized
 
-# Arguments
+Let `x(θ, ζ) ∈ ℝ^3` be a surface parametrization (plasma or wall surface) with outward
+unit normal `n(θ, ζ)`. The Laplace kernels are:
+
+  - **Single-layer**: `φ(x_obs, x_src) = 1 / |x_obs - x_src|`
+  - **Double-layer**: `∂φ/∂n_src = ∇_{x_src} φ ⋅ n_src = (x_obs - x_src) ⋅ n_src / |x_obs - x_src|^3`
+
+This routine computes the *discrete, projected* operators corresponding to these kernels,
+using a uniform quadrature weight `dθdζ = 4π^2 / N` for the far field and a specialized
+near-field correction for the singular region.
+
+## Arguments and block layout
+
+  - `Kc`: Complex global projected double-layer kernel matrix (2P×2P).
+  - `Gc`: Complex global projected single-layer kernel matrix (2P×P).
+  - `observer`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data.
+  - `source`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data.
+  - `PATCH_RAD`: Half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors.
+  - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`).
+  - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`).
+  - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`.
+  - `Gram`: Mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution).
+
+This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`)
+selected by whether observer/source are plasma or wall.
+
+## Numerical treatment of the singularity
+
+The kernel is weakly singular as `x_src → x_obs`. The implementation follows the
+approach used in [Malhotra Journal of Comp. Phys. 2019 108791 eq. 38]
 
-  - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place
+  - **Far field** (nonsingular sources):
 
-  - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place
+      + Use a uniform trapezoidal/rectangle rule on the `(θ, ζ)` grid.
+      + For each observer point, a square patch of size `PATCH_DIM = 2*PATCH_RAD+1`
+        surrounding the singularity is excluded from the far-field sum.
 
-  - `observer`: Observer geometry (PlasmaGeometry3D)
+  - **Near field** (singular patch):
 
-  - `source`: Source geometry (PlasmaGeometry3D)
+      + Extract a Cartesian `PATCH_DIM×PATCH_DIM` patch of the source geometry around the
+        observer-aligned source index.
+      + Interpolate the patch to a **polar quadrature grid** (`RAD_DIM × ANG_DIM`, with `ANG_DIM=2*RAD_DIM`)
+        using a precomputed sparse interpolation operator `P2G` built from tensor-product
+        Lagrange stencils (`INTERP_ORDER` controls the stencil width).
+      + Evaluate kernels on the polar grid and weight them with a **partition-of-unity**
+        quadrature factor `Ppou` that includes the polar Jacobian factor (roughly `r * dr * dθ`)
+        and a smooth cutoff function `χ(ρ)` that localizes the singular correction.
+      + Map the polar correction back onto the Cartesian patch via `P2G` and blend with the
+        far-field trapezoid contribution using `Gpou`, so the combined weight is effectively
+        `trap*(1-χ) + singular_correction`.
 
-  - `PATCH_RAD`: Number of points adjacent to source point to treat as singular
+## Fused projection and threading
 
-      + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1)
+This function is written to be parallel over observer points:
 
-  - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM
+  - Each thread owns a `KernelWorkspace` (scratch arrays for patch extraction, polar interpolation,
+    and temporary kernel values), plus per-thread accumulation buffers `proj_k` / `proj_g`
+    (length `P`) and a boolean `is_patch` mask to skip patch indices in the far-field loop.
 
-  - `INTERP_ORDER`: Lagrange interpolation order
+  - For a given observer index `idx_obs`, the code accumulates the **projected row**
+    `(kernel row idx_obs) · Z` directly into `proj_k` / `proj_g` using `_accum_row!`, and then writes
+    these into shared buffers `KZt[:, idx_obs]` and `GZt[:, idx_obs]`. This is race-free because
+    each observer writes to a unique column.
 
-      + Must be ≤ (2 * PATCH_RAD + 1)
+  - After the threaded loop completes, the final `P×P` blocks are assembled efficiently with BLAS:
 
-# Threading
+        Kc = Zᴴ * (KZt)'
+        Gc = Zᴴ * (GZt)'
 
-This function automatically uses all available threads (`Threads.nthreads()`).
-Start Julia with `julia -t auto` or set `JULIA_NUM_THREADS` to enable multi-threading.
+    implemented as `mul!(Kc_block, Z', transpose(KZt))` (and similarly for `Gc`).
+
+Normalization by `2π` is applied to match the 2D kernel convention so the downstream “add identity”
+logic is consistent between 2D/3D.
+
+## Important parameters
+
+  - `PATCH_RAD`: half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors.
+  - `RAD_DIM`: radial quadrature order on the polar grid (angular order is `2*RAD_DIM`).
+  - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`).
+
+## Performance notes / numerical optimizations
+
+  - **Cached quadrature data**: `get_singular_quadrature` memoizes `P2G`, `Gpou`, `Ppou`, etc. for a given
+    `(PATCH_RAD, RAD_DIM, INTERP_ORDER)` triple to avoid expensive rebuilds.
+  - **Allocation control**: all near-field arrays live in thread-local `KernelWorkspace` objects; no per-observer
+    heap allocation is intended in the hot path.
+  - **Scalar kernel evaluation**: the Laplace kernels have scalar-argument overloads to avoid view/slice creation
+    and to enable LLVM to keep values in registers.
 """
 function compute_3D_kernel_matrices!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    K::AbstractMatrix{ComplexF64},
+    G::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     PATCH_RAD::Int,
     RAD_DIM::Int,
-    INTERP_ORDER::Int
+    INTERP_ORDER::Int,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractMatrix{ComplexF64}
 )
-    num_points = observer.mtheta * observer.nzeta
-    dθdζ = 4π^2 / (num_points)
+    N, P = size(Z) # N = mtheta * nzeta, P = num_modes
+    dθdζ = 4π^2 / N
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
 
-    # Get block of grad green function matrix
+    # Take a view of the corresponding block of the K and G matrices
     col_index = (source isa PlasmaGeometry3D ? 1 : 2)
     row_index = (observer isa PlasmaGeometry3D ? 1 : 2)
-    grad_greenfunction_block = view(
-        grad_greenfunction,
-        ((row_index-1)*num_points+1):(row_index*num_points),
-        ((col_index-1)*num_points+1):(col_index*num_points)
-    )
+    K_block = view(K, ((row_index-1)*P+1):(row_index*P), ((col_index-1)*P+1):(col_index*P))
+    G_block = view(G, ((row_index-1)*P+1):(row_index*P), :)
 
-    # Zero out green function matrix
-    fill!(greenfunction, 0.0)
-    # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
+    # G only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
     populate_greenfunction = source isa PlasmaGeometry3D
 
     # Initialize quadrature data
@@ -470,16 +530,30 @@ function compute_3D_kernel_matrices!(
     @assert observer.mtheta ≥ PATCH_DIM "Must have observer.mtheta ≥ PATCH_DIM, got observer.mtheta=$(observer.mtheta), PATCH_DIM=$PATCH_DIM"
     @assert observer.nzeta ≥ PATCH_DIM "Must have observer.nzeta ≥ PATCH_DIM, got observer.nzeta=$(observer.nzeta), PATCH_DIM=$PATCH_DIM"
 
+    # Buffers for the projection: column idx_obs holds (kernel row idx_obs) · Z
+    KZt = zeros(ComplexF64, P, N)
+    GZt = zeros(ComplexF64, P, N)
+
     # Allocate thread-local workspaces (one per thread)
     max_threadid = Threads.maxthreadid()
     workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_threadid]
+    proj_k_all = [zeros(ComplexF64, P) for _ in 1:max_threadid]
+    proj_g_all = [zeros(ComplexF64, P) for _ in 1:max_threadid]
+    is_patch_all = [falses(N) for _ in 1:max_threadid]
 
     # Parallel loop through observer points
-    Threads.@threads for idx_obs in 1:num_points
+    Threads.@threads for idx_obs in 1:N
         # Get thread-local workspace
         ws = workspaces[Threads.threadid()]
         (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar,
             n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws
+        proj_k = proj_k_all[Threads.threadid()]
+        proj_g = proj_g_all[Threads.threadid()]
+        is_patch = is_patch_all[Threads.threadid()]
+
+        fill!(proj_k, 0.0)
+        fill!(proj_g, 0.0)
+        fill!(is_patch, false)
 
         # Convert linear index to 2D indices
         i_obs = mod1(idx_obs, observer.mtheta)
@@ -488,21 +562,36 @@ function compute_3D_kernel_matrices!(
         @inbounds oy = observer.r[idx_obs, 2]
         @inbounds oz = observer.r[idx_obs, 3]
 
+        # Mark patch source indices so the far-field loop can skip them
+        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
+            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
+            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
+            is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true
+        end
+
         # ============================================================
         # FAR FIELD: Trapezoidal rule for nonsingular source points
         # Note: kernels return zero for r_src = r_obs
         # ============================================================
-        @inbounds for idx_src in 1:num_points
-            sx = source.r[idx_src, 1];
-            sy = source.r[idx_src, 2];
-            sz = source.r[idx_src, 3]
-            nx = source.normal[idx_src, 1];
-            ny = source.normal[idx_src, 2];
-            nz = source.normal[idx_src, 3]
-            # Apply weights (periodic trapezoidal rule = constant weights)
-            grad_greenfunction_block[idx_obs, idx_src] = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
+        @inbounds for idx_src in 1:N
+            is_patch[idx_src] && continue
+            w_double =
+                laplace_double_layer(
+                    ox,
+                    oy,
+                    oz,
+                    source.r[idx_src, 1],
+                    source.r[idx_src, 2],
+                    source.r[idx_src, 3],
+                    source.normal[idx_src, 1],
+                    source.normal[idx_src, 2],
+                    source.normal[idx_src, 3]
+                ) * dθdζ
+            _accum_row!(proj_k, w_double, Zt, idx_src)
+
             if populate_greenfunction
-                greenfunction[idx_obs, idx_src] = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
+                w_single = laplace_single_layer(ox, oy, oz, source.r[idx_src, 1], source.r[idx_src, 2], source.r[idx_src, 3]) * dθdζ
+                _accum_row!(proj_g, w_single, Zt, idx_src)
             end
         end
 
@@ -525,11 +614,11 @@ function compute_3D_kernel_matrices!(
         # Evaluate kernels at polar points with POU weighting
         @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
             # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already
-            rsx = r_polar[ir, ia, 1];
-            rsy = r_polar[ir, ia, 2];
+            rsx = r_polar[ir, ia, 1]
+            rsy = r_polar[ir, ia, 2]
             rsz = r_polar[ir, ia, 3]
-            nsx = n_polar[ir, ia, 1];
-            nsy = n_polar[ir, ia, 2];
+            nsx = n_polar[ir, ia, 1]
+            nsy = n_polar[ir, ia, 2]
             nsz = n_polar[ir, ia, 3]
             M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ
             M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
@@ -550,49 +639,74 @@ function compute_3D_kernel_matrices!(
             idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta)
             idx_src = idx_pol + source.mtheta * (idx_tor - 1)
 
-            trap_double = grad_greenfunction_block[idx_obs, idx_src]
-            grad_greenfunction_block[idx_obs, idx_src] = trap_double + M_grid_double[i, j] + trap_double * Gpou[i, j]
+            sx = source.r[idx_src, 1]
+            sy = source.r[idx_src, 2]
+            sz = source.r[idx_src, 3]
+            nx = source.normal[idx_src, 1]
+            ny = source.normal[idx_src, 2]
+            nz = source.normal[idx_src, 3]
+
+            far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ
+            _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
 
             # Apply near + far contributions
             if populate_greenfunction
-                trap_single = greenfunction[idx_obs, idx_src]
-                greenfunction[idx_obs, idx_src] = trap_single + M_grid_single[i, j] + trap_single * Gpou[i, j]
+                far_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[i, j]) * dθdζ
+                _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src)
+            end
+        end
+
+        # ── Write projected column to buffer (each idx_obs owns its column) ──
+        @inbounds for p in 1:P
+            KZt[p, idx_obs] = proj_k[p]
+        end
+        if populate_greenfunction
+            @inbounds for p in 1:P
+                GZt[p, idx_obs] = proj_g[p]
             end
         end
     end
 
     # Use the same normalization as in the 2D kernel so we can just add I to the diagonal
     # This makes the grri logic identical to the 2D kernel.
-    grad_greenfunction_block ./= 2π
-    greenfunction ./= 2π
+    mul!(K_block, Z', transpose(KZt))
+    K_block ./= 2π
+    if populate_greenfunction
+        mul!(G_block, Z', transpose(GZt))
+        G_block ./= 2π
+    end
 
     # Add the term that comes from the volume integral of Green's identity
-    typeof(source) == typeof(observer) && begin
-        for i in 1:num_points
-            grad_greenfunction_block[i, i] += 1.0
-        end
+    if typeof(source) == typeof(observer)
+        K_block .+= Gram
     end
 end
 
 """
-    kernel!(grad_greenfunction, greenfunction, observer, source, params::KernelParams3D)
+    kernel!(Kc, Gc, observer, source, params::KernelParams3D, Z, Gram)
+
+Public 3D kernel entry point. Forwards to:
 
-Dispatch wrapper for 3D kernel that forwards to `compute_3D_kernel_matrices!` with params.
+`compute_3D_kernel_matrices!(Kc, Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, Z, Gram)`.
 """
 function kernel!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
-    params::KernelParams3D
+    params::KernelParams3D,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractMatrix{ComplexF64}
 )
     return compute_3D_kernel_matrices!(
-        grad_greenfunction,
-        greenfunction,
+        Kc,
+        Gc,
         observer,
         source,
         params.PATCH_RAD,
         params.RAD_DIM,
-        params.INTERP_ORDER
+        params.INTERP_ORDER,
+        Z,
+        Gram
     )
 end
diff --git a/src/Vacuum/ProjectedKernel.jl b/src/Vacuum/ProjectedKernel.jl
deleted file mode 100644
index 5c9c5369..00000000
--- a/src/Vacuum/ProjectedKernel.jl
+++ /dev/null
@@ -1,404 +0,0 @@
-# Fused kernel assembly + Fourier projection for Galerkin vacuum solve.
-#
-# Instead of materializing the full M×M kernel matrices and then projecting,
-# these functions accumulate the P×P projected matrices row by row as the
-# kernel values are computed, reducing memory from O(M²) to O(MP).
-#
-# K_c = Z^H K Z  and  G_c = Z^H G Z
-#
-# where Z = C + iS is the [M × P] complex Fourier basis, K is the double-layer
-# kernel, and G is the single-layer kernel. For each observer point j, the
-# kernel row is projected and accumulated via rank-1 updates:
-#
-#   K_c += conj(Z[j,:]) ⊗ (K[j,:] · Z)
-#
-# FLOP cost is identical to the two-step approach O(M²P), but memory drops
-# from O(M²) to O(MP + P²).
-
-# ============================================================================
-# 2D fused projected kernel
-# ============================================================================
-"""
-    kernel!(K_c, G_c, observer, source, params, exp_mn_basis, Gram)
-
-Compute the Fourier-projected kernel matrices K_c = Z^H K Z and G_c = Z^H G Z
-directly, without materializing the full M×M kernel matrices.
-
-Dispatches to the 2D or 3D implementation based on the geometry/params types.
-
-# Arguments
-
-  - `K_c::Matrix{ComplexF64}`: Output P×P projected double-layer kernel [filled in-place]
-  - `G_c::Matrix{ComplexF64}`: Output P×P projected single-layer kernel [filled in-place]
-  - `observer`: Observer geometry struct
-  - `source`: Source geometry struct
-  - `params`: Kernel parameters (KernelParams2D or KernelParams3D)
-  - `exp_mn_basis::Matrix{ComplexF64}`: [M × P] complex Fourier basis Z = exp(i(mθ − nζ))
-  - `Gram::Matrix{ComplexF64}`: [P × P] Gram matrix Z^H Z (needed for diagonal identity term)
-"""
-# function kernel!(
-#     K_c::AbstractMatrix{ComplexF64},
-#     G_c::AbstractMatrix{ComplexF64},
-#     observer::Union{PlasmaGeometry,WallGeometry},
-#     source::Union{PlasmaGeometry,WallGeometry},
-#     params::KernelParams2D,
-#     exp_mn_basis::AbstractMatrix{ComplexF64},
-#     Gram::AbstractMatrix{ComplexF64}
-# )
-#     _projected_kernel_2D!(K_c, G_c, observer, source, params.n, exp_mn_basis, Gram)
-# end
-
-function kernel!(
-    K_c::AbstractMatrix{ComplexF64},
-    G_c::AbstractMatrix{ComplexF64},
-    observer::Union{PlasmaGeometry3D,WallGeometry3D},
-    source::Union{PlasmaGeometry3D,WallGeometry3D},
-    params::KernelParams3D,
-    exp_mn_basis::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
-)
-    _projected_kernel_3D!(K_c, G_c, observer, source,
-        params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER,
-        exp_mn_basis, Gram)
-end
-"""
-    _projected_kernel_2D!(K_c, G_c, observer, source, n, exp_mn_basis, Gram)
-
-Fused 2D kernel assembly + projection. Mirrors the loop structure of
-`compute_2D_kernel_matrices!` but accumulates rank-1 contributions into the
-P×P projected matrices instead of filling the M×M kernel matrices.
-
-Memory: O(MP) instead of O(M²).
-"""
-@with_pool pool function _projected_kernel_2D!(
-    K_c::AbstractMatrix{ComplexF64},
-    G_c::AbstractMatrix{ComplexF64},
-    observer::Union{PlasmaGeometry,WallGeometry},
-    source::Union{PlasmaGeometry,WallGeometry},
-    n::Int,
-    exp_mn_basis::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
-)
-    M, P = size(exp_mn_basis)
-    Z = exp_mn_basis
-    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
-    mtheta = length(observer.x)
-    dtheta = 2π / mtheta
-    theta_grid = range(; start=0, length=mtheta, step=dtheta)
-
-    # Take a view of the corresponding block of the K_c and G_c matrices
-    col_idx = (source isa PlasmaGeometry ? 1 : 2)
-    row_idx = (observer isa PlasmaGeometry ? 1 : 2)
-    K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
-    G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :)
-
-    populate_greenfunction = source isa PlasmaGeometry
-
-    # S₁ᵢ logarithmic correction factors [Chance Phys. Plasmas 1997 2161 eq. 78]
-    log_correction_0 = 16.0 * dtheta * (log(2 * dtheta) - 68.0 / 15.0) / 15.0
-    log_correction_1 = 128.0 * dtheta * (log(2 * dtheta) - 8.0 / 15.0) / 45.0
-    log_correction_2 = 4.0 * dtheta * (7.0 * log(2 * dtheta) - 11.0 / 15.0) / 45.0
-    log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2)
-
-    gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n)
-
-    spline_x = cubic_interp(theta_grid, source.x; bc=PeriodicBC(; endpoint=:exclusive, period=2π))
-    spline_z = cubic_interp(theta_grid, source.z; bc=PeriodicBC(; endpoint=:exclusive, period=2π))
-    d1_spline_x = deriv1(spline_x)
-    d1_spline_z = deriv1(spline_z)
-
-    stencils_left, stencils_right = GL8_LAGRANGE_STENCILS
-    sing_idx = zeros!(pool, Int, 5)
-
-    dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta)
-    dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta)
-    d1_spline_x(dx_dtheta_grid, theta_grid)
-    d1_spline_z(dz_dtheta_grid, theta_grid)
-
-    # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition)
-    legendre_buf = Vector{Float64}(undef, n + 2)
-
-    # Per-observer projection vectors (P-length complex): proj_z = (kernel row) · Z
-    proj_kz = zeros!(pool, ComplexF64, P)
-    proj_gz = zeros!(pool, ComplexF64, P)
-
-    for j in 1:mtheta
-        x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
-
-        fill!(proj_kz, 0.0)
-        fill!(proj_gz, 0.0)
-        diag_accum = 0.0
-
-        # ── Simpson integration for nonsingular source points ──
-        @inbounds for k in 1:(mtheta-3)
-            isrc = mod1(j + 1 + k, mtheta)
-            G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
-                source.x[isrc], source.z[isrc],
-                dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf;
-                gamma_prefactor)
-
-            wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2))
-
-            if populate_greenfunction
-                _accum_row!(proj_gz, G_n * wsimpson, Zt, isrc)
-            end
-            _accum_row!(proj_kz, gradG_n * wsimpson, Zt, isrc)
-
-            diag_accum -= gradG_0 * wsimpson
-        end
-
-        # ── Gaussian quadrature for singular points ──
-        for (offset_idx, offset) in enumerate(-2:2)
-            sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta)
-        end
-
-        for leftpanel in (true, false)
-            gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta)
-            @inbounds for ig in 1:8
-                theta_gauss = gauss_mid + GL8.x[ig] * dtheta
-                theta_gauss0 = mod(theta_gauss, 2π)
-                x_gauss = spline_x(theta_gauss0)
-                dx_dtheta_gauss = d1_spline_x(theta_gauss0)
-                z_gauss = spline_z(theta_gauss0)
-                dz_dtheta_gauss = d1_spline_z(theta_gauss0)
-                G_n, gradG_n, gradG_0 = green(x_obs, z_obs,
-                    x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf;
-                    gamma_prefactor)
-
-                s = leftpanel ? stencils_left[ig] : stencils_right[ig]
-                wgauss = GL8.w[ig] * dtheta
-
-                if populate_greenfunction
-                    if observer isa PlasmaGeometry
-                        G_n += log((theta_obs - theta_gauss)^2) / x_obs
-                    end
-                    @inbounds for stencil_idx in 1:5
-                        _accum_row!(proj_gz, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
-                    end
-                end
-
-                @inbounds for stencil_idx in 1:5
-                    _accum_row!(proj_kz, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
-                end
-
-                diag_accum -= gradG_0 * wgauss
-            end
-        end
-
-        # Analytic singular integral correction [Chance 1997 eq. 75]
-        if populate_greenfunction && observer isa PlasmaGeometry
-            @inbounds for stencil_idx in 1:5
-                _accum_row!(proj_gz, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx])
-            end
-        end
-
-        # Fold diagonal accumulation into projection
-        _accum_row!(proj_kz, diag_accum, Zt, j)
-
-        # ── Rank-1 accumulate: K_c += conj(Z[j,:]) ⊗ proj_kz ──
-        _rank1_conj!(K_c_block, Zt, j, proj_kz)
-        if populate_greenfunction
-            _rank1_conj!(G_c_block, Zt, j, proj_gz)
-        end
-    end
-
-    # ── Post-processing (mirrors compute_2D_kernel_matrices!) ──
-
-    # Normals point out of vacuum for wall but inward for plasma → flip sign for plasma source
-    if source isa PlasmaGeometry
-        K_c_block .*= -1
-    end
-
-    # Diagonal residue: K += residue·I  →  K_c += residue·Gram
-    # [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
-    residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
-    if residue != 0.0
-        K_c_block .+= residue .* Gram
-    end
-
-    # 2π𝒢 → 𝒢
-    if populate_greenfunction
-        G_c_block ./= 2π
-    end
-end
-
-
-# ============================================================================
-# 3D fused projected kernel
-# ============================================================================
-
-"""
-    _projected_kernel_3D!(K_c, G_c, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, exp_mn_basis, Gram)
-
-Fused 3D kernel assembly + projection. Mirrors the loop structure of
-`compute_3D_kernel_matrices!` (including multi-threading and BIEST singular correction)
-but writes projected P-vectors to per-observer rows of [M × P] buffers instead of
-filling the M×M kernel matrices. The P×P assembly is done after the parallel loop
-via sequential GEMM calls.
-
-Each observer writes to its own row of the shared buffers, so there are no
-cross-thread accumulation races — the same write pattern as the original
-`compute_3D_kernel_matrices!`.
-
-Memory: O(2MP + P²) instead of O(M²).
-"""
-function _projected_kernel_3D!(
-    K_c::AbstractMatrix{ComplexF64},
-    G_c::AbstractMatrix{ComplexF64},
-    observer::Union{PlasmaGeometry3D,WallGeometry3D},
-    source::Union{PlasmaGeometry3D,WallGeometry3D},
-    PATCH_RAD::Int,
-    RAD_DIM::Int,
-    INTERP_ORDER::Int,
-    exp_mn_basis::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
-)
-    M, P = size(exp_mn_basis)
-    Z = exp_mn_basis
-    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
-    num_points = observer.mtheta * observer.nzeta
-    dθdζ = 4π^2 / num_points
-
-    # Take a view of the corresponding block of the K_c and G_c matrices
-    col_idx = (source isa PlasmaGeometry3D ? 1 : 2)
-    row_idx = (observer isa PlasmaGeometry3D ? 1 : 2)
-    K_c_block = view(K_c, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
-    G_c_block = view(G_c, ((row_idx-1)*P+1):(row_idx*P), :)
-    populate_greenfunction = source isa PlasmaGeometry3D
-
-    if PATCH_RAD > (min(source.mtheta, source.nzeta) - 1) ÷ 2
-        @warn "PATCH_RAD clamped in projected kernel" max_PATCH_RAD=(min(source.mtheta, source.nzeta) - 1) ÷ 2
-        PATCH_RAD = (min(source.mtheta, source.nzeta) - 1) ÷ 2
-    end
-    quad_data = get_singular_quadrature(PATCH_RAD, RAD_DIM, INTERP_ORDER)
-    (; PATCH_DIM, ANG_DIM, Ppou, Gpou, P2G) = quad_data
-
-    # [P × M] buffers: column idx_obs holds (kernel row idx_obs) · Z
-    KZt = zeros(ComplexF64, P, M)
-    GZt = zeros(ComplexF64, P, M)
-
-    # Per-thread workspace (kernel scratch arrays + P-length accumulation vectors + patch mask)
-    max_tid = Threads.maxthreadid()
-    workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_tid]
-    proj_kz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
-    proj_gz_all = [zeros(ComplexF64, P) for _ in 1:max_tid]
-    is_patch_all = [falses(num_points) for _ in 1:max_tid]
-
-    Threads.@threads :static for idx_obs in 1:num_points
-        tid = Threads.threadid()
-        ws = workspaces[tid]
-        (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar,
-            n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws
-
-        proj_kz = proj_kz_all[tid]
-        proj_gz = proj_gz_all[tid]
-        is_patch = is_patch_all[tid]
-
-        fill!(proj_kz, 0.0)
-        fill!(proj_gz, 0.0)
-        fill!(is_patch, false)
-
-        i_obs = mod1(idx_obs, observer.mtheta)
-        j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
-        @inbounds ox = observer.r[idx_obs, 1]
-        @inbounds oy = observer.r[idx_obs, 2]
-        @inbounds oz = observer.r[idx_obs, 3]
-
-        # Mark patch source indices so the far-field loop can skip them
-        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
-            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
-            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
-            is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true
-        end
-
-        # ── FAR FIELD: Trapezoidal rule (skip patch — handled in POU correction) ──
-        @inbounds for idx_src in 1:num_points
-            is_patch[idx_src] && continue
-            sx = source.r[idx_src, 1];
-            sy = source.r[idx_src, 2];
-            sz = source.r[idx_src, 3]
-            nx = source.normal[idx_src, 1];
-            ny = source.normal[idx_src, 2];
-            nz = source.normal[idx_src, 3]
-            w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
-            _accum_row!(proj_kz, w_double, Zt, idx_src)
-
-            if populate_greenfunction
-                w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
-                _accum_row!(proj_gz, w_single, Zt, idx_src)
-            end
-        end
-
-        # ── NEAR FIELD: Polar quadrature with BIEST singular correction ──
-        extract_patch!(r_patch, source.r, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
-        extract_patch!(dr_dθ_patch, source.dr_dθ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
-        extract_patch!(dr_dζ_patch, source.dr_dζ, i_obs, j_obs, source.mtheta, source.nzeta, PATCH_DIM)
-
-        interpolate_to_polar!(r_polar, r_patch, P2G)
-        interpolate_to_polar!(dr_dθ_polar, dr_dθ_patch, P2G)
-        interpolate_to_polar!(dr_dζ_polar, dr_dζ_patch, P2G)
-
-        compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient)
-
-        @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
-            rsx = r_polar[ir, ia, 1];
-            rsy = r_polar[ir, ia, 2];
-            rsz = r_polar[ir, ia, 3]
-            nsx = n_polar[ir, ia, 1];
-            nsy = n_polar[ir, ia, 2];
-            nsz = n_polar[ir, ia, 3]
-            M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ
-            M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
-        end
-
-        mul!(M_grid_single_flat, P2G, vec(M_polar_single))
-        mul!(M_grid_double_flat, P2G, vec(M_polar_double))
-        M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
-        M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
-
-        # POU correction: evaluate kernel once with combined weight (1+Gpou) = (1-χ)
-        # since far-field skipped patch points, we include the full trapezoidal + polar here
-        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
-            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
-            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
-            idx_src = idx_pol + source.mtheta * (idx_tor - 1)
-
-            sx = source.r[idx_src, 1];
-            sy = source.r[idx_src, 2];
-            sz = source.r[idx_src, 3]
-            nx = source.normal[idx_src, 1];
-            ny = source.normal[idx_src, 2];
-            nz = source.normal[idx_src, 3]
-            full_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[ii, jj]) * dθdζ
-            _accum_row!(proj_kz, M_grid_double[ii, jj] + full_double, Zt, idx_src)
-
-            if populate_greenfunction
-                full_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[ii, jj]) * dθdζ
-                _accum_row!(proj_gz, M_grid_single[ii, jj] + full_single, Zt, idx_src)
-            end
-        end
-
-        # ── Write projected column to buffer (each idx_obs owns its column) ──
-        @inbounds for p in 1:P
-            KZt[p, idx_obs] = proj_kz[p]
-        end
-        if populate_greenfunction
-            @inbounds for p in 1:P
-                GZt[p, idx_obs] = proj_gz[p]
-            end
-        end
-    end
-
-    # ── Assemble P×P projected matrices: K_c = Z^H * KZt^T, G_c = Z^H * GZt^T ──
-    mul!(K_c_block, Z', transpose(KZt))
-    K_c_block ./= 2π
-    if populate_greenfunction
-        mul!(G_c_block, Z', transpose(GZt))
-        G_c_block ./= 2π
-    end
-
-    # Diagonal: K += I → K_c += Gram [for same-type source/observer]
-    if typeof(source) == typeof(observer)
-        K_c_block .+= Gram
-    end
-end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 7d994531..398d0ff5 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -16,7 +16,6 @@ include("DataTypes.jl")
 include("PnQuadCache.jl")
 include("Kernel2D.jl")
 include("Kernel3D.jl")
-include("ProjectedKernel.jl")
 include("Field.jl")
 
 export VacuumInput, WallShapeSettings
@@ -124,9 +123,12 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     K_int = similar!(pool, K_ext)
     G_int = similar!(pool, G_ext)
 
-    # Fused projected kernel: compute Z^H K Z and Z^H G Z
+    # Fused projected kernel: compute Z^H K Z and Z^H G Z for all operator blocks
+    # Plasma-plasma block
     kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+    # Wall-plasma, plasma-wall, wall-wall blocks
     if !wall.nowall
+        # Wall-plasma, plasma-wall, wall-wall blocks
         kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
         kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
         kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
@@ -175,57 +177,8 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
         @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
         @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
     end
-    """
-        # ================================================================
-        # Collocation approach: solve full physical-space system [M × M]
-        # Handles both no-wall and wall cases.
-        # ================================================================
-        # Full-size kernel matrices
-        grad_green = zeros!(pool, num_points_total, num_points_total)
-        green_temp = zeros!(pool, num_points_surf, num_points_surf)
-
-        kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
-
-        # Project plasma→plasma Green's function to mode space: grre[1:M, 1:2P] = real/imag(G*Z)
-        mul!(temp, green_temp, exp_mn_basis)
-        @view(grre[1:M, 1:P]) .= real.(temp)
-        @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-
-        if !wall.nowall
-            # Plasma–Wall block
-            kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
-            # Wall–Wall block
-            kernel!(grad_green, green_temp, wall, wall, kparams)
-            # Wall–Plasma block
-            kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-            # Project obs=wall, src=plasma block to mode space
-            mul!(temp, green_temp, exp_mn_basis)
-            @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-            @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-        end
-
-        # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
-        grri .= grre # start from same as exterior
-        grad_green_interior = similar!(pool, grad_green)
-        grad_green_interior .= grad_green
-
-        # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
-        F_ext = lu!(grad_green)
-        ldiv!(F_ext, grre)
 
-        # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
-        grad_green_interior .*= -1
-        for i in 1:num_points_total
-            grad_green_interior[i, i] += 2.0
-        end
-        F_int = lu!(grad_green_interior)
-        ldiv!(F_int, grri)
-
-        # wv = (4π²/M) · Z^H · grre_complex  [Chance Phys. Plasmas 2007 052506 eq. 115-118]
-        temp .= complex.(@view(grre[1:M, 1:P]), @view(grre[1:M, (P+1):(2*P)]))
-        mul!(wv, exp_mn_basis', temp)
-        wv .*= (4π^2 / M)
-"""
+    # Enforce symmetry in the vacuum response matrix if desired
     inputs.force_wv_symmetry && hermitianpart!(wv)
 
     if nzeta > 1 # 3D

From 84542557e147691e05c905fdc4b428f090262ca7 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Tue, 17 Mar 2026 15:09:15 -0400
Subject: [PATCH 19/23] VACUUM - IMPROVEMENT - updating the main docstring

---
 src/Vacuum/Vacuum.jl | 85 ++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 26 deletions(-)

diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 398d0ff5..b2ab97fd 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -23,42 +23,76 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field
 export extract_plasma_surface_at_psi
 
 """
-    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
+    _compute_vacuum_response_single!(
+        wv, grri_in, grre_in, plasma_pts, wall_pts,
+        inputs::VacuumInput, wall_settings::WallShapeSettings;
+        n_override=nothing
+    )
 
-Compute the vacuum response matrix and both Green's functions using provided vacuum inputs.
+Compute a single vacuum solve (one coupled 3D solve, or one `n`-slice in 2D) by building and solving
+the boundary integral equation in mode space with an optional conducting wall present and writing out the results:
 
-Single entry point for vacuum calculations.
+  - `wv`: complex vacuum response matrix in straight-fieldline mode space
+  - `grri_in`: interior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility)
+  - `grre_in`: exterior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility)
+  - `plasma_pts` / `wall_pts`: output point clouds for downstream plotting/diagnostics
 
-  - For **3D** (`inputs.nzeta > 1`), computes the full coupled response across all (m,n) modes defined
-    by `inputs.(mlow, mpert, nlow, npert)`.
+## Fused kernel assembly + projection
 
-  - For **2D geometry** (`inputs.nzeta == 1`), supports either:
+This routine uses a **newer kernel evaluation path** that never forms dense point-space kernel matrices.
+Instead, it fuses kernel evaluation and Fourier/Galerkin projection into a single pass.
 
-      + **single-n** (`inputs.npert == 1`): computes (m,n) response for `n = inputs.nlow`
-      + **multi-n** (`inputs.npert > 1`): loops over `n = inputs.nlow:(inputs.nlow+inputs.npert-1)` and returns
-        **blocks** of the full response matrices with one block per toroidal mode number.
+The key idea is:
 
-This is the pure Julia implementation that replaces the Fortran `mscvac` function.
-It computes both interior (grri) and exterior (grre) Green's functions for GPEC response calculations.
+  - Assemble and solve the boundary integral equation directly in `P×P` mode space.
 
-# Arguments
+  - Avoid materializing `M×M` (2D) or `N×N` (3D) kernel matrices.
 
-  - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info.
-  - `wall_settings::WallShapeSettings`: Wall geometry configuration.
+  - Uses complex basis `Z = C + iS` so projected operators are `P×P` complex.
 
-# Returns
+  - The projected operators are accumulated row-by-row while kernel values are computed.
+
+  - Memory drops from `O(M^2)` (or `O(N^2)`) down to `O(MP + P^2)` (or `O(NP + P^2)`).
+
+  - FLOPs remain dominated by the same scaling as the two-step approach (kernel evaluation + projection),
+    plus an additional `O(P^3)` for the LU factorization/solve in mode space.
+
+  - **Projected matrices**
+
+      + Exterior projected kernel blocks are assembled into `K_ext` and `G_ext`.
+      + Interior operators are formed from the exterior ones using the discrete Green-identity diagonal term:
+        the implementation uses `K_int = 2*Gram - K_ext` for same-type source/observer blocks. This effectively
+        computes the kernel with an negative normal direction without recalculating the kernel.
+
+  - **Solves**
 
-  - `wv`: Complex vacuum response matrix.
+      + If `nowall`, solve the plasma-only `P×P` system.
+      + If a wall is present, solve the coupled `2P×2P` block system.
 
-      + 2D single-n: `mpert × mpert`
-      + 2D multi-n: `(mpert*npert) × (mpert*npert)` (block diagonal)
-      + 3D: `num_modes × num_modes` (full coupled)
+  - **Back-compat outputs**
 
-  - `grri`: Interior Green's function matrix.
+      + Although the solve is performed in mode space, `grri_in` and `grre_in` are reconstructed into the
+        legacy real `M×(2P)` layout for downstream code paths that still expect that shape.
 
-  - `grre`: Exterior Green's function matrix.
+## Arguments
 
-  - `xzpts`: Coordinate array (mtheta×4 for 2D, mtheta*nzeta×4 for 3D) [R_plasma, Z_plasma, R_wall, Z_wall].
+  - **`wv::AbstractMatrix{ComplexF64}`**: output vacuum response matrix (modified in-place)
+  - **`grri_in::AbstractMatrix{Float64}`**: output interior Green's function (modified in-place; real/legacy layout)
+  - **`grre_in::AbstractMatrix{Float64}`**: output exterior Green's function (modified in-place; real/legacy layout)
+  - **`plasma_pts::AbstractMatrix{Float64}`**: plasma surface coordinates (modified in-place)
+  - **`wall_pts::AbstractMatrix{Float64}`**: wall surface coordinates (modified in-place)
+  - **`inputs::VacuumInput`**: mode ranges, grid resolution, and geometry settings
+  - **`wall_settings::WallShapeSettings`**: wall geometry configuration
+  - **`n_override::Union{Nothing,Int}`**: optional toroidal mode number override (only used for 2D)
+
+## 2D vs 3D behavior
+
+  - **3D (`inputs.nzeta > 1`)**: computes the full coupled response across all `(m, n)` modes specified by
+    `inputs.(mlow, mpert, nlow, npert)` in a single call using the 3D kernel method in Kernel3D.jl.
+  - **2D (`inputs.nzeta == 1`)**:
+      + If `inputs.npert == 1`, computes the response for `n = inputs.nlow` using the 2D kernel method in Kernel2D.jl.
+      + If `inputs.npert > 1`, the public driver loops over `n` and calls this function once per `n`,
+        writing block columns into the full output matrices using the 2D kernel method in Kernel2D.jl.
 """
 @with_pool pool function _compute_vacuum_response_single!(
     wv::AbstractMatrix{ComplexF64},
@@ -92,10 +126,6 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
     num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
 
-    # Views into output Green's function matrices for the active rows/columns
-    grre = @view grre_in[1:num_points_total, :]
-    grri = @view grri_in[1:num_points_total, :]
-
     # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility
     M = num_points_surf
     P = num_modes
@@ -163,6 +193,9 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     # Need to convert mode space to physical space and unpack the real and imaginary parts
     # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
     # perhaps make it a complex P * P matrix? Then don't need any of this section
+    # Views into output Green's function matrices for the active rows/columns
+    grre = @view grre_in[1:num_points_total, :]
+    grri = @view grri_in[1:num_points_total, :]
     mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
     @view(grre[1:M, 1:P]) .= real.(temp)
     @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)

From c719a6b0d7d104d6e43f37542b52a588b7e87b18 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Tue, 17 Mar 2026 15:26:46 -0400
Subject: [PATCH 20/23] VACUUM - IMPROVEMENT - adding back in the logic that
 only allotes the full matrices if a wall is present

---
 src/Vacuum/Vacuum.jl | 90 +++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 56 deletions(-)

diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index b2ab97fd..400dacc5 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -114,7 +114,7 @@ The key idea is:
     # Compute Fourier basis coefficients
     ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
     exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν)
-    num_points_surf, num_modes = size(exp_mn_basis)
+    num_points, num_modes = size(exp_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
     # Hardcode these values for now - can expose to the user in the future
@@ -123,33 +123,16 @@ The key idea is:
     INTERP_ORDER = 5
     kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override)
 
-    # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
-    num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
-
-    # Complex buffer for projecting to mode space (G*Z) and back; grre/grri stay real for backwards compatibility
-    M = num_points_surf
-    P = num_modes
-    temp = zeros!(pool, ComplexF64, M, P)
-
-    # ================================================================
-    # Galerkin: solve system in P×P mode space. Uses complex basis
-    # Z = C + iS so projected matrices are P×P complex.
-    #
-    # Fused (fuse_projection=true): kernel assembly + Fourier projection
-    # in one pass. The full M×M kernel matrices are never materialized —
-    # instead the P×P projected matrices grad_green_fourier and G_c are
-    # accumulated row by row as kernel values are computed.
-    # Memory:  O(MP + P²)  instead of  O(M²)
-    #
-    # FLOPs:  O(M²P + P³)
-    # ================================================================
+    # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed)
+    wall_fac = wall.nowall ? 1 : 2
+
     # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
-    Gram = zeros!(pool, ComplexF64, P, P)
+    Gram = zeros!(pool, ComplexF64, num_modes, num_modes)
     mul!(Gram, exp_mn_basis', exp_mn_basis)
 
     # Projected kernel matrices [P × P complex]
-    K_ext = zeros!(pool, ComplexF64, 2P, 2P)
-    G_ext = zeros!(pool, ComplexF64, 2P, P)
+    K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes)
+    G_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, num_modes)
     K_int = similar!(pool, K_ext)
     G_int = similar!(pool, G_ext)
 
@@ -166,54 +149,49 @@ The key idea is:
 
     # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
     K_int .= -K_ext
-    K_int[1:P, 1:P] .+= 2 .* Gram
+    K_int[1:num_modes, 1:num_modes] .+= 2 .* Gram
     if !wall.nowall
-        K_int[(P+1):(2*P), (P+1):(2*P)] .+= 2 .* Gram
+        K_int[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)] .+= 2 .* Gram
     end
     G_int .= G_ext
 
     # Solve projected BIEs for exterior and interior kernels
-    if wall.nowall
-        F_ext = lu!(K_ext[1:P, 1:P])
-        ldiv!(F_ext, @view(G_ext[1:P, :]))
-        F_int = lu!(K_int[1:P, 1:P])
-        ldiv!(F_int, @view(G_int[1:P, :]))
-    else
-        F_ext = lu!(K_ext)
-        ldiv!(F_ext, G_ext)
-        F_int = lu!(K_int)
-        ldiv!(F_int, G_int)
-    end
+    F_ext = lu!(K_ext)
+    ldiv!(F_ext, G_ext)
+    F_int = lu!(K_int)
+    ldiv!(F_int, G_int)
 
     # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
-    mul!(wv, Gram, view(G_ext, 1:P, :))
-    wv .*= (4π^2 / M)
+    mul!(wv, Gram, view(G_ext, 1:num_modes, :))
+    wv .*= (4π^2 / num_points)
+
+    # Enforce Hermitian symmetry if desired
+    inputs.force_wv_symmetry && hermitianpart!(wv)
 
     # Backward-compatible reconstruction: grre/grri in M×2P real layout
     # Need to convert mode space to physical space and unpack the real and imaginary parts
     # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
     # perhaps make it a complex P * P matrix? Then don't need any of this section
     # Views into output Green's function matrices for the active rows/columns
-    grre = @view grre_in[1:num_points_total, :]
-    grri = @view grri_in[1:num_points_total, :]
-    mul!(temp, exp_mn_basis, view(G_ext, 1:P, :))
-    @view(grre[1:M, 1:P]) .= real.(temp)
-    @view(grre[1:M, (P+1):(2*P)]) .= imag.(temp)
-    mul!(temp, exp_mn_basis, view(G_int, 1:P, :))
-    @view(grri[1:M, 1:P]) .= real.(temp)
-    @view(grri[1:M, (P+1):(2*P)]) .= imag.(temp)
+    grre = @view grre_in[1:(wall_fac*num_points), :]
+    grri = @view grri_in[1:(wall_fac*num_points), :]
+    temp = zeros!(pool, ComplexF64, num_points, num_modes)
+
+    mul!(temp, exp_mn_basis, view(G_ext, 1:num_modes, :))
+    @view(grre[1:num_points, 1:num_modes]) .= real.(temp)
+    @view(grre[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp)
+    mul!(temp, exp_mn_basis, view(G_int, 1:num_modes, :))
+    @view(grri[1:num_points, 1:num_modes]) .= real.(temp)
+    @view(grri[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp)
     if !wall.nowall
-        mul!(temp, exp_mn_basis, view(G_ext, (P+1):(2*P), :))
-        @view(grre[(M+1):(2*M), 1:P]) .= real.(temp)
-        @view(grre[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
-        mul!(temp, exp_mn_basis, view(G_int, (P+1):(2*P), :))
-        @view(grri[(M+1):(2*M), 1:P]) .= real.(temp)
-        @view(grri[(M+1):(2*M), (P+1):(2*P)]) .= imag.(temp)
+        mul!(temp, exp_mn_basis, view(G_ext, (num_modes+1):(2*num_modes), :))
+        @view(grre[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp)
+        @view(grre[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp)
+        mul!(temp, exp_mn_basis, view(G_int, (num_modes+1):(2*num_modes), :))
+        @view(grri[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp)
+        @view(grri[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp)
     end
 
-    # Enforce symmetry in the vacuum response matrix if desired
-    inputs.force_wv_symmetry && hermitianpart!(wv)
-
     if nzeta > 1 # 3D
         plasma_pts .= plasma_surf.r
         wall_pts .= wall.r

From 4c001f46432b71f3ff997be6c72277d521104445 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Wed, 18 Mar 2026 10:02:02 -0400
Subject: [PATCH 21/23] VACUUM - IMPROVEMENT - using a vector for the diagonal
 of the gram matrix instead of a dense matrix

---
 src/Vacuum/Kernel2D.jl  | 12 +++++++-----
 src/Vacuum/Kernel3D.jl  | 35 +++++++++++++++++------------------
 src/Vacuum/Vacuum.jl    | 26 ++++++++++++++++++++------
 test/runtests_vacuum.jl |  3 ++-
 4 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl
index 49772e96..17748804 100644
--- a/src/Vacuum/Kernel2D.jl
+++ b/src/Vacuum/Kernel2D.jl
@@ -100,7 +100,7 @@ mathematical discretization.
   - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`.
   - `n`: Integer representing the order of the toroidal Fourier component.
   - `Z`: Complex Fourier basis sampled on the `θ` grid.
-  - `Gram`: Mode-space Gram matrix for this basis on the discrete grid.
+  - `Gram`: Diagonal of the mode-space Gram matrix for this basis on the discrete grid.
 
 ## Block layout
 
@@ -163,7 +163,7 @@ This routine is intentionally written to be allocation-light in tight loops:
     source::Union{PlasmaGeometry,WallGeometry},
     n::Int,
     Z::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
+    Gram::AbstractVector{ComplexF64}
 )
 
     M, P = size(Z) # M = mtheta, P = num_modes
@@ -316,9 +316,11 @@ This routine is intentionally written to be allocation-light in tight loops:
     end
 
     # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
-    # The Gram matrix is a result of the projection onto a scalar, Z⋅Zᵀ * residue
+    # The residue term is diagonal in mode space and is scaled by the Gram diagonal.
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
-    Kc_block .+= residue .* Gram
+    @inbounds for p in 1:P
+        Kc_block[p, p] += residue * Gram[p]
+    end
 
     # Since we computed 2π𝒢, divide by 2π to get 𝒢
     if populate_greenfunction
@@ -339,7 +341,7 @@ function kernel!(
     source::Union{PlasmaGeometry,WallGeometry},
     params::KernelParams2D,
     Z::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
+    Gram::AbstractVector{ComplexF64}
 )
     return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)
 end
diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index e066e788..d7abd667 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -427,7 +427,7 @@ near-field correction for the singular region.
   - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`).
   - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`).
   - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`.
-  - `Gram`: Mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution).
+  - `Gram`: Diagonal of the mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution).
 
 This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`)
 selected by whether observer/source are plasma or wall.
@@ -504,7 +504,7 @@ function compute_3D_kernel_matrices!(
     RAD_DIM::Int,
     INTERP_ORDER::Int,
     Z::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
+    Gram::AbstractVector{ComplexF64}
 )
     N, P = size(Z) # N = mtheta * nzeta, P = num_modes
     dθdζ = 4π^2 / N
@@ -575,22 +575,19 @@ function compute_3D_kernel_matrices!(
         # ============================================================
         @inbounds for idx_src in 1:N
             is_patch[idx_src] && continue
-            w_double =
-                laplace_double_layer(
-                    ox,
-                    oy,
-                    oz,
-                    source.r[idx_src, 1],
-                    source.r[idx_src, 2],
-                    source.r[idx_src, 3],
-                    source.normal[idx_src, 1],
-                    source.normal[idx_src, 2],
-                    source.normal[idx_src, 3]
-                ) * dθdζ
+
+            sx = source.r[idx_src, 1]
+            sy = source.r[idx_src, 2]
+            sz = source.r[idx_src, 3]
+            nx = source.normal[idx_src, 1]
+            ny = source.normal[idx_src, 2]
+            nz = source.normal[idx_src, 3]
+
+            w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
             _accum_row!(proj_k, w_double, Zt, idx_src)
 
             if populate_greenfunction
-                w_single = laplace_single_layer(ox, oy, oz, source.r[idx_src, 1], source.r[idx_src, 2], source.r[idx_src, 3]) * dθdζ
+                w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
                 _accum_row!(proj_g, w_single, Zt, idx_src)
             end
         end
@@ -676,9 +673,11 @@ function compute_3D_kernel_matrices!(
         G_block ./= 2π
     end
 
-    # Add the term that comes from the volume integral of Green's identity
+    # Add the term that comes from the volume integral of Green's identity.
     if typeof(source) == typeof(observer)
-        K_block .+= Gram
+        @inbounds for p in 1:P
+            K_block[p, p] += Gram[p]
+        end
     end
 end
 
@@ -696,7 +695,7 @@ function kernel!(
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     params::KernelParams3D,
     Z::AbstractMatrix{ComplexF64},
-    Gram::AbstractMatrix{ComplexF64}
+    Gram::AbstractVector{ComplexF64}
 )
     return compute_3D_kernel_matrices!(
         Kc,
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 400dacc5..72ad0ad4 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -126,9 +126,16 @@ The key idea is:
     # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed)
     wall_fac = wall.nowall ? 1 : 2
 
-    # Gram matrix required by projected_kernel! for the diagonal residue and for interior solve
-    Gram = zeros!(pool, ComplexF64, num_modes, num_modes)
-    mul!(Gram, exp_mn_basis', exp_mn_basis)
+    # Gram matrix diagonal for the discrete Fourier basis on the uniform grid.
+    #
+    # For the basis produced by `compute_fourier_coefficients(...)` (complex exponentials sampled on a
+    # uniform grid), the discrete inner products satisfy:
+    #
+    #     ZᴴZ = num_points · I
+    #
+    # up to roundoff, so the Gram matrix is diagonal. Store only the diagonal as a length-P vector.
+    Gram = acquire!(pool, ComplexF64, num_modes)
+    fill!(Gram, ComplexF64(num_points))
 
     # Projected kernel matrices [P × P complex]
     K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes)
@@ -149,9 +156,13 @@ The key idea is:
 
     # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
     K_int .= -K_ext
-    K_int[1:num_modes, 1:num_modes] .+= 2 .* Gram
+    @inbounds for p in 1:num_modes
+        K_int[p, p] += 2 * Gram[p]
+    end
     if !wall.nowall
-        K_int[(num_modes+1):(2*num_modes), (num_modes+1):(2*num_modes)] .+= 2 .* Gram
+        @inbounds for p in 1:num_modes
+            K_int[num_modes+p, num_modes+p] += 2 * Gram[p]
+        end
     end
     G_int .= G_ext
 
@@ -162,7 +173,10 @@ The key idea is:
     ldiv!(F_int, G_int)
 
     # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
-    mul!(wv, Gram, view(G_ext, 1:num_modes, :))
+    wv .= view(G_ext, 1:num_modes, :)
+    @inbounds for p in 1:num_modes
+        @views wv[p, :] .*= Gram[p]
+    end
     wv .*= (4π^2 / num_points)
 
     # Enforce Hermitian symmetry if desired
diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl
index 472c6c68..3ae4b648 100644
--- a/test/runtests_vacuum.jl
+++ b/test/runtests_vacuum.jl
@@ -1,4 +1,5 @@
 @testset "Vacuum.jl Unit Tests" begin
+    using LinearAlgebra
 
     @testset "Vacuum.jl (2D)" begin
 
@@ -481,7 +482,7 @@
                 ν=plasma_surf.ν
             )
             M, P = size(exp_mn_basis)
-            Gram = exp_mn_basis' * exp_mn_basis
+            Gram = fill(ComplexF64(M), P)
 
             # --- Two-step Galerkin: materialize full kernels then project ---
             grad_green_full = zeros(Float64, 2M, 2M)

From 46f24d9548b4ad10800f7173886dfa16eb1e76d9 Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Wed, 18 Mar 2026 10:17:39 -0400
Subject: [PATCH 22/23] VACUUM - IMPROVEMENT - merging the single and double
 layer kernels to reduce extra computations, reduces runtime by around 10% for
 the 3D solovev example with wall

---
 src/Vacuum/Kernel3D.jl | 61 +++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index d7abd667..58a35b3d 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -280,6 +280,34 @@ Scalar-argument double-layer kernel. Avoids view creation in tight loops.
     return (dx*nx + dy*ny + dz*nz) * r3inv
 end
 
+"""
+    laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double)
+
+Fused scalar-argument Laplace single-layer and double-layer kernels.
+
+This is the hot-path variant used in the 3D projected-kernel assembly when both kernels are needed
+(`populate_greenfunction == true`). It shares the distance computation (`sqrt(r²)`) so we only pay
+for one `sqrt`/reciprocal pipeline per source/observer pair.
+"""
+@fastmath @inline function laplace_single_double_layer(
+    ox::Float64, oy::Float64, oz::Float64,
+    sx::Float64, sy::Float64, sz::Float64,
+    nx::Float64, ny::Float64, nz::Float64
+)
+    dx = ox - sx
+    dy = oy - sy
+    dz = oz - sz
+    r2 = dx*dx + dy*dy + dz*dz
+    r2 < 1e-30 && return (0.0, 0.0)
+    rinv = inv(sqrt(r2))
+    # single-layer: 1/r
+    single = rinv
+    # double-layer: (Δx·n)/r^3
+    r3inv = rinv * rinv * rinv
+    double = (dx*nx + dy*ny + dz*nz) * r3inv
+    return (single, double)
+end
+
 """
     extract_patch!(patch, data, idx_pol_center, idx_tor_center, npol, ntor, PATCH_DIM)
 
@@ -583,12 +611,13 @@ function compute_3D_kernel_matrices!(
             ny = source.normal[idx_src, 2]
             nz = source.normal[idx_src, 3]
 
-            w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
-            _accum_row!(proj_k, w_double, Zt, idx_src)
-
             if populate_greenfunction
-                w_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * dθdζ
-                _accum_row!(proj_g, w_single, Zt, idx_src)
+                w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz)
+                _accum_row!(proj_k, w_double * dθdζ, Zt, idx_src)
+                _accum_row!(proj_g, w_single * dθdζ, Zt, idx_src)
+            else
+                w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
+                _accum_row!(proj_k, w_double, Zt, idx_src)
             end
         end
 
@@ -617,8 +646,14 @@ function compute_3D_kernel_matrices!(
             nsx = n_polar[ir, ia, 1]
             nsy = n_polar[ir, ia, 2]
             nsz = n_polar[ir, ia, 3]
-            M_polar_single[ir, ia] = laplace_single_layer(ox, oy, oz, rsx, rsy, rsz) * Ppou[ir, ia] * dθdζ
-            M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
+            if populate_greenfunction
+                w_single, w_double = laplace_single_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz)
+                M_polar_single[ir, ia] = w_single * Ppou[ir, ia] * dθdζ
+                M_polar_double[ir, ia] = w_double * Ppou[ir, ia] * dθdζ
+            else
+                # Only the double-layer kernel is needed when the source is the wall.
+                M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
+            end
         end
 
         # Distribute polar singular corrections back to Cartesian grid using sparse matrix
@@ -643,13 +678,15 @@ function compute_3D_kernel_matrices!(
             ny = source.normal[idx_src, 2]
             nz = source.normal[idx_src, 3]
 
-            far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ
-            _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
-
-            # Apply near + far contributions
             if populate_greenfunction
-                far_single = laplace_single_layer(ox, oy, oz, sx, sy, sz) * (1.0 + Gpou[i, j]) * dθdζ
+                w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz)
+                far_double = w_double * (1.0 + Gpou[i, j]) * dθdζ
+                far_single = w_single * (1.0 + Gpou[i, j]) * dθdζ
+                _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
                 _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src)
+            else
+                far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ
+                _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
             end
         end
 

From c85493e0983eddc614aa1b35a0d249ae64f44f7b Mon Sep 17 00:00:00 2001
From: Jake Halpern <jhalpern@purdue.edu>
Date: Wed, 18 Mar 2026 11:48:12 -0400
Subject: [PATCH 23/23] VACUUM - IMPROVEMENT - combining kernels into one
 operation, reduces 3D time by around 10%

---
 src/Vacuum/Kernel3D.jl | 204 ++++++++---------------------------------
 1 file changed, 38 insertions(+), 166 deletions(-)

diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index 58a35b3d..dead2b31 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -177,119 +177,19 @@ function get_singular_quadrature(PATCH_RAD::Int, RAD_DIM::Int, INTERP_ORDER::Int
 end
 
 """
-    laplace_single_layer(x_obs, x_src) -> Float64
+    laplace_kernel(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double)
 
-Evaluate the Laplace single-layer (FxU) kernel between two 3D points. Returns
-0.0 if the observation point coincides with the source point to avoid singularity.
+Fused scalar-argument Laplace kernels for the 3D vacuum BIE.
 
-The single-layer kernel φ is the fundamental solution to Laplace's equation:
+Returns a tuple `(single, double)` where:
 
-```
-φ(x_obs, x_src) = 1 / |x_obs - x_src|
-```
+  - `single = 1/r` is the single-layer kernel
+  - `double = (Δx⋅n)/r^3` is the double-layer kernel
 
-# Arguments
-
-  - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector)
-  - `x_src`: Source point (3D Cartesian coordinates, any AbstractVector)
-
-# Returns
-
-  - `Float64`: Kernel value φ(x_obs, x_src)
-"""
-@fastmath function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real})
-    @inbounds begin
-        dx = x_obs[1] - x_src[1]
-        dy = x_obs[2] - x_src[2]
-        dz = x_obs[3] - x_src[3]
-    end
-    r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
-    return inv(sqrt(r2))
-end
-
-"""
-Scalar-argument single-layer kernel. Avoids view creation in tight loops.
+This is used when `compute_3D_kernel_matrices!` needs **both** kernels for the same pair, so the
+distance computation (`sqrt(r²)`) is shared. Returns `(0.0, 0.0)` when `r² < 1e-30`.
 """
-@fastmath @inline function laplace_single_layer(
-    ox::Float64, oy::Float64, oz::Float64,
-    sx::Float64, sy::Float64, sz::Float64
-)
-    dx = ox - sx;
-    dy = oy - sy;
-    dz = oz - sz
-    r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
-    return inv(sqrt(r2))
-end
-
-"""
-    laplace_double_layer(x_obs, x_src, n_src) -> Float64
-
-Evaluate the Laplace double-layer (DxU) kernel between a point and a surface element. Returns
-0.0 if the observation point coincides with the source point to avoid singularity. Allocation-free
-scalar arithmetic is used for maximum performance.
-
-The double-layer kernel K is the normal derivative of the fundamental solution:
-
-```
-K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x_obs - x_src|³
-```
-
-# Arguments
-
-  - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector)
-  - `x_src`: Source point on surface (3D Cartesian coordinates, any AbstractVector)
-  - `n_src`: Outward UNIT normal at source point (must be normalized!, any AbstractVector)
-
-# Returns
-
-  - `Float64`: Kernel value K(x_obs, x_src, n_src)
-"""
-@fastmath function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real})
-    @inbounds begin
-        dx = x_obs[1] - x_src[1]
-        dy = x_obs[2] - x_src[2]
-        dz = x_obs[3] - x_src[3]
-        nx = n_src[1]
-        ny = n_src[2]
-        nz = n_src[3]
-    end
-    r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
-    rinv = inv(sqrt(r2))
-    r3inv = rinv * rinv * rinv
-    return (dx*nx + dy*ny + dz*nz) * r3inv
-end
-
-"""
-Scalar-argument double-layer kernel. Avoids view creation in tight loops.
-"""
-@fastmath @inline function laplace_double_layer(
-    ox::Float64, oy::Float64, oz::Float64,
-    sx::Float64, sy::Float64, sz::Float64,
-    nx::Float64, ny::Float64, nz::Float64
-)
-    dx = ox - sx;
-    dy = oy - sy;
-    dz = oz - sz
-    r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
-    rinv = inv(sqrt(r2))
-    r3inv = rinv * rinv * rinv
-    return (dx*nx + dy*ny + dz*nz) * r3inv
-end
-
-"""
-    laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double)
-
-Fused scalar-argument Laplace single-layer and double-layer kernels.
-
-This is the hot-path variant used in the 3D projected-kernel assembly when both kernels are needed
-(`populate_greenfunction == true`). It shares the distance computation (`sqrt(r²)`) so we only pay
-for one `sqrt`/reciprocal pipeline per source/observer pair.
-"""
-@fastmath @inline function laplace_single_double_layer(
+@fastmath @inline function laplace_kernel(
     ox::Float64, oy::Float64, oz::Float64,
     sx::Float64, sy::Float64, sz::Float64,
     nx::Float64, ny::Float64, nz::Float64
@@ -342,8 +242,7 @@ end
     interpolate_to_polar!(polar_data, patch, quad_data)
 
 Interpolate Cartesian patch data to polar quadrature points using sparse matrix multiply.
-Overwrites `polar_data` using mul! function arguments, mul!(C, A, B, α, β) -> C where
-C = α * A * B + β * C.
+Overwrites `polar_data` using mul! function arguments, mul!(C, A, B) -> C where C = A * B.
 
 # Arguments
 
@@ -352,17 +251,17 @@ C = α * A * B + β * C.
   - `P2G`: Sparse interpolation matrix
 """
 function interpolate_to_polar!(polar_data::Array{Float64,3}, patch::Array{Float64,3}, P2G::SparseMatrixCSC{Float64,Int})
-    # Flatten patch to (Ngrid × dof), apply P2G' to get (Npolar × dof)
     patch_flat = reshape(patch, :, size(patch, 3))
-    mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat, 1.0, 0.0)
+    mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat)
 end
 
 """
-    compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar)
+    compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, normal_orient)
 
 Compute normal vector (= ∂r/∂θ × ∂r/∂ζ) at polar quadrature points from interpolated tangent vectors.
 We already scaled the normals by normal_orient in the geometry construction, so we need to reapply
-that here since we are recomputing the normals from the derivatives.
+that here since we are recomputing the normals from the derivatives. We use inline cross products
+to avoid slice allocation.
 
 # Arguments
 
@@ -372,7 +271,6 @@ that here since we are recomputing the normals from the derivatives.
   - `normal_orient`: Multiplier applied to normals to make them orient out of vacuum region (+1 or -1)
 """
 function compute_polar_normal!(n_polar::Array{Float64,3}, dr_dθ::Array{Float64,3}, dr_dζ::Array{Float64,3}, normal_orient::Int)
-    # Inline cross product to avoid slice allocation
     @inbounds for ia in axes(dr_dθ, 2), ir in axes(dr_dθ, 1)
         n_polar[ir, ia, 1] = dr_dθ[ir, ia, 2] * dr_dζ[ir, ia, 3] - dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 2]
         n_polar[ir, ia, 2] = dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 1] - dr_dθ[ir, ia, 1] * dr_dζ[ir, ia, 3]
@@ -586,9 +484,9 @@ function compute_3D_kernel_matrices!(
         # Convert linear index to 2D indices
         i_obs = mod1(idx_obs, observer.mtheta)
         j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
-        @inbounds ox = observer.r[idx_obs, 1]
-        @inbounds oy = observer.r[idx_obs, 2]
-        @inbounds oz = observer.r[idx_obs, 3]
+        ox = observer.r[idx_obs, 1]
+        oy = observer.r[idx_obs, 2]
+        oz = observer.r[idx_obs, 3]
 
         # Mark patch source indices so the far-field loop can skip them
         @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
@@ -604,20 +502,13 @@ function compute_3D_kernel_matrices!(
         @inbounds for idx_src in 1:N
             is_patch[idx_src] && continue
 
-            sx = source.r[idx_src, 1]
-            sy = source.r[idx_src, 2]
-            sz = source.r[idx_src, 3]
-            nx = source.normal[idx_src, 1]
-            ny = source.normal[idx_src, 2]
-            nz = source.normal[idx_src, 3]
+            sr = view(source.r, idx_src, :)
+            sn = view(source.normal, idx_src, :)
 
+            far_single, far_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* dθdζ
+            _accum_row!(proj_k, far_double, Zt, idx_src)
             if populate_greenfunction
-                w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz)
-                _accum_row!(proj_k, w_double * dθdζ, Zt, idx_src)
-                _accum_row!(proj_g, w_single * dθdζ, Zt, idx_src)
-            else
-                w_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * dθdζ
-                _accum_row!(proj_k, w_double, Zt, idx_src)
+                _accum_row!(proj_g, far_single, Zt, idx_src)
             end
         end
 
@@ -637,56 +528,37 @@ function compute_3D_kernel_matrices!(
         # Compute normal vectors at polar points from interpolated tangent vectors
         compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient)
 
-        # Evaluate kernels at polar points with POU weighting
+        # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already
         @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
-            # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already
-            rsx = r_polar[ir, ia, 1]
-            rsy = r_polar[ir, ia, 2]
-            rsz = r_polar[ir, ia, 3]
-            nsx = n_polar[ir, ia, 1]
-            nsy = n_polar[ir, ia, 2]
-            nsz = n_polar[ir, ia, 3]
-            if populate_greenfunction
-                w_single, w_double = laplace_single_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz)
-                M_polar_single[ir, ia] = w_single * Ppou[ir, ia] * dθdζ
-                M_polar_double[ir, ia] = w_double * Ppou[ir, ia] * dθdζ
-            else
-                # Only the double-layer kernel is needed when the source is the wall.
-                M_polar_double[ir, ia] = laplace_double_layer(ox, oy, oz, rsx, rsy, rsz, nsx, nsy, nsz) * Ppou[ir, ia] * dθdζ
-            end
+            sr = view(r_polar, ir, ia, :)
+            sn = view(n_polar, ir, ia, :)
+            w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* Ppou[ir, ia] .* dθdζ
+            M_polar_single[ir, ia] = w_single
+            M_polar_double[ir, ia] = w_double
         end
 
         # Distribute polar singular corrections back to Cartesian grid using sparse matrix
         # grid = P2G * polar (maps Npolar → Ngrid)
-        mul!(M_grid_single_flat, P2G, vec(M_polar_single))
         mul!(M_grid_double_flat, P2G, vec(M_polar_double))
-        M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
         M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
+        if populate_greenfunction
+            mul!(M_grid_single_flat, P2G, vec(M_polar_single))
+            M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
+        end
 
-        # POU correction: read back far-field trapezoidal values instead of re-evaluating kernels.
-        # trap + M_grid + trap*Gpou = trap*(1+Gpou) + M_grid = trap*(1-χ) + M_grid
+        # POU correction: singular correction + (1 + Gpou) * far-field terms
         @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM
             # Map back to global indices
             idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta)
             idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta)
             idx_src = idx_pol + source.mtheta * (idx_tor - 1)
+            sr = view(source.r, idx_src, :)
+            sn = view(source.normal, idx_src, :)
 
-            sx = source.r[idx_src, 1]
-            sy = source.r[idx_src, 2]
-            sz = source.r[idx_src, 3]
-            nx = source.normal[idx_src, 1]
-            ny = source.normal[idx_src, 2]
-            nz = source.normal[idx_src, 3]
-
+            w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* (1.0 + Gpou[i, j]) .* dθdζ
+            _accum_row!(proj_k, M_grid_double[i, j] + w_double, Zt, idx_src)
             if populate_greenfunction
-                w_single, w_double = laplace_single_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz)
-                far_double = w_double * (1.0 + Gpou[i, j]) * dθdζ
-                far_single = w_single * (1.0 + Gpou[i, j]) * dθdζ
-                _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
-                _accum_row!(proj_g, M_grid_single[i, j] + far_single, Zt, idx_src)
-            else
-                far_double = laplace_double_layer(ox, oy, oz, sx, sy, sz, nx, ny, nz) * (1.0 + Gpou[i, j]) * dθdζ
-                _accum_row!(proj_k, M_grid_double[i, j] + far_double, Zt, idx_src)
+                _accum_row!(proj_g, M_grid_single[i, j] + w_single, Zt, idx_src)
             end
         end
 
@@ -701,7 +573,7 @@ function compute_3D_kernel_matrices!(
         end
     end
 
-    # Use the same normalization as in the 2D kernel so we can just add I to the diagonal
+    # Use the same normalization as in the 2D kernel so we can just add Gram to the diagonal
     # This makes the grri logic identical to the 2D kernel.
     mul!(K_block, Z', transpose(KZt))
     K_block ./= 2π