diff --git a/benchmarks/benchmark_fourier_transforms.jl b/benchmarks/benchmark_fourier_transforms.jl
index 3a978417..840b63f6 100644
--- a/benchmarks/benchmark_fourier_transforms.jl
+++ b/benchmarks/benchmark_fourier_transforms.jl
@@ -28,10 +28,10 @@ function extract_modes(fft_result, mlow, mhigh, mtheta)
     for (i, m) in enumerate(mlow:mhigh)
         if m >= 0
             # Positive frequencies
-            modes[i] = fft_result[m + 1] / mtheta  # FFT normalization
+            modes[i] = fft_result[m+1] / mtheta  # FFT normalization
         else
             # Negative frequencies (wrap around)
-            modes[i] = fft_result[mtheta + m + 1] / mtheta
+            modes[i] = fft_result[mtheta+m+1] / mtheta
         end
     end
     return modes
@@ -39,10 +39,10 @@ end
 
 # Test configurations
 test_cases = [
-    (name="Small (mtheta=128, mpert=10)",   mtheta=128,  mpert=10,  mlow=-5),
-    (name="Medium (mtheta=256, mpert=20)",  mtheta=256,  mpert=20,  mlow=-10),
-    (name="Large (mtheta=480, mpert=40)",   mtheta=480,  mpert=40,  mlow=-20),
-    (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40),
+    (name="Small (mtheta=128, mpert=10)", mtheta=128, mpert=10, mlow=-5),
+    (name="Medium (mtheta=256, mpert=20)", mtheta=256, mpert=20, mlow=-10),
+    (name="Large (mtheta=480, mpert=40)", mtheta=480, mpert=40, mlow=-20),
+    (name="Very Large (mtheta=1024, mpert=80)", mtheta=1024, mpert=80, mlow=-40)
 ]
 
 for test in test_cases
@@ -56,7 +56,7 @@ for test in test_cases
     mhigh = mlow + mpert - 1
 
     # Create test data
-    theta = range(0, 2π, length=mtheta+1)[1:end-1]
+    theta = range(0, 2π; length=mtheta+1)[1:(end-1)]
     data = sin.(3 .* theta) .+ 0.5 .* cos.(7 .* theta) .+ 0.2 .* sin.(11 .* theta)
 
     # Initialize FourierTransform
@@ -67,7 +67,8 @@ for test in test_cases
     theta_buffer = zeros(ComplexF64, mtheta)
 
     # Pre-allocate for low-level API
-    cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow)
+    cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis)
     gij = reshape(data, mtheta, 1)  # Matrix form
     gil = zeros(Float64, mtheta, mpert)
 
@@ -99,7 +100,7 @@ for test in test_cases
     # Note: Our transform uses a different normalization and basis
     println("\n--- Accuracy Check ---")
     println("FourierTransform allocating vs in-place: ",
-            @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer))))
+        @sprintf("%.2e", maximum(abs.(modes_alloc .- modes_buffer))))
 
     # Compare magnitudes of modes (since basis might differ)
     println("Mode magnitudes comparison (FourierTransform vs FFTW):")
@@ -129,9 +130,9 @@ for test in test_cases
     full_modes = zeros(ComplexF64, mtheta)
     for (i, m) in enumerate(mlow:mhigh)
         if m >= 0
-            full_modes[m + 1] = modes_test[i]
+            full_modes[m+1] = modes_test[i]
         else
-            full_modes[mtheta + m + 1] = modes_test[i]
+            full_modes[mtheta+m+1] = modes_test[i]
         end
     end
     t6 = @benchmark ifft($full_modes)
@@ -140,21 +141,21 @@ for test in test_cases
     # Accuracy check
     println("\n--- Inverse Accuracy Check ---")
     println("inverse() allocating vs in-place: ",
-            @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer))))
+        @sprintf("%.2e", maximum(abs.(theta_alloc .- theta_buffer))))
     println("Round-trip error (real part):     ",
-            @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data))))
+        @sprintf("%.2e", maximum(abs.(real.(theta_alloc) .- data))))
 
     # Performance summary
     println("\n--- Performance Summary ---")
     println(@sprintf("Forward transform speedup (in-place vs allocating): %.2fx",
-            median(t1).time / median(t2).time))
+        median(t1).time / median(t2).time))
     println(@sprintf("Allocations eliminated: %d → %d",
-            t1.allocs, t2.allocs))
+        t1.allocs, t2.allocs))
 
     # Compare to FFTW
     println(@sprintf("\nFourier vs FFTW (forward): %.2fx %s",
-            abs(median(t2).time / median(t3).time),
-            median(t2).time < median(t3).time ? "faster" : "slower"))
+        abs(median(t2).time / median(t3).time),
+        median(t2).time < median(t3).time ? "faster" : "slower"))
     println("Note: FFTW computes full DFT (all N modes), we compute truncated series ($mpert modes)")
 end
 
@@ -169,7 +170,7 @@ mlow = -10
 nbatch = 10  # Transform 10 functions simultaneously
 
 ft = FourierTransform(mtheta, mpert, mlow)
-theta = range(0, 2π, length=mtheta+1)[1:end-1]
+theta = range(0, 2π; length=mtheta+1)[1:(end-1)]
 
 # Create batch data
 data_matrix = zeros(Float64, mtheta, nbatch)
@@ -182,7 +183,7 @@ modes_matrix = zeros(ComplexF64, mpert, nbatch)
 println("\nTransforming $nbatch functions of length $mtheta:")
 
 print("Allocating (loop):  ")
-@btime for i in 1:$nbatch
+@btime for i in 1:($nbatch)
     modes = $ft($data_matrix[:, i])
 end
 
@@ -191,7 +192,7 @@ print("Allocating (matrix):")
 
 print("In-place (loop):    ")
 modes_buffer = zeros(ComplexF64, mpert)
-@btime for i in 1:$nbatch
+@btime for i in 1:($nbatch)
     transform!($modes_buffer, $ft, $data_matrix[:, i])
 end
 
@@ -208,7 +209,8 @@ mpert = 10
 mlow = 1
 
 # Setup for low-level API
-cslth, snlth = compute_fourier_coefficients(mtheta, mpert, mlow)
+exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow)
+cslth, snlth = real(exp_mn_basis), imag(exp_mn_basis)
 gij = randn(mtheta, mtheta)  # Green's function matrix
 gil = zeros(Float64, mtheta, mpert)
 
diff --git a/benchmarks/benchmark_vacuum_galerkin.jl b/benchmarks/benchmark_vacuum_galerkin.jl
new file mode 100644
index 00000000..358c6240
--- /dev/null
+++ b/benchmarks/benchmark_vacuum_galerkin.jl
@@ -0,0 +1,399 @@
+#!/usr/bin/env julia
+
+using Printf
+using LinearAlgebra
+using TOML
+using Plots
+
+using Pkg
+Pkg.instantiate()
+using BenchmarkTools
+
+using GeneralizedPerturbedEquilibrium
+const GPEC = GeneralizedPerturbedEquilibrium
+
+"""
+    make_wall_settings(example_dir::AbstractString)
+
+Construct `Vacuum.WallShapeSettings` from the `[Wall]` section in `gpec.toml`
+if present; otherwise return default settings.
+"""
+function make_wall_settings(example_dir::AbstractString)
+    inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml"))
+    if haskey(inputs, "Wall")
+        return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    elseif haskey(inputs, "WALL")
+        # Some examples use legacy capitalized section name
+        return GPEC.Vacuum.WallShapeSettings(; (Symbol(k) => v for (k, v) in inputs["WALL"])...)
+    else
+        return GPEC.Vacuum.WallShapeSettings()
+    end
+end
+
+"""
+    load_equilibrium(example_dir::AbstractString)
+
+Set up the equilibrium specified by `[Equilibrium]` in `gpec.toml` under `example_dir`.
+"""
+function load_equilibrium(example_dir::AbstractString)
+    inputs = TOML.parsefile(joinpath(example_dir, "gpec.toml"))
+    @assert haskey(inputs, "Equilibrium") "[Equilibrium] section missing in gpec.toml for $example_dir"
+    eq_cfg = GPEC.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], example_dir)
+    return GPEC.Equilibrium.setup_equilibrium(eq_cfg)
+end
+
+"""
+    make_vacuum_input(
+        equil::GPEC.Equilibrium.PlasmaEquilibrium;
+        ψ::Real,
+        mtheta::Int,
+        nzeta::Int,
+        mpert::Int,
+        mlow::Int,
+        npert::Int,
+        nlow::Int,
+        use_galerkin::Bool,
+    ) -> GPEC.Vacuum.VacuumInput
+
+Construct a `VacuumInput` at flux surface `ψ` with the specified resolution and mode set.
+The only parameter that differs between the two algorithms we compare is `use_galerkin`.
+"""
+function make_vacuum_input(
+    equil::GPEC.Equilibrium.PlasmaEquilibrium;
+    ψ::Real,
+    mtheta::Int,
+    nzeta::Int,
+    mpert::Int,
+    mlow::Int,
+    npert::Int,
+    nlow::Int,
+    use_galerkin::Bool
+)
+    r, z, ν = GPEC.Vacuum.extract_plasma_surface_at_psi(equil, float(ψ))
+
+    return GPEC.Vacuum.VacuumInput(;
+        x=reverse(r),
+        z=reverse(z),
+        ν=reverse(ν),
+        mtheta_in=length(r),
+        nzeta_in=1,
+        mlow=mlow,
+        mpert=mpert,
+        nlow=nlow,
+        npert=npert,
+        mtheta=mtheta,
+        nzeta=nzeta,
+        force_wv_symmetry=true,
+        use_galerkin=use_galerkin
+    )
+end
+
+"""
+    benchmark_vacuum_2d(
+        example_dir::AbstractString;
+        ψ::Real = 0.95,
+        mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:9)),
+        mpert::Int = 32,
+        mlow::Int = 0,
+        npert::Int = 1,
+        nlow::Int = 1,
+    )
+
+Benchmark `compute_vacuum_response` for 2D (nzeta = 1) using the Solovev example in
+`example_dir`. Scans over `mtheta_values` and compares collocation (`use_galerkin=false`)
+against Galerkin (`use_galerkin=true`) for convergence of the `wv` matrix and runtime.
+"""
+function benchmark_vacuum_2d(
+    example_dir::AbstractString;
+    ψ::Real=1.0,
+    mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:7)),
+    mpert::Int=32,
+    mlow::Int=0,
+    npert::Int=1,
+    nlow::Int=1
+)
+    println("\n===== 2D Vacuum Benchmark (Solovev, $(basename(example_dir))) =====")
+    println("ψ = $(ψ), mtheta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n")
+
+    equil = load_equilibrium(example_dir)
+    wall_settings = make_wall_settings(example_dir)
+
+    mtheta_values = collect(mtheta_values)
+    nm = length(mtheta_values)
+
+    times_colloc = zeros(Float64, nm)
+    times_galerkin = zeros(Float64, nm)
+    errs_colloc = zeros(Float64, nm)
+    errs_galerkin = zeros(Float64, nm)
+
+    # Reference wv for convergence (highest resolution * 2) – always use galerkin
+    mtheta_ref = maximum(mtheta_values) * 2
+    println("Computing 2D reference matrices at mtheta_ref = $mtheta_ref")
+
+    input_ref_galerkin = make_vacuum_input(
+        equil;
+        ψ=ψ,
+        mtheta=mtheta_ref,
+        nzeta=1,
+        mpert=mpert,
+        mlow=mlow,
+        npert=npert,
+        nlow=nlow,
+        use_galerkin=true
+    )
+    wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings)
+    # IMPORTANT: `compute_vacuum_response` uses AdaptiveArrayPools; take a copy so
+    # later pooled allocations do not overwrite the reference storage.
+    wv_ref_galerkin = copy(wv_ref_galerkin)
+    ref_norm_galerkin = norm(wv_ref_galerkin)
+
+    for (i, mtheta) in enumerate(mtheta_values)
+        println("  2D: mtheta = $(rpad(string(mtheta), 5))  (nzeta = 1)")
+
+        # Collocation
+        input_colloc = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=1,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=false
+        )
+        t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings)
+        wv, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings)
+        errs_colloc[i] = norm(wv .- wv_ref_galerkin) / ref_norm_galerkin
+        times_colloc[i] = t_colloc
+
+        # Galerkin
+        input_galerkin = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=1,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=true
+        )
+        t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings)
+        wv_g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings)
+        errs_galerkin[i] = norm(wv_g .- wv_ref_galerkin) / ref_norm_galerkin
+        times_galerkin[i] = t_galerkin
+
+        @printf("    Collocation:  t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i])
+        @printf("    Galerkin:     t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i])
+    end
+
+    # Two‑pane plot: left = convergence, right = runtime
+    plt = plot(; layout=(1, 2), size=(1100, 420))
+
+    # Convergence
+    plot!(plt[1], mtheta_values, errs_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ", ylabel="rel‖Δwv‖",
+        title="2D Vacuum: wv convergence vs mθ")
+    plot!(plt[1], mtheta_values, errs_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    # Runtime
+    plot!(plt[2], mtheta_values, times_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ", ylabel="runtime [s]",
+        title="2D Vacuum: runtime vs mθ")
+    plot!(plt[2], mtheta_values, times_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    plot!(plt[1]; legend=:bottomleft)
+    plot!(plt[2]; legend=:topleft)
+
+    outpath = joinpath(@__DIR__, "vacuum_galerkin_2d.png")
+    savefig(plt, outpath)
+    println("\n2D results saved to $(outpath)")
+
+    return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin)
+end
+
+"""
+    benchmark_vacuum_3d(
+        example_dir::AbstractString;
+        ψ::Real = 0.95,
+        mtheta_values::AbstractVector{<:Integer} = 16 .* (2 .^ (0:3)),
+        mpert::Int = 16,
+        mlow::Int = 0,
+        npert::Int = 1,
+        nlow::Int = 1,
+    )
+
+Benchmark `compute_vacuum_response` for 3D (nzeta = mtheta) using the Solovev 3D example
+in `example_dir`. Scans over `mtheta = nzeta` and compares collocation vs Galerkin.
+"""
+function benchmark_vacuum_3d(
+    example_dir::AbstractString;
+    ψ::Real=1.0,
+    mtheta_values::AbstractVector{<:Integer}=16 .* (2 .^ (0:2)),
+    mpert::Int=16,
+    mlow::Int=0,
+    npert::Int=1,
+    nlow::Int=1
+)
+    println("\n===== 3D Vacuum Benchmark (Solovev 3D, $(basename(example_dir))) =====")
+    println("ψ = $(ψ), mtheta = nzeta ∈ $(collect(mtheta_values)), mpert=$mpert, mlow=$mlow, nlow=$nlow, npert=$npert\n")
+
+    equil = load_equilibrium(example_dir)
+    wall_settings = make_wall_settings(example_dir)
+
+    mtheta_values = collect(mtheta_values)
+    nm = length(mtheta_values)
+
+    times_colloc = zeros(Float64, nm)
+    times_galerkin = zeros(Float64, nm)
+    errs_colloc = zeros(Float64, nm)
+    errs_galerkin = zeros(Float64, nm)
+
+    # Reference wv for convergence (highest resolution * 2) – always use galerkin
+    mtheta_ref = maximum(mtheta_values) * 2
+    nzeta_ref = mtheta_ref
+    println("Computing 3D reference matrices at mtheta_ref = nzeta_ref = $mtheta_ref")
+
+    input_ref_galerkin = make_vacuum_input(
+        equil;
+        ψ=ψ,
+        mtheta=mtheta_ref,
+        nzeta=nzeta_ref,
+        mpert=mpert,
+        mlow=mlow,
+        npert=npert,
+        nlow=nlow,
+        use_galerkin=true
+    )
+    wv_ref_galerkin, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_ref_galerkin, wall_settings)
+    # Again, protect the reference matrix from being overwritten by pooled allocations.
+    wv_ref_galerkin = copy(wv_ref_galerkin)
+    ref_norm_galerkin = norm(wv_ref_galerkin)
+
+    for (i, mtheta) in enumerate(mtheta_values)
+        nzeta = mtheta
+        println("  3D: mtheta = $(rpad(string(mtheta), 5)), nzeta = $(rpad(string(nzeta), 5))")
+
+        # Collocation
+        input_colloc = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=nzeta,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=false
+        )
+        t_colloc = @belapsed GPEC.Vacuum.compute_vacuum_response($input_colloc, $wall_settings)
+        wv3, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_colloc, wall_settings)
+        errs_colloc[i] = norm(wv3 .- wv_ref_galerkin) / ref_norm_galerkin
+        times_colloc[i] = t_colloc
+
+        # Galerkin
+        input_galerkin = make_vacuum_input(
+            equil;
+            ψ=ψ,
+            mtheta=mtheta,
+            nzeta=nzeta,
+            mpert=mpert,
+            mlow=mlow,
+            npert=npert,
+            nlow=nlow,
+            use_galerkin=true
+        )
+        t_galerkin = @belapsed GPEC.Vacuum.compute_vacuum_response($input_galerkin, $wall_settings)
+        wv3g, _, _, _, _ = GPEC.Vacuum.compute_vacuum_response(input_galerkin, wall_settings)
+        errs_galerkin[i] = norm(wv3g .- wv_ref_galerkin) / ref_norm_galerkin
+        times_galerkin[i] = t_galerkin
+
+        @printf("    Collocation:  t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_colloc, errs_colloc[i])
+        @printf("    Galerkin:     t = %.3f s,  rel‖Δwv‖ = %.3e\n", t_galerkin, errs_galerkin[i])
+    end
+
+    # Two‑pane plot: left = convergence, right = runtime (mtheta = nzeta on x-axis)
+    plt = plot(; layout=(1, 2), size=(1100, 420))
+
+    # Convergence
+    plot!(plt[1], mtheta_values, errs_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ = nzeta", ylabel="rel‖Δwv‖",
+        title="3D Vacuum: wv convergence vs mθ = nzeta")
+    plot!(plt[1], mtheta_values, errs_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    # Runtime
+    plot!(plt[2], mtheta_values, times_colloc;
+        lw=2, marker=:circle, xscale=:log10, yscale=:log10,
+        label="Collocation", xlabel="mθ = nzeta", ylabel="runtime [s]",
+        title="3D Vacuum: runtime vs mθ = nzeta")
+    plot!(plt[2], mtheta_values, times_galerkin;
+        lw=2, marker=:square, xscale=:log10, yscale=:log10,
+        label="Galerkin")
+
+    plot!(plt[1]; legend=:bottomleft)
+    plot!(plt[2]; legend=:topleft)
+
+    outpath = joinpath(@__DIR__, "vacuum_galerkin_3d.png")
+    savefig(plt, outpath)
+    println("\n3D results saved to $(outpath)")
+
+    return (; mtheta_values, times_colloc, times_galerkin, errs_colloc, errs_galerkin)
+end
+
+"""
+    main()
+
+Entry point when running this file as a script.
+
+Usage (from repository root):
+
+```bash
+julia --project=. benchmarks/benchmark_vacuum_galerkin.jl
+```
+
+Edit the `mtheta_values` and other keyword arguments in the calls below to
+explore different resolution ranges.
+"""
+function main()
+    # 2D Solovev example
+    example_2d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example")
+
+    # 3D Solovev example
+    example_3d = joinpath(@__DIR__, "..", "examples", "Solovev_ideal_example_3D")
+
+    benchmark_vacuum_2d(
+        example_2d;
+        ψ=1.0,
+        mtheta_values=16 .* (2 .^ (0:9)),  # 16 → 8192 (easily editable)
+        mpert=31,
+        mlow=-15,
+        npert=1,
+        nlow=1
+    )
+
+    benchmark_vacuum_3d(
+        example_3d;
+        ψ=1.0,
+        mtheta_values=16 .* (2 .^ (0:3)), # 16 → 128 (easily editable)
+        mpert=31,
+        mlow=-15,
+        npert=1,
+        nlow=1
+    )
+
+    return nothing
+end
+
+if abspath(PROGRAM_FILE) == @__FILE__
+    main()
+end
diff --git a/examples/DIIID-like_ideal_example/gpec.toml b/examples/DIIID-like_ideal_example/gpec.toml
index 982f6817..f95e67dc 100644
--- a/examples/DIIID-like_ideal_example/gpec.toml
+++ b/examples/DIIID-like_ideal_example/gpec.toml
@@ -14,16 +14,6 @@ newq0 = 0                               # Override for on-axis safety factor (0
 etol = 1e-7                             # Error tolerance for equilibrium solver
 force_termination = false               # Terminate after equilibrium setup (skip stability calculations)
 
-[Wall]
-shape = "nowall"                        # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
-a = 0.2415                              # Distance from plasma (conformal) or shape parameter
-aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
-bw = 1.5                                # Elongation parameter for wall shapes
-cw = 0                                  # Offset of wall center from major radius
-dw = 0.5                                # Triangularity parameter for wall shapes
-tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
-equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
-
 [ForceFreeStates]
 bal_flag = false              # Ideal MHD ballooning criterion for short wavelengths
 mat_flag = true               # Construct coefficient matrices for diagnostic purposes
@@ -63,6 +53,16 @@ save_interval = 3            # Save every Nth ODE step (1=all, 10=every 10th). A
 singfac_min = 1e-4            # Fractional distance from rational q at which ideal jump enforced
 ucrit = 1e4                   # Maximum fraction of solutions allowed before re-normalized
 
+[Wall]
+shape = "nowall"                        # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
+a = 0.2415                              # Distance from plasma (conformal) or shape parameter
+aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
+bw = 1.5                                # Elongation parameter for wall shapes
+cw = 0                                  # Offset of wall center from major radius
+dw = 0.5                                # Triangularity parameter for wall shapes
+tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
+equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
+
 [ForcingTerms]
 forcing_data_file = "forcing.dat"       # Path to forcing data file (n, m, complex amplitude)
 forcing_data_format = "ascii"           # Format of forcing data: "ascii" or "hdf5"
diff --git a/examples/Solovev_ideal_example/gpec.toml b/examples/Solovev_ideal_example/gpec.toml
index 0065fde8..108c55bd 100644
--- a/examples/Solovev_ideal_example/gpec.toml
+++ b/examples/Solovev_ideal_example/gpec.toml
@@ -14,28 +14,6 @@ newq0 = 0                               # Override for on-axis safety factor (0
 etol = 1e-7                             # Error tolerance for equilibrium solver
 force_termination = false               # Terminate after equilibrium setup (skip stability calculations)
 
-
-[Wall]
-shape = "conformal"                     # Wall shape (nowall, conformal, elliptical, dee, mod_dee, filepath)
-a = 0.2415                              # Distance from plasma (conformal) or shape parameter
-aw = 0.05                               # Half-thickness parameter for Dee-shaped walls
-bw = 1.5                                # Elongation parameter for wall shapes
-cw = 0                                  # Offset of wall center from major radius
-dw = 0.5                                # Triangularity parameter for wall shapes
-tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
-equal_arc_wall = true                   # Equal arc length distribution of nodes on wall
-
-# [PerturbedEquilibrium]
-# # Uncomment this section to enable perturbed equilibrium calculations
-# forcing_data_file = "forcing.dat"      # Path to forcing data (n, m, real, imag)
-# forcing_data_format = "ascii"          # "ascii" or "hdf5"
-# fixed_boundary = false                 # Fixed boundary flag
-# output_eigenmodes = true               # Output mode fields as b-fields
-# compute_response = true                # Compute plasma response
-# compute_singular_coupling = true       # Compute singular coupling metrics
-# verbose = true                         # Enable verbose logging
-# write_outputs_to_HDF5 = true           # Write outputs to HDF5
-
 [ForceFreeStates]
 bal_flag = false              # Ideal MHD ballooning criterion for short wavelengths
 mat_flag = true               # Construct coefficient matrices for diagnostic purposes
@@ -75,7 +53,7 @@ ucrit = 1e3                   # Maximum fraction of solutions allowed before re-
 force_wv_symmetry = true      # Forces vacuum energy matrix symmetry
 save_interval = 3            # Save every Nth ODE step (1=all, 10=every 10th). Always saves near rational surfaces.
 
-[WALL]
+[Wall]
 shape = "conformal"           # String selecting wall shape ["nowall", "conformal", "elliptical", "dee", "mod_dee", "from_file"]
 a = 0.2415                    # The distance of the wall from the plasma in units of major radius (conformal), or minor radius parameter (others).
 aw = 0.05                     # Half-thickness of the wall.
diff --git a/examples/Solovev_ideal_example_3D/gpec.toml b/examples/Solovev_ideal_example_3D/gpec.toml
index d9e8b663..93d4e995 100644
--- a/examples/Solovev_ideal_example_3D/gpec.toml
+++ b/examples/Solovev_ideal_example_3D/gpec.toml
@@ -32,8 +32,8 @@ nn_high = 1                   # Largest toroidal mode number to include
 delta_mlow = 8                # Expands lower bound of Fourier harmonics
 delta_mhigh = 8               # Expands upper bound of Fourier harmonics
 delta_mband = 0               # Integration keeps only this wide a band...
-mthvac = 96                  # Number of points used in splines over poloidal angle at plasma-vacuum interface.
-nzvac = 64
+mthvac = 128                  # Number of points used in splines over poloidal angle at plasma-vacuum interface.
+nzvac = 128
 thmax0 = 1                    # Linear multiplier on the automatic choice of theta integration bounds
 
 kin_flag = false              # Kinetic EL equation (default: false)
@@ -63,14 +63,3 @@ cw = 0                                  # Offset of wall center from major radiu
 dw = 0.5                                # Triangularity parameter for wall shapes
 tw = 0.05                               # Sharpness of wall corners (try 0.05 as initial value)
 equal_arc_wall = false                   # Equal arc length distribution of nodes on wall
-
-# [PerturbedEquilibrium]
-# # Uncomment this section to enable perturbed equilibrium calculations
-# forcing_data_file = "forcing.dat"      # Path to forcing data (n, m, real, imag)
-# forcing_data_format = "ascii"          # "ascii" or "hdf5"
-# fixed_boundary = false                 # Fixed boundary flag
-# output_eigenmodes = true               # Output mode fields as b-fields
-# compute_response = true                # Compute plasma response
-# compute_singular_coupling = true       # Compute singular coupling metrics
-# verbose = true                         # Enable verbose logging
-# write_outputs_to_HDF5 = true           # Write outputs to HDF5
diff --git a/examples/Solovev_ideal_example_3D/run_example.jl b/examples/Solovev_ideal_example_3D/run_example.jl
new file mode 100644
index 00000000..fa0a9e53
--- /dev/null
+++ b/examples/Solovev_ideal_example_3D/run_example.jl
@@ -0,0 +1,4 @@
+using Pkg;
+Pkg.activate(joinpath(@__DIR__, "../.."))
+using GeneralizedPerturbedEquilibrium
+GeneralizedPerturbedEquilibrium.main([dirname(@__FILE__)])
diff --git a/src/ForceFreeStates/Free.jl b/src/ForceFreeStates/Free.jl
index 9c8ec89c..f0e4a9be 100644
--- a/src/ForceFreeStates/Free.jl
+++ b/src/ForceFreeStates/Free.jl
@@ -32,10 +32,7 @@ and data dumping.
 
     # Scale by (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126]
     singfac = vec((mlow:mhigh) .- qlim .* (nlow:nhigh)')
-    @inbounds for ipert in 1:numpert_total
-        @views vac_data.wv[ipert, :] .*= singfac[ipert]
-        @views vac_data.wv[:, ipert] .*= singfac[ipert]
-    end
+    @inbounds @views vac_data.wv .*= singfac .* singfac'
 
     # Compute complex energy eigenvalues and vectors
     vac_data.wt .= wp .+ vac_data.wv
@@ -75,13 +72,11 @@ and data dumping.
     # Compute plasma and vacuum contributions.
     # wpt = wt' * wp * wt  ; wvt = wt' * wv * wt
     mul!(tmp_mat, wp, vac_data.wt)
-    mul!(wpt, adjoint(vac_data.wt), tmp_mat)
+    mul!(wpt, vac_data.wt', tmp_mat)
     mul!(tmp_mat, vac_data.wv, vac_data.wt)
-    mul!(wvt, adjoint(vac_data.wt), tmp_mat)
-    for ipert in 1:numpert_total
-        vac_data.ep[ipert] = wpt[ipert, ipert]
-        vac_data.ev[ipert] = wvt[ipert, ipert]
-    end
+    mul!(wvt, vac_data.wt', tmp_mat)
+    vac_data.ep .= diag(wpt)
+    vac_data.ev .= diag(wvt)
 
     # Normalize eigenvectors based on scaled wt
     coeffs = odet.u[:, :, 1, end] \ (vac_data.wt .* (2π * equil.psio * 1e-3))
@@ -123,8 +118,8 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium
     # TODO: 4 spline points is arbitrary - is there a better way?
     qedge = profiles.q_spline(ctrl.psiedge)
     npsi = max(4, ceil(Int, (intr.qlim - qedge) * intr.nhigh * 4))
-    psi_array = zeros(Float64, npsi + 1)
-    wv_array = zeros(ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total)
+    psi_array = zeros!(pool, Float64, npsi + 1)
+    wv_array = zeros!(pool, ComplexF64, npsi + 1, intr.numpert_total, intr.numpert_total)
 
     for i in 1:(npsi+1)
         # Space points evenly in q
@@ -143,10 +138,7 @@ function free_compute_wv_spline(ctrl::ForceFreeStatesControl, equil::Equilibrium
 
         # Apply singular factor scaling: (m - n*q)(m' - n'*q) [Chance Phys. Plasmas 1997 2161 eq. 126]
         singfac = vec((intr.mlow:intr.mhigh) .- qi .* (intr.nlow:intr.nhigh)')
-        @inbounds for ipert in 1:intr.numpert_total
-            @views wv[ipert, :] .*= singfac[ipert]
-            @views wv[:, ipert] .*= singfac[ipert]
-        end
+        @inbounds @views wv .*= singfac .* singfac'
 
         @views wv_array[i, :, :] .= wv
     end
diff --git a/src/PerturbedEquilibrium/SingularCoupling.jl b/src/PerturbedEquilibrium/SingularCoupling.jl
index b27da960..40a8c1ce 100644
--- a/src/PerturbedEquilibrium/SingularCoupling.jl
+++ b/src/PerturbedEquilibrium/SingularCoupling.jl
@@ -154,7 +154,7 @@ function compute_singular_coupling_metrics!(
             # Compute Green's functions at this surface for this n
             # TODO: This assumes an initial 2D equilibrum, getting 2D Green's functions for independent n
             vac_input = Vacuum.VacuumInput(equil, sing_surf.psifac, mtheta, 1, mpert, mlow, 1, nn)
-            _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings; green_only=true)
+            _, grri, grre, _, _ = Vacuum.compute_vacuum_response(vac_input, wall_settings)
 
             # Store in singular surface struct (overwrites for each n)
             ffs_intr.sing[s].grri = grri
diff --git a/src/Utilities/FourierTransforms.jl b/src/Utilities/FourierTransforms.jl
index 36a53e0b..ae3fe8ce 100644
--- a/src/Utilities/FourierTransforms.jl
+++ b/src/Utilities/FourierTransforms.jl
@@ -66,19 +66,16 @@ just use the n argument. In 3D, we need to compute the basis for all modes and g
 # Returns
 
 - 2D
-  - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν)` [mtheta, mpert]
-  - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν)` [mtheta, mpert]
+  - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν))` [mtheta, mpert]
 - 3D
-  - `cos_mn_basis::Matrix{Float64}`: Cosine coefficients `cos(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert]
-  - `sin_mn_basis::Matrix{Float64}`: Sine coefficients `sin(m*θ - n*ν - n*ϕ)` [mtheta * nzeta, mpert * npert]
+  - `exp_mn_basis::Matrix{ComplexF64}`: Exponential coefficients `exp(i(m*θ - n*ν - n*ϕ))` [mtheta * nzeta, mpert * npert]
 
 # Notes
 
 The theta and phi grids are uniform: `θᵢ = 2π*i/mtheta` for `i = 0:mtheta-1` and `ϕⱼ = 2π*j/nzeta` for `j = 0:nzeta-1`
 
 When `n=0, ν=0` (default), this reduces to simple harmonic basis:
-- `cos_mn_basis[i,l] = cos(m*θᵢ)`
-- `sin_mn_basis[i,l] = sin(m*θᵢ)`
+- `exp_mn_basis[i,l] = exp(i(m*θᵢ))`
 """
 function compute_fourier_coefficients(
     mtheta::Int,
@@ -100,17 +97,15 @@ function compute_fourier_coefficients(
         @assert length(ν) == mtheta "ν must have length mtheta"
 
         # In 2D, we only use one toroidal mode at a time
-        # Compute sin(mθ - nν) and cos(mθ - nν)
-        sin_mn_basis = sin.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)
-        cos_mn_basis = cos.((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν)
+        # Compute exp(i(mθ - nν))
+        exp_mn_basis = exp.(im .* ((mlow .+ (0:(mpert-1))') .* θ_grid .- n_2D .* ν))
     else # 3D
         @assert (n_2D === nothing && ν === nothing) "n_2D and ν should be nothing for 3D"
 
         # In 3D, we need to compute the basis for all modes and grid points
-        # Compute sin(mθ - nζ) and cos(mθ - nζ)
+        # Compute exp(i(mθ - nζ))
         ζ_grid = range(; start=0, length=nzeta, step=2π/nzeta)
-        sin_mn_basis = zeros(mtheta * nzeta, mpert * npert)
-        cos_mn_basis = zeros(mtheta * nzeta, mpert * npert)
+        exp_mn_basis = zeros(ComplexF64, mtheta * nzeta, mpert * npert)
         for idx_n in 1:npert
             n = nlow + idx_n - 1
             n_col_offset = (idx_n - 1) * mpert
@@ -119,16 +114,13 @@ function compute_fourier_coefficients(
                 col = idx_m + n_col_offset
                 for (j, ζ) in enumerate(ζ_grid), (i, θ) in enumerate(θ_grid)
                     idx = i + (j-1)*mtheta
-                    arg = m * θ - n * ζ
-                    s, c = sincos(arg)
-                    cos_mn_basis[idx, col] = c
-                    sin_mn_basis[idx, col] = s
+                    exp_mn_basis[idx, col] = exp(im * (m * θ - n * ζ))
                 end
             end
         end
     end
 
-    return cos_mn_basis, sin_mn_basis
+    return exp_mn_basis
 end
 
 """
@@ -207,8 +199,8 @@ function FourierTransform(
     n::Int=0,
     ν::Vector{Float64}=zeros(Float64, mtheta)
 )
-    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν)
-    return FourierTransform(mtheta, mpert, mlow, cos_mn_basis, sin_mn_basis)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, 1, 1, 1; n_2D=n, ν=ν)
+    return FourierTransform(mtheta, mpert, mlow, real(exp_mn_basis), imag(exp_mn_basis))
 end
 
 """
diff --git a/src/Vacuum/Kernel2D.jl b/src/Vacuum/Kernel2D.jl
index eac91b01..17748804 100644
--- a/src/Vacuum/Kernel2D.jl
+++ b/src/Vacuum/Kernel2D.jl
@@ -59,56 +59,124 @@ const GL8_LAGRANGE_STENCILS = precompute_lagrange_stencils(GL8.x)
 # and per-n sinh/cosh cache are defined in PnQuadCache.jl.
 
 """
-    kernel!(grad_greenfunction, greenfunction, observer, source, n)
+    compute_2D_kernel_matrices!(K, G, observer, source, n, Z, Gram)
 
-Compute kernels of integral equation for Laplace's equation in a torus.
-**WARNING: This kernel only supports closed toroidal walls currently.
-The residue calculation needs to be updated for open walls.**
+Compute the **Fourier/Galerkin-projected** 2D vacuum boundary-integral kernel blocks for
+Laplace’s equation in an axisymmetric torus, **without ever forming the dense
+`M×M` “point-to-point” kernel matrices.
 
-# Arguments
+This is the fused “evaluate kernel + project” path that the vacuum solver uses:
 
-  - `grad_greenfunction`: Gradient Green's function matrix (output)
-  - `greenfunction`: Green's function matrix (output)
-  - `observer`: Observer geometry struct (PlasmaGeometry or WallGeometry)
-  - `source`: Source geometry struct (PlasmaGeometry or WallGeometry)
-  - `n`: Toroidal mode number
+    Kc = Zᴴ * K * Z
+    Gc = Zᴴ * G
 
-# Returns
+where:
 
-Modifies `grad_greenfunction` and `greenfunction` in place.
-Note that greenfunction is zeroed each time this function is called,
-but grad_greenfunction is not since it fills a different block of the
-(2 * mtheta, 2 * mtheta) depending on the source/observer.
+  - `K` is the **double-layer** kernel (normal derivative of the Green’s function),
+  - `G` is the **single-layer** kernel (Green’s function itself; only needed for plasma-as-source),
+  - `Z ∈ ℂ^{M×P}` is the complex Fourier basis on the poloidal grid,
+    and `Zᴴ` is its conjugate transpose.
 
-# Notes
+Rather than computing a full kernel row `K[j, :]` and then multiplying by `Z`, this routine
+projects **on the fly**:
+
+  - For each observer node `j`, it accumulates the projected row-vector
+    `proj_k = (K[j,:] * weights) · Z` and (optionally) `proj_g = (G[j,:] * weights) · Z`
+    into length-`P` work buffers.
+
+  - It then performs a **rank-1 update** into the appropriate projected block:
+
+        Kc += conj(Z[j, :])' * proj_k
+        Gc += conj(Z[j, :])' * proj_g
+
+This reduces peak memory from `O(M^2)` to `O(MP + P^2)` while keeping the same
+mathematical discretization.
+
+## Arguments
+
+  - `Kc`: Complex global projected double-layer kernel matrix.
+  - `Gc`: Complex global projected single-layer kernel matrix.
+  - `observer`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`.
+  - `source`: `PlasmaGeometry` or `WallGeometry` object providing `x(θ)` and `z(θ)`.
+  - `n`: Integer representing the order of the toroidal Fourier component.
+  - `Z`: Complex Fourier basis sampled on the `θ` grid.
+  - `Gram`: Diagonal of the mode-space Gram matrix for this basis on the discrete grid.
+
+## Block layout
+
+`Kc` and `Gc` are the complex global projected matrices. `Kc` contains four blocks corresponding to
+plasma/wall as observer/source, `Gc` contains two blocks corresponding to plasma/wall as observer.
+This function writes **only one block** to each of `Kc` and `Gc` per call:
+
+  - `Kc_block` is a `P×P` view into `Kc` selected by `(observer isa PlasmaGeometry ? 1 : 2, source isa PlasmaGeometry ? 1 : 2)`.
+  - `Gc_block` is a `P×(2P)` view into `Gc` with the same observer block-row; only the columns
+    corresponding to the source being plasma are populated (when `source isa PlasmaGeometry`).
+
+## Toroidal Green's functions
+
+  - The scalar `G_n` returned by `green(...)` is `2π * 𝒢ⁿ(θ, θ′)` (Chance 1997),
+    the `n`-th toroidal Fourier component of the Laplace Green’s function in axisymmetry.
+  - The scalar `gradG_n` returned by `green(...)` corresponds to the toroidal-mode `n`
+    contribution to the **double-layer** integrand `𝒥 * (∇′𝒢ⁿ · ∇′ℒ)`
+    (Chance 1997), i.e. the normal-derivative factor multiplied by the geometric Jacobian.
+  - `gradG_0` is the `n = 0` piece used for analytic diagonal/singularity bookkeeping.
+
+## Numerical treatment of the singularity
+
+The toroidal Green’s function kernel is weakly singular as `θ′ → θ`. The implementation follows
+Chance *Phys. Plasmas* **4**, 2161 (1997) and uses a mixed strategy:
+
+  - **Far field (nonsingular region)**: composite Simpson’s `1/3` rule on the uniform `θ` grid,
+    skipping the near-singular stencil around `j`.
+  - **Near field (singular panels)**: 8-point Gauss–Legendre quadrature on the two panels
+    of length `dtheta` immediately to the left/right of `θ_j`, using:
+      + periodic cubic splines of `source.x(θ)` and `source.z(θ)` to evaluate geometry at off-grid nodes,
+      + precomputed 5-point Lagrange stencils to map each Gauss node back to the five neighboring
+        discrete source indices `[j-2, j-1, j, j+1, j+2]` *without allocations*,
+      + analytic logarithmic/singular correction terms (`log_correction_array`) for the single-layer
+        kernel when the observer/source block is plasma–plasma (Chance 1997, e.g. eqs. 75, 78),
+      + an analytic diagonal/residue correction for the double-layer kernel (Chance 1997, Table I / residues).
 
-  - Uses Simpson's rule for integration away from singular points
-  - Uses Gaussian quadrature near singular points for improved accuracy
-  - Implements analytical singularity removal [Chance Phys. Plasmas 1997 2161]
+## Performance and allocation avoidance (hot-path optimizations)
+
+This routine is intentionally written to be allocation-light in tight loops:
+
+  - **Precomputed quadrature tables**: `GL8` and `GL8_LAGRANGE_STENCILS` are global constants.
+  - **Hoisted n-dependent constants**: `gamma_prefactor = 2√π * Γ(1/2 - n)` is computed once per call
+    and passed into `green(...)` rather than recomputed per quadrature node.
+  - **Spline derivative batching**: `∂R/∂θ` and `∂Z/∂θ` are evaluated on the full grid once (for Simpson),
+    while off-grid Gauss points evaluate splines directly.
+  - **Projection reuse**: the transpose `Zt = transpose(Z)` is materialized so that “row-accumulate”
+    operations `_accum_row!` can access `Z` with contiguous column memory.
+  - **Rank-1 assembly**: the final projected update uses `conj(Z[j,:]) ⊗ proj_*` via `_rank1_conj!`,
+    avoiding constructing intermediate `P×P` temporaries.
+
+## Caveats / limitations
+
+  - **Closed-wall assumption**: the current residue/diagonal handling is written for closed,
+    periodic toroidal boundaries. Open-wall residue logic is not implemented.
 """
 @with_pool pool function compute_2D_kernel_matrices!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
-    n::Int
+    n::Int,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractVector{ComplexF64}
 )
 
-    mtheta = length(observer.x)
-    dtheta = 2π / mtheta
-    theta_grid = range(; start=0, length=mtheta, step=dtheta)
+    M, P = size(Z) # M = mtheta, P = num_modes
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
+    dtheta = 2π / M
+    theta_grid = range(; start=0, length=M, step=dtheta)
 
     # Take a view of the corresponding block of the grad_greenfunction
-    col_index = (source isa PlasmaGeometry ? 1 : 2)
-    row_index = (observer isa PlasmaGeometry ? 1 : 2)
-    grad_greenfunction_block = view(
-        grad_greenfunction,
-        ((row_index-1)*mtheta+1):(row_index*mtheta),
-        ((col_index-1)*mtheta+1):(col_index*mtheta)
-    )
+    col_idx = (source isa PlasmaGeometry ? 1 : 2)
+    row_idx = (observer isa PlasmaGeometry ? 1 : 2)
+    Kc_block = view(Kc, ((row_idx-1)*P+1):(row_idx*P), ((col_idx-1)*P+1):(col_idx*P))
+    Gc_block = view(Gc, ((row_idx-1)*P+1):(row_idx*P), :)
 
-    # Zero out greenfunction at start of each kernel call
-    fill!(greenfunction, 0.0)
     # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
     populate_greenfunction = source isa PlasmaGeometry
 
@@ -119,7 +187,7 @@ but grad_greenfunction is not since it fills a different block of the
     log_correction_array = SVector(log_correction_2, log_correction_1, log_correction_0, log_correction_1, log_correction_2)
 
     # Precompute the n-dependent prefactor 2√π·Γ(1/2-n) [Chance Phys. Plasmas 1997 2161 eq. 40]
-    # This is constant for all source/observer point pairs within this kernel call.
+    # This constant is only computed once for each n
     gamma_prefactor = 2 * sqrt(π) * gamma(0.5 - n)
 
     # Set up periodic splines used for off-grid Gaussian quadrature points
@@ -134,42 +202,59 @@ but grad_greenfunction is not since it fills a different block of the
 
     # Precompute source derivatives on the theta grid once used in Simpson integration
     # The Gaussian singular-panel points are off-grid, so those still use spline evaluation directly.
-    dx_dtheta_grid = acquire!(pool, eltype(source.x), mtheta)
-    dz_dtheta_grid = acquire!(pool, eltype(source.z), mtheta)
+    dx_dtheta_grid = acquire!(pool, eltype(source.x), M)
+    dz_dtheta_grid = acquire!(pool, eltype(source.z), M)
 
     # Call in-place API to avoid allocations
     d1_spline_x(dx_dtheta_grid, theta_grid)
     d1_spline_z(dz_dtheta_grid, theta_grid)
 
+    # Pre-allocated Legendre buffer (hoisted out of green() to avoid per-call pool acquisition)
+    legendre_buf = acquire!(pool, Float64, n + 2)
+
+    # Per-observer projection vectors: proj = (kernel row) · Z
+    proj_k = zeros!(pool, ComplexF64, P)
+    proj_g = zeros!(pool, ComplexF64, P)
+
     # Loop through observer points
-    for j in 1:mtheta
+    for j in 1:M
         # Get observer coordinates
         x_obs, z_obs, theta_obs = observer.x[j], observer.z[j], theta_grid[j]
 
-        # Perform Simpson integration for nonsingular source points
+        # Zero out projection terms
+        fill!(proj_k, 0.0)
+        fill!(proj_g, 0.0)
+        diag_accum = 0.0
+
+        # ============================================================
+        # FAR FIELD: Simpson integration for nonsingular source points
+        # ============================================================
         # Nonsingular region endpoints are at j±2, so exclude j-1, j, and j+1.
-        @inbounds for k in 1:(mtheta-3)
-            isrc = mod1(j + 1 + k, mtheta)
-            G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n; gamma_prefactor)
+        @inbounds for k in 1:(M-3)
+            isrc = mod1(j + 1 + k, M)
+            G_n, gradG_n, gradG_0 = green(x_obs, z_obs, source.x[isrc], source.z[isrc], dx_dtheta_grid[isrc], dz_dtheta_grid[isrc], n, legendre_buf; gamma_prefactor)
 
             # Composite Simpson's 1/3 rule weights, excluding singular points
             # Note we set to 4 for even/2 for odd since we index from 1 while the formula assumes indexing from 0
-            wsimpson = dtheta / 3 * ((k == 1 || k == mtheta - 3) ? 1 : (iseven(k) ? 4 : 2))
+            wsimpson = dtheta / 3 * ((k == 1 || k == M - 3) ? 1 : (iseven(k) ? 4 : 2))
 
-            # Sum contributions to Green's function matrices using Simpson weight
+            # Sum and project contributions to Green's function matrices
             if populate_greenfunction
-                greenfunction[j, isrc] += G_n * wsimpson
+                _accum_row!(proj_g, G_n * wsimpson, Zt, isrc)
             end
-            grad_greenfunction_block[j, isrc] += gradG_n * wsimpson
+            _accum_row!(proj_k, gradG_n * wsimpson, Zt, isrc)
             # Subtract regular integral component of δⱼᵢK⁰ [Chance Phys. Plasmas 1997 2161 eq. 83]
-            grad_greenfunction_block[j, j] -= gradG_0 * wsimpson
+            diag_accum -= gradG_0 * wsimpson
         end
 
-        # Perform Gaussian quadrature for singular points (source = obs point)
+        # ============================================================
+        # NEAR FIELD: Gaussian quadrature with singular correction
+        # ============================================================
         # Indices of the singularity region, [j-2, j-1, j, j+1, j+2] (allocation-free)
         for (offset_idx, offset) in enumerate(-2:2)
-            sing_idx[offset_idx] = mod1(j + offset + mtheta, mtheta)
+            sing_idx[offset_idx] = mod1(j + offset + M, M)
         end
+
         # Integrate region of length 2 * dtheta on left/right of singularity
         for leftpanel in (true, false)
             gauss_mid = theta_obs + (leftpanel ? -dtheta : dtheta)
@@ -181,7 +266,7 @@ but grad_greenfunction is not since it fills a different block of the
                 dx_dtheta_gauss = d1_spline_x(theta_gauss0)
                 z_gauss = spline_z(theta_gauss0)
                 dz_dtheta_gauss = d1_spline_z(theta_gauss0)
-                G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n; gamma_prefactor)
+                G_n, gradG_n, gradG_0 = green(x_obs, z_obs, x_gauss, z_gauss, dx_dtheta_gauss, dz_dtheta_gauss, n, legendre_buf; gamma_prefactor)
 
                 # Get stencil and weight for the Gaussian point
                 s = leftpanel ? stencils_left[ig] : stencils_right[ig]
@@ -194,54 +279,71 @@ but grad_greenfunction is not since it fills a different block of the
                         G_n += log((theta_obs - theta_gauss)^2) / x_obs
                     end
                     @inbounds for stencil_idx in 1:5
-                        greenfunction[j, sing_idx[stencil_idx]] += G_n * s[stencil_idx] * wgauss
+                        _accum_row!(proj_g, G_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                     end
                 end
 
                 # Second type of singularity: 𝒦ⁿ [Chance Phys. Plasmas 1997 2161 eq. 83, 86]
                 @inbounds for stencil_idx in 1:5
-                    grad_greenfunction_block[j, sing_idx[stencil_idx]] += gradG_n * s[stencil_idx] * wgauss
+                    _accum_row!(proj_k, gradG_n * s[stencil_idx] * wgauss, Zt, sing_idx[stencil_idx])
                 end
                 # Subtract off the diverging singular n=0 component
-                grad_greenfunction_block[j, j] -= gradG_0 * wgauss
+                diag_accum -= gradG_0 * wgauss
             end
         end
 
         # Subtract off analytic singular integral [Chance Phys. Plasmas 1997 2161 eq. 75] if plasma-plasma block
         if populate_greenfunction && observer isa PlasmaGeometry
             @inbounds for stencil_idx in 1:5
-                greenfunction[j, sing_idx[stencil_idx]] -= log_correction_array[stencil_idx] / x_obs
+                _accum_row!(proj_g, -log_correction_array[stencil_idx] / x_obs, Zt, sing_idx[stencil_idx])
             end
         end
+
+        # Project the n=0 diagonal accumulation
+        _accum_row!(proj_k, diag_accum, Zt, j)
+
+        # ── Rank-1 accumulate: K/G += conj(Z[j,:]) ⋅ proj_k/g ──
+        _rank1_conj!(Kc_block, Zt, j, proj_k)
+        if populate_greenfunction
+            _rank1_conj!(Gc_block, Zt, j, proj_g)
+        end
     end
 
     # Normals need to point outward from vacuum region. In VACUUM clockwise θ convention, normal points
     # out of vacuum for wall but inward for plasma, so we multiply by -1 for plasma sources
     if source isa PlasmaGeometry
-        grad_greenfunction_block .*= -1
+        Kc_block .*= -1
     end
 
     # Add analytic singular integral (second type) to block diagonal [Chance Phys. Plasmas 1997 2161 Table I, eq. 69, 89]
+    # The residue term is diagonal in mode space and is scaled by the Gram diagonal.
     residue = (observer isa WallGeometry) ? 0.0 : (source isa PlasmaGeometry ? 2.0 : -2.0)
-    @inbounds for i in 1:mtheta
-        grad_greenfunction_block[i, i] += residue
+    @inbounds for p in 1:P
+        Kc_block[p, p] += residue * Gram[p]
     end
 
     # Since we computed 2π𝒢, divide by 2π to get 𝒢
     if populate_greenfunction
-        greenfunction ./= 2π
+        Gc_block ./= 2π
     end
 end
 
-# Dispatch wrapper for unified 2D/3D vacuum: forwards to 5-arg compute_2D_kernel_matrices! with params.n
+"""
+    kernel!(Kc, Gc, observer, source, params::KernelParams2D, Z, Gram)
+
+Public 2D kernel entry point. This is a thin wrapper that forwards to
+`compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)`.
+"""
 function kernel!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry,WallGeometry},
     source::Union{PlasmaGeometry,WallGeometry},
-    params::KernelParams2D
+    params::KernelParams2D,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractVector{ComplexF64}
 )
-    return compute_2D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, params.n)
+    return compute_2D_kernel_matrices!(Kc, Gc, observer, source, params.n, Z, Gram)
 end
 
 #############################################################
@@ -639,19 +741,22 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c
   - Implements analytical derivatives from Chance 1997 equations
   - The coupling terms include the Jacobian factor from the coordinate transformation
   - By default uses the 2007 Legendre function implementation (Bulirsch + Gaussian integration)
+
+An overload accepting a pre-allocated `legendre_buf::Vector{Float64}` of length `n+2` is available.
+Callers in tight loops should allocate this buffer once and pass it in to avoid per-call pool acquisition.
 """
-@with_pool pool function green(
+function green(
     x_obs::Float64,
     z_obs::Float64,
     x_source::Float64,
     z_source::Float64,
     dx_dtheta::Float64,
     dz_dtheta::Float64,
-    n::Int;
+    n::Int,
+    legendre::AbstractVector{Float64};
     gamma_prefactor::Float64=2 * sqrt(π) * gamma(0.5 - n),
     uselegacygreenfunction::Bool=false
 )
-
     x_obs2 = x_obs^2
     x_source2 = x_source^2
     x_minus2 = (x_obs - x_source)^2
@@ -670,9 +775,7 @@ according to equations (36)-(42) of Chance 1997. Replaces `green` from Fortran c
     # Argument of Legendre function 𝘴 [Chance Phys. Plasmas 1997 2161 eq. 42]
     s = (x_obs2 + x_source2 + ζ2) / R2
 
-    # Legendre functions for
-    # P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1
-    legendre = acquire!(pool, Float64, n + 2)
+    # Legendre functions: P⁰ = p0, P¹ = p1, Pⁿ = pn, Pⁿ⁺¹ = pnp1
     if uselegacygreenfunction
         Pn_minus_half_1997!(legendre, s, n)
     else
diff --git a/src/Vacuum/Kernel3D.jl b/src/Vacuum/Kernel3D.jl
index df28dd45..dead2b31 100644
--- a/src/Vacuum/Kernel3D.jl
+++ b/src/Vacuum/Kernel3D.jl
@@ -177,74 +177,35 @@ function get_singular_quadrature(PATCH_RAD::Int, RAD_DIM::Int, INTERP_ORDER::Int
 end
 
 """
-    laplace_single_layer(x_obs, x_src) -> Float64
+    laplace_kernel(ox, oy, oz, sx, sy, sz, nx, ny, nz) -> (single, double)
 
-Evaluate the Laplace single-layer (FxU) kernel between two 3D points. Returns
-0.0 if the observation point coincides with the source point to avoid singularity.
+Fused scalar-argument Laplace kernels for the 3D vacuum BIE.
 
-The single-layer kernel φ is the fundamental solution to Laplace's equation:
+Returns a tuple `(single, double)` where:
 
-```
-φ(x_obs, x_src) = 1 / |x_obs - x_src|
-```
+  - `single = 1/r` is the single-layer kernel
+  - `double = (Δx⋅n)/r^3` is the double-layer kernel
 
-# Arguments
-
-  - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector)
-  - `x_src`: Source point (3D Cartesian coordinates, any AbstractVector)
-
-# Returns
-
-  - `Float64`: Kernel value φ(x_obs, x_src)
-"""
-function laplace_single_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real})
-    @inbounds begin
-        dx = x_obs[1] - x_src[1]
-        dy = x_obs[2] - x_src[2]
-        dz = x_obs[3] - x_src[3]
-    end
-    r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
-    return inv(sqrt(r2))
-end
-
-"""
-    laplace_double_layer(x_obs, x_src, n_src) -> Float64
-
-Evaluate the Laplace double-layer (DxU) kernel between a point and a surface element. Returns
-0.0 if the observation point coincides with the source point to avoid singularity. Allocation-free
-scalar arithmetic is used for maximum performance.
-
-The double-layer kernel K is the normal derivative of the fundamental solution:
-
-```
-K(x_obs, x_src, n_src) = ∇_{x_src} φ · n_src = (x_obs - x_src) · n_src / |x_obs - x_src|³
-```
-
-# Arguments
-
-  - `x_obs`: Observation point (3D Cartesian coordinates, any AbstractVector)
-  - `x_src`: Source point on surface (3D Cartesian coordinates, any AbstractVector)
-  - `n_src`: Outward UNIT normal at source point (must be normalized!, any AbstractVector)
-
-# Returns
-
-  - `Float64`: Kernel value K(x_obs, x_src, n_src)
+This is used when `compute_3D_kernel_matrices!` needs **both** kernels for the same pair, so the
+distance computation (`sqrt(r²)`) is shared. Returns `(0.0, 0.0)` when `r² < 1e-30`.
 """
-function laplace_double_layer(x_obs::AbstractVector{<:Real}, x_src::AbstractVector{<:Real}, n_src::AbstractVector{<:Real})
-    @inbounds begin
-        dx = x_obs[1] - x_src[1]
-        dy = x_obs[2] - x_src[2]
-        dz = x_obs[3] - x_src[3]
-        nx = n_src[1]
-        ny = n_src[2]
-        nz = n_src[3]
-    end
+@fastmath @inline function laplace_kernel(
+    ox::Float64, oy::Float64, oz::Float64,
+    sx::Float64, sy::Float64, sz::Float64,
+    nx::Float64, ny::Float64, nz::Float64
+)
+    dx = ox - sx
+    dy = oy - sy
+    dz = oz - sz
     r2 = dx*dx + dy*dy + dz*dz
-    r2 < 1e-30 && return 0.0
+    r2 < 1e-30 && return (0.0, 0.0)
     rinv = inv(sqrt(r2))
+    # single-layer: 1/r
+    single = rinv
+    # double-layer: (Δx·n)/r^3
     r3inv = rinv * rinv * rinv
-    return (dx*nx + dy*ny + dz*nz) * r3inv
+    double = (dx*nx + dy*ny + dz*nz) * r3inv
+    return (single, double)
 end
 
 """
@@ -281,8 +242,7 @@ end
     interpolate_to_polar!(polar_data, patch, quad_data)
 
 Interpolate Cartesian patch data to polar quadrature points using sparse matrix multiply.
-Overwrites `polar_data` using mul! function arguments, mul!(C, A, B, α, β) -> C where
-C = α * A * B + β * C.
+Overwrites `polar_data` using mul! function arguments, mul!(C, A, B) -> C where C = A * B.
 
 # Arguments
 
@@ -291,17 +251,17 @@ C = α * A * B + β * C.
   - `P2G`: Sparse interpolation matrix
 """
 function interpolate_to_polar!(polar_data::Array{Float64,3}, patch::Array{Float64,3}, P2G::SparseMatrixCSC{Float64,Int})
-    # Flatten patch to (Ngrid × dof), apply P2G' to get (Npolar × dof)
     patch_flat = reshape(patch, :, size(patch, 3))
-    mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat, 1.0, 0.0)
+    mul!(reshape(polar_data, :, size(patch, 3)), P2G', patch_flat)
 end
 
 """
-    compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar)
+    compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, normal_orient)
 
 Compute normal vector (= ∂r/∂θ × ∂r/∂ζ) at polar quadrature points from interpolated tangent vectors.
 We already scaled the normals by normal_orient in the geometry construction, so we need to reapply
-that here since we are recomputing the normals from the derivatives.
+that here since we are recomputing the normals from the derivatives. We use inline cross products
+to avoid slice allocation.
 
 # Arguments
 
@@ -311,7 +271,6 @@ that here since we are recomputing the normals from the derivatives.
   - `normal_orient`: Multiplier applied to normals to make them orient out of vacuum region (+1 or -1)
 """
 function compute_polar_normal!(n_polar::Array{Float64,3}, dr_dθ::Array{Float64,3}, dr_dζ::Array{Float64,3}, normal_orient::Int)
-    # Inline cross product to avoid slice allocation
     @inbounds for ia in axes(dr_dθ, 2), ir in axes(dr_dθ, 1)
         n_polar[ir, ia, 1] = dr_dθ[ir, ia, 2] * dr_dζ[ir, ia, 3] - dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 2]
         n_polar[ir, ia, 2] = dr_dθ[ir, ia, 3] * dr_dζ[ir, ia, 1] - dr_dθ[ir, ia, 1] * dr_dζ[ir, ia, 3]
@@ -362,63 +321,128 @@ function KernelWorkspace(PATCH_DIM::Int, RAD_DIM::Int, ANG_DIM::Int)
 end
 
 """
-    compute_3D_kernel_matrices!(grad_greenfunction, greenfunction, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER)
+    compute_3D_kernel_matrices!(K, G, observer, source, PATCH_RAD, RAD_DIM, INTERP_ORDER, Z, Gram)
 
-Compute boundary integral kernel matrices for 3D geometries with the singular correction
-algorithm from [Malhotra Plasma Phys. and Cont. Fusion 2019 024004].
-Uses multi-threading for parallel computation over observer points.
+Compute the **Fourier/Galerkin-projected** 3D vacuum boundary-integral kernel blocks for
+Laplace’s equation, using a high-order singular quadrature / partition-of-unity (POU)
+scheme on a tensor-product `(θ, ζ)` surface grid.
 
-  - Far regions: Rectangle rule with uniform weights (1/N)
-  - Singular regions: Polar quadrature with partition-of-unity blending
+Like the 2D kernel, this routine implements the **fused projection path** used by the vacuum solve:
+it produces the projected operators in mode space **without materializing a dense**
+`N×N` point-space kernel (where `N = mtheta * nzeta`).
 
-grad_greenfunction is the double-layer kernel matrix, where each entry is
-∇_{x_src} φ(x_obs, x_src) · n_src, and greenfunction is the single-layer kernel matrix,
-where each entry is φ(x_obs, x_src).
+## Mathematical object being discretized
 
-# Arguments
+Let `x(θ, ζ) ∈ ℝ^3` be a surface parametrization (plasma or wall surface) with outward
+unit normal `n(θ, ζ)`. The Laplace kernels are:
+
+  - **Single-layer**: `φ(x_obs, x_src) = 1 / |x_obs - x_src|`
+  - **Double-layer**: `∂φ/∂n_src = ∇_{x_src} φ ⋅ n_src = (x_obs - x_src) ⋅ n_src / |x_obs - x_src|^3`
+
+This routine computes the *discrete, projected* operators corresponding to these kernels,
+using a uniform quadrature weight `dθdζ = 4π^2 / N` for the far field and a specialized
+near-field correction for the singular region.
+
+## Arguments and block layout
+
+  - `Kc`: Complex global projected double-layer kernel matrix (2P×2P).
+  - `Gc`: Complex global projected single-layer kernel matrix (2P×P).
+  - `observer`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data.
+  - `source`: `PlasmaGeometry3D` or `WallGeometry3D` object providing geometry data.
+  - `PATCH_RAD`: Half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors.
+  - `RAD_DIM`: Radial quadrature order on the polar grid (angular order is `2*RAD_DIM`).
+  - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`).
+  - `Z`: Complex Fourier basis sampled on the surface grid, shaped `N×P` (`P = number of retained modes`). `Z[idx, :]` contains the basis values at the surface node `idx`.
+  - `Gram`: Diagonal of the mode-space Gram matrix used to add the analytic “identity” term when `typeof(source) == typeof(observer)` (i.e. the same operator block that receives the Green’s-identity diagonal contribution).
+
+This routine fills exactly one `P×P` block view `Kc_block` (and optionally the corresponding `Gc_block`)
+selected by whether observer/source are plasma or wall.
 
-  - `grad_greenfunction`: Double-layer kernel matrix (Nobs × Nsrc) filled in place
+## Numerical treatment of the singularity
 
-  - `greenfunction`: Single-layer kernel matrix (Nobs × Nsrc) filled in place
-  - `observer`: Observer geometry (PlasmaGeometry3D)
-  - `source`: Source geometry (PlasmaGeometry3D)
-  - `PATCH_RAD`: Number of points adjacent to source point to treat as singular
+The kernel is weakly singular as `x_src → x_obs`. The implementation follows the
+approach used in [Malhotra Journal of Comp. Phys. 2019 108791 eq. 38]
 
-      + Total patch size in # of gridpoints = (2 * PATCH_RAD + 1) x (2 * PATCH_RAD + 1)
-  - `RAD_DIM`: Polar radial quadrature order. Angular order = 2 * RAD_DIM
-  - `INTERP_ORDER`: Lagrange interpolation order
+  - **Far field** (nonsingular sources):
 
-      + Must be ≤ (2 * PATCH_RAD + 1)
+      + Use a uniform trapezoidal/rectangle rule on the `(θ, ζ)` grid.
+      + For each observer point, a square patch of size `PATCH_DIM = 2*PATCH_RAD+1`
+        surrounding the singularity is excluded from the far-field sum.
 
-# Threading
+  - **Near field** (singular patch):
 
-This function automatically uses all available threads (`Threads.nthreads()`).
-Start Julia with `julia -t auto` or set `JULIA_NUM_THREADS` to enable multi-threading.
+      + Extract a Cartesian `PATCH_DIM×PATCH_DIM` patch of the source geometry around the
+        observer-aligned source index.
+      + Interpolate the patch to a **polar quadrature grid** (`RAD_DIM × ANG_DIM`, with `ANG_DIM=2*RAD_DIM`)
+        using a precomputed sparse interpolation operator `P2G` built from tensor-product
+        Lagrange stencils (`INTERP_ORDER` controls the stencil width).
+      + Evaluate kernels on the polar grid and weight them with a **partition-of-unity**
+        quadrature factor `Ppou` that includes the polar Jacobian factor (roughly `r * dr * dθ`)
+        and a smooth cutoff function `χ(ρ)` that localizes the singular correction.
+      + Map the polar correction back onto the Cartesian patch via `P2G` and blend with the
+        far-field trapezoid contribution using `Gpou`, so the combined weight is effectively
+        `trap*(1-χ) + singular_correction`.
+
+## Fused projection and threading
+
+This function is written to be parallel over observer points:
+
+  - Each thread owns a `KernelWorkspace` (scratch arrays for patch extraction, polar interpolation,
+    and temporary kernel values), plus per-thread accumulation buffers `proj_k` / `proj_g`
+    (length `P`) and a boolean `is_patch` mask to skip patch indices in the far-field loop.
+
+  - For a given observer index `idx_obs`, the code accumulates the **projected row**
+    `(kernel row idx_obs) · Z` directly into `proj_k` / `proj_g` using `_accum_row!`, and then writes
+    these into shared buffers `KZt[:, idx_obs]` and `GZt[:, idx_obs]`. This is race-free because
+    each observer writes to a unique column.
+
+  - After the threaded loop completes, the final `P×P` blocks are assembled efficiently with BLAS:
+
+        Kc = Zᴴ * (KZt)'
+        Gc = Zᴴ * (GZt)'
+
+    implemented as `mul!(Kc_block, Z', transpose(KZt))` (and similarly for `Gc`).
+
+Normalization by `2π` is applied to match the 2D kernel convention so the downstream “add identity”
+logic is consistent between 2D/3D.
+
+## Important parameters
+
+  - `PATCH_RAD`: half-width of the singular patch in grid points. Must satisfy `PATCH_RAD ≤ (min(source.mtheta, source.nzeta) - 1) ÷ 2` to avoid errors.
+  - `RAD_DIM`: radial quadrature order on the polar grid (angular order is `2*RAD_DIM`).
+  - `INTERP_ORDER`: Lagrange interpolation order used to build `P2G` (must satisfy `INTERP_ORDER ≤ 2*PATCH_RAD+1`).
+
+## Performance notes / numerical optimizations
+
+  - **Cached quadrature data**: `get_singular_quadrature` memoizes `P2G`, `Gpou`, `Ppou`, etc. for a given
+    `(PATCH_RAD, RAD_DIM, INTERP_ORDER)` triple to avoid expensive rebuilds.
+  - **Allocation control**: all near-field arrays live in thread-local `KernelWorkspace` objects; no per-observer
+    heap allocation is intended in the hot path.
+  - **Scalar kernel evaluation**: the Laplace kernels have scalar-argument overloads to avoid view/slice creation
+    and to enable LLVM to keep values in registers.
 """
 function compute_3D_kernel_matrices!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    K::AbstractMatrix{ComplexF64},
+    G::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
     PATCH_RAD::Int,
     RAD_DIM::Int,
-    INTERP_ORDER::Int
+    INTERP_ORDER::Int,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractVector{ComplexF64}
 )
-    num_points = observer.mtheta * observer.nzeta
-    dθdζ = 4π^2 / (num_points)
+    N, P = size(Z) # N = mtheta * nzeta, P = num_modes
+    dθdζ = 4π^2 / N
+    Zt = Matrix{ComplexF64}(transpose(Z))  # [P × M] for contiguous column access
 
-    # Get block of grad green function matrix
+    # Take a view of the corresponding block of the K and G matrices
     col_index = (source isa PlasmaGeometry3D ? 1 : 2)
     row_index = (observer isa PlasmaGeometry3D ? 1 : 2)
-    grad_greenfunction_block = view(
-        grad_greenfunction,
-        ((row_index-1)*num_points+1):(row_index*num_points),
-        ((col_index-1)*num_points+1):(col_index*num_points)
-    )
+    K_block = view(K, ((row_index-1)*P+1):(row_index*P), ((col_index-1)*P+1):(col_index*P))
+    G_block = view(G, ((row_index-1)*P+1):(row_index*P), :)
 
-    # Zero out green function matrix
-    fill!(greenfunction, 0.0)
-    # 𝒢ⁿ only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
+    # G only needed for plasma as source term (RHS of eqs. 26/27 in Chance 1997)
     populate_greenfunction = source isa PlasmaGeometry3D
 
     # Initialize quadrature data
@@ -432,38 +456,60 @@ function compute_3D_kernel_matrices!(
     @assert observer.mtheta ≥ PATCH_DIM "Must have observer.mtheta ≥ PATCH_DIM, got observer.mtheta=$(observer.mtheta), PATCH_DIM=$PATCH_DIM"
     @assert observer.nzeta ≥ PATCH_DIM "Must have observer.nzeta ≥ PATCH_DIM, got observer.nzeta=$(observer.nzeta), PATCH_DIM=$PATCH_DIM"
 
+    # Buffers for the projection: column idx_obs holds (kernel row idx_obs) · Z
+    KZt = zeros(ComplexF64, P, N)
+    GZt = zeros(ComplexF64, P, N)
+
     # Allocate thread-local workspaces (one per thread)
     max_threadid = Threads.maxthreadid()
     workspaces = [KernelWorkspace(PATCH_DIM, RAD_DIM, ANG_DIM) for _ in 1:max_threadid]
+    proj_k_all = [zeros(ComplexF64, P) for _ in 1:max_threadid]
+    proj_g_all = [zeros(ComplexF64, P) for _ in 1:max_threadid]
+    is_patch_all = [falses(N) for _ in 1:max_threadid]
 
     # Parallel loop through observer points
-    Threads.@threads for idx_obs in 1:num_points
+    Threads.@threads for idx_obs in 1:N
         # Get thread-local workspace
         ws = workspaces[Threads.threadid()]
         (; r_patch, dr_dθ_patch, dr_dζ_patch, r_polar, dr_dθ_polar, dr_dζ_polar,
             n_polar, M_polar_single, M_polar_double, M_grid_single_flat, M_grid_double_flat) = ws
+        proj_k = proj_k_all[Threads.threadid()]
+        proj_g = proj_g_all[Threads.threadid()]
+        is_patch = is_patch_all[Threads.threadid()]
+
+        fill!(proj_k, 0.0)
+        fill!(proj_g, 0.0)
+        fill!(is_patch, false)
 
         # Convert linear index to 2D indices
         i_obs = mod1(idx_obs, observer.mtheta)
         j_obs = (idx_obs - 1) ÷ observer.mtheta + 1
-        r_obs = @view observer.r[idx_obs, :]
+        ox = observer.r[idx_obs, 1]
+        oy = observer.r[idx_obs, 2]
+        oz = observer.r[idx_obs, 3]
+
+        # Mark patch source indices so the far-field loop can skip them
+        @inbounds for jj in 1:PATCH_DIM, ii in 1:PATCH_DIM
+            idx_pol = periodic_wrap(i_obs - PATCH_RAD + ii - 1, source.mtheta)
+            idx_tor = periodic_wrap(j_obs - PATCH_RAD + jj - 1, source.nzeta)
+            is_patch[idx_pol+source.mtheta*(idx_tor-1)] = true
+        end
 
         # ============================================================
         # FAR FIELD: Trapezoidal rule for nonsingular source points
         # Note: kernels return zero for r_src = r_obs
         # ============================================================
-        @inbounds for idx_src in 1:num_points
-            # Evaluate kernels at grid points
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            K_single = laplace_single_layer(r_obs, r_src)
-            K_double = laplace_double_layer(r_obs, r_src, n_src)
-
-            # Apply weights (periodic trapezoidal rule = constant weights)
+        @inbounds for idx_src in 1:N
+            is_patch[idx_src] && continue
+
+            sr = view(source.r, idx_src, :)
+            sn = view(source.normal, idx_src, :)
+
+            far_single, far_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* dθdζ
+            _accum_row!(proj_k, far_double, Zt, idx_src)
             if populate_greenfunction
-                greenfunction[idx_obs, idx_src] = K_single * dθdζ
+                _accum_row!(proj_g, far_single, Zt, idx_src)
             end
-            grad_greenfunction_block[idx_obs, idx_src] = K_double * dθdζ
         end
 
         # ============================================================
@@ -482,80 +528,93 @@ function compute_3D_kernel_matrices!(
         # Compute normal vectors at polar points from interpolated tangent vectors
         compute_polar_normal!(n_polar, dr_dθ_polar, dr_dζ_polar, source.normal_orient)
 
-        # Evaluate kernels at polar points with POU weighting
+        # Evaluate kernels and apply quadrature weights: area element × POU, where POU contains rdrdθ already
         @inbounds for ia in 1:ANG_DIM, ir in 1:RAD_DIM
-            # Evaluate kernels using recomputed normal (use @view to avoid allocation)
-            r_src = @view r_polar[ir, ia, :]
-            n_src = @view n_polar[ir, ia, :]
-            K_single = laplace_single_layer(r_obs, r_src)
-            K_double = laplace_double_layer(r_obs, r_src, n_src)
-
-            # Apply quadrature weights: area element × POU, where POU contains rdrdθ already
-            M_polar_single[ir, ia] = K_single * Ppou[ir, ia] * dθdζ
-            M_polar_double[ir, ia] = K_double * Ppou[ir, ia] * dθdζ
+            sr = view(r_polar, ir, ia, :)
+            sn = view(n_polar, ir, ia, :)
+            w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* Ppou[ir, ia] .* dθdζ
+            M_polar_single[ir, ia] = w_single
+            M_polar_double[ir, ia] = w_double
         end
 
         # Distribute polar singular corrections back to Cartesian grid using sparse matrix
         # grid = P2G * polar (maps Npolar → Ngrid)
-        mul!(M_grid_single_flat, P2G, vec(M_polar_single))
         mul!(M_grid_double_flat, P2G, vec(M_polar_double))
-        M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
         M_grid_double = reshape(M_grid_double_flat, PATCH_DIM, PATCH_DIM)
+        if populate_greenfunction
+            mul!(M_grid_single_flat, P2G, vec(M_polar_single))
+            M_grid_single = reshape(M_grid_single_flat, PATCH_DIM, PATCH_DIM)
+        end
 
-        # Compute remaining far-field POU contribution and near-field polar quadrature result
-        # We include this region in the far-field trapezoidal rule, so use Gpou = -χ here to get 1-χ
+        # POU correction: singular correction + (1 + Gpou) * far-field terms
         @inbounds for j in 1:PATCH_DIM, i in 1:PATCH_DIM
             # Map back to global indices
             idx_pol = periodic_wrap(i_obs - PATCH_RAD + i - 1, source.mtheta)
             idx_tor = periodic_wrap(j_obs - PATCH_RAD + j - 1, source.nzeta)
             idx_src = idx_pol + source.mtheta * (idx_tor - 1)
+            sr = view(source.r, idx_src, :)
+            sn = view(source.normal, idx_src, :)
 
-            # Remainder of far-field contribution on the singular grid: Gpou = -χ
-            r_src = @view source.r[idx_src, :]
-            n_src = @view source.normal[idx_src, :]
-            far_single = laplace_single_layer(r_obs, r_src) * Gpou[i, j] * dθdζ
-            far_double = laplace_double_layer(r_obs, r_src, n_src) * Gpou[i, j] * dθdζ
-
-            # Apply near + far contributions
+            w_single, w_double = laplace_kernel(ox, oy, oz, sr[1], sr[2], sr[3], sn[1], sn[2], sn[3]) .* (1.0 + Gpou[i, j]) .* dθdζ
+            _accum_row!(proj_k, M_grid_double[i, j] + w_double, Zt, idx_src)
             if populate_greenfunction
-                greenfunction[idx_obs, idx_src] += M_grid_single[i, j] + far_single
+                _accum_row!(proj_g, M_grid_single[i, j] + w_single, Zt, idx_src)
+            end
+        end
+
+        # ── Write projected column to buffer (each idx_obs owns its column) ──
+        @inbounds for p in 1:P
+            KZt[p, idx_obs] = proj_k[p]
+        end
+        if populate_greenfunction
+            @inbounds for p in 1:P
+                GZt[p, idx_obs] = proj_g[p]
             end
-            grad_greenfunction_block[idx_obs, idx_src] += M_grid_double[i, j] + far_double
         end
     end
 
-    # Use the same normalization as in the 2D kernel so we can just add I to the diagonal
+    # Use the same normalization as in the 2D kernel so we can just add Gram to the diagonal
     # This makes the grri logic identical to the 2D kernel.
-    grad_greenfunction_block ./= 2π
-    greenfunction ./= 2π
+    mul!(K_block, Z', transpose(KZt))
+    K_block ./= 2π
+    if populate_greenfunction
+        mul!(G_block, Z', transpose(GZt))
+        G_block ./= 2π
+    end
 
-    # Add the term that comes from the volume integral of Green's identity
-    typeof(source) == typeof(observer) && begin
-        for i in 1:num_points
-            grad_greenfunction_block[i, i] += 1.0
+    # Add the term that comes from the volume integral of Green's identity.
+    if typeof(source) == typeof(observer)
+        @inbounds for p in 1:P
+            K_block[p, p] += Gram[p]
         end
     end
 end
 
 """
-    kernel!(grad_greenfunction, greenfunction, observer, source, params::KernelParams3D)
+    kernel!(Kc, Gc, observer, source, params::KernelParams3D, Z, Gram)
+
+Public 3D kernel entry point. Forwards to:
 
-Dispatch wrapper for 3D kernel that forwards to `compute_3D_kernel_matrices!` with params.
+`compute_3D_kernel_matrices!(Kc, Gc, observer, source, params.PATCH_RAD, params.RAD_DIM, params.INTERP_ORDER, Z, Gram)`.
 """
 function kernel!(
-    grad_greenfunction::AbstractMatrix{Float64},
-    greenfunction::AbstractMatrix{Float64},
+    Kc::AbstractMatrix{ComplexF64},
+    Gc::AbstractMatrix{ComplexF64},
     observer::Union{PlasmaGeometry3D,WallGeometry3D},
     source::Union{PlasmaGeometry3D,WallGeometry3D},
-    params::KernelParams3D
+    params::KernelParams3D,
+    Z::AbstractMatrix{ComplexF64},
+    Gram::AbstractVector{ComplexF64}
 )
     return compute_3D_kernel_matrices!(
-        grad_greenfunction,
-        greenfunction,
+        Kc,
+        Gc,
         observer,
         source,
         params.PATCH_RAD,
         params.RAD_DIM,
-        params.INTERP_ORDER
+        params.INTERP_ORDER,
+        Z,
+        Gram
     )
 end
diff --git a/src/Vacuum/Utilities.jl b/src/Vacuum/Utilities.jl
index de4d220b..ac69a9e1 100644
--- a/src/Vacuum/Utilities.jl
+++ b/src/Vacuum/Utilities.jl
@@ -152,3 +152,28 @@ Inline function for fast cross product of two 3D vectors at a given index.
         c[idx, 3] = a1*b2 - a2*b1
     end
 end
+
+# ── Helpers for small-P accumulation (avoids BLAS dispatch overhead) ──────────
+"""
+Accumulate `proj += w * Zt[:, col]` with SIMD. Replaces BLAS.axpy! for small P.
+"""
+@inline function _accum_row!(proj::AbstractVector{ComplexF64}, w::Float64,
+    Zt::AbstractMatrix{ComplexF64}, col::Int)
+    @inbounds @simd for p in eachindex(proj)
+        proj[p] += w * Zt[p, col]
+    end
+end
+
+"""
+Rank-1 update `A += conj(Zt[:, j]) * y^T`. Avoids allocating a conjugated temporary.
+"""
+@inline function _rank1_conj!(A::AbstractMatrix{ComplexF64},
+    Zt::AbstractMatrix{ComplexF64}, j::Int,
+    y::AbstractVector{ComplexF64})
+    @inbounds for p2 in eachindex(y)
+        y_p2 = y[p2]
+        for p1 in axes(A, 1)
+            A[p1, p2] += conj(Zt[p1, j]) * y_p2
+        end
+    end
+end
diff --git a/src/Vacuum/Vacuum.jl b/src/Vacuum/Vacuum.jl
index 2f6335d6..72ad0ad4 100644
--- a/src/Vacuum/Vacuum.jl
+++ b/src/Vacuum/Vacuum.jl
@@ -9,7 +9,7 @@ using AdaptiveArrayPools
 
 # Import parent modules
 import ..Equilibrium
-using ..Utilities.FourierTransforms: compute_fourier_coefficients, fourier_transform!, fourier_inverse_transform!
+using ..Utilities.FourierTransforms: compute_fourier_coefficients
 
 include("Utilities.jl")
 include("DataTypes.jl")
@@ -23,44 +23,76 @@ export compute_vacuum_response, compute_vacuum_response!, compute_vacuum_field
 export extract_plasma_surface_at_psi
 
 """
-    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings;
-        green_only=false)
+    _compute_vacuum_response_single!(
+        wv, grri_in, grre_in, plasma_pts, wall_pts,
+        inputs::VacuumInput, wall_settings::WallShapeSettings;
+        n_override=nothing
+    )
 
-Compute the vacuum response matrix and both Green's functions using provided vacuum inputs.
+Compute a single vacuum solve (one coupled 3D solve, or one `n`-slice in 2D) by building and solving
+the boundary integral equation in mode space with an optional conducting wall present and writing out the results:
 
-Single entry point for vacuum calculations.
+  - `wv`: complex vacuum response matrix in straight-fieldline mode space
+  - `grri_in`: interior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility)
+  - `grre_in`: exterior Green's function sampled on the plasma surface in straight-fieldline mode space (real layout for backward compatibility)
+  - `plasma_pts` / `wall_pts`: output point clouds for downstream plotting/diagnostics
 
-  - For **3D** (`inputs.nzeta > 1`), computes the full coupled response across all (m,n) modes defined
-    by `inputs.(mlow, mpert, nlow, npert)`.
+## Fused kernel assembly + projection
 
-  - For **2D geometry** (`inputs.nzeta == 1`), supports either:
+This routine uses a **newer kernel evaluation path** that never forms dense point-space kernel matrices.
+Instead, it fuses kernel evaluation and Fourier/Galerkin projection into a single pass.
 
-      + **single-n** (`inputs.npert == 1`): computes (m,n) response for `n = inputs.nlow`
-      + **multi-n** (`inputs.npert > 1`): loops over `n = inputs.nlow:(inputs.nlow+inputs.npert-1)` and returns
-        **blocks** of the full response matrices with one block per toroidal mode number.
+The key idea is:
 
-This is the pure Julia implementation that replaces the Fortran `mscvac` function.
-It computes both interior (grri) and exterior (grre) Green's functions for GPEC response calculations.
+  - Assemble and solve the boundary integral equation directly in `P×P` mode space.
 
-# Arguments
+  - Avoid materializing `M×M` (2D) or `N×N` (3D) kernel matrices.
 
-  - `inputs`: `VacuumInput` struct with mode numbers, grid resolution, and boundary info.
-  - `wall_settings::WallShapeSettings`: Wall geometry configuration.
-  - `green_only`: If true, skip building the response matrix `wv` and return zeros for `wv` and `xzpts`.
+  - Uses complex basis `Z = C + iS` so projected operators are `P×P` complex.
 
-# Returns
+  - The projected operators are accumulated row-by-row while kernel values are computed.
+
+  - Memory drops from `O(M^2)` (or `O(N^2)`) down to `O(MP + P^2)` (or `O(NP + P^2)`).
+
+  - FLOPs remain dominated by the same scaling as the two-step approach (kernel evaluation + projection),
+    plus an additional `O(P^3)` for the LU factorization/solve in mode space.
+
+  - **Projected matrices**
+
+      + Exterior projected kernel blocks are assembled into `K_ext` and `G_ext`.
+      + Interior operators are formed from the exterior ones using the discrete Green-identity diagonal term:
+        the implementation uses `K_int = 2*Gram - K_ext` for same-type source/observer blocks. This effectively
+        computes the kernel with an negative normal direction without recalculating the kernel.
 
-  - `wv`: Complex vacuum response matrix.
+  - **Solves**
 
-      + 2D single-n: `mpert × mpert`
-      + 2D multi-n: `(mpert*npert) × (mpert*npert)` (block diagonal)
-      + 3D: `num_modes × num_modes` (full coupled)
+      + If `nowall`, solve the plasma-only `P×P` system.
+      + If a wall is present, solve the coupled `2P×2P` block system.
 
-  - `grri`: Interior Green's function matrix.
+  - **Back-compat outputs**
 
-  - `grre`: Exterior Green's function matrix.
+      + Although the solve is performed in mode space, `grri_in` and `grre_in` are reconstructed into the
+        legacy real `M×(2P)` layout for downstream code paths that still expect that shape.
 
-  - `xzpts`: Coordinate array (mtheta×4 for 2D, mtheta*nzeta×4 for 3D) [R_plasma, Z_plasma, R_wall, Z_wall].
+## Arguments
+
+  - **`wv::AbstractMatrix{ComplexF64}`**: output vacuum response matrix (modified in-place)
+  - **`grri_in::AbstractMatrix{Float64}`**: output interior Green's function (modified in-place; real/legacy layout)
+  - **`grre_in::AbstractMatrix{Float64}`**: output exterior Green's function (modified in-place; real/legacy layout)
+  - **`plasma_pts::AbstractMatrix{Float64}`**: plasma surface coordinates (modified in-place)
+  - **`wall_pts::AbstractMatrix{Float64}`**: wall surface coordinates (modified in-place)
+  - **`inputs::VacuumInput`**: mode ranges, grid resolution, and geometry settings
+  - **`wall_settings::WallShapeSettings`**: wall geometry configuration
+  - **`n_override::Union{Nothing,Int}`**: optional toroidal mode number override (only used for 2D)
+
+## 2D vs 3D behavior
+
+  - **3D (`inputs.nzeta > 1`)**: computes the full coupled response across all `(m, n)` modes specified by
+    `inputs.(mlow, mpert, nlow, npert)` in a single call using the 3D kernel method in Kernel3D.jl.
+  - **2D (`inputs.nzeta == 1`)**:
+      + If `inputs.npert == 1`, computes the response for `n = inputs.nlow` using the 2D kernel method in Kernel2D.jl.
+      + If `inputs.npert > 1`, the public driver loops over `n` and calls this function once per `n`,
+        writing block columns into the full output matrices using the 2D kernel method in Kernel2D.jl.
 """
 @with_pool pool function _compute_vacuum_response_single!(
     wv::AbstractMatrix{ComplexF64},
@@ -70,109 +102,127 @@ It computes both interior (grri) and exterior (grre) Green's functions for GPEC
     wall_pts::AbstractMatrix{Float64},
     inputs::VacuumInput,
     wall_settings::WallShapeSettings;
-    n_override::Union{Nothing,Int}=nothing,
-    green_only::Bool=false
+    n_override::Union{Nothing,Int}=nothing
 )
 
+    (; mtheta, mpert, mlow, nzeta, npert, nlow) = inputs
+
     # Initialize surface geometries
-    plasma_surf = inputs.nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
-    wall = inputs.nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
+    plasma_surf = nzeta > 1 ? PlasmaGeometry3D(inputs) : PlasmaGeometry(inputs)
+    wall = nzeta > 1 ? WallGeometry3D(inputs, wall_settings) : WallGeometry(inputs, plasma_surf, wall_settings)
 
     # Compute Fourier basis coefficients
     ν = hasproperty(plasma_surf, :ν) ? plasma_surf.ν : nothing
-    cos_mn_basis, sin_mn_basis = compute_fourier_coefficients(inputs.mtheta, inputs.mpert, inputs.mlow, inputs.nzeta, inputs.npert, inputs.nlow; n_2D=n_override, ν=ν)
-    num_points_surf, num_modes = size(cos_mn_basis)
+    exp_mn_basis = compute_fourier_coefficients(mtheta, mpert, mlow, nzeta, npert, nlow; n_2D=n_override, ν=ν)
+    num_points, num_modes = size(exp_mn_basis)
 
     # Create kernel parameters structs used to dispatch to the correct kernel
-    if inputs.nzeta > 1
-        # Hardcode these values for now - can expose to the user in the future
-        kparams = KernelParams3D(11, 20, 5)
-    else
-        kparams = KernelParams2D(n_override)
+    # Hardcode these values for now - can expose to the user in the future
+    PATCH_RAD = 11
+    RAD_DIM = 20
+    INTERP_ORDER = 5
+    kparams = nzeta > 1 ? KernelParams3D(PATCH_RAD, RAD_DIM, INTERP_ORDER) : KernelParams2D(n_override)
+
+    # Scales kernel matrix sizes by a factor of 2 if a wall is present (don't allocate unless needed)
+    wall_fac = wall.nowall ? 1 : 2
+
+    # Gram matrix diagonal for the discrete Fourier basis on the uniform grid.
+    #
+    # For the basis produced by `compute_fourier_coefficients(...)` (complex exponentials sampled on a
+    # uniform grid), the discrete inner products satisfy:
+    #
+    #     ZᴴZ = num_points · I
+    #
+    # up to roundoff, so the Gram matrix is diagonal. Store only the diagonal as a length-P vector.
+    Gram = acquire!(pool, ComplexF64, num_modes)
+    fill!(Gram, ComplexF64(num_points))
+
+    # Projected kernel matrices [P × P complex]
+    K_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, wall_fac * num_modes)
+    G_ext = zeros!(pool, ComplexF64, wall_fac * num_modes, num_modes)
+    K_int = similar!(pool, K_ext)
+    G_int = similar!(pool, G_ext)
+
+    # Fused projected kernel: compute Z^H K Z and Z^H G Z for all operator blocks
+    # Plasma-plasma block
+    kernel!(K_ext, G_ext, plasma_surf, plasma_surf, kparams, exp_mn_basis, Gram)
+    # Wall-plasma, plasma-wall, wall-wall blocks
+    if !wall.nowall
+        # Wall-plasma, plasma-wall, wall-wall blocks
+        kernel!(K_ext, G_ext, plasma_surf, wall, kparams, exp_mn_basis, Gram)
+        kernel!(K_ext, G_ext, wall, plasma_surf, kparams, exp_mn_basis, Gram)
+        kernel!(K_ext, G_ext, wall, wall, kparams, exp_mn_basis, Gram)
     end
 
-    # Active rows for computation (plasma only if no wall, plasma+wall if wall present)
-    num_points_total = wall.nowall ? num_points_surf : 2 * num_points_surf
+    # Interior kernel in real space: K_int = 2I - K_ext → Fourier transformed: K_int = 2·Gram - K_ext
+    K_int .= -K_ext
+    @inbounds for p in 1:num_modes
+        K_int[p, p] += 2 * Gram[p]
+    end
+    if !wall.nowall
+        @inbounds for p in 1:num_modes
+            K_int[num_modes+p, num_modes+p] += 2 * Gram[p]
+        end
+    end
+    G_int .= G_ext
+
+    # Solve projected BIEs for exterior and interior kernels
+    F_ext = lu!(K_ext)
+    ldiv!(F_ext, G_ext)
+    F_int = lu!(K_int)
+    ldiv!(F_int, G_int)
+
+    # Construct the vacuum response matrix: wv = (4π²/M) · Gram · G
+    wv .= view(G_ext, 1:num_modes, :)
+    @inbounds for p in 1:num_modes
+        @views wv[p, :] .*= Gram[p]
+    end
+    wv .*= (4π^2 / num_points)
 
-    # Local work arrays
-    grad_green = zeros!(pool, num_points_total, num_points_total)
-    green_temp = zeros!(pool, num_points_surf, num_points_surf)
+    # Enforce Hermitian symmetry if desired
+    inputs.force_wv_symmetry && hermitianpart!(wv)
 
+    # Backward-compatible reconstruction: grre/grri in M×2P real layout
+    # Need to convert mode space to physical space and unpack the real and imaginary parts
+    # TODO: propagate complex M * P grri/grre matrices to perturbed equilibrium code
+    # perhaps make it a complex P * P matrix? Then don't need any of this section
     # Views into output Green's function matrices for the active rows/columns
-    grre = @view grre_in[1:num_points_total, :]
-    grri = @view grri_in[1:num_points_total, :]
-
-    # Plasma–Plasma block
-    kernel!(grad_green, green_temp, plasma_surf, plasma_surf, kparams)
-
-    # Fourier transform obs=plasma, src=plasma block
-    fourier_transform!(grre, green_temp, cos_mn_basis)
-    fourier_transform!(grre, green_temp, sin_mn_basis; col_offset=num_modes)
-
+    grre = @view grre_in[1:(wall_fac*num_points), :]
+    grri = @view grri_in[1:(wall_fac*num_points), :]
+    temp = zeros!(pool, ComplexF64, num_points, num_modes)
+
+    mul!(temp, exp_mn_basis, view(G_ext, 1:num_modes, :))
+    @view(grre[1:num_points, 1:num_modes]) .= real.(temp)
+    @view(grre[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp)
+    mul!(temp, exp_mn_basis, view(G_int, 1:num_modes, :))
+    @view(grri[1:num_points, 1:num_modes]) .= real.(temp)
+    @view(grri[1:num_points, (num_modes+1):(2*num_modes)]) .= imag.(temp)
     if !wall.nowall
-        # Plasma–Wall block
-        kernel!(grad_green, green_temp, plasma_surf, wall, kparams)
-        # Wall–Wall block
-        kernel!(grad_green, green_temp, wall, wall, kparams)
-        # Wall–Plasma block
-        kernel!(grad_green, green_temp, wall, plasma_surf, kparams)
-        # Fourier transform obs=wall, src=plasma block
-        fourier_transform!(grre, green_temp, cos_mn_basis; row_offset=num_points_surf)
-        fourier_transform!(grre, green_temp, sin_mn_basis; row_offset=num_points_surf, col_offset=num_modes)
+        mul!(temp, exp_mn_basis, view(G_ext, (num_modes+1):(2*num_modes), :))
+        @view(grre[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp)
+        @view(grre[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp)
+        mul!(temp, exp_mn_basis, view(G_int, (num_modes+1):(2*num_modes), :))
+        @view(grri[(num_points+1):(2*num_points), 1:num_modes]) .= real.(temp)
+        @view(grri[(num_points+1):(2*num_points), (num_modes+1):(2*num_modes)]) .= imag.(temp)
     end
 
-    # Compute both Green's functions: exterior (kernelsign=+1) then interior (kernelsign=-1)
-    grri .= grre # start from same as exterior
-    grad_green_interior = similar!(pool, grad_green)
-    grad_green_interior .= grad_green
-
-    # Solve exterior first, overwriting grad_green to save memory since we already have the interior kernel
-    F_ext = lu!(grad_green)
-    ldiv!(F_ext, grre)
-
-    # Interior flips the sign of the normal, but not the diagonal terms, so we multiply by -1 and add 2I to the diagonal
-    grad_green_interior .*= -1
-    for i in 1:num_points_total
-        grad_green_interior[i, i] += 2.0
-    end
-    F_int = lu!(grad_green_interior)
-    ldiv!(F_int, grri)
-
-    # Always initialise wv to zero so that green_only keeps it zeroed
-    if !green_only
-        # Perform inverse Fourier transforms to get response matrix components [Chance Phys. Plasmas 2007 052506 eq. 115-118]
-        arr, aii, ari, air = ntuple(_ -> zeros(num_modes, num_modes), 4)
-        fourier_inverse_transform!(arr, grre, cos_mn_basis)
-        fourier_inverse_transform!(aii, grre, sin_mn_basis; col_offset=num_modes)
-        fourier_inverse_transform!(ari, grre, sin_mn_basis)
-        fourier_inverse_transform!(air, grre, cos_mn_basis; col_offset=num_modes)
-
-        # Final form of vacuum response matrix [Chance Phys. Plasmas 2007 052506 eq. 114]
-        wv .= complex.(arr .+ aii, air .- ari)
-        inputs.force_wv_symmetry && hermitianpart!(wv)
-
-        # Fill coordinate arrays
-        if inputs.nzeta > 1 # 3D
-            plasma_pts .= plasma_surf.r
-            wall_pts .= wall.r
-        else # 2D
-            @views begin
-                plasma_pts[:, 1] .= plasma_surf.x
-                plasma_pts[:, 2] .= 0.0
-                plasma_pts[:, 3] .= plasma_surf.z
-                wall_pts[:, 1] .= wall.x
-                wall_pts[:, 2] .= 0.0
-                wall_pts[:, 3] .= wall.z
-            end
+    if nzeta > 1 # 3D
+        plasma_pts .= plasma_surf.r
+        wall_pts .= wall.r
+    else # 2D
+        @views begin
+            plasma_pts[:, 1] .= plasma_surf.x
+            plasma_pts[:, 2] .= 0.0
+            plasma_pts[:, 3] .= plasma_surf.z
+            wall_pts[:, 1] .= wall.x
+            wall_pts[:, 2] .= 0.0
+            wall_pts[:, 3] .= wall.z
         end
     end
 end
 
 """
-    compute_vacuum_response(
-        inputs::VacuumInput,
-        wall_settings::WallShapeSettings;
-        green_only=false)
+    compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
 
 Allocate and return the vacuum response matrix and Green's functions for the given
 vacuum inputs.
@@ -182,7 +232,7 @@ implementation. For performance‑critical paths that already own preallocated s
 (e.g. `ForceFreeStates.VacuumData`), prefer the in‑place method to avoid extra
 heap allocations.
 """
-@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false)
+@with_pool pool function compute_vacuum_response(inputs::VacuumInput, wall_settings::WallShapeSettings)
 
     # Allocate storage for the vacuum response matrix and Green's functions
     numpoints = inputs.mtheta * inputs.nzeta
@@ -195,17 +245,13 @@ heap allocations.
         wall_pts=zeros!(pool, numpoints, 3)
     )
 
-    compute_vacuum_response!(vac, inputs, wall_settings; green_only=green_only)
+    compute_vacuum_response!(vac, inputs, wall_settings)
 
     return vac.wv, vac.grri, vac.grre, vac.plasma_pts, vac.wall_pts
 end
 
 """
-    compute_vacuum_response!(
-        vac_data,
-        inputs::VacuumInput,
-        wall_settings::WallShapeSettings;
-        green_only=false)
+    compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings)
 
 In-place variant that computes the vacuum response and directly populates the arrays
 stored in `vac_data`.
@@ -222,7 +268,7 @@ compatible sizes:
 This is designed to work with `ForceFreeStates.VacuumData` but does not depend on
 its concrete type (duck-typed on field names only).
 """
-function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings; green_only::Bool=false)
+function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::WallShapeSettings)
 
     mpert = inputs.mpert
     npert = inputs.npert
@@ -237,8 +283,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::
             vac_data.plasma_pts,
             vac_data.wall_pts,
             inputs,
-            wall_settings;
-            green_only=green_only
+            wall_settings
         )
     else
         # 2D vacuum: fill diagonal blocks of the response matrix
@@ -262,8 +307,7 @@ function compute_vacuum_response!(vac_data, inputs::VacuumInput, wall_settings::
                 vac_data.wall_pts,
                 inputs,
                 wall_settings;
-                n_override=n,
-                green_only=green_only
+                n_override=n
             )
         end
     end
diff --git a/test/runtests_vacuum.jl b/test/runtests_vacuum.jl
index debb8299..3ae4b648 100644
--- a/test/runtests_vacuum.jl
+++ b/test/runtests_vacuum.jl
@@ -1,4 +1,5 @@
 @testset "Vacuum.jl Unit Tests" begin
+    using LinearAlgebra
 
     @testset "Vacuum.jl (2D)" begin
 
@@ -445,6 +446,120 @@
                 @test size(plasma_pts) == (16, 3)
             end
         end
+
+        # -------------------------------------------------------------------------
+        @testset "fused vs two-step Galerkin (2D, nowall)" begin
+            # Small case where both Galerkin paths are cheap: compare K_c, G_c
+            # assembled via the full M×M kernel + projection against the fused
+            # projected kernels from the unified `kernel!` API.
+            inputs = VacuumInput(
+                mtheta_in=17,
+                nzeta_in=1,
+                x=collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=17))),
+                z=collect(0.3 .* sin.(range(0, 2π, length=17))),
+                ν=zeros(17),
+                mlow=1,
+                mpert=2,
+                nlow=1,
+                npert=1,
+                nzeta=1,
+                mtheta=32
+            )
+            wall_settings = WallShapeSettings(shape="nowall")
+
+            plasma_surf = GeneralizedPerturbedEquilibrium.Vacuum.PlasmaGeometry(inputs)
+            kparams = GeneralizedPerturbedEquilibrium.Vacuum.KernelParams2D(inputs.nlow)
+
+            # Fourier basis on the surface grid
+            exp_mn_basis = GeneralizedPerturbedEquilibrium.Utilities.FourierTransforms.compute_fourier_coefficients(
+                inputs.mtheta,
+                inputs.mpert,
+                inputs.mlow,
+                inputs.nzeta,
+                inputs.npert,
+                inputs.nlow;
+                n_2D=inputs.nlow,
+                ν=plasma_surf.ν
+            )
+            M, P = size(exp_mn_basis)
+            Gram = fill(ComplexF64(M), P)
+
+            # --- Two-step Galerkin: materialize full kernels then project ---
+            grad_green_full = zeros(Float64, 2M, 2M)
+            green_full = zeros(Float64, M, M)
+            GeneralizedPerturbedEquilibrium.Vacuum.kernel!(
+                grad_green_full,
+                green_full,
+                plasma_surf,
+                plasma_surf,
+                kparams
+            )
+
+            # Exterior projected kernels from full matrices: K_c = Z^H K Z, G_c = Z^H G Z
+            K_c_two = zeros(ComplexF64, P, P)
+            G_c_two = zeros(ComplexF64, P, P)
+            tmp = zeros(ComplexF64, M, P)
+
+            grad_pp = @view grad_green_full[1:M, 1:M]
+            mul!(tmp, grad_pp, exp_mn_basis)
+            mul!(K_c_two, exp_mn_basis', tmp)
+            mul!(tmp, green_full, exp_mn_basis)
+            mul!(G_c_two, exp_mn_basis', tmp)
+
+            # --- Fused Galerkin via unified kernel! ---
+            K_c_fused = zeros(ComplexF64, P, P)
+            G_c_fused = zeros(ComplexF64, P, P)
+            GeneralizedPerturbedEquilibrium.Vacuum.projected_kernel!(
+                K_c_fused,
+                G_c_fused,
+                plasma_surf,
+                plasma_surf,
+                kparams,
+                exp_mn_basis,
+                Gram
+            )
+
+            @test isapprox(K_c_fused, K_c_two; rtol=1e-10, atol=1e-12)
+            @test isapprox(G_c_fused, G_c_two; rtol=1e-10, atol=1e-12)
+        end
+
+        # -------------------------------------------------------------------------
+        @testset "wall Galerkin vs collocation (2D, conformal)" begin
+            mtheta_eq = 17
+            mtheta = 128
+            mpert = 3
+            boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq)))
+            boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq)))
+
+            inputs_colloc = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=1, npert=1,
+                nzeta=1, mtheta=mtheta,
+                use_galerkin=false
+            )
+            inputs_galerkin = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=1, npert=1,
+                nzeta=1, mtheta=mtheta,
+                use_galerkin=true
+            )
+
+            wall_settings = WallShapeSettings(shape="conformal", a=0.5)
+
+            wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings)
+            wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings)
+
+            M = mtheta
+            P = mpert
+
+            @test isapprox(wv_g, wv_c; rtol=1e-8)
+            @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+        end
     end
 
     # -------------------------------------------------------------------------
@@ -631,19 +746,43 @@
             @test isapprox(wv, wv', rtol=1e-12)
         end
 
-        @testset "compute_vacuum_response 3D green_only" begin
-            inputs = _make_3d_inputs(mtheta=32, nzeta=32, mtheta_eq=17)
-            wall_settings = WallShapeSettings(shape="nowall")
-            wv, grri, grre, plasma_pts, wall_pts = compute_vacuum_response(inputs, wall_settings; green_only=true)
+        @testset "wall Galerkin vs collocation (3D, conformal)" begin
+            mtheta_eq = 17
+            mtheta = 32
+            nzeta = 32
+            mpert = 2
+            npert = 2
+            boundary_x = collect(1.7 .+ 0.3 .* cos.(range(0, 2π, length=mtheta_eq)))
+            boundary_z = collect(0.3 .* sin.(range(0, 2π, length=mtheta_eq)))
+
+            inputs_colloc = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=0, npert=npert,
+                nzeta=nzeta, mtheta=mtheta,
+                use_galerkin=false
+            )
+            inputs_galerkin = VacuumInput(
+                mtheta_in=mtheta_eq, nzeta_in=1,
+                x=boundary_x, z=boundary_z, ν=zeros(mtheta_eq),
+                mlow=1, mpert=mpert, nlow=0, npert=npert,
+                nzeta=nzeta, mtheta=mtheta,
+                use_galerkin=true
+            )
 
-            numpoints = inputs.mtheta * inputs.nzeta
-            num_modes = inputs.mpert * inputs.npert
-            @test size(wv) == (num_modes, num_modes)
-            @test all(wv .== 0)
-            @test size(grri) == (2 * numpoints, 2 * num_modes)
-            @test size(grre) == (2 * numpoints, 2 * num_modes)
-            @test all(isfinite, grri)
-            @test all(isfinite, grre)
+            wall_settings = WallShapeSettings(shape="conformal", a=0.3)
+
+            wv_c, grri_c, grre_c, _, _ = compute_vacuum_response(inputs_colloc, wall_settings)
+            wv_g, grri_g, grre_g, _, _ = compute_vacuum_response(inputs_galerkin, wall_settings)
+
+            M = mtheta * nzeta
+            P = mpert * npert
+
+            @test isapprox(wv_g, wv_c; rtol=1e-8)
+            @test isapprox(grre_g[1:M, 1:(2*P)], grre_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[1:M, 1:(2*P)], grri_c[1:M, 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grre_g[(M+1):(2*M), 1:(2*P)], grre_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
+            @test isapprox(grri_g[(M+1):(2*M), 1:(2*P)], grri_c[(M+1):(2*M), 1:(2*P)]; rtol=1e-8)
         end
 
         @testset "Kernel3D laplace_single_layer" begin