diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 5aabbf7ac6..0632e3d3d8 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -10,9 +10,14 @@
 function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
                       mortar_l2::LobattoLegendreMortarL2, uEltype)
     # TODO: Taal compare performance of different types
-    fstar_threaded = [Array{uEltype, 4}(undef, nvariables(equations), nnodes(mortar_l2),
-                                        nnodes(mortar_l2), 4)
-                      for _ in 1:Threads.nthreads()]
+    fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
+                                                nnodes(mortar_l2),
+                                                nnodes(mortar_l2), 4)
+                              for _ in 1:Threads.nthreads()]
+    fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
+                                                  nnodes(mortar_l2),
+                                                  nnodes(mortar_l2), 4)
+                                for _ in 1:Threads.nthreads()]
 
     fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations),
                                             nnodes(mortar_l2), nnodes(mortar_l2))
@@ -21,7 +26,7 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
                                     nnodes(mortar_l2))
                   for _ in 1:Threads.nthreads()]
 
-    (; fstar_threaded, fstar_tmp_threaded, u_threaded)
+    (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded)
 end
 
 #     index_to_start_step_3d(index::Symbol, index_range)
@@ -521,12 +526,13 @@ function calc_mortar_flux!(surface_flux_values,
                            surface_integral, dg::DG, cache)
     @unpack neighbor_ids, node_indices = cache.mortars
     @unpack contravariant_vectors = cache.elements
-    @unpack fstar_threaded, fstar_tmp_threaded = cache
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
     index_range = eachnode(dg)
 
     @threaded for mortar in eachmortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar = fstar_threaded[Threads.threadid()]
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
         fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
 
         # Get index information on the small elements
@@ -555,7 +561,8 @@ function calc_mortar_flux!(surface_flux_values,
                                                             i_small, j_small, k_small,
                                                             element)
 
-                    calc_mortar_flux!(fstar, mesh, nonconservative_terms, equations,
+                    calc_mortar_flux!(fstar_primary, fstar_secondary, mesh,
+                                      nonconservative_terms, equations,
                                       surface_integral, dg, cache,
                                       mortar, position, normal_direction,
                                       i, j)
@@ -581,14 +588,15 @@ function calc_mortar_flux!(surface_flux_values,
         # "mortar_fluxes_to_elements!" instead.
         mortar_fluxes_to_elements!(surface_flux_values,
                                    mesh, equations, mortar_l2, dg, cache,
-                                   mortar, fstar, u_buffer, fstar_tmp)
+                                   mortar, fstar_primary, fstar_secondary, u_buffer,
+                                   fstar_tmp)
     end
 
     return nothing
 end
 
 # Inlined version of the mortar flux computation on small elements for conservation fluxes
-@inline function calc_mortar_flux!(fstar,
+@inline function calc_mortar_flux!(fstar_primary, fstar_secondary,
                                    mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                    nonconservative_terms::False, equations,
                                    surface_integral, dg::DG, cache,
@@ -603,13 +611,15 @@ end
     flux = surface_flux(u_ll, u_rr, normal_direction, equations)
 
     # Copy flux to buffer
-    set_node_vars!(fstar, flux, equations, dg, i_node_index, j_node_index,
+    set_node_vars!(fstar_primary, flux, equations, dg, i_node_index, j_node_index,
+                   position_index)
+    set_node_vars!(fstar_secondary, flux, equations, dg, i_node_index, j_node_index,
                    position_index)
 end
 
 # Inlined version of the mortar flux computation on small elements for conservation fluxes
 # with nonconservative terms
-@inline function calc_mortar_flux!(fstar,
+@inline function calc_mortar_flux!(fstar_primary, fstar_secondary,
                                    mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                    nonconservative_terms::True, equations,
                                    surface_integral, dg::DG, cache,
@@ -627,11 +637,19 @@ end
     # Compute nonconservative flux and add it to the flux scaled by a factor of 0.5 based on
     # the interpretation of global SBP operators coupled discontinuously via
     # central fluxes/SATs
-    noncons = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
-    flux_plus_noncons = flux + 0.5f0 * noncons
+    noncons_primary = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
+    noncons_secondary = nonconservative_flux(u_rr, u_ll, normal_direction, equations)
+    flux_plus_noncons_primary = flux + 0.5f0 * noncons_primary
+    flux_plus_noncons_secondary = flux + 0.5f0 * noncons_secondary
 
     # Copy to buffer
-    set_node_vars!(fstar, flux_plus_noncons, equations, dg, i_node_index, j_node_index,
+    set_node_vars!(fstar_primary, flux_plus_noncons_primary, equations, dg,
+                   i_node_index,
+                   j_node_index,
+                   position_index)
+    set_node_vars!(fstar_secondary, flux_plus_noncons_secondary, equations, dg,
+                   i_node_index,
+                   j_node_index,
                    position_index)
 end
 
@@ -639,8 +657,8 @@ end
                                             mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                             equations,
                                             mortar_l2::LobattoLegendreMortarL2,
-                                            dg::DGSEM, cache, mortar, fstar, u_buffer,
-                                            fstar_tmp)
+                                            dg::DGSEM, cache, mortar, fstar_primary,
+                                            fstar_secondary, u_buffer, fstar_tmp)
     @unpack neighbor_ids, node_indices = cache.mortars
     index_range = eachnode(dg)
 
@@ -652,8 +670,10 @@ end
         element = neighbor_ids[position, mortar]
         for j in eachnode(dg), i in eachnode(dg)
             for v in eachvariable(equations)
-                surface_flux_values[v, i, j, small_direction, element] = fstar[v, i, j,
-                                                                               position]
+                surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                       i,
+                                                                                       j,
+                                                                                       position]
             end
         end
     end
@@ -661,19 +681,19 @@ end
     # Project small fluxes to large element.
     multiply_dimensionwise!(u_buffer,
                             mortar_l2.reverse_lower, mortar_l2.reverse_lower,
-                            view(fstar, .., 1),
+                            view(fstar_secondary, .., 1),
                             fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_upper, mortar_l2.reverse_lower,
-                                view(fstar, .., 2),
+                                view(fstar_secondary, .., 2),
                                 fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_lower, mortar_l2.reverse_upper,
-                                view(fstar, .., 3),
+                                view(fstar_secondary, .., 3),
                                 fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_upper, mortar_l2.reverse_upper,
-                                view(fstar, .., 4),
+                                view(fstar_secondary, .., 4),
                                 fstar_tmp)
 
     # The flux is calculated in the outward direction of the small elements,
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index 3f286ca01f..b1204e5a15 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -271,7 +271,8 @@ end
                                             mesh::P4estMesh{3},
                                             equations::AbstractEquationsParabolic,
                                             mortar_l2::LobattoLegendreMortarL2,
-                                            dg::DGSEM, cache, mortar, fstar, u_buffer,
+                                            dg::DGSEM, cache, mortar, fstar_primary,
+                                            fstar_secondary, u_buffer,
                                             fstar_tmp)
     @unpack neighbor_ids, node_indices = cache.mortars
     index_range = eachnode(dg)
@@ -283,8 +284,10 @@ end
         element = neighbor_ids[position, mortar]
         for j in eachnode(dg), i in eachnode(dg)
             for v in eachvariable(equations)
-                surface_flux_values[v, i, j, small_direction, element] = fstar[v, i, j,
-                                                                               position]
+                surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                       i,
+                                                                                       j,
+                                                                                       position]
             end
         end
     end
@@ -292,19 +295,19 @@ end
     # Project small fluxes to large element.
     multiply_dimensionwise!(u_buffer,
                             mortar_l2.reverse_lower, mortar_l2.reverse_lower,
-                            view(fstar, .., 1),
+                            view(fstar_secondary, .., 1),
                             fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_upper, mortar_l2.reverse_lower,
-                                view(fstar, .., 2),
+                                view(fstar_secondary, .., 2),
                                 fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_lower, mortar_l2.reverse_upper,
-                                view(fstar, .., 3),
+                                view(fstar_secondary, .., 3),
                                 fstar_tmp)
     add_multiply_dimensionwise!(u_buffer,
                                 mortar_l2.reverse_upper, mortar_l2.reverse_upper,
-                                view(fstar, .., 4),
+                                view(fstar_secondary, .., 4),
                                 fstar_tmp)
 
     # The flux is calculated in the outward direction of the small elements,
@@ -788,12 +791,12 @@ function calc_mortar_flux_divergence!(surface_flux_values,
                                       surface_integral, dg::DG, cache)
     @unpack neighbor_ids, node_indices = cache.mortars
     @unpack contravariant_vectors = cache.elements
-    @unpack fstar_threaded, fstar_tmp_threaded = cache
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
     index_range = eachnode(dg)
 
     @threaded for mortar in eachmortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar = fstar_threaded[Threads.threadid()]
+        fstar = fstar_primary_threaded[Threads.threadid()]
         fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
 
         # Get index information on the small elements
@@ -842,7 +845,7 @@ function calc_mortar_flux_divergence!(surface_flux_values,
         # this reuses the hyperbolic version of `mortar_fluxes_to_elements!`
         mortar_fluxes_to_elements!(surface_flux_values,
                                    mesh, equations, mortar_l2, dg, cache,
-                                   mortar, fstar, u_buffer, fstar_tmp)
+                                   mortar, fstar, fstar, u_buffer, fstar_tmp)
     end
 
     return nothing
@@ -851,7 +854,7 @@ end
 # NOTE: Use analogy to "calc_mortar_flux!" for hyperbolic eqs with no nonconservative terms.
 # Reasoning: "calc_interface_flux!" for parabolic part is implemented as the version for
 # hyperbolic terms with conserved terms only, i.e., no nonconservative terms.
-@inline function calc_mortar_flux!(fstar,
+@inline function calc_mortar_flux!(fstar_primary, fstar_secondary,
                                    mesh::P4estMesh{3},
                                    nonconservative_terms::False,
                                    equations::AbstractEquationsParabolic,
@@ -867,7 +870,9 @@ end
     # TODO: parabolic; only BR1 at the moment
     flux_ = 0.5f0 * (u_ll + u_rr)
     # Copy flux to buffer
-    set_node_vars!(fstar, flux_, equations, dg, i_node_index, j_node_index,
+    set_node_vars!(fstar_primary, flux_, equations, dg, i_node_index, j_node_index,
+                   position_index)
+    set_node_vars!(fstar_secondary, flux_, equations, dg, i_node_index, j_node_index,
                    position_index)
 end
 
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index 635c8dc795..3daca10e82 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -384,12 +384,12 @@ function calc_mpi_mortar_flux!(surface_flux_values,
                                surface_integral, dg::DG, cache)
     @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
     @unpack contravariant_vectors = cache.elements
-    @unpack fstar_threaded, fstar_tmp_threaded = cache
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
     index_range = eachnode(dg)
 
     @threaded for mortar in eachmpimortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar = fstar_threaded[Threads.threadid()]
+        fstar = fstar_primary_threaded[Threads.threadid()]
         fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
 
         # Get index information on the small elements
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index 5cec16300f..3d2db528a1 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -547,16 +547,28 @@ end
 @trixi_testset "elixir_mhd_alfven_wave_nonconforming.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_mhd_alfven_wave_nonconforming.jl"),
-                        l2=[0.00019018725889431733, 0.0006523517707148006,
-                            0.0002401595437705759, 0.0007796920661427565,
-                            0.0007095787460334334, 0.0006558819731628876,
-                            0.0003565026134076906, 0.0007904654548841712,
-                            9.437300326448332e-7],
-                        linf=[0.0012482306861187897, 0.006408776208178299,
-                            0.0016845452099629663, 0.0068711236542984555,
-                            0.004626581522263695, 0.006614624811393632,
-                            0.0030068344747734566, 0.008277825749754025,
-                            1.3475027166309006e-5],
+                        l2=[
+                            0.0001788543743594658,
+                            0.000624334205581902,
+                            0.00022892869974368887,
+                            0.0007223464581156573,
+                            0.0006651366626523314,
+                            0.0006287275014743352,
+                            0.000344484339916008,
+                            0.0007179788287557142,
+                            8.632896980651243e-7
+                        ],
+                        linf=[
+                            0.0010730565632763867,
+                            0.004596749809344033,
+                            0.0013235269262853733,
+                            0.00468874234888117,
+                            0.004719267084104306,
+                            0.004228339352211896,
+                            0.0037503625505571625,
+                            0.005104176909383168,
+                            9.738081186490818e-6
+                        ],
                         tspan=(0.0, 0.25),
                         coverage_override=(trees_per_dimension = (1, 1, 1),))
     # Ensure that we do not have excessive memory allocations
@@ -571,16 +583,28 @@ end
 
 @trixi_testset "elixir_mhd_shockcapturing_amr.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhd_shockcapturing_amr.jl"),
-                        l2=[0.006297229188267704, 0.006436347763092648,
-                            0.0071091348227321095, 0.00652953798427642,
-                            0.0206148702828057, 0.005561406556411695,
-                            0.007570747563696005, 0.005571060186513173,
-                            3.888176398720913e-6],
-                        linf=[0.20904050630623572, 0.1863002690612441,
-                            0.2347653795205547, 0.19430178062881898,
-                            0.6858488630270272, 0.15169972127018583,
-                            0.22431157058134898, 0.16823638722404644,
-                            0.0005208971463830214],
+                        l2=[
+                            0.0062973565893792004,
+                            0.006436273914579104,
+                            0.007112703307027178,
+                            0.006529650167358523,
+                            0.020607452343745017,
+                            0.005560993001492338,
+                            0.007576418168749763,
+                            0.0055721349394598635,
+                            3.8269125984310296e-6
+                        ],
+                        linf=[
+                            0.2090718196650192,
+                            0.1863884052971854,
+                            0.23475479927204168,
+                            0.19460789763442982,
+                            0.6859816363887359,
+                            0.15171474186273914,
+                            0.22404690260234983,
+                            0.16808957604979002,
+                            0.0005083795485317637
+                        ],
                         tspan=(0.0, 0.04),
                         coverage_override=(maxiters = 6, initial_refinement_level = 1,
                                            base_level = 1, max_level = 2))
@@ -597,26 +621,26 @@ end
 @trixi_testset "elixir_mhd_amr_entropy_bounded.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhd_amr_entropy_bounded.jl"),
                         l2=[
-                            0.005430176785094096,
-                            0.006185803468926062,
-                            0.012158513265762224,
-                            0.006185144232789619,
-                            0.03509140423905665,
-                            0.004968215426326584,
-                            0.006553519141867704,
-                            0.005008885124643863,
-                            5.165777182726578e-6
+                            0.005430006338127661,
+                            0.006186402899876596,
+                            0.012171513410597289,
+                            0.006181479343504159,
+                            0.035068817354117605,
+                            0.004967715666538709,
+                            0.006592173316509503,
+                            0.0050151140388451105,
+                            5.146547644807638e-6
                         ],
                         linf=[
-                            0.1864317840224794,
-                            0.2041246899193812,
-                            0.36992946717578445,
-                            0.2327158690965257,
-                            1.0368624176126007,
-                            0.1846308291826353,
-                            0.2062255411778191,
-                            0.18955666546331185,
-                            0.0005208969502913304
+                            0.18655204102670386,
+                            0.20397573777286138,
+                            0.3700839435299759,
+                            0.23329319876321034,
+                            1.0348619438460904,
+                            0.18462694496595722,
+                            0.20648634653698617,
+                            0.18947822281424997,
+                            0.0005083794158781671
                         ],
                         tspan=(0.0, 0.04),
                         coverage_override=(maxiters = 6, initial_refinement_level = 1,