If these don't match then benchmark times will be either under or over estimated.
Context dotnet/runtime#86033 (comment)
This is the "good" case where the jit isn't messing up the workload invoke codegen (which just makes the problem worse).
public delegate System.Single OverheadDelegate();
private void OverheadActionUnroll(System.Int64 invokeCount)
{
for (System.Int64 i = 0; i < invokeCount; i++)
{
consumer.Consume(overheadDelegate());
public delegate System.Numerics.Vector3 WorkloadDelegate();
private void WorkloadActionUnroll(System.Int64 invokeCount)
{
for (System.Int64 i = 0; i < invokeCount; i++)
{
consumer.Consume(workloadDelegate().X);
;; overhead
mov rbp, gword ptr [rsi+38H]
mov rax, gword ptr [rsi+28H]
mov rcx, gword ptr [rax+08H]
call [rax+18H]BenchmarkDotNet.Autogenerated.Runnable_0+OverheadDelegate:Invoke():float:this
vmovss dword ptr [rbp+48H], xmm0
;; workload
mov rbp, gword ptr [rsi+38H]
mov rax, gword ptr [rsi+30H]
mov rcx, gword ptr [rax+08H]
lea rdx, [rsp+110H]
call [rax+18H]BenchmarkDotNet.Autogenerated.Runnable_0+WorkloadDelegate:Invoke():System.Numerics.Vector3:this
vmovss xmm0, dword ptr [rsp+110H]
vmovss dword ptr [rbp+48H], xmm0
May be restricted to certain return types like Vector; I haven't seen this elsewhere.
If these don't match then benchmark times will be either under or over estimated.
Context dotnet/runtime#86033 (comment)
This is the "good" case where the jit isn't messing up the workload invoke codegen (which just makes the problem worse).
May be restricted to certain return types like Vector; I haven't seen this elsewhere.