diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
new file mode 100644
index 000000000..640f48cf6
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
@@ -0,0 +1,468 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityAccelerator.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends;
+using ILGPU.Backends.IL;
+using ILGPU.Backends.Velocity;
+using ILGPU.Resources;
+using ILGPU.Runtime.CPU;
+using System;
+using System.Diagnostics;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// A SIMD-enabled CPU-based accelerator.
+ ///
+ public sealed class VelocityAccelerator : Accelerator
+ {
+ #region Static
+
+ ///
+ /// The internal method to launch kernels
+ ///
+ private static readonly MethodInfo RunMethodInfo =
+ typeof(VelocityAccelerator).GetMethod(
+ nameof(Run),
+ BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+
+ #endregion
+
+ #region Instance
+
+ private readonly VelocityMultiprocessor[] multiprocessors;
+ private readonly SemaphoreSlim taskConcurrencyLimit = new SemaphoreSlim(1);
+ private readonly Barrier multiprocessorBarrier;
+
+ ///
+ /// Constructs a new Velocity accelerator.
+ ///
+ /// The ILGPU context.
+ /// The Velocity device.
+ ///
+ /// The thread priority of the execution threads.
+ ///
+ internal VelocityAccelerator(
+ Context context,
+ VelocityDevice device,
+ ThreadPriority threadPriority)
+ : base(context, device)
+ {
+ if (!device.IsLittleEndian)
+ {
+ throw new NotSupportedException(
+ RuntimeErrorMessages.VelocityLittleEndian);
+ }
+
+ multiprocessors = new VelocityMultiprocessor[device.NumMultiprocessors];
+ multiprocessorBarrier = new Barrier(device.NumMultiprocessors + 1);
+ ThreadPriority = threadPriority;
+ MaxLocalMemoryPerThread = device.MaxLocalMemoryPerThread;
+ NumThreads = device.WarpSize * device.NumMultiprocessors;
+
+ // Initialize all multiprocessors
+ Action processingCompleted = OnProcessingCompleted;
+ for (int i = 0; i < device.NumMultiprocessors; ++i)
+ {
+ var multiProcessor = new VelocityMultiprocessor(this, i);
+ multiProcessor.ProcessingCompleted += processingCompleted;
+ multiprocessors[i] = multiProcessor;
+ }
+
+ // Init the underlying Velocity backend
+ Init(new VelocityBackend(
+ context,
+ new CPUCapabilityContext(),
+ WarpSize,
+ new VelocityArgumentMapper(context)));
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the Velocity backend of this accelerator.
+ ///
+ internal new VelocityBackend Backend =>
+ base.Backend as VelocityBackend;
+
+ ///
+ /// Returns the current thread priority.
+ ///
+ public ThreadPriority ThreadPriority { get; }
+
+ ///
+ /// Returns the maximum local memory per thread in bytes.
+ ///
+ public int MaxLocalMemoryPerThread { get; }
+
+ ///
+ /// Returns the maximum number of parallel threads.
+ ///
+ public int NumThreads { get; }
+
+ #endregion
+
+ #region Launch Methods
+
+ ///
+ /// Main internal run method to launch loaded kernels.
+ ///
+ /// The user-defined kernel config.
+ ///
+ /// The actual runtime kernel config to be used for launching.
+ ///
+ /// The kernel entry point handler.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void Run(
+ KernelConfig userKernelConfig,
+ RuntimeKernelConfig runtimeKernelConfig,
+ VelocityKernelEntryPoint kernelHandler,
+ VelocityParameters velocityParameters)
+ {
+ // Avoid concurrent executions of kernels.. we have to wait for the current
+ // kernel to finish first
+ taskConcurrencyLimit.Wait();
+ try
+ {
+ // Distribute the workload
+ int groupSize = runtimeKernelConfig.GroupDim.Size;
+ int gridSize = runtimeKernelConfig.GridDim.Size;
+ int numActiveMPs = Math.Min(
+ gridSize / NumMultiprocessors,
+ NumMultiprocessors);
+ int chunkSizePerMP = IntrinsicMath.DivRoundUp(gridSize, numActiveMPs);
+
+ // Start the multiprocessor journey
+ for (int i = 0; i < numActiveMPs; ++i)
+ {
+ int startIndex = i * chunkSizePerMP;
+ int endIndex = Math.Min(startIndex + chunkSizePerMP, gridSize) - 1;
+ multiprocessors[i].Run(
+ kernelHandler,
+ startIndex,
+ endIndex,
+ gridSize,
+ userKernelConfig.GridDim.Size,
+ groupSize,
+ velocityParameters);
+ }
+
+ // Wait for all multiprocessors to finish
+ multiprocessorBarrier.SignalAndWait();
+ }
+ finally
+ {
+ taskConcurrencyLimit.Release();
+ }
+ }
+
+ private void OnProcessingCompleted(VelocityMultiprocessor processor) =>
+ multiprocessorBarrier.SignalAndWait();
+
+ ///
+ /// Generates a dynamic kernel-launcher method that will be just-in-time compiled
+ /// during the first invocation. Using the generated launcher lowers the overhead
+ /// for kernel launching dramatically, since unnecessary operations (like boxing)
+ /// can be avoided.
+ ///
+ /// The kernel to generate a launcher for.
+ ///
+ /// The custom group size for the launching operation.
+ ///
+ /// The generated launcher method.
+ private MethodInfo GenerateKernelLauncherMethod(
+ VelocityCompiledKernel kernel,
+ int customGroupSize)
+ {
+ var entryPoint = kernel.EntryPoint;
+ AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);
+
+ // Add support for by ref parameters
+ if (entryPoint.HasByRefParameters)
+ {
+ throw new NotSupportedException(
+ ErrorMessages.NotSupportedByRefKernelParameters);
+ }
+
+ // Declare a new launcher method
+ using var scopedLock = entryPoint.CreateLauncherMethod(
+ Context.RuntimeSystem,
+ out var launcher);
+ var emitter = new ILEmitter(launcher.ILGenerator);
+
+ // Map all arguments to an argument structure containing mapped views
+ var argumentMapper = Backend.ArgumentMapper;
+ var (structLocal, _) = argumentMapper.Map(emitter, entryPoint);
+
+ var velocityKernel = emitter.DeclareLocal(typeof(VelocityKernel));
+ KernelLauncherBuilder.EmitLoadKernelArgument(
+ Kernel.KernelInstanceParamIdx, emitter);
+ emitter.Emit(LocalOperation.Store, velocityKernel);
+
+ // Create an instance of the custom parameters type
+ var parametersInstance = emitter.DeclareLocal(kernel.ParametersType);
+ {
+ // Assign parameters
+ var parameters = entryPoint.Parameters;
+ for (int i = 0, e = parameters.Count; i < e; ++i)
+ {
+ // Load the current argument onto the stack
+ emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset);
+ if (parameters.IsByRef(i))
+ emitter.Emit(OpCodes.Ldobj, parameters[i]);
+ }
+
+ // Create new task object
+ emitter.EmitNewObject(kernel.ParametersTypeConstructor);
+
+ // Store task
+ emitter.Emit(LocalOperation.Store, parametersInstance);
+ }
+
+ // Load custom user dimension
+ KernelLauncherBuilder.EmitLoadKernelConfig(
+ entryPoint,
+ emitter,
+ Kernel.KernelParamDimensionIdx,
+ MaxGridSize,
+ MaxGroupSize);
+
+ // Load dimensions
+ KernelLauncherBuilder.EmitLoadRuntimeKernelConfig(
+ entryPoint,
+ emitter,
+ Kernel.KernelParamDimensionIdx,
+ MaxGridSize,
+ MaxGroupSize,
+ customGroupSize);
+
+ // Load the kernel delegate
+ emitter.Emit(LocalOperation.Load, velocityKernel);
+ emitter.EmitCall(VelocityKernel.GetKernelExecutionDelegate);
+
+ // Load the parameters object
+ emitter.Emit(LocalOperation.Load, parametersInstance);
+
+ // Launch kernel execution
+ emitter.EmitCall(RunMethodInfo);
+
+ // End of launch method
+ emitter.Emit(OpCodes.Ret);
+ emitter.Finish();
+
+ return launcher.Finish();
+ }
+
+ #endregion
+
+ ///
+ public override TExtension CreateExtension<
+ TExtension,
+ TExtensionProvider>(TExtensionProvider provider) =>
+ provider.CreateVelocityExtension(this);
+
+ ///
+ protected override MemoryBuffer AllocateRawInternal(
+ long length,
+ int elementSize) =>
+ new VelocityMemoryBuffer(this, length, elementSize);
+
+ ///
+ /// Loads the given kernel.
+ ///
+ /// The kernel to load.
+ /// The custom group size.
+ /// The loaded kernel
+ private Kernel LoadKernel(CompiledKernel kernel, int customGroupSize)
+ {
+ if (kernel is null)
+ throw new ArgumentNullException(nameof(kernel));
+ var compiledKernel = kernel as VelocityCompiledKernel;
+ if (compiledKernel is null)
+ {
+ throw new NotSupportedException(
+ RuntimeErrorMessages.NotSupportedKernel);
+ }
+
+ var launcherMethod = GenerateKernelLauncherMethod(
+ compiledKernel,
+ customGroupSize);
+ return new VelocityKernel(
+ this,
+ compiledKernel,
+ launcherMethod);
+ }
+
+ ///
+ /// Loads a default kernel.
+ ///
+ protected override Kernel LoadKernelInternal(CompiledKernel kernel) =>
+ LoadKernel(kernel, 0);
+
+ ///
+ /// Loads an implicitly grouped kernel.
+ ///
+ protected override Kernel LoadImplicitlyGroupedKernelInternal(
+ CompiledKernel kernel,
+ int customGroupSize,
+ out KernelInfo kernelInfo)
+ {
+ if (customGroupSize < 0)
+ throw new ArgumentOutOfRangeException(nameof(customGroupSize));
+ kernelInfo = KernelInfo.CreateFrom(
+ kernel.Info,
+ customGroupSize,
+ null);
+ return LoadKernel(kernel, customGroupSize);
+ }
+
+ ///
+ /// Loads an auto grouped kernel.
+ ///
+ protected override Kernel LoadAutoGroupedKernelInternal(
+ CompiledKernel kernel,
+ out KernelInfo kernelInfo)
+ {
+ var result = LoadKernel(kernel, WarpSize);
+ kernelInfo = new KernelInfo(WarpSize, NumThreads / WarpSize);
+ return result;
+ }
+
+ ///
+ protected override AcceleratorStream CreateStreamInternal() =>
+ new VelocityStream(this);
+
+ ///
+ protected override void SynchronizeInternal() { }
+
+ ///
+ protected override void OnBind() { }
+
+ ///
+ protected override void OnUnbind() { }
+
+ #region Peer Access
+
+ ///
+ protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) =>
+ otherAccelerator is CPUAccelerator ||
+ otherAccelerator is VelocityAccelerator;
+
+ ///
+ protected override void EnablePeerAccessInternal(Accelerator otherAccelerator)
+ {
+ if (!CanAccessPeerInternal(otherAccelerator))
+ {
+ throw new InvalidOperationException(
+ RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator);
+ }
+ }
+
+ ///
+ protected override void DisablePeerAccessInternal(
+ Accelerator otherAccelerator) =>
+ Debug.Assert(
+ CanAccessPeerInternal(otherAccelerator),
+ "Invalid EnablePeerAccess method");
+
+ #endregion
+
+ #region Occupancy
+
+ ///
+ protected override int EstimateMaxActiveGroupsPerMultiprocessorInternal(
+ Kernel kernel,
+ int groupSize,
+ int dynamicSharedMemorySizeInBytes) =>
+ kernel is VelocityKernel
+ ? groupSize > MaxGroupSize.Size ? 0 : NumMultiprocessors
+ : throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+ ///
+ protected override int EstimateGroupSizeInternal(
+ Kernel kernel,
+ Func computeSharedMemorySize,
+ int maxGroupSize,
+ out int minGridSize)
+ {
+ if (!(kernel is VelocityKernel))
+ throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+ // Estimation
+ minGridSize = NumThreads;
+ return Math.Min(maxGroupSize, MaxGroupSize.Size);
+ }
+
+ ///
+ protected override int EstimateGroupSizeInternal(
+ Kernel kernel,
+ int dynamicSharedMemorySizeInBytes,
+ int maxGroupSize,
+ out int minGridSize)
+ {
+ if (!(kernel is VelocityKernel))
+ throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+ // Estimation
+ minGridSize = NumThreads;
+ return 1;
+ }
+
+ #endregion
+
+ #region Page Lock Scope
+
+ ///
+ protected override PageLockScope CreatePageLockFromPinnedInternal(
+ IntPtr pinned,
+ long numElements)
+ {
+ Trace.WriteLine(RuntimeErrorMessages.NotSupportedPageLock);
+ return new NullPageLockScope(this, pinned, numElements);
+ }
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Dispose all managed resources allocated by this CPU accelerator instance.
+ ///
+ protected override void DisposeAccelerator_SyncRoot(bool disposing)
+ {
+ if (!disposing)
+ return;
+
+ // Dispose task engine
+ taskConcurrencyLimit.Wait();
+
+ // Dispose all multiprocessors
+ foreach (var multiprocessor in multiprocessors)
+ multiprocessor.Dispose();
+
+ // Dispose barriers
+ taskConcurrencyLimit.Dispose();
+ }
+
+ #endregion
+
+ }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs
new file mode 100644
index 000000000..e6a44ac7e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs
@@ -0,0 +1,74 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityContextExtensions.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends;
+using ILGPU.Backends.PTX;
+using ILGPU.Resources;
+using System;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// Cuda specific context extensions.
+ ///
+ public static class VelocityContextExtensions
+ {
+ #region Builder
+
+ ///
+ /// Enables all velocity devices.
+ ///
+ /// The builder instance.
+ ///
+ /// The maximum number bytes of shared memory per group.
+ ///
+ /// The updated builder instance.
+ public static Context.Builder Velocity(
+ this Context.Builder builder,
+ int maxSharedMemoryPerGroup = VelocityDevice.MinSharedMemoryPerGroup)
+ {
+ if (!Backend.RuntimePlatform.Is64Bit())
+ {
+ throw new NotSupportedException(string.Format(
+ RuntimeErrorMessages.VelocityPlatform64,
+ Backend.RuntimePlatform));
+ }
+
+ builder.DeviceRegistry.Register(new VelocityDevice());
+ return builder;
+ }
+
+ #endregion
+
+ #region Context
+
+ ///
+ /// Gets a registered Velocity device.
+ ///
+ /// The ILGPU context.
+ /// The registered Velocity device.
+ public static VelocityDevice GetVelocityDevice(this Context context) =>
+ context.GetDevice(0);
+
+ ///
+ /// Creates a new Velocity accelerator.
+ ///
+ /// The ILGPU context.
+ /// The created Velocity accelerator.
+ public static VelocityAccelerator CreateVelocityAccelerator(
+ this Context context) =>
+ context.GetVelocityDevice().CreateVelocityAccelerator(context);
+
+ #endregion
+ }
+
+}
+
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
new file mode 100644
index 000000000..7456444e2
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
@@ -0,0 +1,140 @@
+using System;
+using System.Numerics;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// Represents a software-emulated velocity device for high-performance execution of
+ /// tasks on the CPU using vectorization.
+ ///
+ public sealed class VelocityDevice : Device
+ {
+ #region Constants
+
+ ///
+ /// The default maximum amount of shared memory in bytes (1024k).
+ ///
+ public const int MinSharedMemoryPerGroup = 1 << 20;
+
+ #endregion
+
+ #region Instance
+
+ ///
+ /// Creates a new velocity device with the default amount of shared memory per
+ /// group (refer to for more
+ /// information about the default size).
+ ///
+ public VelocityDevice()
+ : this(MinSharedMemoryPerGroup)
+ { }
+
+ ///
+ /// Creates a new velocity device using the given amount of shared memory (min
+ /// amount is per group).
+ ///
+ ///
+ /// The maximum amount of shared memory per group in bytes.
+ ///
+ public VelocityDevice(int maxSharedMemoryPerGroup)
+ {
+ if (maxSharedMemoryPerGroup < MinSharedMemoryPerGroup)
+ throw new ArgumentOutOfRangeException(nameof(maxSharedMemoryPerGroup));
+
+ Name = nameof(VelocityAccelerator);
+ WarpSize = VelocityWarp32.RawVectorLength;
+ MinWarpSize = VelocityWarp64.RawVectorLength;
+ MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize;
+ NumMultiprocessors = Environment.ProcessorCount;
+ MaxGroupSize = new Index3D(
+ MaxNumThreadsPerGroup,
+ 1,
+ 1);
+
+ MemorySize = long.MaxValue;
+ MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue);
+ MaxSharedMemoryPerGroup = maxSharedMemoryPerGroup;
+ MaxConstantMemory = int.MaxValue;
+ NumThreads = MaxNumThreads;
+
+ // Get the endian type from the global BitConverter class
+ IsLittleEndian = BitConverter.IsLittleEndian;
+
+ // Allocate a sufficient amount of local memory per thread equal to
+ // the maximum number of shared memory per group in bytes
+ MaxLocalMemoryPerThread = maxSharedMemoryPerGroup;
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the minimum warp size of this device.
+ ///
+ public int MinWarpSize { get; }
+
+ ///
+ /// Returns the number of threads.
+ ///
+ public int NumThreads { get; }
+
+ ///
+ /// Returns true if this device operates in little endian mode.
+ ///
+ public bool IsLittleEndian { get; }
+
+ ///
+ /// Returns the maximum local memory per thread in bytes.
+ ///
+ public int MaxLocalMemoryPerThread { get; }
+
+ #endregion
+
+ #region Methods
+
+ ///
+ public override Accelerator CreateAccelerator(Context context) =>
+ CreateVelocityAccelerator(context);
+
+ ///
+ /// Creates a new performance CPU accelerator using and the default thread
+ /// priority.
+ ///
+ /// The ILGPU context.
+ /// The created CPU accelerator.
+ public VelocityAccelerator CreateVelocityAccelerator(
+ Context context) =>
+ CreateVelocityAccelerator(context, ThreadPriority.Normal);
+
+ ///
+ /// Creates a new performance CPU accelerator using and the default thread
+ /// priority.
+ ///
+ /// The ILGPU context.
+ ///
+ /// The thread priority of the execution threads.
+ ///
+ /// The created CPU accelerator.
+ public VelocityAccelerator CreateVelocityAccelerator(
+ Context context,
+ ThreadPriority threadPriority) =>
+ new VelocityAccelerator(context, this, threadPriority);
+
+ #endregion
+
+ #region Object
+
+ ///
+ public override bool Equals(object obj) =>
+ obj is VelocityDevice device &&
+ device.MaxSharedMemoryPerGroup == MaxSharedMemoryPerGroup &&
+ base.Equals(obj);
+
+ ///
+ public override int GetHashCode() => base.GetHashCode() ^ MaxSharedMemoryPerGroup;
+
+ #endregion
+ }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs
new file mode 100644
index 000000000..41a6c2d0e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs
@@ -0,0 +1,80 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityKernel.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends.Velocity;
+using System.Reflection;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// Represents a single Velocity kernel.
+ ///
+ public sealed class VelocityKernel : Kernel
+ {
+ #region Static
+
+ ///
+ /// Represents the property getter.
+ ///
+ internal static readonly MethodInfo GetKernelExecutionDelegate =
+ typeof(VelocityKernel).GetProperty(
+ nameof(KernelEntryPoint),
+ BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance)
+ .GetGetMethod(true);
+
+ #endregion
+
+ #region Instance
+
+ ///
+ /// Loads a compiled kernel into the given Cuda context as kernel program.
+ ///
+ /// The associated accelerator.
+ /// The source kernel.
+ /// The launcher method for the given kernel.
+ /// The execution method.
+ internal VelocityKernel(
+ VelocityAccelerator accelerator,
+ VelocityCompiledKernel kernel,
+ MethodInfo launcher)
+ : base(accelerator, kernel, launcher)
+ {
+ KernelEntryPoint = kernel.CreateKernelEntryPoint();
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the associated Velocity runtime.
+ ///
+ public VelocityAccelerator VelocityAccelerator =>
+ Accelerator as VelocityAccelerator;
+
+ ///
+ /// The main kernel entry point function to be called from each velocity
+ /// multiprocessor during exeution.
+ ///
+ internal VelocityKernelEntryPoint KernelEntryPoint { get; }
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Does not perform any operation.
+ ///
+ protected override void DisposeAcceleratorObject(bool disposing) { }
+
+ #endregion
+ }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs
new file mode 100644
index 000000000..e9c640a0e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs
@@ -0,0 +1,151 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2017-2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityMemoryBuffer.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Runtime.CPU;
+using ILGPU.Util;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// A memory buffer that lives in CPU space.
+ ///
+ public class VelocityMemoryBuffer : MemoryBuffer
+ {
+ #region Instance
+
+ ///
+ /// Initializes this array view source on the CPU.
+ ///
+ /// The parent accelerator (if any).
+ /// The length of this source.
+ /// The element size.
+ internal VelocityMemoryBuffer(
+ Accelerator accelerator,
+ long length,
+ int elementSize)
+ : base(accelerator, length, elementSize)
+ {
+ // Ensure that all element accesses will be properly aligned
+ int alignmentOffset = Interop.ComputeAlignmentOffset(
+ length,
+ elementSize * accelerator.WarpSize);
+ // Pad the length to ensure a valid buffer size
+ long paddedLength = length + alignmentOffset;
+
+ // Allocate resources and assign pointers
+ NativeBufferPtr = Marshal.AllocHGlobal(new IntPtr(paddedLength));
+ NativePtr = NativeBufferPtr + alignmentOffset;
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the natively allocated underlying buffer pointer which may not be
+ /// aligned in all cases.
+ ///
+ public IntPtr NativeBufferPtr { get; private set; }
+
+ #endregion
+
+ #region Methods
+
+ ///
+ protected internal override void MemSet(
+ AcceleratorStream stream,
+ byte value,
+ in ArrayView targetView) =>
+ CPUMemoryBuffer.CPUMemSet(
+ targetView.LoadEffectiveAddressAsPtr(),
+ value,
+ 0L,
+ targetView.LengthInBytes);
+
+ ///
+ protected internal override void CopyFrom(
+ AcceleratorStream stream,
+ in ArrayView sourceView,
+ in ArrayView targetView) =>
+ CPUMemoryBuffer.CPUCopyFrom(stream, sourceView, targetView);
+
+ ///
+ protected internal override void CopyTo(
+ AcceleratorStream stream,
+ in ArrayView sourceView,
+ in ArrayView targetView) =>
+ CPUMemoryBuffer.CPUCopyTo(stream, sourceView, targetView);
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Disposes the underlying memory buffer.
+ ///
+ protected override void DisposeAcceleratorObject(bool disposing)
+ {
+ Marshal.FreeHGlobal(NativeBufferPtr);
+ NativeBufferPtr = IntPtr.Zero;
+ NativePtr = IntPtr.Zero;
+ }
+
+ #endregion
+
+ }
+
+ sealed class VelocityMemoryBufferPool : VelocityMemoryBuffer
+ {
+ #region Instance
+
+ private volatile int sharedMemoryOffset;
+ private readonly int warpSize;
+
+ public VelocityMemoryBufferPool(
+ VelocityAccelerator accelerator,
+ int size)
+ : base(accelerator, size, 1)
+ {
+ warpSize = accelerator.WarpSize;
+ }
+
+ #endregion
+
+ #region Methods
+
+ public void Reset() =>
+ Interlocked.Exchange(ref sharedMemoryOffset, 0);
+
+ ///
+ /// Gets a chunk of memory of a certain type.
+ ///
+ /// The number of elements.
+ /// The element type to allocate.
+ /// A view pointing to the right chunk of shared memory.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public ArrayView Allocate(int length)
+ where T : unmanaged
+ {
+ int totalElementSize = length * Interop.SizeOf() * warpSize;
+ int alignment = Interop.ComputeAlignmentOffset(
+ sharedMemoryOffset,
+ totalElementSize);
+ int newOffset = Interlocked.Add(ref sharedMemoryOffset, alignment);
+ return new ArrayView(this, newOffset, length * warpSize);
+ }
+
+ #endregion
+ }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs
new file mode 100644
index 000000000..98d40de2e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs
@@ -0,0 +1,402 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityMultiprocessor.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Util;
+using System;
+using System.Collections.Immutable;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// Represents a single velocity kernel processing delegate.
+ ///
+ /// The start index within the thread grid.
+ /// The end index within the thread grid.
+ /// The current parameters.
+ delegate void VelocityKernelEntryPoint(
+ int globalStartIndex,
+ int globalEndIndex,
+ VelocityParameters parameters);
+
+ ///
+ /// A single velocity multiprocessor consisting of a single processing thread and
+ /// a runtime context.
+ ///
+ sealed class VelocityMultiprocessor : DisposeBase
+ {
+ #region Static
+
+ ///
+ /// All kernel handler types required to launch a kernel delegate on this MP.
+ ///
+ public static readonly ImmutableArray KernelHandlerTypes =
+ ImmutableArray.Create(
+ typeof(int),
+ typeof(int),
+ typeof(VelocityParameters));
+
+ ///
+ /// Stores the current velocity multiprocessor.
+ ///
+ [ThreadStatic]
+ private static VelocityMultiprocessor current;
+
+ ///
+ /// Returns the parent velocity multiprocessor for the current thread.
+ ///
+ /// The parent multiprocessor for the current thread.
+ public static VelocityMultiprocessor GetCurrent() => current;
+
+ ///
+ /// Allocates a chunk of shared memory.
+ ///
+ /// A velocity warp made of shared-memory pointers.
+ public static VelocityWarp64 GetSharedMemory(int length)
+ where T : unmanaged
+ {
+ var sharedMemoryView = GetCurrent().GetSharedMemoryFromPool(length);
+ long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64();
+ var addresses = VelocityWarp64.GetConstI(intPtr);
+ return addresses.AddU(VelocityWarp64.LaneIndexVector);
+ }
+
+ ///
+ /// Allocates a chunk of local memory.
+ ///
+ /// A velocity warp made of local-memory pointers.
+ public static VelocityWarp64 GetLocalMemory(int length)
+ where T : unmanaged
+ {
+ var sharedMemoryView = GetCurrent().GetLocalMemoryFromPool(length);
+ long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64();
+ var addresses = VelocityWarp64.GetConstI(intPtr);
+ return addresses.AddU(VelocityWarp64.LaneIndexVector);
+ }
+
+ ///
+ /// Returns the current grid indices for all warp lanes associated with this
+ /// multiprocessor.
+ ///
+ /// A velocity warp made of grid indices.
+ public static VelocityWarp32 GetCurrentGridIdx() => GetCurrent().GridIdx;
+
+ ///
+ /// Sets the current grid index by advancing to the given offset.
+ ///
+ public static void SetCurrentGridIdx(int offset) =>
+ GetCurrent().ResetGridIndex(offset);
+
+ ///
+ /// Returns the current grid dimension for all warp lanes associated with this
+ /// multiprocessor.
+ ///
+ /// A velocity warp made of the current grid dimension.
+ public static VelocityWarp32 GetCurrentGridDim() =>
+ VelocityWarp32.GetConstI(GetCurrent().GridDim);
+
+ ///
+ /// Returns the current user defined grid dimension which may not be a multiple
+ /// of the group size used on this multiprocessor.
+ ///
+ /// A velocity warp made of the current user grid dimension.
+ public static VelocityWarp32 GetCurrentUserGridDim() =>
+ VelocityWarp32.GetConstI(GetCurrent().UserGridDim);
+
+ ///
+ /// Returns the current group dimension for all warp lanes associated with this
+ /// multiprocessor.
+ ///
+ /// A velocity warp made of the current grid dimension.
+ public static VelocityWarp32 GetCurrentGroupDim() =>
+ VelocityWarp32.GetConstI(GetCurrent().GroupDim);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetSharedMemoryMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ $"{nameof(GetSharedMemory)}`1",
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetLocalMemoryMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ $"{nameof(GetLocalMemory)}`1",
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetCurrentGridIdxMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ nameof(GetCurrentGridIdx),
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo SetCurrentGridIdxMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ nameof(SetCurrentGridIdx),
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetCurrentGridDimMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ nameof(GetCurrentGridDim),
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetCurrentUserGridDimMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ nameof(GetCurrentUserGridDim),
+ BindingFlags.Public | BindingFlags.Static);
+
+ ///
+ /// Returns a handle to the method.
+ ///
+ public static readonly MethodInfo GetCurrentGroupDimMethodInfo =
+ typeof(VelocityMultiprocessor).GetMethod(
+ nameof(GetCurrentGroupDim),
+ BindingFlags.Public | BindingFlags.Static);
+
+ #endregion
+
+ #region Events
+
+ ///
+ /// Will be raised once a chunk of a scheduled thread grid has been completed.
+ ///
+ public Action ProcessingCompleted;
+
+ #endregion
+
+ #region Instance
+
+ // Thread data
+ private readonly Thread runtimeThread;
+ private readonly SemaphoreSlim startProcessingSema;
+
+ // Context data
+ private readonly VelocityMemoryBufferPool sharedMemoryPool;
+ private readonly VelocityMemoryBufferPool localMemoryPool;
+
+ // Runtime data
+ private volatile VelocityKernelEntryPoint kernelHandler;
+ private volatile int startIndexRange;
+ private volatile int endIndexRange;
+ private volatile VelocityParameters kernelParameters;
+ private volatile bool running = true;
+
+ ///
+ /// Initializes a new velocity multiprocessor.
+ ///
+ /// The parent velocity accelerator.
+ /// The current processor index.
+ internal VelocityMultiprocessor(
+ VelocityAccelerator accelerator,
+ int processorIndex)
+ {
+ runtimeThread = new Thread(DoWork)
+ {
+ Priority = accelerator.ThreadPriority,
+ IsBackground = true,
+ Name = $"ILGPU_{accelerator.InstanceId}_Velocity_{processorIndex}"
+ };
+ startProcessingSema = new SemaphoreSlim(0);
+ sharedMemoryPool = new VelocityMemoryBufferPool(
+ accelerator,
+ accelerator.MaxSharedMemoryPerGroup);
+ localMemoryPool = new VelocityMemoryBufferPool(
+ accelerator,
+ accelerator.MaxLocalMemoryPerThread);
+ WarpSize = accelerator.WarpSize;
+ ProcessorIndex = processorIndex;
+
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the current warp size.
+ ///
+ public int WarpSize { get; }
+
+ ///
+ /// Returns the multiprocessor index.
+ ///
+ public int ProcessorIndex { get; }
+
+ ///
+ /// Returns the precomputed grid indices for all lanes in the current
+ /// multiprocessor.
+ ///
+ public VelocityWarp32 GridIdx { get; private set; }
+
+ ///
+ /// Returns the current grid dimension.
+ ///
+ public int GridDim { get; private set; }
+
+ ///
+ /// Returns the current user grid dimension.
+ ///
+ public int UserGridDim { get; private set; }
+
+ ///
+ /// Returns the current group dimension.
+ ///
+ public int GroupDim { get; private set; }
+
+ #endregion
+
+ #region Methods
+
+ ///
+ /// Resets the current grid index.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ResetGridIndex(int offset)
+ {
+ // Compute the initial grid indices based on our processor index and the
+ // warp size
+ int baseIndex = WarpSize * ProcessorIndex + offset;
+ GridIdx =
+ VelocityWarp32.LaneIndexVector.AddI(
+ VelocityWarp32.GetConstI(baseIndex));
+ }
+
+ ///
+ /// Gets a chunk of shared memory of a certain type.
+ ///
+ /// The number of elements.
+ /// The element type to allocate.
+ /// A view pointing to the right chunk of shared memory.
+ public ArrayView GetSharedMemoryFromPool(int length)
+ where T : unmanaged =>
+ sharedMemoryPool.Allocate(length);
+
+ ///
+ /// Gets a chunk of local memory of a certain type.
+ ///
+ /// The number of elements.
+ /// The element type to allocate.
+ /// A view pointing to the right chunk of local memory.
+ public ArrayView GetLocalMemoryFromPool(int length)
+ where T : unmanaged =>
+ localMemoryPool.Allocate(length);
+
+ ///
+ /// Dispatches a new kernel execution.
+ ///
+ /// The kernel handler delegate.
+ /// The start interval index.
+ /// The end interval index.
+ /// The current grid dimension.
+ /// The current group dimension.
+ /// All kernel parameters.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void Run(
+ VelocityKernelEntryPoint handler,
+ int startIndex,
+ int endIndex,
+ int gridDimension,
+ int userGridDimension,
+ int groupDimension,
+ VelocityParameters parameters)
+ {
+ GridDim = gridDimension;
+ UserGridDim = userGridDimension;
+ GroupDim = groupDimension;
+
+ // Note that we do not have to invoke
+ // ResetGridIndex(offset: 0);
+ // here, as this method will be automatically invoked by each Velocity kernel
+
+ // Schedule this operation
+ kernelHandler = handler;
+ startIndexRange = startIndex;
+ endIndexRange = endIndex;
+ kernelParameters = parameters;
+ sharedMemoryPool.Reset();
+ localMemoryPool.Reset();
+
+ // Ensure visibility of all changes to other threads
+ Thread.MemoryBarrier();
+
+ // Launch the processing task
+ startProcessingSema.Release();
+ }
+
+ ///
+ /// The main processing thread of this multiprocessor.
+ ///
+ private void DoWork()
+ {
+ // Assign the current multiprocessor to this instance
+ current = this;
+
+ // Process all tasks
+ while (true)
+ {
+ // Wait for the next task to arrive
+ startProcessingSema.Wait();
+
+ // Break the loop if we are shutting down
+ if (!running)
+ break;
+
+ // Launch the actual kernel method
+ kernelHandler(startIndexRange, endIndexRange, kernelParameters);
+
+ // Signal the main thread that the processing has been completed. Note
+ // that we avoid any null checks at this point
+ ProcessingCompleted(this);
+ }
+ }
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Waits for the processing thread to shutdown and disposes all internal thread
+ /// objects.
+ ///
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing)
+ {
+ running = false;
+ startProcessingSema.Release();
+ runtimeThread.Join();
+
+ startProcessingSema.Dispose();
+ sharedMemoryPool.Dispose();
+ localMemoryPool.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ #endregion
+ }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs
new file mode 100644
index 000000000..bd38b034c
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs
@@ -0,0 +1,27 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityParameters.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// The base class for all velocity parameters.
+ ///
+ abstract class VelocityParameters
+ {
+ ///
+ /// Does nothing at the moment
+ ///
+ public VelocityParameters()
+ {
+ }
+ }
+}
+
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityStream.cs b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs
new file mode 100644
index 000000000..9adfe3cc9
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs
@@ -0,0 +1,74 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2022 ILGPU Project
+// www.ilgpu.net
+//
+// File: VelocityStream.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Runtime.CPU;
+
+namespace ILGPU.Runtime.Velocity
+{
+ ///
+ /// Represents a velocity stream.
+ ///
+ sealed class VelocityStream : AcceleratorStream
+ {
+ #region Static
+
+ ///
+ /// The default instance.
+ ///
+ internal static readonly VelocityStream Default = new VelocityStream();
+
+ #endregion
+
+ #region Instance
+
+ ///
+ /// Constructs a new Velocity stream.
+ ///
+ private VelocityStream() : base() { }
+
+ ///
+ /// Constructs a new Velocity stream.
+ ///
+ /// The associated accelerator.
+ internal VelocityStream(Accelerator accelerator)
+ : base(accelerator)
+ { }
+
+ #endregion
+
+ #region Methods
+
+ ///
+ /// Does not perform any operation.
+ ///
+ public override void Synchronize() { }
+
+ ///
+ protected unsafe override ProfilingMarker AddProfilingMarkerInternal()
+ {
+ using var binding = Accelerator.BindScoped();
+ return new CPUProfilingMarker(Accelerator);
+ }
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Does not perform any operation.
+ ///
+ protected override void DisposeAcceleratorObject(bool disposing) { }
+
+ #endregion
+ }
+}
+
+