diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
new file mode 100644
index 000000000..640f48cf6
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
@@ -0,0 +1,468 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityAccelerator.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends;
+using ILGPU.Backends.IL;
+using ILGPU.Backends.Velocity;
+using ILGPU.Resources;
+using ILGPU.Runtime.CPU;
+using System;
+using System.Diagnostics;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// A SIMD-enabled CPU-based accelerator.
+    /// </summary>
+    public sealed class VelocityAccelerator : Accelerator
+    {
+        #region Static
+
+        /// <summary>
+        /// The internal <see cref="Run(KernelConfig, RuntimeKernelConfig,
+        /// VelocityKernelEntryPoint)"/> method to launch kernels
+        /// </summary>
+        private static readonly MethodInfo RunMethodInfo =
+            typeof(VelocityAccelerator).GetMethod(
+                nameof(Run),
+                BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+
+        #endregion
+
+        #region Instance
+
+        private readonly VelocityMultiprocessor[] multiprocessors;
+        private readonly SemaphoreSlim taskConcurrencyLimit = new SemaphoreSlim(1);
+        private readonly Barrier multiprocessorBarrier;
+
+        /// <summary>
+        /// Constructs a new Velocity accelerator.
+        /// </summary>
+        /// <param name="context">The ILGPU context.</param>
+        /// <param name="device">The Velocity device.</param>
+        /// <param name="threadPriority">
+        /// The thread priority of the execution threads.
+        /// </param>
+        internal VelocityAccelerator(
+            Context context,
+            VelocityDevice device,
+            ThreadPriority threadPriority)
+            : base(context, device)
+        {
+            if (!device.IsLittleEndian)
+            {
+                throw new NotSupportedException(
+                    RuntimeErrorMessages.VelocityLittleEndian);
+            }
+
+            multiprocessors = new VelocityMultiprocessor[device.NumMultiprocessors];
+            multiprocessorBarrier = new Barrier(device.NumMultiprocessors + 1);
+            ThreadPriority = threadPriority;
+            MaxLocalMemoryPerThread = device.MaxLocalMemoryPerThread;
+            NumThreads = device.WarpSize * device.NumMultiprocessors;
+
+            // Initialize all multiprocessors
+            Action<VelocityMultiprocessor> processingCompleted = OnProcessingCompleted;
+            for (int i = 0; i < device.NumMultiprocessors; ++i)
+            {
+                var multiProcessor = new VelocityMultiprocessor(this, i);
+                multiProcessor.ProcessingCompleted += processingCompleted;
+                multiprocessors[i] = multiProcessor;
+            }
+
+            // Init the underlying Velocity backend
+            Init(new VelocityBackend<ILEmitter>(
+                context,
+                new CPUCapabilityContext(),
+                WarpSize,
+                new VelocityArgumentMapper(context)));
+        }
+
+        #endregion
+
+        #region Properties
+
+        /// <summary>
+        /// Returns the Velocity backend of this accelerator.
+        /// </summary>
+        internal new VelocityBackend<ILEmitter> Backend =>
+            base.Backend as VelocityBackend<ILEmitter>;
+
+        /// <summary>
+        /// Returns the current thread priority.
+        /// </summary>
+        public ThreadPriority ThreadPriority { get; }
+
+        /// <summary>
+        /// Returns the maximum local memory per thread in bytes.
+        /// </summary>
+        public int MaxLocalMemoryPerThread { get; }
+
+        /// <summary>
+        /// Returns the maximum number of parallel threads.
+        /// </summary>
+        public int NumThreads { get; }
+
+        #endregion
+
+        #region Launch Methods
+
+        /// <summary>
+        /// Main internal run method to launch loaded kernels.
+        /// </summary>
+        /// <param name="userKernelConfig">The user-defined kernel config.</param>
+        /// <param name="runtimeKernelConfig">
+        /// The actual runtime kernel config to be used for launching.
+        /// </param>
+        /// <param name="kernelHandler">The kernel entry point handler.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal void Run(
+            KernelConfig userKernelConfig,
+            RuntimeKernelConfig runtimeKernelConfig,
+            VelocityKernelEntryPoint kernelHandler,
+            VelocityParameters velocityParameters)
+        {
+            // Avoid concurrent executions of kernels.. we have to wait for the current
+            // kernel to finish first
+            taskConcurrencyLimit.Wait();
+            try
+            {
+                // Distribute the workload
+                int groupSize = runtimeKernelConfig.GroupDim.Size;
+                int gridSize = runtimeKernelConfig.GridDim.Size;
+                int numActiveMPs = Math.Min(
+                    gridSize / NumMultiprocessors,
+                    NumMultiprocessors);
+                int chunkSizePerMP = IntrinsicMath.DivRoundUp(gridSize, numActiveMPs);
+
+                // Start the multiprocessor journey
+                for (int i = 0; i < numActiveMPs; ++i)
+                {
+                    int startIndex = i * chunkSizePerMP;
+                    int endIndex = Math.Min(startIndex + chunkSizePerMP, gridSize) - 1;
+                    multiprocessors[i].Run(
+                        kernelHandler,
+                        startIndex,
+                        endIndex,
+                        gridSize,
+                        userKernelConfig.GridDim.Size,
+                        groupSize,
+                        velocityParameters);
+                }
+
+                // Wait for all multiprocessors to finish
+                multiprocessorBarrier.SignalAndWait();
+            }
+            finally
+            {
+                taskConcurrencyLimit.Release();
+            }
+        }
+
+        private void OnProcessingCompleted(VelocityMultiprocessor processor) =>
+            multiprocessorBarrier.SignalAndWait();
+
+        /// <summary>
+        /// Generates a dynamic kernel-launcher method that will be just-in-time compiled
+        /// during the first invocation. Using the generated launcher lowers the overhead
+        /// for kernel launching dramatically, since unnecessary operations (like boxing)
+        /// can be avoided.
+        /// </summary>
+        /// <param name="kernel">The kernel to generate a launcher for.</param>
+        /// <param name="customGroupSize">
+        /// The custom group size for the launching operation.
+        /// </param>
+        /// <returns>The generated launcher method.</returns>
+        private MethodInfo GenerateKernelLauncherMethod(
+            VelocityCompiledKernel kernel,
+            int customGroupSize)
+        {
+            var entryPoint = kernel.EntryPoint;
+            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);
+
+            // Add support for by ref parameters
+            if (entryPoint.HasByRefParameters)
+            {
+                throw new NotSupportedException(
+                    ErrorMessages.NotSupportedByRefKernelParameters);
+            }
+
+            // Declare a new launcher method
+            using var scopedLock = entryPoint.CreateLauncherMethod(
+                Context.RuntimeSystem,
+                out var launcher);
+            var emitter = new ILEmitter(launcher.ILGenerator);
+
+            // Map all arguments to an argument structure containing mapped views
+            var argumentMapper = Backend.ArgumentMapper;
+            var (structLocal, _) = argumentMapper.Map(emitter, entryPoint);
+
+            var velocityKernel = emitter.DeclareLocal(typeof(VelocityKernel));
+            KernelLauncherBuilder.EmitLoadKernelArgument<VelocityKernel, ILEmitter>(
+                Kernel.KernelInstanceParamIdx, emitter);
+            emitter.Emit(LocalOperation.Store, velocityKernel);
+
+            // Create an instance of the custom parameters type
+            var parametersInstance = emitter.DeclareLocal(kernel.ParametersType);
+            {
+                // Assign parameters
+                var parameters = entryPoint.Parameters;
+                for (int i = 0, e = parameters.Count; i < e; ++i)
+                {
+                    // Load the current argument onto the stack
+                    emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset);
+                    if (parameters.IsByRef(i))
+                        emitter.Emit(OpCodes.Ldobj, parameters[i]);
+                }
+
+                // Create new task object
+                emitter.EmitNewObject(kernel.ParametersTypeConstructor);
+
+                // Store task
+                emitter.Emit(LocalOperation.Store, parametersInstance);
+            }
+
+            // Load custom user dimension
+            KernelLauncherBuilder.EmitLoadKernelConfig(
+                entryPoint,
+                emitter,
+                Kernel.KernelParamDimensionIdx,
+                MaxGridSize,
+                MaxGroupSize);
+
+            // Load dimensions
+            KernelLauncherBuilder.EmitLoadRuntimeKernelConfig(
+                entryPoint,
+                emitter,
+                Kernel.KernelParamDimensionIdx,
+                MaxGridSize,
+                MaxGroupSize,
+                customGroupSize);
+
+            // Load the kernel delegate
+            emitter.Emit(LocalOperation.Load, velocityKernel);
+            emitter.EmitCall(VelocityKernel.GetKernelExecutionDelegate);
+
+            // Load the parameters object
+            emitter.Emit(LocalOperation.Load, parametersInstance);
+
+            // Launch kernel execution
+            emitter.EmitCall(RunMethodInfo);
+
+            // End of launch method
+            emitter.Emit(OpCodes.Ret);
+            emitter.Finish();
+
+            return launcher.Finish();
+        }
+
+        #endregion
+
+        /// <inheritdoc/>
+        public override TExtension CreateExtension<
+            TExtension,
+            TExtensionProvider>(TExtensionProvider provider) =>
+            provider.CreateVelocityExtension(this);
+
+        /// <inheritdoc/>
+        protected override MemoryBuffer AllocateRawInternal(
+            long length,
+            int elementSize) =>
+            new VelocityMemoryBuffer(this, length, elementSize);
+
+        /// <summary>
+        /// Loads the given kernel.
+        /// </summary>
+        /// <param name="kernel">The kernel to load.</param>
+        /// <param name="customGroupSize">The custom group size.</param>
+        /// <returns>The loaded kernel</returns>
+        private Kernel LoadKernel(CompiledKernel kernel, int customGroupSize)
+        {
+            if (kernel is null)
+                throw new ArgumentNullException(nameof(kernel));
+            var compiledKernel = kernel as VelocityCompiledKernel;
+            if (compiledKernel is null)
+            {
+                throw new NotSupportedException(
+                    RuntimeErrorMessages.NotSupportedKernel);
+            }
+
+            var launcherMethod = GenerateKernelLauncherMethod(
+                compiledKernel,
+                customGroupSize);
+            return new VelocityKernel(
+                this,
+                compiledKernel,
+                launcherMethod);
+        }
+
+        /// <summary>
+        /// Loads a default kernel.
+        /// </summary>
+        protected override Kernel LoadKernelInternal(CompiledKernel kernel) =>
+            LoadKernel(kernel, 0);
+
+        /// <summary>
+        /// Loads an implicitly grouped kernel.
+        /// </summary>
+        protected override Kernel LoadImplicitlyGroupedKernelInternal(
+            CompiledKernel kernel,
+            int customGroupSize,
+            out KernelInfo kernelInfo)
+        {
+            if (customGroupSize < 0)
+                throw new ArgumentOutOfRangeException(nameof(customGroupSize));
+            kernelInfo = KernelInfo.CreateFrom(
+                kernel.Info,
+                customGroupSize,
+                null);
+            return LoadKernel(kernel, customGroupSize);
+        }
+
+        /// <summary>
+        /// Loads an auto grouped kernel.
+        /// </summary>
+        protected override Kernel LoadAutoGroupedKernelInternal(
+            CompiledKernel kernel,
+            out KernelInfo kernelInfo)
+        {
+            var result = LoadKernel(kernel, WarpSize);
+            kernelInfo = new KernelInfo(WarpSize, NumThreads / WarpSize);
+            return result;
+        }
+
+        /// <summary cref="Accelerator.CreateStreamInternal()"/>
+        protected override AcceleratorStream CreateStreamInternal() =>
+            new VelocityStream(this);
+
+        /// <summary cref="Accelerator.Synchronize"/>
+        protected override void SynchronizeInternal() { }
+
+        /// <summary cref="Accelerator.OnBind"/>
+        protected override void OnBind() { }
+
+        /// <summary cref="Accelerator.OnUnbind"/>
+        protected override void OnUnbind() { }
+
+        #region Peer Access
+
+        /// <summary cref="Accelerator.CanAccessPeerInternal(Accelerator)"/>
+        protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) =>
+            otherAccelerator is CPUAccelerator ||
+            otherAccelerator is VelocityAccelerator;
+
+        /// <summary cref="Accelerator.EnablePeerAccessInternal(Accelerator)"/>
+        protected override void EnablePeerAccessInternal(Accelerator otherAccelerator)
+        {
+            if (!CanAccessPeerInternal(otherAccelerator))
+            {
+                throw new InvalidOperationException(
+                    RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator);
+            }
+        }
+
+        /// <summary cref="Accelerator.DisablePeerAccessInternal(Accelerator)"/>
+        protected override void DisablePeerAccessInternal(
+            Accelerator otherAccelerator) =>
+            Debug.Assert(
+                CanAccessPeerInternal(otherAccelerator),
+                "Invalid EnablePeerAccess method");
+
+        #endregion
+
+        #region Occupancy
+
+        /// <summary cref="Accelerator.EstimateMaxActiveGroupsPerMultiprocessor(
+        /// Kernel, int, int)"/>
+        protected override int EstimateMaxActiveGroupsPerMultiprocessorInternal(
+            Kernel kernel,
+            int groupSize,
+            int dynamicSharedMemorySizeInBytes) =>
+            kernel is VelocityKernel
+            ? groupSize > MaxGroupSize.Size ? 0 : NumMultiprocessors
+            : throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+        /// <summary cref="Accelerator.EstimateGroupSizeInternal(
+        /// Kernel, Func{int, int}, int, out int)"/>
+        protected override int EstimateGroupSizeInternal(
+            Kernel kernel,
+            Func<int, int> computeSharedMemorySize,
+            int maxGroupSize,
+            out int minGridSize)
+        {
+            if (!(kernel is VelocityKernel))
+                throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+            // Estimation
+            minGridSize = NumThreads;
+            return Math.Min(maxGroupSize, MaxGroupSize.Size);
+        }
+
+        /// <summary cref="Accelerator.EstimateGroupSizeInternal(
+        /// Kernel, int, int, out int)"/>
+        protected override int EstimateGroupSizeInternal(
+            Kernel kernel,
+            int dynamicSharedMemorySizeInBytes,
+            int maxGroupSize,
+            out int minGridSize)
+        {
+            if (!(kernel is VelocityKernel))
+                throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel);
+
+            // Estimation
+            minGridSize = NumThreads;
+            return 1;
+        }
+
+        #endregion
+
+        #region Page Lock Scope
+
+        /// <inheritdoc/>
+        protected override PageLockScope<T> CreatePageLockFromPinnedInternal<T>(
+            IntPtr pinned,
+            long numElements)
+        {
+            Trace.WriteLine(RuntimeErrorMessages.NotSupportedPageLock);
+            return new NullPageLockScope<T>(this, pinned, numElements);
+        }
+
+        #endregion
+
+        #region IDisposable
+
+        /// <summary>
+        /// Dispose all managed resources allocated by this CPU accelerator instance.
+        /// </summary>
+        protected override void DisposeAccelerator_SyncRoot(bool disposing)
+        {
+            if (!disposing)
+                return;
+
+            // Dispose task engine
+            taskConcurrencyLimit.Wait();
+
+            // Dispose all multiprocessors
+            foreach (var multiprocessor in multiprocessors)
+                multiprocessor.Dispose();
+
+            // Dispose barriers
+            taskConcurrencyLimit.Dispose();
+        }
+
+        #endregion
+
+    }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs
new file mode 100644
index 000000000..e6a44ac7e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs
@@ -0,0 +1,74 @@
+﻿// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityContextExtensions.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends;
+using ILGPU.Backends.PTX;
+using ILGPU.Resources;
+using System;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// Cuda specific context extensions.
+    /// </summary>
+    public static class VelocityContextExtensions
+    {
+        #region Builder
+
+        /// <summary>
+        /// Enables all velocity devices.
+        /// </summary>
+        /// <param name="builder">The builder instance.</param>
+        /// <param name="maxSharedMemoryPerGroup">
+        /// The maximum number bytes of shared memory per group.
+        /// </param>
+        /// <returns>The updated builder instance.</returns>
+        public static Context.Builder Velocity(
+            this Context.Builder builder,
+            int maxSharedMemoryPerGroup = VelocityDevice.MinSharedMemoryPerGroup)
+        {
+            if (!Backend.RuntimePlatform.Is64Bit())
+            {
+                throw new NotSupportedException(string.Format(
+                    RuntimeErrorMessages.VelocityPlatform64,
+                    Backend.RuntimePlatform));
+            }
+
+            builder.DeviceRegistry.Register(new VelocityDevice());
+            return builder;
+        }
+
+        #endregion
+
+        #region Context
+
+        /// <summary>
+        /// Gets a registered Velocity device.
+        /// </summary>
+        /// <param name="context">The ILGPU context.</param>
+        /// <returns>The registered Velocity device.</returns>
+        public static VelocityDevice GetVelocityDevice(this Context context) =>
+            context.GetDevice<VelocityDevice>(0);
+
+        /// <summary>
+        /// Creates a new Velocity accelerator.
+        /// </summary>
+        /// <param name="context">The ILGPU context.</param>
+        /// <returns>The created Velocity accelerator.</returns>
+        public static VelocityAccelerator CreateVelocityAccelerator(
+            this Context context) =>
+            context.GetVelocityDevice().CreateVelocityAccelerator(context);
+
+        #endregion
+    }
+
+}
+
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
new file mode 100644
index 000000000..7456444e2
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
@@ -0,0 +1,140 @@
+using System;
+using System.Numerics;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// Represents a software-emulated velocity device for high-performance execution of
+    /// tasks on the CPU using vectorization.
+    /// </summary>
+    public sealed class VelocityDevice : Device
+    {
+        #region Constants
+
+        /// <summary>
+        /// The default maximum amount of shared memory in bytes (1024k).
+        /// </summary>
+        public const int MinSharedMemoryPerGroup = 1 << 20;
+
+        #endregion
+
+        #region Instance
+
+        /// <summary>
+        /// Creates a new velocity device with the default amount of shared memory per
+        /// group (refer to <see cref="MinSharedMemoryPerGroup"/> for more
+        /// information about the default size).
+        /// </summary>
+        public VelocityDevice()
+            : this(MinSharedMemoryPerGroup)
+        { }
+
+        /// <summary>
+        /// Creates a new velocity device using the given amount of shared memory (min
+        /// amount is <see cref="MinSharedMemoryPerGroup"/> per group).
+        /// </summary>
+        /// <param name="maxSharedMemoryPerGroup">
+        /// The maximum amount of shared memory per group in bytes.
+        /// </param>
+        public VelocityDevice(int maxSharedMemoryPerGroup)
+        {
+            if (maxSharedMemoryPerGroup < MinSharedMemoryPerGroup)
+                throw new ArgumentOutOfRangeException(nameof(maxSharedMemoryPerGroup));
+
+            Name = nameof(VelocityAccelerator);
+            WarpSize = VelocityWarp32.RawVectorLength;
+            MinWarpSize = VelocityWarp64.RawVectorLength;
+            MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize;
+            NumMultiprocessors = Environment.ProcessorCount;
+            MaxGroupSize = new Index3D(
+                MaxNumThreadsPerGroup,
+                1,
+                1);
+
+            MemorySize = long.MaxValue;
+            MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue);
+            MaxSharedMemoryPerGroup = maxSharedMemoryPerGroup;
+            MaxConstantMemory = int.MaxValue;
+            NumThreads = MaxNumThreads;
+
+            // Get the endian type from the global BitConverter class
+            IsLittleEndian = BitConverter.IsLittleEndian;
+
+            // Allocate a sufficient amount of local memory per thread equal to
+            // the maximum number of shared memory per group in bytes
+            MaxLocalMemoryPerThread = maxSharedMemoryPerGroup;
+        }
+
+        #endregion
+
+        #region Properties
+
+        /// <summary>
+        /// Returns the minimum warp size of this device.
+        /// </summary>
+        public int MinWarpSize { get; }
+
+        /// <summary>
+        /// Returns the number of threads.
+        /// </summary>
+        public int NumThreads { get; }
+
+        /// <summary>
+        /// Returns true if this device operates in little endian mode.
+        /// </summary>
+        public bool IsLittleEndian { get; }
+
+        /// <summary>
+        /// Returns the maximum local memory per thread in bytes.
+        /// </summary>
+        public int MaxLocalMemoryPerThread { get; }
+
+        #endregion
+
+        #region Methods
+
+        /// <inheritdoc/>
+        public override Accelerator CreateAccelerator(Context context) =>
+            CreateVelocityAccelerator(context);
+
+        /// <summary>
+        /// Creates a new performance CPU accelerator using and the default thread
+        /// priority.
+        /// </summary>
+        /// <param name="context">The ILGPU context.</param>
+        /// <returns>The created CPU accelerator.</returns>
+        public VelocityAccelerator CreateVelocityAccelerator(
+            Context context) =>
+            CreateVelocityAccelerator(context, ThreadPriority.Normal);
+
+        /// <summary>
+        /// Creates a new performance CPU accelerator using and the default thread
+        /// priority.
+        /// </summary>
+        /// <param name="context">The ILGPU context.</param>
+        /// <param name="threadPriority">
+        /// The thread priority of the execution threads.
+        /// </param>
+        /// <returns>The created CPU accelerator.</returns>
+        public VelocityAccelerator CreateVelocityAccelerator(
+            Context context,
+            ThreadPriority threadPriority) =>
+            new VelocityAccelerator(context, this, threadPriority);
+
+        #endregion
+
+        #region Object
+
+        /// <inheritdoc/>
+        public override bool Equals(object obj) =>
+            obj is VelocityDevice device &&
+            device.MaxSharedMemoryPerGroup == MaxSharedMemoryPerGroup &&
+            base.Equals(obj);
+
+        /// <inheritdoc/>
+        public override int GetHashCode() => base.GetHashCode() ^ MaxSharedMemoryPerGroup;
+
+        #endregion
+    }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs
new file mode 100644
index 000000000..41a6c2d0e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs
@@ -0,0 +1,80 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityKernel.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends.Velocity;
+using System.Reflection;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// Represents a single Velocity kernel.
+    /// </summary>
+    public sealed class VelocityKernel : Kernel
+    {
+        #region Static
+
+        /// <summary>
+        /// Represents the <see cref="KernelEntryPoint"/> property getter.
+        /// </summary>
+        internal static readonly MethodInfo GetKernelExecutionDelegate =
+            typeof(VelocityKernel).GetProperty(
+                nameof(KernelEntryPoint),
+                BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance)
+            .GetGetMethod(true);
+
+        #endregion
+
+        #region Instance
+
+        /// <summary>
+        /// Loads a compiled kernel into the given Cuda context as kernel program.
+        /// </summary>
+        /// <param name="accelerator">The associated accelerator.</param>
+        /// <param name="kernel">The source kernel.</param>
+        /// <param name="launcher">The launcher method for the given kernel.</param>
+        /// <param name="kernelExecutionDelegate">The execution method.</param>
+        internal VelocityKernel(
+            VelocityAccelerator accelerator,
+            VelocityCompiledKernel kernel,
+            MethodInfo launcher)
+            : base(accelerator, kernel, launcher)
+        {
+            KernelEntryPoint = kernel.CreateKernelEntryPoint();
+        }
+
+        #endregion
+
+        #region Properties
+
+        /// <summary>
+        /// Returns the associated Velocity runtime.
+        /// </summary>
+        public VelocityAccelerator VelocityAccelerator =>
+            Accelerator as VelocityAccelerator;
+
+        /// <summary>
+        /// The main kernel entry point function to be called from each velocity
+        /// multiprocessor during exeution.
+        /// </summary>
+        internal VelocityKernelEntryPoint KernelEntryPoint { get; }
+
+        #endregion
+
+        #region IDisposable
+
+        /// <summary>
+        /// Does not perform any operation.
+        /// </summary>
+        protected override void DisposeAcceleratorObject(bool disposing) { }
+
+        #endregion
+    }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs
new file mode 100644
index 000000000..e9c640a0e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs
@@ -0,0 +1,151 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                        Copyright (c) 2017-2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityMemoryBuffer.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Runtime.CPU;
+using ILGPU.Util;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// A memory buffer that lives in CPU space.
+    /// </summary>
+    public class VelocityMemoryBuffer : MemoryBuffer
+    {
+        #region Instance
+
+        /// <summary>
+        /// Initializes this array view source on the CPU.
+        /// </summary>
+        /// <param name="accelerator">The parent accelerator (if any).</param>
+        /// <param name="length">The length of this source.</param>
+        /// <param name="elementSize">The element size.</param>
+        internal VelocityMemoryBuffer(
+            Accelerator accelerator,
+            long length,
+            int elementSize)
+            : base(accelerator, length, elementSize)
+        {
+            // Ensure that all element accesses will be properly aligned
+            int alignmentOffset = Interop.ComputeAlignmentOffset(
+                length,
+                elementSize * accelerator.WarpSize);
+            // Pad the length to ensure a valid buffer size
+            long paddedLength = length + alignmentOffset;
+
+            // Allocate resources and assign pointers
+            NativeBufferPtr = Marshal.AllocHGlobal(new IntPtr(paddedLength));
+            NativePtr = NativeBufferPtr + alignmentOffset;
+        }
+
+        #endregion
+
+        #region Properties
+
+        /// <summary>
+        /// Returns the natively allocated underlying buffer pointer which may not be
+        /// aligned in all cases.
+        /// </summary>
+        public IntPtr NativeBufferPtr { get; private set; }
+
+        #endregion
+
+        #region Methods
+
+        /// <inheritdoc/>
+        protected internal override void MemSet(
+            AcceleratorStream stream,
+            byte value,
+            in ArrayView<byte> targetView) =>
+            CPUMemoryBuffer.CPUMemSet(
+                targetView.LoadEffectiveAddressAsPtr(),
+                value,
+                0L,
+                targetView.LengthInBytes);
+
+        /// <inheritdoc/>
+        protected internal override void CopyFrom(
+            AcceleratorStream stream,
+            in ArrayView<byte> sourceView,
+            in ArrayView<byte> targetView) =>
+            CPUMemoryBuffer.CPUCopyFrom(stream, sourceView, targetView);
+
+        /// <inheritdoc/>
+        protected internal override void CopyTo(
+            AcceleratorStream stream,
+            in ArrayView<byte> sourceView,
+            in ArrayView<byte> targetView) =>
+            CPUMemoryBuffer.CPUCopyTo(stream, sourceView, targetView);
+
+        #endregion
+
+        #region IDisposable
+
+        /// <summary>
+        /// Disposes the underlying memory buffer.
+        /// </summary>
+        protected override void DisposeAcceleratorObject(bool disposing)
+        {
+            Marshal.FreeHGlobal(NativeBufferPtr);
+            NativeBufferPtr = IntPtr.Zero;
+            NativePtr = IntPtr.Zero;
+        }
+
+        #endregion
+
+    }
+
+    sealed class VelocityMemoryBufferPool : VelocityMemoryBuffer
+    {
+        #region Instance
+
+        private volatile int sharedMemoryOffset;
+        private readonly int warpSize;
+
+        public VelocityMemoryBufferPool(
+            VelocityAccelerator accelerator,
+            int size)
+            : base(accelerator, size, 1)
+        {
+            warpSize = accelerator.WarpSize;
+        }
+
+        #endregion
+
+        #region Methods
+
+        public void Reset() =>
+            Interlocked.Exchange(ref sharedMemoryOffset, 0);
+
+        /// <summary>
+        /// Gets a chunk of memory of a certain type.
+        /// </summary>
+        /// <param name="length">The number of elements.</param>
+        /// <typeparam name="T">The element type to allocate.</typeparam>
+        /// <returns>A view pointing to the right chunk of shared memory.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public ArrayView<T> Allocate<T>(int length)
+            where T : unmanaged
+        {
+            int totalElementSize = length * Interop.SizeOf<T>() * warpSize;
+            int alignment = Interop.ComputeAlignmentOffset(
+                sharedMemoryOffset,
+                totalElementSize);
+            int newOffset = Interlocked.Add(ref sharedMemoryOffset, alignment);
+            return new ArrayView<T>(this, newOffset, length * warpSize);
+        }
+
+        #endregion
+    }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs
new file mode 100644
index 000000000..98d40de2e
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs
@@ -0,0 +1,402 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityMultiprocessor.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Util;
+using System;
+using System.Collections.Immutable;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// Represents a single velocity kernel processing delegate.
+    /// </summary>
+    /// <param name="globalStartIndex">The start index within the thread grid.</param>
+    /// <param name="globalEndIndex">The end index within the thread grid.</param>
+    /// <param name="parameters">The current parameters.</param>
+    delegate void VelocityKernelEntryPoint(
+        int globalStartIndex,
+        int globalEndIndex,
+        VelocityParameters parameters);
+
+    /// <summary>
+    /// A single velocity multiprocessor consisting of a single processing thread and
+    /// a runtime context.
+    /// </summary>
+    sealed class VelocityMultiprocessor : DisposeBase
+    {
+        #region Static
+
+        /// <summary>
+        /// All kernel handler types required to launch a kernel delegate on this MP.
+        /// </summary>
+        public static readonly ImmutableArray<Type> KernelHandlerTypes =
+            ImmutableArray.Create(
+                typeof(int),
+                typeof(int),
+                typeof(VelocityParameters));
+
+        /// <summary>
+        /// Stores the current velocity multiprocessor.
+        /// </summary>
+        [ThreadStatic]
+        private static VelocityMultiprocessor current;
+
+        /// <summary>
+        /// Returns the parent velocity multiprocessor for the current thread.
+        /// </summary>
+        /// <returns>The parent multiprocessor for the current thread.</returns>
+        public static VelocityMultiprocessor GetCurrent() => current;
+
+        /// <summary>
+        /// Allocates a chunk of shared memory.
+        /// </summary>
+        /// <returns>A velocity warp made of shared-memory pointers.</returns>
+        public static VelocityWarp64 GetSharedMemory<T>(int length)
+            where T : unmanaged
+        {
+            var sharedMemoryView = GetCurrent().GetSharedMemoryFromPool<T>(length);
+            long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64();
+            var addresses = VelocityWarp64.GetConstI(intPtr);
+            return addresses.AddU(VelocityWarp64.LaneIndexVector);
+        }
+
+        /// <summary>
+        /// Allocates a chunk of local memory.
+        /// </summary>
+        /// <returns>A velocity warp made of local-memory pointers.</returns>
+        public static VelocityWarp64 GetLocalMemory<T>(int length)
+            where T : unmanaged
+        {
+            var sharedMemoryView = GetCurrent().GetLocalMemoryFromPool<T>(length);
+            long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64();
+            var addresses = VelocityWarp64.GetConstI(intPtr);
+            return addresses.AddU(VelocityWarp64.LaneIndexVector);
+        }
+
+        /// <summary>
+        /// Returns the current grid indices for all warp lanes associated with this
+        /// multiprocessor.
+        /// </summary>
+        /// <returns>A velocity warp made of grid indices.</returns>
+        public static VelocityWarp32 GetCurrentGridIdx() => GetCurrent().GridIdx;
+
+        /// <summary>
+        /// Sets the current grid index by advancing to the given offset.
+        /// </summary>
+        public static void SetCurrentGridIdx(int offset) =>
+            GetCurrent().ResetGridIndex(offset);
+
+        /// <summary>
+        /// Returns the current grid dimension for all warp lanes associated with this
+        /// multiprocessor.
+        /// </summary>
+        /// <returns>A velocity warp made of the current grid dimension.</returns>
+        public static VelocityWarp32 GetCurrentGridDim() =>
+            VelocityWarp32.GetConstI(GetCurrent().GridDim);
+
+        /// <summary>
+        /// Returns the current user defined grid dimension which may not be a multiple
+        /// of the group size used on this multiprocessor.
+        /// </summary>
+        /// <returns>A velocity warp made of the current user grid dimension.</returns>
+        public static VelocityWarp32 GetCurrentUserGridDim() =>
+            VelocityWarp32.GetConstI(GetCurrent().UserGridDim);
+
+        /// <summary>
+        /// Returns the current group dimension for all warp lanes associated with this
+        /// multiprocessor.
+        /// </summary>
+        /// <returns>A velocity warp made of the current grid dimension.</returns>
+        public static VelocityWarp32 GetCurrentGroupDim() =>
+            VelocityWarp32.GetConstI(GetCurrent().GroupDim);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetSharedMemory{T}"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetSharedMemoryMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                $"{nameof(GetSharedMemory)}`1",
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetLocalMemory{T}"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetLocalMemoryMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                $"{nameof(GetLocalMemory)}`1",
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetCurrentGridIdx"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetCurrentGridIdxMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                nameof(GetCurrentGridIdx),
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="SetCurrentGridIdx"/> method.
+        /// </summary>
+        public static readonly MethodInfo SetCurrentGridIdxMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                nameof(SetCurrentGridIdx),
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetCurrentGridDim"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetCurrentGridDimMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                nameof(GetCurrentGridDim),
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetCurrentUserGridDim"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetCurrentUserGridDimMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                nameof(GetCurrentUserGridDim),
+                BindingFlags.Public | BindingFlags.Static);
+
+        /// <summary>
+        /// Returns a handle to the <see cref="GetCurrentGroupDim"/> method.
+        /// </summary>
+        public static readonly MethodInfo GetCurrentGroupDimMethodInfo =
+            typeof(VelocityMultiprocessor).GetMethod(
+                nameof(GetCurrentGroupDim),
+                BindingFlags.Public | BindingFlags.Static);
+
+        #endregion
+
+        #region Events
+
+        /// <summary>
+        /// Will be raised once a chunk of a scheduled thread grid has been completed.
+        /// </summary>
+        public Action<VelocityMultiprocessor> ProcessingCompleted;
+
+        #endregion
+
+        #region Instance
+
+        // Thread data
+        private readonly Thread runtimeThread;
+        private readonly SemaphoreSlim startProcessingSema;
+
+        // Context data
+        private readonly VelocityMemoryBufferPool sharedMemoryPool;
+        private readonly VelocityMemoryBufferPool localMemoryPool;
+
+        // Runtime data
+        private volatile VelocityKernelEntryPoint kernelHandler;
+        private volatile int startIndexRange;
+        private volatile int endIndexRange;
+        private volatile VelocityParameters kernelParameters;
+        private volatile bool running = true;
+
+        /// <summary>
+        /// Initializes a new velocity multiprocessor.
+        /// </summary>
+        /// <param name="accelerator">The parent velocity accelerator.</param>
+        /// <param name="processorIndex">The current processor index.</param>
+        internal VelocityMultiprocessor(
+            VelocityAccelerator accelerator,
+            int processorIndex)
+        {
+            runtimeThread = new Thread(DoWork)
+            {
+                Priority = accelerator.ThreadPriority,
+                IsBackground = true,
+                Name = $"ILGPU_{accelerator.InstanceId}_Velocity_{processorIndex}"
+            };
+            startProcessingSema = new SemaphoreSlim(0);
+            sharedMemoryPool = new VelocityMemoryBufferPool(
+                accelerator,
+                accelerator.MaxSharedMemoryPerGroup);
+            localMemoryPool = new VelocityMemoryBufferPool(
+                accelerator,
+                accelerator.MaxLocalMemoryPerThread);
+            WarpSize = accelerator.WarpSize;
+            ProcessorIndex = processorIndex;
+
+        }
+
+        #endregion
+
+        #region Properties
+
+        /// <summary>
+        /// Returns the current warp size.
+        /// </summary>
+        public int WarpSize { get; }
+
+        /// <summary>
+        /// Returns the multiprocessor index.
+        /// </summary>
+        public int ProcessorIndex { get; }
+
+        /// <summary>
+        /// Returns the precomputed grid indices for all lanes in the current
+        /// multiprocessor.
+        /// </summary>
+        public VelocityWarp32 GridIdx { get; private set; }
+
+        /// <summary>
+        /// Returns the current grid dimension.
+        /// </summary>
+        public int GridDim { get; private set; }
+
+        /// <summary>
+        /// Returns the current user grid dimension.
+        /// </summary>
+        public int UserGridDim { get; private set; }
+
+        /// <summary>
+        /// Returns the current group dimension.
+        /// </summary>
+        public int GroupDim { get; private set; }
+
+        #endregion
+
+        #region Methods
+
+        /// <summary>
+        /// Resets the current grid index.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ResetGridIndex(int offset)
+        {
+            // Compute the initial grid indices based on our processor index and the
+            // warp size
+            int baseIndex = WarpSize * ProcessorIndex + offset;
+            GridIdx =
+                VelocityWarp32.LaneIndexVector.AddI(
+                VelocityWarp32.GetConstI(baseIndex));
+        }
+
+        /// <summary>
+        /// Gets a chunk of shared memory of a certain type.
+        /// </summary>
+        /// <param name="length">The number of elements.</param>
+        /// <typeparam name="T">The element type to allocate.</typeparam>
+        /// <returns>A view pointing to the right chunk of shared memory.</returns>
+        public ArrayView<T> GetSharedMemoryFromPool<T>(int length)
+            where T : unmanaged =>
+            sharedMemoryPool.Allocate<T>(length);
+
+        /// <summary>
+        /// Gets a chunk of local memory of a certain type.
+        /// </summary>
+        /// <param name="length">The number of elements.</param>
+        /// <typeparam name="T">The element type to allocate.</typeparam>
+        /// <returns>A view pointing to the right chunk of local memory.</returns>
+        public ArrayView<T> GetLocalMemoryFromPool<T>(int length)
+            where T : unmanaged =>
+            localMemoryPool.Allocate<T>(length);
+
+        /// <summary>
+        /// Dispatches a new kernel execution.
+        /// </summary>
+        /// <param name="handler">The kernel handler delegate.</param>
+        /// <param name="startIndex">The start interval index.</param>
+        /// <param name="endIndex">The end interval index.</param>
+        /// <param name="gridDimension">The current grid dimension.</param>
+        /// <param name="groupDimension">The current group dimension.</param>
+        /// <param name="parameters">All kernel parameters.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Run(
+            VelocityKernelEntryPoint handler,
+            int startIndex,
+            int endIndex,
+            int gridDimension,
+            int userGridDimension,
+            int groupDimension,
+            VelocityParameters parameters)
+        {
+            GridDim = gridDimension;
+            UserGridDim = userGridDimension;
+            GroupDim = groupDimension;
+
+            // Note that we do not have to invoke
+            // ResetGridIndex(offset: 0);
+            // here, as this method will be automatically invoked by each Velocity kernel
+
+            // Schedule this operation
+            kernelHandler = handler;
+            startIndexRange = startIndex;
+            endIndexRange = endIndex;
+            kernelParameters = parameters;
+            sharedMemoryPool.Reset();
+            localMemoryPool.Reset();
+
+            // Ensure visibility of all changes to other threads
+            Thread.MemoryBarrier();
+
+            // Launch the processing task
+            startProcessingSema.Release();
+        }
+
+        /// <summary>
+        /// The main processing thread of this multiprocessor.
+        /// </summary>
+        private void DoWork()
+        {
+            // Assign the current multiprocessor to this instance
+            current = this;
+
+            // Process all tasks
+            while (true)
+            {
+                // Wait for the next task to arrive
+                startProcessingSema.Wait();
+
+                // Break the loop if we are shutting down
+                if (!running)
+                    break;
+
+                // Launch the actual kernel method
+                kernelHandler(startIndexRange, endIndexRange, kernelParameters);
+
+                // Signal the main thread that the processing has been completed. Note
+                // that we avoid any null checks at this point
+                ProcessingCompleted(this);
+            }
+        }
+
+        #endregion
+
+        #region IDisposable
+
+        /// <summary>
+        /// Waits for the processing thread to shutdown and disposes all internal thread
+        /// objects.
+        /// </summary>
+        protected override void Dispose(bool disposing)
+        {
+            if (disposing)
+            {
+                running = false;
+                startProcessingSema.Release();
+                runtimeThread.Join();
+
+                startProcessingSema.Dispose();
+                sharedMemoryPool.Dispose();
+                localMemoryPool.Dispose();
+            }
+            base.Dispose(disposing);
+        }
+
+        #endregion
+    }
+}
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs
new file mode 100644
index 000000000..bd38b034c
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs
@@ -0,0 +1,27 @@
+﻿// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityParameters.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// The base class for all velocity parameters.
+    /// </summary>
+    abstract class VelocityParameters
+    {
+        /// <summary>
+        /// Does nothing at the moment
+        /// </summary>
+        public VelocityParameters()
+        {
+        }
+    }
+}
+
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityStream.cs b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs
new file mode 100644
index 000000000..9adfe3cc9
--- /dev/null
+++ b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs
@@ -0,0 +1,74 @@
+﻿// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2022 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: VelocityStream.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Runtime.CPU;
+
+namespace ILGPU.Runtime.Velocity
+{
+    /// <summary>
+    /// Represents a velocity stream.
+    /// </summary>
+    sealed class VelocityStream : AcceleratorStream
+    {
+        #region Static
+
+        /// <summary>
+        /// The default instance.
+        /// </summary>
+        internal static readonly VelocityStream Default = new VelocityStream();
+
+        #endregion
+
+        #region Instance
+
+        /// <summary>
+        /// Constructs a new Velocity stream.
+        /// </summary>
+        private VelocityStream() : base() { }
+
+        /// <summary>
+        /// Constructs a new Velocity stream.
+        /// </summary>
+        /// <param name="accelerator">The associated accelerator.</param>
+        internal VelocityStream(Accelerator accelerator)
+            : base(accelerator)
+        { }
+
+        #endregion
+
+        #region Methods
+
+        /// <summary>
+        /// Does not perform any operation.
+        /// </summary>
+        public override void Synchronize() { }
+
+        /// <inheritdoc/>
+        protected unsafe override ProfilingMarker AddProfilingMarkerInternal()
+        {
+            using var binding = Accelerator.BindScoped();
+            return new CPUProfilingMarker(Accelerator);
+        }
+
+        #endregion
+
+        #region IDisposable
+
+        /// <summary>
+        /// Does not perform any operation.
+        /// </summary>
+        protected override void DisposeAcceleratorObject(bool disposing) { }
+
+        #endregion
+    }
+}
+
+