diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs new file mode 100644 index 000000000..640f48cf6 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs @@ -0,0 +1,468 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityAccelerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Backends.IL; +using ILGPU.Backends.Velocity; +using ILGPU.Resources; +using ILGPU.Runtime.CPU; +using System; +using System.Diagnostics; +using System.Reflection; +using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A SIMD-enabled CPU-based accelerator. + /// + public sealed class VelocityAccelerator : Accelerator + { + #region Static + + /// + /// The internal method to launch kernels + /// + private static readonly MethodInfo RunMethodInfo = + typeof(VelocityAccelerator).GetMethod( + nameof(Run), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + + #endregion + + #region Instance + + private readonly VelocityMultiprocessor[] multiprocessors; + private readonly SemaphoreSlim taskConcurrencyLimit = new SemaphoreSlim(1); + private readonly Barrier multiprocessorBarrier; + + /// + /// Constructs a new Velocity accelerator. + /// + /// The ILGPU context. + /// The Velocity device. + /// + /// The thread priority of the execution threads. + /// + internal VelocityAccelerator( + Context context, + VelocityDevice device, + ThreadPriority threadPriority) + : base(context, device) + { + if (!device.IsLittleEndian) + { + throw new NotSupportedException( + RuntimeErrorMessages.VelocityLittleEndian); + } + + multiprocessors = new VelocityMultiprocessor[device.NumMultiprocessors]; + multiprocessorBarrier = new Barrier(device.NumMultiprocessors + 1); + ThreadPriority = threadPriority; + MaxLocalMemoryPerThread = device.MaxLocalMemoryPerThread; + NumThreads = device.WarpSize * device.NumMultiprocessors; + + // Initialize all multiprocessors + Action processingCompleted = OnProcessingCompleted; + for (int i = 0; i < device.NumMultiprocessors; ++i) + { + var multiProcessor = new VelocityMultiprocessor(this, i); + multiProcessor.ProcessingCompleted += processingCompleted; + multiprocessors[i] = multiProcessor; + } + + // Init the underlying Velocity backend + Init(new VelocityBackend( + context, + new CPUCapabilityContext(), + WarpSize, + new VelocityArgumentMapper(context))); + } + + #endregion + + #region Properties + + /// + /// Returns the Velocity backend of this accelerator. + /// + internal new VelocityBackend Backend => + base.Backend as VelocityBackend; + + /// + /// Returns the current thread priority. + /// + public ThreadPriority ThreadPriority { get; } + + /// + /// Returns the maximum local memory per thread in bytes. + /// + public int MaxLocalMemoryPerThread { get; } + + /// + /// Returns the maximum number of parallel threads. + /// + public int NumThreads { get; } + + #endregion + + #region Launch Methods + + /// + /// Main internal run method to launch loaded kernels. + /// + /// The user-defined kernel config. + /// + /// The actual runtime kernel config to be used for launching. + /// + /// The kernel entry point handler. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void Run( + KernelConfig userKernelConfig, + RuntimeKernelConfig runtimeKernelConfig, + VelocityKernelEntryPoint kernelHandler, + VelocityParameters velocityParameters) + { + // Avoid concurrent executions of kernels.. we have to wait for the current + // kernel to finish first + taskConcurrencyLimit.Wait(); + try + { + // Distribute the workload + int groupSize = runtimeKernelConfig.GroupDim.Size; + int gridSize = runtimeKernelConfig.GridDim.Size; + int numActiveMPs = Math.Min( + gridSize / NumMultiprocessors, + NumMultiprocessors); + int chunkSizePerMP = IntrinsicMath.DivRoundUp(gridSize, numActiveMPs); + + // Start the multiprocessor journey + for (int i = 0; i < numActiveMPs; ++i) + { + int startIndex = i * chunkSizePerMP; + int endIndex = Math.Min(startIndex + chunkSizePerMP, gridSize) - 1; + multiprocessors[i].Run( + kernelHandler, + startIndex, + endIndex, + gridSize, + userKernelConfig.GridDim.Size, + groupSize, + velocityParameters); + } + + // Wait for all multiprocessors to finish + multiprocessorBarrier.SignalAndWait(); + } + finally + { + taskConcurrencyLimit.Release(); + } + } + + private void OnProcessingCompleted(VelocityMultiprocessor processor) => + multiprocessorBarrier.SignalAndWait(); + + /// + /// Generates a dynamic kernel-launcher method that will be just-in-time compiled + /// during the first invocation. Using the generated launcher lowers the overhead + /// for kernel launching dramatically, since unnecessary operations (like boxing) + /// can be avoided. + /// + /// The kernel to generate a launcher for. + /// + /// The custom group size for the launching operation. + /// + /// The generated launcher method. + private MethodInfo GenerateKernelLauncherMethod( + VelocityCompiledKernel kernel, + int customGroupSize) + { + var entryPoint = kernel.EntryPoint; + AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); + + // Add support for by ref parameters + if (entryPoint.HasByRefParameters) + { + throw new NotSupportedException( + ErrorMessages.NotSupportedByRefKernelParameters); + } + + // Declare a new launcher method + using var scopedLock = entryPoint.CreateLauncherMethod( + Context.RuntimeSystem, + out var launcher); + var emitter = new ILEmitter(launcher.ILGenerator); + + // Map all arguments to an argument structure containing mapped views + var argumentMapper = Backend.ArgumentMapper; + var (structLocal, _) = argumentMapper.Map(emitter, entryPoint); + + var velocityKernel = emitter.DeclareLocal(typeof(VelocityKernel)); + KernelLauncherBuilder.EmitLoadKernelArgument( + Kernel.KernelInstanceParamIdx, emitter); + emitter.Emit(LocalOperation.Store, velocityKernel); + + // Create an instance of the custom parameters type + var parametersInstance = emitter.DeclareLocal(kernel.ParametersType); + { + // Assign parameters + var parameters = entryPoint.Parameters; + for (int i = 0, e = parameters.Count; i < e; ++i) + { + // Load the current argument onto the stack + emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset); + if (parameters.IsByRef(i)) + emitter.Emit(OpCodes.Ldobj, parameters[i]); + } + + // Create new task object + emitter.EmitNewObject(kernel.ParametersTypeConstructor); + + // Store task + emitter.Emit(LocalOperation.Store, parametersInstance); + } + + // Load custom user dimension + KernelLauncherBuilder.EmitLoadKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize); + + // Load dimensions + KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize, + customGroupSize); + + // Load the kernel delegate + emitter.Emit(LocalOperation.Load, velocityKernel); + emitter.EmitCall(VelocityKernel.GetKernelExecutionDelegate); + + // Load the parameters object + emitter.Emit(LocalOperation.Load, parametersInstance); + + // Launch kernel execution + emitter.EmitCall(RunMethodInfo); + + // End of launch method + emitter.Emit(OpCodes.Ret); + emitter.Finish(); + + return launcher.Finish(); + } + + #endregion + + /// + public override TExtension CreateExtension< + TExtension, + TExtensionProvider>(TExtensionProvider provider) => + provider.CreateVelocityExtension(this); + + /// + protected override MemoryBuffer AllocateRawInternal( + long length, + int elementSize) => + new VelocityMemoryBuffer(this, length, elementSize); + + /// + /// Loads the given kernel. + /// + /// The kernel to load. + /// The custom group size. + /// The loaded kernel + private Kernel LoadKernel(CompiledKernel kernel, int customGroupSize) + { + if (kernel is null) + throw new ArgumentNullException(nameof(kernel)); + var compiledKernel = kernel as VelocityCompiledKernel; + if (compiledKernel is null) + { + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedKernel); + } + + var launcherMethod = GenerateKernelLauncherMethod( + compiledKernel, + customGroupSize); + return new VelocityKernel( + this, + compiledKernel, + launcherMethod); + } + + /// + /// Loads a default kernel. + /// + protected override Kernel LoadKernelInternal(CompiledKernel kernel) => + LoadKernel(kernel, 0); + + /// + /// Loads an implicitly grouped kernel. + /// + protected override Kernel LoadImplicitlyGroupedKernelInternal( + CompiledKernel kernel, + int customGroupSize, + out KernelInfo kernelInfo) + { + if (customGroupSize < 0) + throw new ArgumentOutOfRangeException(nameof(customGroupSize)); + kernelInfo = KernelInfo.CreateFrom( + kernel.Info, + customGroupSize, + null); + return LoadKernel(kernel, customGroupSize); + } + + /// + /// Loads an auto grouped kernel. + /// + protected override Kernel LoadAutoGroupedKernelInternal( + CompiledKernel kernel, + out KernelInfo kernelInfo) + { + var result = LoadKernel(kernel, WarpSize); + kernelInfo = new KernelInfo(WarpSize, NumThreads / WarpSize); + return result; + } + + /// + protected override AcceleratorStream CreateStreamInternal() => + new VelocityStream(this); + + /// + protected override void SynchronizeInternal() { } + + /// + protected override void OnBind() { } + + /// + protected override void OnUnbind() { } + + #region Peer Access + + /// + protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) => + otherAccelerator is CPUAccelerator || + otherAccelerator is VelocityAccelerator; + + /// + protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) + { + if (!CanAccessPeerInternal(otherAccelerator)) + { + throw new InvalidOperationException( + RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator); + } + } + + /// + protected override void DisablePeerAccessInternal( + Accelerator otherAccelerator) => + Debug.Assert( + CanAccessPeerInternal(otherAccelerator), + "Invalid EnablePeerAccess method"); + + #endregion + + #region Occupancy + + /// + protected override int EstimateMaxActiveGroupsPerMultiprocessorInternal( + Kernel kernel, + int groupSize, + int dynamicSharedMemorySizeInBytes) => + kernel is VelocityKernel + ? groupSize > MaxGroupSize.Size ? 0 : NumMultiprocessors + : throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + Func computeSharedMemorySize, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = NumThreads; + return Math.Min(maxGroupSize, MaxGroupSize.Size); + } + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + int dynamicSharedMemorySizeInBytes, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = NumThreads; + return 1; + } + + #endregion + + #region Page Lock Scope + + /// + protected override PageLockScope CreatePageLockFromPinnedInternal( + IntPtr pinned, + long numElements) + { + Trace.WriteLine(RuntimeErrorMessages.NotSupportedPageLock); + return new NullPageLockScope(this, pinned, numElements); + } + + #endregion + + #region IDisposable + + /// + /// Dispose all managed resources allocated by this CPU accelerator instance. + /// + protected override void DisposeAccelerator_SyncRoot(bool disposing) + { + if (!disposing) + return; + + // Dispose task engine + taskConcurrencyLimit.Wait(); + + // Dispose all multiprocessors + foreach (var multiprocessor in multiprocessors) + multiprocessor.Dispose(); + + // Dispose barriers + taskConcurrencyLimit.Dispose(); + } + + #endregion + + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs new file mode 100644 index 000000000..e6a44ac7e --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs @@ -0,0 +1,74 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityContextExtensions.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Backends.PTX; +using ILGPU.Resources; +using System; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Cuda specific context extensions. + /// + public static class VelocityContextExtensions + { + #region Builder + + /// + /// Enables all velocity devices. + /// + /// The builder instance. + /// + /// The maximum number bytes of shared memory per group. + /// + /// The updated builder instance. + public static Context.Builder Velocity( + this Context.Builder builder, + int maxSharedMemoryPerGroup = VelocityDevice.MinSharedMemoryPerGroup) + { + if (!Backend.RuntimePlatform.Is64Bit()) + { + throw new NotSupportedException(string.Format( + RuntimeErrorMessages.VelocityPlatform64, + Backend.RuntimePlatform)); + } + + builder.DeviceRegistry.Register(new VelocityDevice()); + return builder; + } + + #endregion + + #region Context + + /// + /// Gets a registered Velocity device. + /// + /// The ILGPU context. + /// The registered Velocity device. + public static VelocityDevice GetVelocityDevice(this Context context) => + context.GetDevice(0); + + /// + /// Creates a new Velocity accelerator. + /// + /// The ILGPU context. + /// The created Velocity accelerator. + public static VelocityAccelerator CreateVelocityAccelerator( + this Context context) => + context.GetVelocityDevice().CreateVelocityAccelerator(context); + + #endregion + } + +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs new file mode 100644 index 000000000..7456444e2 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs @@ -0,0 +1,140 @@ +using System; +using System.Numerics; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a software-emulated velocity device for high-performance execution of + /// tasks on the CPU using vectorization. + /// + public sealed class VelocityDevice : Device + { + #region Constants + + /// + /// The default maximum amount of shared memory in bytes (1024k). + /// + public const int MinSharedMemoryPerGroup = 1 << 20; + + #endregion + + #region Instance + + /// + /// Creates a new velocity device with the default amount of shared memory per + /// group (refer to for more + /// information about the default size). + /// + public VelocityDevice() + : this(MinSharedMemoryPerGroup) + { } + + /// + /// Creates a new velocity device using the given amount of shared memory (min + /// amount is per group). + /// + /// + /// The maximum amount of shared memory per group in bytes. + /// + public VelocityDevice(int maxSharedMemoryPerGroup) + { + if (maxSharedMemoryPerGroup < MinSharedMemoryPerGroup) + throw new ArgumentOutOfRangeException(nameof(maxSharedMemoryPerGroup)); + + Name = nameof(VelocityAccelerator); + WarpSize = VelocityWarp32.RawVectorLength; + MinWarpSize = VelocityWarp64.RawVectorLength; + MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize; + NumMultiprocessors = Environment.ProcessorCount; + MaxGroupSize = new Index3D( + MaxNumThreadsPerGroup, + 1, + 1); + + MemorySize = long.MaxValue; + MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue); + MaxSharedMemoryPerGroup = maxSharedMemoryPerGroup; + MaxConstantMemory = int.MaxValue; + NumThreads = MaxNumThreads; + + // Get the endian type from the global BitConverter class + IsLittleEndian = BitConverter.IsLittleEndian; + + // Allocate a sufficient amount of local memory per thread equal to + // the maximum number of shared memory per group in bytes + MaxLocalMemoryPerThread = maxSharedMemoryPerGroup; + } + + #endregion + + #region Properties + + /// + /// Returns the minimum warp size of this device. + /// + public int MinWarpSize { get; } + + /// + /// Returns the number of threads. + /// + public int NumThreads { get; } + + /// + /// Returns true if this device operates in little endian mode. + /// + public bool IsLittleEndian { get; } + + /// + /// Returns the maximum local memory per thread in bytes. + /// + public int MaxLocalMemoryPerThread { get; } + + #endregion + + #region Methods + + /// + public override Accelerator CreateAccelerator(Context context) => + CreateVelocityAccelerator(context); + + /// + /// Creates a new performance CPU accelerator using and the default thread + /// priority. + /// + /// The ILGPU context. + /// The created CPU accelerator. + public VelocityAccelerator CreateVelocityAccelerator( + Context context) => + CreateVelocityAccelerator(context, ThreadPriority.Normal); + + /// + /// Creates a new performance CPU accelerator using and the default thread + /// priority. + /// + /// The ILGPU context. + /// + /// The thread priority of the execution threads. + /// + /// The created CPU accelerator. + public VelocityAccelerator CreateVelocityAccelerator( + Context context, + ThreadPriority threadPriority) => + new VelocityAccelerator(context, this, threadPriority); + + #endregion + + #region Object + + /// + public override bool Equals(object obj) => + obj is VelocityDevice device && + device.MaxSharedMemoryPerGroup == MaxSharedMemoryPerGroup && + base.Equals(obj); + + /// + public override int GetHashCode() => base.GetHashCode() ^ MaxSharedMemoryPerGroup; + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs new file mode 100644 index 000000000..41a6c2d0e --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs @@ -0,0 +1,80 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityKernel.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.Velocity; +using System.Reflection; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single Velocity kernel. + /// + public sealed class VelocityKernel : Kernel + { + #region Static + + /// + /// Represents the property getter. + /// + internal static readonly MethodInfo GetKernelExecutionDelegate = + typeof(VelocityKernel).GetProperty( + nameof(KernelEntryPoint), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .GetGetMethod(true); + + #endregion + + #region Instance + + /// + /// Loads a compiled kernel into the given Cuda context as kernel program. + /// + /// The associated accelerator. + /// The source kernel. + /// The launcher method for the given kernel. + /// The execution method. + internal VelocityKernel( + VelocityAccelerator accelerator, + VelocityCompiledKernel kernel, + MethodInfo launcher) + : base(accelerator, kernel, launcher) + { + KernelEntryPoint = kernel.CreateKernelEntryPoint(); + } + + #endregion + + #region Properties + + /// + /// Returns the associated Velocity runtime. + /// + public VelocityAccelerator VelocityAccelerator => + Accelerator as VelocityAccelerator; + + /// + /// The main kernel entry point function to be called from each velocity + /// multiprocessor during exeution. + /// + internal VelocityKernelEntryPoint KernelEntryPoint { get; } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs new file mode 100644 index 000000000..e9c640a0e --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs @@ -0,0 +1,151 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2017-2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMemoryBuffer.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.CPU; +using ILGPU.Util; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A memory buffer that lives in CPU space. + /// + public class VelocityMemoryBuffer : MemoryBuffer + { + #region Instance + + /// + /// Initializes this array view source on the CPU. + /// + /// The parent accelerator (if any). + /// The length of this source. + /// The element size. + internal VelocityMemoryBuffer( + Accelerator accelerator, + long length, + int elementSize) + : base(accelerator, length, elementSize) + { + // Ensure that all element accesses will be properly aligned + int alignmentOffset = Interop.ComputeAlignmentOffset( + length, + elementSize * accelerator.WarpSize); + // Pad the length to ensure a valid buffer size + long paddedLength = length + alignmentOffset; + + // Allocate resources and assign pointers + NativeBufferPtr = Marshal.AllocHGlobal(new IntPtr(paddedLength)); + NativePtr = NativeBufferPtr + alignmentOffset; + } + + #endregion + + #region Properties + + /// + /// Returns the natively allocated underlying buffer pointer which may not be + /// aligned in all cases. + /// + public IntPtr NativeBufferPtr { get; private set; } + + #endregion + + #region Methods + + /// + protected internal override void MemSet( + AcceleratorStream stream, + byte value, + in ArrayView targetView) => + CPUMemoryBuffer.CPUMemSet( + targetView.LoadEffectiveAddressAsPtr(), + value, + 0L, + targetView.LengthInBytes); + + /// + protected internal override void CopyFrom( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyFrom(stream, sourceView, targetView); + + /// + protected internal override void CopyTo( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyTo(stream, sourceView, targetView); + + #endregion + + #region IDisposable + + /// + /// Disposes the underlying memory buffer. + /// + protected override void DisposeAcceleratorObject(bool disposing) + { + Marshal.FreeHGlobal(NativeBufferPtr); + NativeBufferPtr = IntPtr.Zero; + NativePtr = IntPtr.Zero; + } + + #endregion + + } + + sealed class VelocityMemoryBufferPool : VelocityMemoryBuffer + { + #region Instance + + private volatile int sharedMemoryOffset; + private readonly int warpSize; + + public VelocityMemoryBufferPool( + VelocityAccelerator accelerator, + int size) + : base(accelerator, size, 1) + { + warpSize = accelerator.WarpSize; + } + + #endregion + + #region Methods + + public void Reset() => + Interlocked.Exchange(ref sharedMemoryOffset, 0); + + /// + /// Gets a chunk of memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ArrayView Allocate(int length) + where T : unmanaged + { + int totalElementSize = length * Interop.SizeOf() * warpSize; + int alignment = Interop.ComputeAlignmentOffset( + sharedMemoryOffset, + totalElementSize); + int newOffset = Interlocked.Add(ref sharedMemoryOffset, alignment); + return new ArrayView(this, newOffset, length * warpSize); + } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs new file mode 100644 index 000000000..98d40de2e --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs @@ -0,0 +1,402 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMultiprocessor.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Util; +using System; +using System.Collections.Immutable; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single velocity kernel processing delegate. + /// + /// The start index within the thread grid. + /// The end index within the thread grid. + /// The current parameters. + delegate void VelocityKernelEntryPoint( + int globalStartIndex, + int globalEndIndex, + VelocityParameters parameters); + + /// + /// A single velocity multiprocessor consisting of a single processing thread and + /// a runtime context. + /// + sealed class VelocityMultiprocessor : DisposeBase + { + #region Static + + /// + /// All kernel handler types required to launch a kernel delegate on this MP. + /// + public static readonly ImmutableArray KernelHandlerTypes = + ImmutableArray.Create( + typeof(int), + typeof(int), + typeof(VelocityParameters)); + + /// + /// Stores the current velocity multiprocessor. + /// + [ThreadStatic] + private static VelocityMultiprocessor current; + + /// + /// Returns the parent velocity multiprocessor for the current thread. + /// + /// The parent multiprocessor for the current thread. + public static VelocityMultiprocessor GetCurrent() => current; + + /// + /// Allocates a chunk of shared memory. + /// + /// A velocity warp made of shared-memory pointers. + public static VelocityWarp64 GetSharedMemory(int length) + where T : unmanaged + { + var sharedMemoryView = GetCurrent().GetSharedMemoryFromPool(length); + long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64(); + var addresses = VelocityWarp64.GetConstI(intPtr); + return addresses.AddU(VelocityWarp64.LaneIndexVector); + } + + /// + /// Allocates a chunk of local memory. + /// + /// A velocity warp made of local-memory pointers. + public static VelocityWarp64 GetLocalMemory(int length) + where T : unmanaged + { + var sharedMemoryView = GetCurrent().GetLocalMemoryFromPool(length); + long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64(); + var addresses = VelocityWarp64.GetConstI(intPtr); + return addresses.AddU(VelocityWarp64.LaneIndexVector); + } + + /// + /// Returns the current grid indices for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of grid indices. + public static VelocityWarp32 GetCurrentGridIdx() => GetCurrent().GridIdx; + + /// + /// Sets the current grid index by advancing to the given offset. + /// + public static void SetCurrentGridIdx(int offset) => + GetCurrent().ResetGridIndex(offset); + + /// + /// Returns the current grid dimension for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of the current grid dimension. + public static VelocityWarp32 GetCurrentGridDim() => + VelocityWarp32.GetConstI(GetCurrent().GridDim); + + /// + /// Returns the current user defined grid dimension which may not be a multiple + /// of the group size used on this multiprocessor. + /// + /// A velocity warp made of the current user grid dimension. + public static VelocityWarp32 GetCurrentUserGridDim() => + VelocityWarp32.GetConstI(GetCurrent().UserGridDim); + + /// + /// Returns the current group dimension for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of the current grid dimension. + public static VelocityWarp32 GetCurrentGroupDim() => + VelocityWarp32.GetConstI(GetCurrent().GroupDim); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetSharedMemoryMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + $"{nameof(GetSharedMemory)}`1", + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetLocalMemoryMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + $"{nameof(GetLocalMemory)}`1", + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetCurrentGridIdxMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGridIdx), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo SetCurrentGridIdxMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(SetCurrentGridIdx), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetCurrentGridDimMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGridDim), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetCurrentUserGridDimMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentUserGridDim), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Returns a handle to the method. + /// + public static readonly MethodInfo GetCurrentGroupDimMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGroupDim), + BindingFlags.Public | BindingFlags.Static); + + #endregion + + #region Events + + /// + /// Will be raised once a chunk of a scheduled thread grid has been completed. + /// + public Action ProcessingCompleted; + + #endregion + + #region Instance + + // Thread data + private readonly Thread runtimeThread; + private readonly SemaphoreSlim startProcessingSema; + + // Context data + private readonly VelocityMemoryBufferPool sharedMemoryPool; + private readonly VelocityMemoryBufferPool localMemoryPool; + + // Runtime data + private volatile VelocityKernelEntryPoint kernelHandler; + private volatile int startIndexRange; + private volatile int endIndexRange; + private volatile VelocityParameters kernelParameters; + private volatile bool running = true; + + /// + /// Initializes a new velocity multiprocessor. + /// + /// The parent velocity accelerator. + /// The current processor index. + internal VelocityMultiprocessor( + VelocityAccelerator accelerator, + int processorIndex) + { + runtimeThread = new Thread(DoWork) + { + Priority = accelerator.ThreadPriority, + IsBackground = true, + Name = $"ILGPU_{accelerator.InstanceId}_Velocity_{processorIndex}" + }; + startProcessingSema = new SemaphoreSlim(0); + sharedMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxSharedMemoryPerGroup); + localMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxLocalMemoryPerThread); + WarpSize = accelerator.WarpSize; + ProcessorIndex = processorIndex; + + } + + #endregion + + #region Properties + + /// + /// Returns the current warp size. + /// + public int WarpSize { get; } + + /// + /// Returns the multiprocessor index. + /// + public int ProcessorIndex { get; } + + /// + /// Returns the precomputed grid indices for all lanes in the current + /// multiprocessor. + /// + public VelocityWarp32 GridIdx { get; private set; } + + /// + /// Returns the current grid dimension. + /// + public int GridDim { get; private set; } + + /// + /// Returns the current user grid dimension. + /// + public int UserGridDim { get; private set; } + + /// + /// Returns the current group dimension. + /// + public int GroupDim { get; private set; } + + #endregion + + #region Methods + + /// + /// Resets the current grid index. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ResetGridIndex(int offset) + { + // Compute the initial grid indices based on our processor index and the + // warp size + int baseIndex = WarpSize * ProcessorIndex + offset; + GridIdx = + VelocityWarp32.LaneIndexVector.AddI( + VelocityWarp32.GetConstI(baseIndex)); + } + + /// + /// Gets a chunk of shared memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + public ArrayView GetSharedMemoryFromPool(int length) + where T : unmanaged => + sharedMemoryPool.Allocate(length); + + /// + /// Gets a chunk of local memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of local memory. + public ArrayView GetLocalMemoryFromPool(int length) + where T : unmanaged => + localMemoryPool.Allocate(length); + + /// + /// Dispatches a new kernel execution. + /// + /// The kernel handler delegate. + /// The start interval index. + /// The end interval index. + /// The current grid dimension. + /// The current group dimension. + /// All kernel parameters. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Run( + VelocityKernelEntryPoint handler, + int startIndex, + int endIndex, + int gridDimension, + int userGridDimension, + int groupDimension, + VelocityParameters parameters) + { + GridDim = gridDimension; + UserGridDim = userGridDimension; + GroupDim = groupDimension; + + // Note that we do not have to invoke + // ResetGridIndex(offset: 0); + // here, as this method will be automatically invoked by each Velocity kernel + + // Schedule this operation + kernelHandler = handler; + startIndexRange = startIndex; + endIndexRange = endIndex; + kernelParameters = parameters; + sharedMemoryPool.Reset(); + localMemoryPool.Reset(); + + // Ensure visibility of all changes to other threads + Thread.MemoryBarrier(); + + // Launch the processing task + startProcessingSema.Release(); + } + + /// + /// The main processing thread of this multiprocessor. + /// + private void DoWork() + { + // Assign the current multiprocessor to this instance + current = this; + + // Process all tasks + while (true) + { + // Wait for the next task to arrive + startProcessingSema.Wait(); + + // Break the loop if we are shutting down + if (!running) + break; + + // Launch the actual kernel method + kernelHandler(startIndexRange, endIndexRange, kernelParameters); + + // Signal the main thread that the processing has been completed. Note + // that we avoid any null checks at this point + ProcessingCompleted(this); + } + } + + #endregion + + #region IDisposable + + /// + /// Waits for the processing thread to shutdown and disposes all internal thread + /// objects. + /// + protected override void Dispose(bool disposing) + { + if (disposing) + { + running = false; + startProcessingSema.Release(); + runtimeThread.Join(); + + startProcessingSema.Dispose(); + sharedMemoryPool.Dispose(); + localMemoryPool.Dispose(); + } + base.Dispose(disposing); + } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs new file mode 100644 index 000000000..bd38b034c --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs @@ -0,0 +1,27 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityParameters.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +namespace ILGPU.Runtime.Velocity +{ + /// + /// The base class for all velocity parameters. + /// + abstract class VelocityParameters + { + /// + /// Does nothing at the moment + /// + public VelocityParameters() + { + } + } +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityStream.cs b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs new file mode 100644 index 000000000..9adfe3cc9 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs @@ -0,0 +1,74 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityStream.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.CPU; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a velocity stream. + /// + sealed class VelocityStream : AcceleratorStream + { + #region Static + + /// + /// The default instance. + /// + internal static readonly VelocityStream Default = new VelocityStream(); + + #endregion + + #region Instance + + /// + /// Constructs a new Velocity stream. + /// + private VelocityStream() : base() { } + + /// + /// Constructs a new Velocity stream. + /// + /// The associated accelerator. + internal VelocityStream(Accelerator accelerator) + : base(accelerator) + { } + + #endregion + + #region Methods + + /// + /// Does not perform any operation. + /// + public override void Synchronize() { } + + /// + protected unsafe override ProfilingMarker AddProfilingMarkerInternal() + { + using var binding = Accelerator.BindScoped(); + return new CPUProfilingMarker(Accelerator); + } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} + +