diff --git a/.gitignore b/.gitignore index ea20fd949..dae0ab9c4 100644 --- a/.gitignore +++ b/.gitignore @@ -249,6 +249,9 @@ ModelManifest.xml # macOS .DS_Store +# VSCode +.vscode/ + # Ignore specific template outputs Src/ILGPU/AtomicFunctions.cs Src/ILGPU/Backends/PTX/PTXIntrinsics.Generated.cs @@ -274,6 +277,7 @@ Src/ILGPU/Runtime/Cuda/LibDevice.cs Src/ILGPU/Runtime/KernelLoaders.cs Src/ILGPU/Runtime/MemoryBuffers.cs Src/ILGPU/Runtime/PageLockedArrays.Generated.cs +Src/ILGPU/Runtime/Velocity/VelocityWarps.cs Src/ILGPU/Static/ArithmeticEnums.cs Src/ILGPU/Static/CapabilityContext.cs Src/ILGPU/Static/DllImports.cs @@ -334,6 +338,7 @@ Src/ILGPU.Tests/.test.runsettings Src/ILGPU.Tests.CPU/Configurations.cs Src/ILGPU.Tests.Cuda/Configurations.cs Src/ILGPU.Tests.OpenCL/Configurations.cs +Src/ILGPU.Tests.Velocity/Configurations.cs # Generated test source files (Algorithms) Src/ILGPU.Algorithms.Tests/Generic/ConfigurationBase.cs @@ -362,3 +367,4 @@ Src/ILGPU.Algorithms.Tests/XMathTests.Trig.cs Src/ILGPU.Algorithms.Tests.CPU/Configurations.cs Src/ILGPU.Algorithms.Tests.Cuda/Configurations.cs Src/ILGPU.Algorithms.Tests.OpenCL/Configurations.cs + diff --git a/Src/ILGPU.Tests.Velocity/Configurations.tt b/Src/ILGPU.Tests.Velocity/Configurations.tt new file mode 100644 index 000000000..3290ac580 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/Configurations.tt @@ -0,0 +1,51 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: Configurations.tt/Configurations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../ILGPU.Tests/Generic/ConfigurationBase.tt" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.IO" #> +using Xunit; +using Xunit.Abstractions; + +<# +var configurationFile = Host.ResolvePath("../ILGPU.Tests/Configurations.txt"); +var configurations = TestConfig.Parse(configurationFile); +#> +namespace ILGPU.Tests.Velocity +{ +<# foreach (var (test, level, collection) in configurations) { #> +<# var name = $"Velocity{test}_{level}"; #> + [Collection("VelocityContextCollection<#= collection #>")] + public sealed partial class <#= name #> : <#= test #> + { + public <#= name #>( + ITestOutputHelper output, + VelocityTestContext<#= collection #> testContext) + : base(output, testContext) + { } + } + +<# } #> +<# foreach (var (config, level) in TestConfig.AllConfigurations) { #> + public class VelocityTestContext<#= config #> : VelocityTestContext + { + public VelocityTestContext<#= config #>() + : base(OptimizationLevel.<#= level #>) + { } + } + + [CollectionDefinition("VelocityContextCollection<#= config #>")] + public class VelocityContextCollection<#= config #> : + ICollectionFixture> { } + +<# } #> +} \ No newline at end of file diff --git a/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj b/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj new file mode 100644 index 000000000..ad568c5fa --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj @@ -0,0 +1,57 @@ + + + + $(LibraryUnitTestTargetFrameworks) + false + + + + $(MSBuildProjectDirectory)\..\ILGPU.Tests\.test.runsettings + + + + true + AllEnabledByDefault + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + True + True + Configurations.tt + + + + + + TextTemplatingFileGenerator + Configurations.cs + + + + + + True + True + Configurations.tt + + + diff --git a/Src/ILGPU.Tests.Velocity/TestContext.cs b/Src/ILGPU.Tests.Velocity/TestContext.cs new file mode 100644 index 000000000..880e2358d --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/TestContext.cs @@ -0,0 +1,44 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: TestContext.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.Velocity; +using System; + +namespace ILGPU.Tests.Velocity +{ + /// + /// An abstract test context for Velocity accelerators. + /// + public abstract class VelocityTestContext : TestContext + { + /// + /// Creates a new test context instance. + /// + /// The optimization level to use. + /// The context preparation handler. + protected VelocityTestContext( + OptimizationLevel optimizationLevel, + Action prepareContext) + : base( + optimizationLevel, + builder => prepareContext(builder.Velocity()), + context => context.CreateVelocityAccelerator()) + { } + + /// + /// Creates a new test context instance. + /// + /// The optimization level to use. + protected VelocityTestContext(OptimizationLevel optimizationLevel) + : this(optimizationLevel, _ => { }) + { } + } +} diff --git a/Src/ILGPU.Tests/GroupOperations.cs b/Src/ILGPU.Tests/GroupOperations.cs index e62bd340f..fcc551dee 100644 --- a/Src/ILGPU.Tests/GroupOperations.cs +++ b/Src/ILGPU.Tests/GroupOperations.cs @@ -32,13 +32,18 @@ internal static void GroupDimensionKernel(ArrayView1D data) data[2] = Group.DimZ; } - [Theory] + [SkippableTheory] [InlineData(1, 0, 0)] [InlineData(0, 1, 0)] [InlineData(0, 0, 1)] [KernelMethod(nameof(GroupDimensionKernel))] public void GroupDimension1D(int xMask, int yMask, int zMask) { + Skip.If( + xMask >= Accelerator.MaxGroupSize.X || + yMask >= Accelerator.MaxGroupSize.Y || + zMask >= Accelerator.MaxGroupSize.Z); + for (int i = 2; i <= Math.Min(8, Accelerator.MaxNumThreadsPerGroup); i <<= 1) { using var buffer = Accelerator.Allocate1D(3); @@ -61,13 +66,18 @@ public void GroupDimension1D(int xMask, int yMask, int zMask) } } - [Theory] + [SkippableTheory] [InlineData(1, 1, 0)] [InlineData(0, 1, 1)] [InlineData(1, 0, 1)] [KernelMethod(nameof(GroupDimensionKernel))] public void GroupDimension2D(int xMask, int yMask, int zMask) { + Skip.If( + xMask >= Accelerator.MaxGroupSize.X || + yMask >= Accelerator.MaxGroupSize.Y || + zMask >= Accelerator.MaxGroupSize.Z); + var end = (int)Math.Sqrt(Accelerator.MaxNumThreadsPerGroup); for (int i = 2; i <= end; i <<= 1) { @@ -121,6 +131,7 @@ internal static void GroupBarrierKernel(ArrayView1D data) } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] @@ -129,7 +140,7 @@ public void GroupBarrier(int length) { Skip.If(length > Accelerator.MaxNumThreadsPerGroup); - for (int i = 1; i <= Accelerator.MaxNumThreadsPerGroup; i <<= 1) + for (int i = 2; i <= Accelerator.MaxNumThreadsPerGroup; i <<= 1) { using var buffer = Accelerator.Allocate1D(length * i); var extent = new KernelConfig( @@ -151,6 +162,7 @@ internal static void GroupBarrierAndKernel( } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] @@ -184,6 +196,7 @@ internal static void GroupBarrierOrKernel( } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] @@ -219,6 +232,7 @@ internal static void GroupBarrierPopCountKernel( } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] @@ -250,6 +264,7 @@ internal static void GroupBroadcastKernel( } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] @@ -283,6 +298,7 @@ internal static void GroupDivergentControlFlowKernel( } [SkippableTheory] + [InlineData(2)] [InlineData(32)] [InlineData(256)] [InlineData(1024)] diff --git a/Src/ILGPU.Tests/KernelEntryPoints.cs b/Src/ILGPU.Tests/KernelEntryPoints.cs index f86ea7325..e4ad240c3 100644 --- a/Src/ILGPU.Tests/KernelEntryPoints.cs +++ b/Src/ILGPU.Tests/KernelEntryPoints.cs @@ -65,12 +65,14 @@ internal static void Index2EntryPointKernel( output[linearIndex] = linearIndex; } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] [KernelMethod(nameof(Index2EntryPointKernel))] public void Index2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + var extent = new Index2D(length, length); using var buffer = Accelerator.Allocate1D(extent.Size); Execute(extent, buffer.View, extent); @@ -88,12 +90,16 @@ internal static void Index3EntryPointKernel( output[linearIndex] = linearIndex; } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] [KernelMethod(nameof(Index3EntryPointKernel))] public void Index3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + var extent = new Index3D(length, length, length); using var buffer = Accelerator.Allocate1D(extent.Size); Execute(extent, buffer.View, extent); @@ -147,12 +153,14 @@ internal static void GroupedIndex2EntryPointKernel( output[idx] = idx; } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(129)] [KernelMethod(nameof(GroupedIndex2EntryPointKernel))] public void GroupedIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + var end = (int)Math.Sqrt(Accelerator.MaxNumThreadsPerGroup); for (int i = 1; i <= end; i <<= 1) { @@ -197,11 +205,15 @@ internal static void GroupedIndex3EntryPointKernel( output[idx] = idx; } - [Theory] + [SkippableTheory] [InlineData(33)] [KernelMethod(nameof(GroupedIndex3EntryPointKernel))] public void GroupedIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + var end = (int)Math.Pow(Accelerator.MaxNumThreadsPerGroup, 1.0 / 3.0); for (int i = 1; i <= end; i <<= 1) { @@ -301,11 +313,13 @@ public void NonCapturingLambdaIndex1EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] public void NonCapturingLambdaIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + Action, Index2D> kernel = (index, output, extent) => { @@ -321,11 +335,15 @@ public void NonCapturingLambdaIndex2EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] public void NonCapturingLambdaIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + Action, Index3D> kernel = (index, output, extent) => { @@ -358,11 +376,13 @@ public void InstanceMethodIndex1EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] public void InstanceMethodIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + var instanceHost = new InstaceHost(); Action, Index2D> kernel = instanceHost.InstanceKernel; @@ -376,11 +396,15 @@ public void InstanceMethodIndex2EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] public void InstanceMethodIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + var instanceHost = new InstaceHost(); Action, Index3D> kernel = instanceHost.InstanceKernel; @@ -413,11 +437,13 @@ public void StaticPropertyCapturingLambdaIndex1EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] public void StaticPropertyCapturingLambdaIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + Action, Index2D> kernel = (index, output, extent) => { @@ -434,11 +460,15 @@ public void StaticPropertyCapturingLambdaIndex2EntryPoint(int length) Verify(buffer.View, expected); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] public void StaticPropertyCapturingLambdaIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + Action, Index3D> kernel = (index, output, extent) => { @@ -472,11 +502,13 @@ public void LocalCapturingLambdaIndex1EntryPoint(int length) Execute(kernel.Method, new Index1D((int)buffer.Length), buffer.View)); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] public void LocalCapturingLambdaIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + var capturedVariable = 1; Action, Index2D> kernel = (index, output, extent) => @@ -491,11 +523,15 @@ public void LocalCapturingLambdaIndex2EntryPoint(int length) Execute(kernel.Method, extent, buffer.View, extent)); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] public void LocalCapturingLambdaIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + var capturedVariable = 1; Action, Index3D> kernel = (index, output, extent) => @@ -548,11 +584,13 @@ public void StaticFieldCapturingLambdaIndex1EntryPoint(int length) VerifyStaticFieldCapturingLambdaException(e); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(513)] public void StaticFieldCapturingLambdaIndex2EntryPoint(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + Action, Index2D> kernel = (index, output, extent) => { @@ -567,11 +605,15 @@ public void StaticFieldCapturingLambdaIndex2EntryPoint(int length) VerifyStaticFieldCapturingLambdaException(e); } - [Theory] + [SkippableTheory] [InlineData(33)] [InlineData(257)] public void StaticFieldCapturingLambdaIndex3EntryPoint(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + Action, Index3D> kernel = (index, output, extent) => { diff --git a/Src/ILGPU.Tests/MemoryBufferOperations.tt b/Src/ILGPU.Tests/MemoryBufferOperations.tt index 70479dcb9..2f21d6e39 100644 --- a/Src/ILGPU.Tests/MemoryBufferOperations.tt +++ b/Src/ILGPU.Tests/MemoryBufferOperations.tt @@ -261,13 +261,15 @@ namespace ILGPU.Tests } <# foreach (var type in copyTypes) { #> - [Theory] + [SkippableTheory] <# foreach (var length in lengths) { #> [InlineData(<#= length #>)] <# } #> [KernelMethod(nameof(Copy2D_Kernel))] public void Copy2D_<#= type.Name #>(int length) { + Skip.If(length > Accelerator.MaxGroupSize.Y); + var extent = new LongIndex2D(length, length); Func> converter = c => (<#= type.Type #>)c; var expected = InitializeArray2D(length, converter); @@ -331,13 +333,17 @@ namespace ILGPU.Tests } <# foreach (var type in copyTypes) { #> - [Theory] + [SkippableTheory] <# foreach (var length in lengths) { #> [InlineData(<#= length #>)] <# } #> [KernelMethod(nameof(Copy3D_Kernel))] public void Copy3D_<#= type.Name #>(int length) { + Skip.If( + length > Accelerator.MaxGroupSize.Y || + length > Accelerator.MaxGroupSize.Z); + var extent = new LongIndex3D(length, length, length); Func> converter = c => (<#= type.Type #>)c; var expected = InitializeArray3D(length, converter); diff --git a/Src/ILGPU/AtomicFunctions.tt b/Src/ILGPU/AtomicFunctions.tt index 244599173..8b4e8f4f3 100644 --- a/Src/ILGPU/AtomicFunctions.tt +++ b/Src/ILGPU/AtomicFunctions.tt @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2016-2021 ILGPU Project +// Copyright (c) 2016-2022 ILGPU Project // www.ilgpu.net // // File: AtomicFunctions.tt/AtomicFunctions.cs @@ -17,8 +17,6 @@ <#@ import namespace="System.Collections.Generic" #> <#@ output extension=".cs" #> <# -var atomicSignedIntAndFloatTypes = AtomicSignedIntTypes. - Concat(AtomicFloatTypes).ToArray(); var atomicUnsignedIntAndFloatTypes = AtomicUnsignedIntTypes. Concat(AtomicFloatTypes).ToArray(); #> @@ -28,6 +26,8 @@ using System; using System.Runtime.CompilerServices; using System.Threading; +// disable: max_line_length + namespace ILGPU { namespace AtomicOperations @@ -49,7 +49,7 @@ namespace ILGPU /// The expected comparison value. /// The target value. /// The old value. - public readonly <#= type.Type #> CompareExchange( + public <#= type.Type #> CompareExchange( ref <#= type.Type #> target, <#= type.Type #> compare, <#= type.Type #> value) => @@ -61,7 +61,7 @@ namespace ILGPU /// The left operand. /// The right operand. /// True, if both operands represent the same value. - public readonly bool IsSame( + public bool IsSame( <#= type.Type #> left, <#= type.Type #> right) => left == right; } @@ -91,9 +91,9 @@ namespace ILGPU <# } #> <# foreach (var type in atomicUnsignedIntAndFloatTypes) { #> - struct Add<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct Add<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => current + value; @@ -150,9 +150,9 @@ namespace ILGPU #region Max <# foreach (var type in AtomicNumericTypes) { #> - struct Max<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct Max<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => IntrinsicMath.Max(current, value); @@ -188,9 +188,9 @@ namespace ILGPU #region Min <# foreach (var type in AtomicNumericTypes) { #> - struct Min<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct Min<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => IntrinsicMath.Min(current, value); @@ -226,9 +226,9 @@ namespace ILGPU #region And <# foreach (var type in AtomicIntTypes) { #> - struct And<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct And<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => current & value; @@ -265,9 +265,9 @@ namespace ILGPU #region Or <# foreach (var type in AtomicIntTypes) { #> - struct Or<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct Or<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => current | value; @@ -304,9 +304,9 @@ namespace ILGPU #region Xor <# foreach (var type in AtomicIntTypes) { #> - struct Xor<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> + readonly struct Xor<#= type.Name #> : AtomicOperations.IAtomicOperation<<#= type.Type #>> { - public readonly <#= type.Type #> Operation( + public <#= type.Type #> Operation( <#= type.Type #> current, <#= type.Type #> value) => current ^ value; diff --git a/Src/ILGPU/Backends/Backend.cs b/Src/ILGPU/Backends/Backend.cs index cc0c8eb01..998473046 100644 --- a/Src/ILGPU/Backends/Backend.cs +++ b/Src/ILGPU/Backends/Backend.cs @@ -66,6 +66,11 @@ public enum BackendType /// IL, + /// + /// A Velocity backend. + /// + Velocity, + /// /// A PTX backend. /// @@ -123,7 +128,7 @@ public void OptimizedKernelContext( /// /// Represents the current kernel context in scope of a backend instance. /// - protected readonly ref struct BackendContext + protected internal readonly ref struct BackendContext { #region Nested Types @@ -696,7 +701,7 @@ public CompiledKernel Compile( entry, backendContext, specialization); - if (entryPoint.IsImplictlyGrouped && + if (entryPoint.IsImplicitlyGrouped && backendContext.SharedMemorySpecification.HasSharedMemory) { throw new NotSupportedException( diff --git a/Src/ILGPU/Backends/EntryPoints/ArgumentMapper.cs b/Src/ILGPU/Backends/EntryPoints/ArgumentMapper.cs index 56b39080d..259e33b30 100644 --- a/Src/ILGPU/Backends/EntryPoints/ArgumentMapper.cs +++ b/Src/ILGPU/Backends/EntryPoints/ArgumentMapper.cs @@ -35,11 +35,6 @@ public abstract class ArgumentMapper : ICache { #region Constants - /// - /// The internal prefix name for all runtime fields. - /// - private const string FieldPrefixName = "Field"; - /// /// The intrinsic kernel length parameter field name. /// @@ -520,7 +515,7 @@ void MapViewArgument( private static string GetFieldName(int index) { Debug.Assert(index >= 0, "Invalid field index"); - return FieldPrefixName + index; + return StructureType.GetFieldName(index); } #endregion diff --git a/Src/ILGPU/Backends/EntryPoints/EntryPoint.cs b/Src/ILGPU/Backends/EntryPoints/EntryPoint.cs index af34e9286..9c874d6bb 100644 --- a/Src/ILGPU/Backends/EntryPoints/EntryPoint.cs +++ b/Src/ILGPU/Backends/EntryPoints/EntryPoint.cs @@ -81,7 +81,7 @@ public EntryPoint( /// /// Returns true if the entry point represents an implicitly grouped kernel. /// - public bool IsImplictlyGrouped => !IsExplicitlyGrouped; + public bool IsImplicitlyGrouped => !IsExplicitlyGrouped; /// /// Returns the index type of the index parameter. diff --git a/Src/ILGPU/Backends/IBackendCodeGenerator.cs b/Src/ILGPU/Backends/IBackendCodeGenerator.cs index 4f3101660..248ce73de 100644 --- a/Src/ILGPU/Backends/IBackendCodeGenerator.cs +++ b/Src/ILGPU/Backends/IBackendCodeGenerator.cs @@ -317,6 +317,12 @@ public interface IBackendCodeGenerator /// The node. void GenerateCode(DebugAssertOperation debug); + /// + /// Generates code for the output writer. + /// + /// The node. + void GenerateCode(WriteToOutput writeToOutput); + // Terminators /// @@ -594,7 +600,7 @@ public void Visit(DebugAssertOperation debug) => /// public void Visit(WriteToOutput writeToOutput) => - throw new InvalidCodeGenerationException(); + CodeGenerator.GenerateCode(writeToOutput); /// public void Visit(ReturnTerminator returnTerminator) => diff --git a/Src/ILGPU/Backends/IL/DefaultILBackend.cs b/Src/ILGPU/Backends/IL/DefaultILBackend.cs index 239f0656b..dd0ecedf5 100644 --- a/Src/ILGPU/Backends/IL/DefaultILBackend.cs +++ b/Src/ILGPU/Backends/IL/DefaultILBackend.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2021 ILGPU Project +// Copyright (c) 2018-2023 ILGPU Project // www.ilgpu.net // // File: DefaultILBackend.cs @@ -51,7 +51,7 @@ protected override void GenerateCode( if (entryPoint.MethodInfo.IsNotCapturingLambda()) emitter.Emit(OpCodes.Ldnull); - if (entryPoint.IsImplictlyGrouped) + if (entryPoint.IsImplicitlyGrouped) { // Load index emitter.Emit(LocalOperation.Load, index); diff --git a/Src/ILGPU/Backends/IL/ILEmitter.cs b/Src/ILGPU/Backends/IL/ILEmitter.cs index 050e3ec5f..93fb92207 100644 --- a/Src/ILGPU/Backends/IL/ILEmitter.cs +++ b/Src/ILGPU/Backends/IL/ILEmitter.cs @@ -234,6 +234,12 @@ public interface IILEmitter /// The jump targets. void EmitSwitch(ILLabel[] labels); + /// + /// Emits code to write something to the console. + /// + /// The message to write. + void EmitWriteLine(string message); + /// /// Finishes the code generation process. /// @@ -405,6 +411,10 @@ public void EmitSwitch(params ILLabel[] labels) Generator.Emit(OpCodes.Switch, switchLabels); } + /// + public void EmitWriteLine(string message) => + Generator.EmitWriteLine(message); + /// public void Finish() { } @@ -635,6 +645,10 @@ public void EmitSwitch(params ILLabel[] labels) } } + /// + public void EmitWriteLine(string message) => + Writer.WriteLine($" => Write('{message}')"); + /// public void Finish() { @@ -717,6 +731,9 @@ public void EmitConstant(double constant) { } /// public void EmitSwitch(params ILLabel[] labels) { } + /// + public void EmitWriteLine(string message) { } + /// public void Finish() { } diff --git a/Src/ILGPU/Backends/IL/ILEmitterExtensions.cs b/Src/ILGPU/Backends/IL/ILEmitterExtensions.cs index d48d23542..3f044260e 100644 --- a/Src/ILGPU/Backends/IL/ILEmitterExtensions.cs +++ b/Src/ILGPU/Backends/IL/ILEmitterExtensions.cs @@ -9,6 +9,8 @@ // Source License. See LICENSE.txt for details. // --------------------------------------------------------------------------------------- +using ILGPU.IR.Types; +using ILGPU.IR.Values; using System; using System.Reflection; using System.Reflection.Emit; @@ -20,6 +22,8 @@ namespace ILGPU.Backends.IL /// public static class ILEmitterExtensions { + #region Static Instance + private static readonly MethodInfo GetHashCodeInfo = typeof(object).GetMethod( nameof(object.GetHashCode), BindingFlags.Public | BindingFlags.Instance); @@ -27,6 +31,189 @@ public static class ILEmitterExtensions nameof(object.Equals), BindingFlags.Public | BindingFlags.Instance); + /// + /// Caches all constant op codes. + /// + private static readonly OpCode[] ConstantOpCodes = + new OpCode[] + { + OpCodes.Ldc_I4_M1, + OpCodes.Ldc_I4_0, + OpCodes.Ldc_I4_1, + OpCodes.Ldc_I4_2, + OpCodes.Ldc_I4_3, + OpCodes.Ldc_I4_4, + OpCodes.Ldc_I4_5, + OpCodes.Ldc_I4_6, + OpCodes.Ldc_I4_7, + OpCodes.Ldc_I4_8, + }; + + /// + /// Stores the constructor of the type. + /// + private static readonly ConstructorInfo HalfConstructor = + typeof(Half).GetConstructor( + BindingFlags.NonPublic | BindingFlags.CreateInstance, + null, + new Type[] { typeof(ushort) }, + null); + + #endregion + + #region Methods + + /// + /// Emits optimized code to load an integer constant. + /// + public static void LoadIntegerConstant( + this TILEmitter emitter, + int constant) + where TILEmitter : IILEmitter + { + if (constant >= -1 && constant < ConstantOpCodes.Length) + emitter.Emit(ConstantOpCodes[constant + 1]); + else + emitter.EmitConstant(constant); + } + + /// + /// Calls a compatible shuffle method. + /// + public static void LoadConstant( + this TILEmitter emitter, + PrimitiveValue value, + ref ILLocal? temporaryHalf) + where TILEmitter : IILEmitter + { + switch (value.BasicValueType) + { + case BasicValueType.Int1: + if (value.Int1Value) + emitter.Emit(OpCodes.Ldc_I4_0); + else + emitter.Emit(OpCodes.Ldc_I4_1); + break; + case BasicValueType.Int16: + emitter.LoadIntegerConstant(value.Int16Value); + break; + case BasicValueType.Int32: + emitter.LoadIntegerConstant(value.Int32Value); + break; + case BasicValueType.Int64: + emitter.EmitConstant(value.Int64Value); + break; + case BasicValueType.Float16: + // Allocate a temporary variable and invoke the half constructor + temporaryHalf ??= emitter.DeclareLocal(typeof(Half)); + emitter.Emit(LocalOperation.LoadAddress, temporaryHalf.Value); + emitter.EmitConstant(value.Float16Value.RawValue); + emitter.EmitNewObject(HalfConstructor); + emitter.Emit(LocalOperation.Load, temporaryHalf.Value); + break; + case BasicValueType.Float32: + emitter.EmitConstant(value.Float32Value); + break; + case BasicValueType.Float64: + emitter.EmitConstant(value.Float64Value); + break; + default: + throw new NotSupportedIntrinsicException( + value.BasicValueType.ToString()); + } + } + + /// + /// Loads an object from an address in memory. + /// + /// The emitter instance. + /// The manged type to load. + public static void LoadObject( + this TILEmitter emitter, + Type typeToLoad) + where TILEmitter : IILEmitter => + emitter.Emit(OpCodes.Ldobj, typeToLoad); + + /// + /// Generates code that loads a null value. + /// + public static ILLocal LoadNull( + this TILEmitter emitter, + Type type) + where TILEmitter : IILEmitter + { + var nullLocal = emitter.DeclareLocal(type); + // Check whether the given value is a reference type + if (type.IsClass) + { + // Emit a null reference + emitter.Emit(OpCodes.Ldnull, type); + emitter.Emit(LocalOperation.Store, nullLocal); + } + else + { + // Emit a new local variable that is initialized with null + emitter.Emit(LocalOperation.LoadAddress, nullLocal); + emitter.Emit(OpCodes.Initobj, type); + } + return nullLocal; + } + + /// + /// Gets managed field info from a pre-defined converted structure type. + /// + /// The managed structure type. + /// The internal field index. + /// The corresponding field info. + public static FieldInfo GetFieldInfo(Type type, int fieldIndex) + { + var fieldName = StructureType.GetFieldName(fieldIndex); + return type.GetField(fieldName); + } + + /// + /// Emits code to load a field. + /// + public static void LoadField( + this TILEmitter emitter, + Type type, + int fieldIndex) + where TILEmitter : IILEmitter + { + var fieldInfo = GetFieldInfo(type, fieldIndex); + emitter.Emit(OpCodes.Ldfld, fieldInfo); + } + + /// + /// Emits code to load the address of a field. + /// + public static void LoadFieldAddress( + this TILEmitter emitter, + Type type, + int fieldIndex) + where TILEmitter : IILEmitter + { + var fieldInfo = GetFieldInfo(type, fieldIndex); + emitter.Emit(OpCodes.Ldflda, fieldInfo); + } + + /// + /// Emits code to store a value to a field. + /// + public static void StoreField( + this TILEmitter emitter, + Type type, + int fieldIndex) + where TILEmitter : IILEmitter + { + var fieldInfo = GetFieldInfo(type, fieldIndex); + emitter.Emit(OpCodes.Stfld, fieldInfo); + } + + #endregion + + #region Hash Code and Equals + /// /// Generates hash code and equals functions for the given fields. /// @@ -197,5 +384,7 @@ public static MethodInfo GenerateEquals( return equals; } + + #endregion } } diff --git a/Src/ILGPU/Backends/IL/Transformations/ILAcceleratorSpecializer.cs b/Src/ILGPU/Backends/IL/Transformations/ILAcceleratorSpecializer.cs index c4922f5d1..ef22fed5c 100644 --- a/Src/ILGPU/Backends/IL/Transformations/ILAcceleratorSpecializer.cs +++ b/Src/ILGPU/Backends/IL/Transformations/ILAcceleratorSpecializer.cs @@ -20,30 +20,52 @@ namespace ILGPU.Backends.IL.Transformations /// /// The IL accelerator specializer. /// - public sealed class ILAcceleratorSpecializer : AcceleratorSpecializer + public class ILAcceleratorSpecializer : AcceleratorSpecializer { #region Instance /// /// Constructs a new IL accelerator specializer. /// + /// The current accelerator type. /// The actual pointer type to use. /// The warp size to use. /// True, if the assertions are enabled. /// True, if the IO is enabled. - public ILAcceleratorSpecializer( + internal ILAcceleratorSpecializer( + AcceleratorType acceleratorType, PrimitiveType pointerType, int warpSize, bool enableAssertions, bool enableIOOperations) : base( - AcceleratorType.CPU, + acceleratorType, warpSize, pointerType, enableAssertions, enableIOOperations) { } + /// + /// Constructs a new IL accelerator specializer. + /// + /// The actual pointer type to use. + /// The warp size to use. + /// True, if the assertions are enabled. + /// True, if the IO is enabled. + public ILAcceleratorSpecializer( + PrimitiveType pointerType, + int warpSize, + bool enableAssertions, + bool enableIOOperations) + : this( + AcceleratorType.CPU, + pointerType, + warpSize, + enableAssertions, + enableIOOperations) + { } + #endregion #region Methods diff --git a/Src/ILGPU/Backends/OpenCL/CLCodeGenerator.Values.cs b/Src/ILGPU/Backends/OpenCL/CLCodeGenerator.Values.cs index fbc810a47..cd49c38f7 100644 --- a/Src/ILGPU/Backends/OpenCL/CLCodeGenerator.Values.cs +++ b/Src/ILGPU/Backends/OpenCL/CLCodeGenerator.Values.cs @@ -793,6 +793,11 @@ public void GenerateCode(DebugAssertOperation debug) => // Invalid debug node -> should have been removed debug.Assert(false); + /// + public void GenerateCode(WriteToOutput writeToOutput) => + // Invalid write node -> should have been removed + writeToOutput.Assert(false); + /// public void GenerateCode(LanguageEmitValue value) => // Ignore PTX instructions. diff --git a/Src/ILGPU/Backends/PTX/PTXArgumentMapper.cs b/Src/ILGPU/Backends/PTX/PTXArgumentMapper.cs index e22b9d294..7141d36af 100644 --- a/Src/ILGPU/Backends/PTX/PTXArgumentMapper.cs +++ b/Src/ILGPU/Backends/PTX/PTXArgumentMapper.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2021 ILGPU Project +// Copyright (c) 2018-2023 ILGPU Project // www.ilgpu.net // // File: PTXArgumentMapper.cs @@ -22,7 +22,7 @@ namespace ILGPU.Backends.PTX /// Constructs mappings for PTX kernels. /// /// Members of this class are not thread safe. - public sealed class PTXArgumentMapper : ViewArgumentMapper + public class PTXArgumentMapper : ViewArgumentMapper { #region Nested Types @@ -48,7 +48,7 @@ public MappingHandler(EntryPoint entryPoint) public bool CanMapKernelLength(out Type indexType) { indexType = EntryPoint.KernelIndexType; - return EntryPoint.IsImplictlyGrouped; + return EntryPoint.IsImplicitlyGrouped; } public void MapKernelLength( @@ -57,7 +57,7 @@ public void MapKernelLength( where TILEmitter : struct, IILEmitter where TTarget : struct, ITarget { - Debug.Assert(EntryPoint.IsImplictlyGrouped); + Debug.Assert(EntryPoint.IsImplicitlyGrouped); var argumentSource = new ArgumentSource( kernelLengthTarget.TargetType, diff --git a/Src/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs b/Src/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs index 7972709a8..8460a22c3 100644 --- a/Src/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs +++ b/Src/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs @@ -1227,7 +1227,13 @@ public void GenerateCode(SubWarpShuffle shuffle) /// public void GenerateCode(DebugAssertOperation debug) => - Debug.Assert(false, "Invalid debug node -> should have been removed"); + // Invalid debug node -> should have been removed + debug.Assert(false); + + /// + public void GenerateCode(WriteToOutput writeToOutput) => + // Invalid write node -> should have been removed + writeToOutput.Assert(false); /// [SuppressMessage( diff --git a/Src/ILGPU/Backends/PhiBindings.cs b/Src/ILGPU/Backends/PhiBindings.cs index c55db6be5..2c9f96009 100644 --- a/Src/ILGPU/Backends/PhiBindings.cs +++ b/Src/ILGPU/Backends/PhiBindings.cs @@ -14,6 +14,7 @@ using ILGPU.IR.Analyses.ControlFlowDirection; using ILGPU.IR.Analyses.TraversalOrders; using ILGPU.IR.Values; +using ILGPU.Util; using System; using System.Collections; using System.Collections.Generic; @@ -298,6 +299,8 @@ public static PhiBindings Create( { var mapping = collection.CreateMap(new InfoProvider()); + var allPhiValues = InlineList.Create( + collection.Count); foreach (var block in collection) { // Resolve phis @@ -307,6 +310,9 @@ public static PhiBindings Create( // Map all phi arguments foreach (var phi in phis) { + // Remember the current phi value + allPhiValues.Add(phi); + // Allocate phi for further processing allocator.Allocate(block, phi); @@ -331,7 +337,10 @@ public static PhiBindings Create( info.IntermediatePhis.Count); } - return new PhiBindings(mapping, maxNumIntermediatePhis); + return new PhiBindings( + mapping, + maxNumIntermediatePhis, + ref allPhiValues); } #endregion @@ -339,6 +348,7 @@ public static PhiBindings Create( #region Instance private readonly BasicBlockMap phiMapping; + private readonly InlineList allPhiValues; /// /// Constructs new phi bindings. @@ -347,11 +357,17 @@ public static PhiBindings Create( /// /// The maximum number of intermediate phi values. /// + /// The list of all phi values. private PhiBindings( in BasicBlockMap mapping, - int maxNumIntermediatePhis) + int maxNumIntermediatePhis, + ref InlineList phiValues) { phiMapping = mapping; + + allPhiValues = InlineList.Empty; + phiValues.MoveTo(ref allPhiValues); + MaxNumIntermediatePhis = maxNumIntermediatePhis; } @@ -364,6 +380,11 @@ private PhiBindings( /// public int MaxNumIntermediatePhis { get; } + /// + /// Returns a span including all phi values. + /// + public ReadOnlySpan PhiValues => allPhiValues; + #endregion #region Methods @@ -375,7 +396,7 @@ private PhiBindings( /// The resolved bindings (if any) /// True, if phi bindings could be resolved. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly bool TryGetBindings( + public bool TryGetBindings( BasicBlock block, out PhiBindingCollection bindings) { diff --git a/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs b/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs new file mode 100644 index 000000000..baf13f1b8 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs @@ -0,0 +1,34 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityArgumentMapper.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.PTX; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Constructs mappings Velocity kernels. + /// + /// The current velocity backend uses the PTX argument mapper. + sealed class VelocityArgumentMapper : PTXArgumentMapper + { + #region Instance + + /// + /// Constructs a new IL argument mapper. + /// + /// The current context. + public VelocityArgumentMapper(Context context) + : base(context) + { } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityBackend.cs b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs new file mode 100644 index 000000000..bd0f7343e --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs @@ -0,0 +1,200 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityBackend.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.Backends.IL.Transformations; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Transformations; +using ILGPU.Runtime; +using ILGPU.Runtime.Velocity; + +namespace ILGPU.Backends.Velocity +{ + class VelocityBackend : + CodeGeneratorBackend< + VelocityBackend.Handler, + VelocityCodeGenerator.GeneratorArgs, + VelocityCodeGenerator, + object> + where TILEmitter : struct, IILEmitter + where TVerifier : IVelocityWarpVerifier, new() + { + #region Nested Types + + /// + /// Represents the handler delegate type of custom code-generation handlers. + /// + /// The current backend. + /// The current emitter. + /// The value to generate code for. + public delegate void Handler( + VelocityBackend backend, + in TILEmitter emitter, + Value value); + + #endregion + + #region Instance + + /// + /// Constructs a new Velocity backend. + /// + /// The context to use. + /// The supported capabilities. + /// The current warp size. + /// The argument mapper to use. + public VelocityBackend( + Context context, + CapabilityContext capabilities, + int warpSize, + VelocityArgumentMapper argumentMapper) + : base( + context, + capabilities, + BackendType.Velocity, + argumentMapper) + { + WarpSize = warpSize; + Instructions = new VelocityInstructions(); + TypeGenerator = new VelocityTypeGenerator(context.RuntimeSystem, warpSize); + + InitIntrinsicProvider(); + InitializeKernelTransformers(builder => + { + var transformerBuilder = Transformer.CreateBuilder( + TransformerConfiguration.Empty); + transformerBuilder.AddBackendOptimizations( + new ILAcceleratorSpecializer( + AcceleratorType.Velocity, + PointerType, + warpSize, + Context.Properties.EnableAssertions, + Context.Properties.EnableIOOperations), + context.Properties.InliningMode, + context.Properties.OptimizationLevel); + builder.Add(transformerBuilder.ToTransformer()); + }); + } + + #endregion + + #region Properties + + /// + /// Returns the current warp size to be used. + /// + public int WarpSize { get; } + + /// + /// Returns the current instructions map. + /// + internal VelocityInstructions Instructions { get; } + + /// + /// Returns the current type generator. + /// + internal VelocityTypeGenerator TypeGenerator { get; } + + /// + /// Returns the associated . + /// + public new VelocityArgumentMapper ArgumentMapper => + base.ArgumentMapper as VelocityArgumentMapper; + + #endregion + + protected override object CreateKernelBuilder( + EntryPoint entryPoint, + in BackendContext backendContext, + in KernelSpecialization specialization, + out VelocityCodeGenerator.GeneratorArgs data) + { + // Create a new generation module + var module = new VelocityGenerationModule( + Context.RuntimeSystem, + Instructions, + TypeGenerator, + backendContext, + entryPoint); + data = new VelocityCodeGenerator.GeneratorArgs( + Instructions, + module, + WarpSize, + entryPoint); + return null; + } + + protected override VelocityCodeGenerator + CreateFunctionCodeGenerator( + Method method, + Allocas allocas, + VelocityCodeGenerator.GeneratorArgs data) => + new VelocityFunctionGenerator(data, method, allocas); + + protected override VelocityCodeGenerator + CreateKernelCodeGenerator( + in AllocaKindInformation sharedAllocations, + Method method, + Allocas allocas, + VelocityCodeGenerator.GeneratorArgs data) => + new VelocityKernelFunctionGenerator( + data, + method, + allocas); + + protected override CompiledKernel CreateKernel( + EntryPoint entryPoint, + CompiledKernel.KernelInfo kernelInfo, + object builder, + VelocityCodeGenerator.GeneratorArgs data) + { + using var module = data.Module; + return new VelocityCompiledKernel( + Context, + entryPoint, + module.KernelMethod, + module.ParametersType, + module.ParametersTypeConstructor, + module.ParameterFields, + module.SharedAllocationSize); + } + } + + sealed class VelocityBackend : + VelocityBackend + where TILEmitter : struct, IILEmitter + { + #region Instance + + /// + /// Constructs a new Velocity backend. + /// + /// The context to use. + /// The supported capabilities. + /// The current warp size. + /// The argument mapper to use. + public VelocityBackend( + Context context, + CapabilityContext capabilities, + int warpSize, + VelocityArgumentMapper argumentMapper) + : base( + context, + capabilities, + warpSize, + argumentMapper) + { } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs new file mode 100644 index 000000000..65cdb6034 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs @@ -0,0 +1,100 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.IO.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Resources; +using ILGPU.Runtime.Velocity; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(Load load) + { + var mask = blockMasks[load.BasicBlock]; + var source = GetLocal(load.Source); + Instructions.CreateLoad( + Emitter, + mask, + source, + load.Type, + TypeGenerator); + Store(load); + } + + /// + /// Generates code to store primitive values and pointers from memory while using + /// the given mask to differentiate between active and inactive lanes. + /// + private void GenerateNonStructureStore(TypeNode typeNode) + { + var basicValueType = typeNode switch + { + PrimitiveType primitiveType => primitiveType.BasicValueType, + PaddingType paddingType => paddingType.BasicValueType, + PointerType _ => BasicValueType.Int64, + _ => throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType, typeNode) + }; + + Emitter.EmitCall(Instructions.GetIOOperation( + basicValueType, + WarpSize).Store); + } + + /// + public void GenerateCode(Store store) + { + var mask = GetBlockMask(store.BasicBlock); + var target = GetLocal(store.Target); + var value = GetLocal(store.Value); + var type = store.Value.Type; + + if (type is StructureType structureType) + { + // Iterate over all fields and store them + var vectorizedType = GetVectorizedType(type); + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the current field value + Emitter.Emit(LocalOperation.LoadAddress, value); + Emitter.LoadField(vectorizedType, fieldAccess.Index); + + // Adjust the target offset + long fieldOffset = structureType.GetOffset(fieldAccess); + Emitter.EmitConstant(fieldOffset); + Emitter.EmitCall(Instructions.GetConstValueOperation64( + VelocityWarpOperationMode.I)); + Emitter.Emit(LocalOperation.Load, target); + Emitter.EmitCall(Instructions.GetBinaryOperation64( + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U)); + + // Store the field into memory + Emitter.Emit(LocalOperation.Load, mask); + GenerateNonStructureStore(fieldType); + } + } + else + { + Emitter.Emit(LocalOperation.Load, value); + Emitter.Emit(LocalOperation.Load, target); + Emitter.Emit(LocalOperation.Load, mask); + + GenerateNonStructureStore(type); + } + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs new file mode 100644 index 000000000..cdbda7232 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs @@ -0,0 +1,144 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Terminators.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public abstract void GenerateCode(ReturnTerminator returnTerminator); + + /// + public void GenerateCode(UnconditionalBranch branch) + { + // Create a branch if required + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + branchBuilder.RecordBranchTarget(branch.Target, () => + { + // Pass the current mask + Emitter.Emit(LocalOperation.Load, GetBlockMask(branch.BasicBlock)); + }); + branchBuilder.EmitBranch(); + } + + /// + public void GenerateCode(IfBranch branch) + { + // Get current mask + var currentMask = GetBlockMask(branch.BasicBlock); + + // Load condition and convert it into a lane mask + Load(branch.Condition); + Emitter.EmitCall(Instructions.ToMaskOperation32); + + var tempMask = Emitter.DeclareLocal(typeof(VelocityLaneMask)); + Emitter.Emit(LocalOperation.Store, tempMask); + + // Create a new branch builder + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + + // Adjust the true mask + branchBuilder.RecordBranchTarget(branch.TrueTarget, () => + { + // Intersect with the current mask + Emitter.Emit(LocalOperation.Load, tempMask); + IntersectWithMask(currentMask); + }); + + // Intersect negated with the current mask + branchBuilder.RecordBranchTarget(branch.FalseTarget, () => + { + // Adjust the current mask + Emitter.Emit(LocalOperation.Load, tempMask); + Emitter.EmitCall(Instructions.NegateLaneMask); + IntersectWithMask(currentMask); + }); + + // Emit branch (if required) + branchBuilder.EmitBranch(); + } + + /// + public void GenerateCode(SwitchBranch branch) + { + // Get current mask + var currentMask = GetBlockMask(branch.BasicBlock); + + // Create a new branch builder + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + + // Check lower bounds: case < 0 + Load(branch.Condition); + Emitter.EmitConstant(0); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.GetCompareOperation32( + CompareKind.LessThan, + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.ToMaskOperation32); + + // Check upper bounds: case >= num cases + Load(branch.Condition); + Emitter.EmitConstant(branch.NumCasesWithoutDefault); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.GetCompareOperation32( + CompareKind.GreaterEqual, + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.ToMaskOperation32); + + // Store unified branch mask + Emitter.EmitCall(Instructions.UnifyLanesMask); + IntersectWithMask(currentMask); + + var outOfBoundsMask = Emitter.DeclareLocal(typeof(VelocityLaneMask)); + Emitter.Emit(LocalOperation.Store, outOfBoundsMask); + + // Record branch to the default block + branchBuilder.RecordBranchTarget(branch.DefaultBlock, () => + { + Emitter.Emit(LocalOperation.Load, outOfBoundsMask); + }); + + // Adjust masks for each target + for (int i = 0; i < branch.NumCasesWithoutDefault; ++i) + { + // Check whether the conditional selector is equal to the current case + Load(branch.Condition); + Emitter.EmitConstant(i); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.GetCompareOperation32( + CompareKind.Equal, + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.ToMaskOperation32); + + // Store the current mask + var currentCaseMask = Emitter.DeclareLocal(typeof(VelocityLaneMask)); + IntersectWithMask(currentMask); + Emitter.Emit(LocalOperation.Store, currentCaseMask); + + // Record branch + branchBuilder.RecordBranchTarget(branch.GetCaseTarget(i), () => + { + Emitter.Emit(LocalOperation.Load, currentCaseMask); + }); + } + + // Emit branch if necessary + branchBuilder.EmitBranch(); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs new file mode 100644 index 000000000..57839778c --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs @@ -0,0 +1,219 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Threads.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(GenericAtomic atomic) + { + // Load the target and the value + Load(atomic.Target); + Load(atomic.Value); + Emitter.Emit(LocalOperation.Load, GetBlockMask(atomic.BasicBlock)); + + // Get the appropriate atomic operation + var warpMode = atomic.ArithmeticBasicValueType.GetWarpMode(); + var operation = atomic.IsTreatedAs32Bit() + ? Instructions.GetAtomicOperation32(atomic.Kind, warpMode) + : Instructions.GetAtomicOperation64(atomic.Kind, warpMode); + + // Call the operation implementation + Emitter.EmitCall(operation); + + // Check whether we actually need the result + if (!atomic.Uses.HasAny) + Emitter.Emit(OpCodes.Pop); + else + Store(atomic); + } + + /// + public void GenerateCode(AtomicCAS atomicCAS) + { + // Load the target, the compare value and the value + Load(atomicCAS.Target); + Load(atomicCAS.Value); + Load(atomicCAS.CompareValue); + Emitter.Emit(LocalOperation.Load, GetBlockMask(atomicCAS.BasicBlock)); + + // Get the appropriate atomic operation + var operation = atomicCAS.IsTreatedAs32Bit() + ? Instructions.AtomicCompareExchangeOperation32 + : Instructions.AtomicCompareExchangeOperation64; + + // Call the operation implementation + Emitter.EmitCall(operation); + Store(atomicCAS); + } + + /// + public void GenerateCode(GridIndexValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + Emitter.EmitCall(VelocityMultiprocessor.GetCurrentGridIdxMethodInfo); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(0); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + break; + } + Store(value); + } + + /// + public void GenerateCode(GroupIndexValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + Emitter.EmitCall(Instructions.LaneIndexVectorOperation32); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(0); + ToWarpValue(is32Bit: true, VelocityWarpOperationMode.I); + break; + } + Store(value); + } + + /// + public void GenerateCode(GridDimensionValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + Emitter.EmitCall(VelocityMultiprocessor.GetCurrentGridDimMethodInfo); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(1); + ToWarpValue(is32Bit: true, VelocityWarpOperationMode.I); + break; + } + Store(value); + } + + /// + public void GenerateCode(GroupDimensionValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + Emitter.EmitCall(VelocityMultiprocessor.GetCurrentGroupDimMethodInfo); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(1); + ToWarpValue(is32Bit: true, VelocityWarpOperationMode.I); + break; + } + Store(value); + } + + /// + public void GenerateCode(WarpSizeValue value) + { + Emitter.EmitConstant(WarpSize); + Emitter.EmitCall( + Instructions.GetConstValueOperation32(VelocityWarpOperationMode.I)); + Store(value); + } + + /// + public void GenerateCode(LaneIdxValue value) + { + Emitter.EmitCall(Instructions.LaneIndexVectorOperation32); + Store(value); + } + + /// + public void GenerateCode(PredicateBarrier barrier) + { + // Load predicate + Load(barrier.Predicate); + Emitter.Emit(LocalOperation.Load, GetBlockMask(barrier.BasicBlock)); + + // Load and call predicate operation + var operation = + Instructions.GetGroupPredicateBarrierOperation32(barrier.Kind); + Emitter.EmitCall(operation); + Store(barrier); + } + + /// + public void GenerateCode(Barrier barrier) => + Instructions.CallMemoryBarrier(Emitter); + + /// + public void GenerateCode(Broadcast broadcast) + { + // Load the source variable + Load(broadcast.Variable); + Load(broadcast.Origin); + + // Get the appropriate broadcast operation + var operation = broadcast.Kind == BroadcastKind.WarpLevel + ? Instructions.GetWarpBroadcastOperation32(VerifierType) + : Instructions.GetGroupBroadcastOperation32(VerifierType); + + // Emit the warp or group operation + Emitter.EmitCall(operation); + Store(broadcast); + } + + /// + public void GenerateCode(WarpShuffle shuffle) + { + // Load the source variable and the origin + Load(shuffle.Variable); + Load(shuffle.Origin); + + // Get the appropriate broadcast operation + var operation = Instructions.GetWarpShuffleOperation32( + shuffle.Kind, + VerifierType); + + // Emit the shuffle operation + Emitter.EmitCall(operation); + Store(shuffle); + } + + /// + public void GenerateCode(SubWarpShuffle shuffle) + { + // Load the source variable, the origin, and the sub-warp width + Load(shuffle.Variable); + Load(shuffle.Origin); + Load(shuffle.Width); + + // Get the appropriate broadcast operation + var operation = Instructions.GetSubWarpShuffleOperation32( + shuffle.Kind, + VerifierType); + + // Emit the shuffle operation + Emitter.EmitCall(operation); + Store(shuffle); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs new file mode 100644 index 000000000..658069265 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs @@ -0,0 +1,556 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Values.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(MethodCall methodCall) + { + // Load the current execution mask + Emitter.Emit(LocalOperation.Load, blockMasks[methodCall.BasicBlock]); + + // Load all arguments onto the evaluation stack + foreach (Value arg in methodCall) + Load(arg); + + // Call the module method + var method = Module[methodCall.Target]; + Emitter.EmitCall(method); + + if (!methodCall.Target.IsVoid) + Store(methodCall); + } + + /// + public void GenerateCode(Parameter parameter) + { + // Parameters have been bound in the beginning and do not need to be + // processed here + } + + /// + public void GenerateCode(PhiValue phiValue) + { + // Phi values need to be allocated in the beginning and do not need to be + // handled here + } + + /// + public void GenerateCode(UnaryArithmeticValue value) + { + Load(value.Value); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + var method = value.IsTreatedAs32Bit() + ? Instructions.GetUnaryOperation32(value.Kind, warpMode) + : Instructions.GetUnaryOperation64(value.Kind, warpMode); + Emitter.EmitCall(method); + Store(value); + } + + /// + public void GenerateCode(BinaryArithmeticValue value) + { + Load(value.Left); + Load(value.Right); + + // Check for operation types + switch (value.Kind) + { + case BinaryArithmeticKind.Shl: + case BinaryArithmeticKind.Shr: + // We need to convert the rhs operations to int64 + if (!value.IsTreatedAs32Bit()) + { + Emitter.EmitCall(Instructions.GetConvertWidenOperation32( + VelocityWarpOperationMode.I)); + } + break; + } + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + var method = value.IsTreatedAs32Bit() + ? Instructions.GetBinaryOperation32(value.Kind, warpMode) + : Instructions.GetBinaryOperation64(value.Kind, warpMode); + Emitter.EmitCall(method); + Store(value); + } + + /// + public void GenerateCode(TernaryArithmeticValue value) + { + Load(value.First); + Load(value.Second); + Load(value.Third); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + var method = value.IsTreatedAs32Bit() + ? Instructions.GetTernaryOperation32(value.Kind, warpMode) + : Instructions.GetTernaryOperation64(value.Kind, warpMode); + Emitter.EmitCall(method); + Store(value); + } + + /// + public void GenerateCode(CompareValue value) + { + Load(value.Left); + Load(value.Right); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + var method = value.CompareType.GetBasicValueType().IsTreatedAs32Bit() + ? Instructions.GetCompareOperation32(value.Kind, warpMode) + : Instructions.GetCompareOperation64(value.Kind, warpMode); + Emitter.EmitCall(method); + Store(value); + } + + /// + public void GenerateCode(ConvertValue value) + { + // Check to which value we have to convert the current value + var sourceMode = value.SourceType.GetWarpMode(); + var targetMode = value.TargetType.GetWarpMode(); + + // Load source + Load(value.Value); + + // Check whether have to expand or to narrow the current values on the stack + var sourceBasicValueType = value.SourceType.GetBasicValueType(); + bool sourceIs32Bit = sourceBasicValueType.IsTreatedAs32Bit(); + bool targetIs32Bit = value.IsTreatedAs32Bit(); + + if (sourceIs32Bit) + { + // The source value lives in the 32bit warp world + + // Check whether we have to widen first + if (targetIs32Bit) + { + // Use the local conversion functionality + Emitter.EmitCall(Instructions.GetSoftwareConvertOperation32( + value.SourceType, + value.TargetType, + WarpSize)); + } + else + { + // Use the local conversion mechanism in 32bit mode + ArithmeticBasicValueType targetType32; + if (sourceBasicValueType.IsFloat()) + { + // Ensure 32bit float compatibility + targetType32 = ArithmeticBasicValueType.Float32; + } + else + { + // Extent types to 32bit only while preserving the sign + targetType32 = value.TargetType.ForceTo32Bit(); + if (targetType32.IsFloat()) + { + targetType32 = value.IsSourceUnsigned + ? ArithmeticBasicValueType.UInt32 + : ArithmeticBasicValueType.Int32; + } + } + Emitter.EmitCall(Instructions.GetSoftwareConvertOperation32( + value.SourceType, + targetType32, + WarpSize)); + + // Widen first + Emitter.EmitCall(Instructions.GetConvertWidenOperation32( + sourceMode)); + + // Ensure valid data types in 64bit world + Emitter.EmitCall(Instructions.GetAcceleratedConvertOperation64( + sourceMode, + targetMode)); + } + } + else + { + // The source value lives in the 64bit warp world + + // Convert the values according to the 64bit type information + Emitter.EmitCall(Instructions.GetAcceleratedConvertOperation64( + sourceMode, + targetMode)); + + // We have to enter the 32bit world + if (targetIs32Bit) + { + // Narrow to 32bit world + Emitter.EmitCall(Instructions.GetConvertNarrowOperation64( + targetMode)); + + // Convert the remaining parts + Emitter.EmitCall(Instructions.GetSoftwareConvertOperation32( + value.TargetType.ForceTo32Bit(), + value.TargetType, + WarpSize)); + } + } + + Store(value); + } + + /// + public void GenerateCode(FloatAsIntCast value) + { + // Do nothing as this does not change any register contents + var valueLocal = GetLocal(value.Value); + Alias(value, valueLocal); + } + + /// + public void GenerateCode(IntAsFloatCast value) + { + // Do nothing as this does not change any register contents + var valueLocal = GetLocal(value.Value); + Alias(value, valueLocal); + } + + /// + /// Emits a new merge operation working on arbitrary values. + /// + private ILLocal? EmitMerge( + Value value, + Func loadLeft, + Func loadRight, + Action loadCondition, + Action merge32, + Action merge64, + Func getTempLocal) + { + // Merges values based on predicate masks + void MergeLocal(BasicValueType basicValueType) + { + loadCondition(); + if (basicValueType.IsTreatedAs32Bit()) + { + // Merge 32bit values + merge32(); + } + else + { + // Expand condition to 64bit vector if required + Emitter.EmitCall(Instructions.GetConvertWidenOperation32( + VelocityWarpOperationMode.I)); + + // Merge 64bit values + merge64(); + } + } + + // Merge the actual values from all lanes + if (value.Type is StructureType structureType) + { + var targetType = TypeGenerator.GetVectorizedType(structureType); + var target = getTempLocal(targetType); + + // Iterate over all field elements + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load arguments + Emitter.Emit(LocalOperation.LoadAddress, target); + + var leftType = loadLeft(); + Emitter.LoadField(leftType, fieldAccess.Index); + var rightType = loadRight(); + Emitter.LoadField(rightType, fieldAccess.Index); + + // Merge + MergeLocal(fieldType.BasicValueType); + + // Store field + Emitter.StoreField(targetType, fieldAccess.Index); + } + + return target; + } + else + { + // A direct merge is possible + loadLeft(); + loadRight(); + MergeLocal(value.BasicValueType); + return null; + } + } + + /// + public void GenerateCode(Predicate predicate) + { + // Load true and false values in reverse order to match API spec + var falseLocal = GetLocal(predicate.FalseValue); + var trueLocal = GetLocal(predicate.TrueValue); + + // Emit the merge + var local = EmitMerge(predicate, + () => + { + Emitter.Emit(LocalOperation.Load, falseLocal); + return falseLocal.VariableType; + }, + () => + { + Emitter.Emit(LocalOperation.Load, trueLocal); + return trueLocal.VariableType; + }, + () => Load(predicate.Condition), + () => Emitter.EmitCall(Instructions.MergeOperation32), + () => Emitter.EmitCall(Instructions.MergeOperation64), + type => Emitter.DeclareLocal(type)); + + // Bind value result + if (local.HasValue) + Alias(predicate, local.Value); + else + Store(predicate); + } + + /// + public void GenerateCode(Alloca alloca) + { + // All allocations have already been processed in the beginning. + } + + /// + public void GenerateCode(MemoryBarrier barrier) => + Instructions.CallMemoryBarrier(Emitter); + + /// + public void GenerateCode(PrimitiveValue value) + { + switch (value.BasicValueType) + { + case BasicValueType.Int1: + Emitter.Emit(value.Int1Value ? OpCodes.Ldc_I4_M1 : OpCodes.Ldc_I4_0); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Int8: + Emitter.LoadIntegerConstant(value.Int8Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Int16: + Emitter.LoadIntegerConstant(value.Int16Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Int32: + Emitter.LoadIntegerConstant(value.Int32Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Int64: + Emitter.EmitConstant(value.Int64Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation64( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Float16: + Emitter.EmitConstant(value.Float16Value.RawValue); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.U)); + break; + case BasicValueType.Float32: + Emitter.EmitConstant(value.Float32Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.F)); + break; + case BasicValueType.Float64: + Emitter.EmitConstant(value.Float64Value); + Emitter.EmitCall( + Instructions.GetConstValueOperation64( + VelocityWarpOperationMode.F)); + break; + default: + throw new NotSupportedIntrinsicException( + value.BasicValueType.ToString()); + } + Store(value); + } + + /// + public void GenerateCode(StringValue value) + { + Emitter.EmitConstant(value.String); + Store(value); + } + + /// + public void GenerateCode(NullValue value) + { + // Check whether we have already loaded a null value + if (!nullLocals.TryGetValue(value.Type, out var local)) + { + // If not... load the value + local = Emitter.LoadNull(GetVectorizedType(value.Type)); + nullLocals.Add(value.Type, local); + } + Alias(value, local); + } + + /// + public void GenerateCode(StructureValue value) + { + // Generate a local variable that contains the type + var managedType = GetVectorizedType(value.Type); + var local = Emitter.LoadNull(managedType); + + // Insert all fields + for (int i = 0, e = value.Count; i < e; ++i) + { + Emitter.Emit(LocalOperation.LoadAddress, local); + Load(value[i]); + Emitter.StoreField(managedType, i); + } + + Alias(value, local); + } + + /// + public void GenerateCode(GetField value) + { + // Check the result type of the operation + if (!value.FieldSpan.HasSpan) + { + // Extract the primitive value from the structure + LoadRefAndType(value.ObjectValue, out var objectType); + Emitter.LoadField(objectType, value.FieldSpan.Index); + + // Store field value + Store(value); + } + else + { + // The result is a new structure value + var newObjectType = GetVectorizedType(value.Type); + var local = Emitter.DeclareLocal(newObjectType); + // Extract all fields from the structure + int span = value.FieldSpan.Span; + for (int i = 0; i < span; ++i) + { + Emitter.Emit(LocalOperation.LoadAddress, local); + + LoadRefAndType(value.ObjectValue, out var objectType); + Emitter.LoadField( + objectType, + i + value.FieldSpan.Index); + + Emitter.StoreField(newObjectType, i); + } + + // Bind the current value + Alias(value, local); + } + } + + /// + public void GenerateCode(SetField value) + { + var mask = GetBlockMask(value.BasicBlock); + + // The result operation will be another structure instance + LoadVectorized(value.ObjectValue, out var type); + // var vectorized = GetVectorizedType(value.ObjectValue.Type); + var local = Emitter.DeclareLocal(type); + + // Copy object instance + Emitter.Emit(LocalOperation.Store, local); + + var structureType = value.ObjectValue.Type.As(value); + for (int i = 0, e = value.FieldSpan.Span; i < e; ++i) + { + int fieldOffset = value.FieldSpan.Index + i; + + // Load the source value + Emitter.Emit(LocalOperation.LoadAddress, local); + Emitter.Emit(OpCodes.Dup); + Emitter.LoadField(type, fieldOffset); + + // Load the target value to store + if (e > 1) + { + LoadRef(value.Value); + Emitter.LoadField(type, i); + } + else + { + // Load the whole value + Load(value.Value); + } + + // Load the mask + Emitter.Emit(LocalOperation.Load, mask); + + // Merge data + var mergeMode = structureType[i].BasicValueType.IsTreatedAs32Bit() + ? Instructions.MergeWithMaskOperation32 + : Instructions.MergeWithMaskOperation64; + Emitter.EmitCall(mergeMode); + + // Store merged value + Emitter.StoreField(type, fieldOffset); + } + + Alias(value, local); + } + + /// + public void GenerateCode(DebugAssertOperation debug) + { + // If the mask is active emit a failed debug assertion + var blockMask = GetBlockMask(debug.BasicBlock); + Emitter.Emit(LocalOperation.Load, blockMask); + + // Load the debug condition + Load(debug.Condition); + + // Load the debug error message + string errorMessage = debug.Message.Resolve() is StringValue stringValue + ? debug.Location.FormatErrorMessage(stringValue.String) + : "Assertion failed"; + Emitter.EmitConstant(errorMessage); + + // Call our assertion method + Instructions.CallAssert(Emitter); + } + + /// + public void GenerateCode(WriteToOutput output) => + throw new NotSupportedIntrinsicException(); + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs new file mode 100644 index 000000000..fdf92cea2 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs @@ -0,0 +1,132 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Views.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Resources; +using ILGPU.Runtime.Velocity; +using System.Runtime.InteropServices; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(IntAsPointerCast cast) + { + // Load the integer information + Load(cast.Value); + + // Check whether we have to convert it to a 64bit value + if (cast.SourceType.BasicValueType.IsTreatedAs32Bit()) + { + // Convert it to a 64bit pointer + Emitter.EmitCall(Instructions.GetConvertWidenOperation32( + VelocityWarpOperationMode.U)); + } + + // The integer can now be interpreted as pointer + Store(cast); + } + + /// + public void GenerateCode(PointerAsIntCast cast) => + Alias(cast, GetLocal(cast.Value)); + + /// + public void GenerateCode(PointerCast cast) => + Alias(cast, GetLocal(cast.Value)); + + /// + public void GenerateCode(AddressSpaceCast value) => + Alias(value, GetLocal(value.Value)); + + /// + public void GenerateCode(LoadElementAddress value) + { + // Load the raw element offset to multiply + Load(value.Offset); + + // Widen the source address if necessary + if (value.Is32BitAccess) + { + Emitter.EmitCall( + Instructions.GetConvertWidenOperation32(VelocityWarpOperationMode.I)); + } + + // Load the source type information and the element size to multiply + var sourceType = value.Source.Type.As(value); + Emitter.EmitConstant((long)sourceType.ElementType.Size); + ToWarpValue(is32Bit: false, VelocityWarpOperationMode.U); + + // Load the source vector to add + Load(value.Source); + + // Perform the actual offset computation + var madOperation = Instructions.GetTernaryOperation64( + TernaryArithmeticKind.MultiplyAdd, + VelocityWarpOperationMode.U); + Emitter.EmitCall(madOperation); + Store(value); + } + + /// + public void GenerateCode(LoadFieldAddress value) + { + // Compute the actual field offset based on the vectorized type + long offset = value.StructureType.GetOffset(value.FieldSpan.Access.Index); + + // If this results in an actual byte offset... add it + if (offset != 0L) + { + // Load the source addresses + Load(value.Source); + + // Load constant + Emitter.EmitConstant(offset); + ToWarpValue(is32Bit: false, VelocityWarpOperationMode.U); + + // Adjust address + Emitter.EmitCall(Instructions.GetBinaryOperation64( + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U)); + + // Store the newly computed offset + Store(value); + } + else + { + Alias(value, GetLocal(value.Source)); + } + } + + /// + public void GenerateCode(AlignTo value) => + // Not implemented at the moment as we do not make use of bulk-vector loads + // and stores at the moment + Alias(value, GetLocal(value.Source)); + + /// + public void GenerateCode(AsAligned value) => + // Not implemented at the moment as we do not make use of bulk-vector loads + // and stores at the moment + Alias(value, GetLocal(value.Source)); + + /// + public void GenerateCode(DynamicMemoryLengthValue value) => + throw value.GetNotSupportedException(ErrorMessages + .NotSupportedDynamicSharedMemoryAllocations); + + /// + public void GenerateCode(LanguageEmitValue value) { } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs new file mode 100644 index 000000000..6419b9544 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs @@ -0,0 +1,764 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Analyses.ControlFlowDirection; +using ILGPU.IR.Analyses.TraversalOrders; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Resources; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Reflection; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Generates vectorized MSIL instructions out of IR values. + /// + /// The IL emitter type. + /// The view generator type. + /// The code needs to be prepared for this code generator. + abstract partial class VelocityCodeGenerator : + IBackendCodeGenerator + where TILEmitter : struct, IILEmitter + where TVerifier : IVelocityWarpVerifier, new() + { + #region Constants + + public const int MaskParameterIndex = 0; + + #endregion + + #region Nested Types + + /// + /// Represents a specialized phi binding allocator. + /// + private readonly struct PhiBindingAllocator : IPhiBindingAllocator + { + /// + /// Constructs a new phi binding allocator. + /// + /// The parent code generator. + public PhiBindingAllocator( + VelocityCodeGenerator parent) + { + Parent = parent; + } + + /// + /// Returns the parent code generator. + /// + public VelocityCodeGenerator Parent { get; } + + /// + /// Does not perform any operation. + /// + public void Process(BasicBlock block, Phis phis) { } + + /// + /// Allocates a new phi value in the dominator block. + /// + public void Allocate(BasicBlock block, PhiValue phiValue) => + Parent.Declare(phiValue); + } + + /// + /// Helps building branches by taking branch targets and masks into account. + /// + private struct BranchBuilder + { + private readonly VelocityCodeGenerator codeGenerator; + + private readonly bool isBackEdgeBlock; + private InlineList<(BasicBlock BasicBlock, Action PassMask)> headers; + + public BranchBuilder( + VelocityCodeGenerator parent, + BasicBlock currentBlock) + { + codeGenerator = parent; + isBackEdgeBlock = parent.backEdges.Contains(currentBlock); + headers = InlineList<(BasicBlock, Action)>.Create(2); + } + + /// + /// Returns the current emitter. + /// + public TILEmitter Emitter => codeGenerator.Emitter; + + /// + /// Returns the header map. + /// + public BasicBlockMap< + BasicBlockCollection> Headers => + codeGenerator.headersBodyMap; + + /// + /// Returns the parent dominators. + /// + public Dominators Dominators => codeGenerator.dominators; + + /// + /// Returns the parent instructions + /// + public VelocityInstructions Instructions => codeGenerator.Instructions; + public bool NeedsBranch => headers.Count > 0; + + /// + /// Records a branch target. + /// + /// The target block to branch to. + /// The pass mask action. + public void RecordBranchTarget(BasicBlock target, Action passMask) + { + // Check for a jump backwards + if (isBackEdgeBlock && Headers.Contains(target)) + { + // We need to intersect with the mask of the target block + headers.Add((target, passMask)); + } + else + { + // Pass the current mask + passMask(); + + // We are branching forwards and need to pass the mask while unifying + // all lanes + codeGenerator.UnifyWithMaskOf(target); + } + } + + /// + /// Emits a branch if required. + /// + public void EmitBranch() + { + // If we don't need a branch, we can safely return here + if (!NeedsBranch) + return; + + // Intersect with all target masks + foreach (var (target, passMask) in headers) + { + passMask(); + codeGenerator.IntersectWithMaskOf(target); + } + + // TODO: Find the "top-most" header + // TODO: support for multiple header jumps + headers[0].BasicBlock.Assert(headers.Count == 1); + + // Disable all lanes in all loop bodies + foreach (var (header, _) in headers) + { + foreach (var block in Headers[header]) + { + if (block == header) + continue; + + var blockMask = codeGenerator.GetBlockMask(block); + codeGenerator.DisableAllLanes(blockMask); + } + } + + // Check for any active lane and jump in the case a lane requires + // further processing + var (targetHeader, _) = headers[0]; + Emitter.Emit( + LocalOperation.Load, + codeGenerator.GetBlockMask(targetHeader)); + Emitter.EmitCall(Instructions.MaskHasActiveLanes); + + // Branch to the actual loop header + var blockLabel = codeGenerator.blockLookup[targetHeader]; + Emitter.Emit(OpCodes.Brtrue, blockLabel); + } + } + + public readonly struct GeneratorArgs + { + internal GeneratorArgs( + VelocityInstructions instructions, + VelocityGenerationModule module, + int warpSize, + EntryPoint entryPoint) + { + Instructions = instructions; + Module = module; + WarpSize = warpSize; + EntryPoint = entryPoint; + } + + /// + /// Returns the current instruction instance. + /// + public VelocityInstructions Instructions { get; } + + /// + /// Returns the current generation module. + /// + public VelocityGenerationModule Module { get; } + + /// + /// Returns the current warp size to be used. + /// + public int WarpSize { get; } + + /// + /// Returns the current entry point. + /// + public EntryPoint EntryPoint { get; } + } + + #endregion + + #region Static + + /// + /// The current verifier type. + /// + private static readonly Type VerifierType = typeof(TVerifier); + + #endregion + + #region Instance + + /// + /// Maps blocks to their input masks. + /// + private readonly BasicBlockMap blockMasks; + + /// + /// Maps blocks to labels. + /// + private readonly Dictionary blockLookup = + new Dictionary(); + + /// + /// The set of all back edge source blocks. + /// + private readonly BasicBlockSet backEdges; + + /// + /// The set of all loop headers. + /// + private readonly BasicBlockMap< + BasicBlockCollection> headersBodyMap; + + /// + /// The current dominators. + /// + private readonly Dominators dominators; + + private readonly Dictionary locals = + new Dictionary(); + + /// + /// Temporary locals for initialization. + /// + private readonly Dictionary nullLocals = + new Dictionary(); + + /// + /// Constructs a new IL code generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + protected VelocityCodeGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + { + Instructions = args.Instructions; + Module = args.Module; + WarpSize = args.WarpSize; + + // Creates a new IL emitter + Method = method; + Allocas = allocas; + Emitter = (TILEmitter)Activator.CreateInstance( + typeof(TILEmitter), + Module.GetILGenerator(method)); + + blockMasks = method.Blocks.CreateMap(); + headersBodyMap = method.Blocks.CreateMap< + BasicBlockCollection>(); + backEdges = method.Blocks.CreateSet(); + + // Determine CFG, dominators and loops + var cfg = method.Blocks.CreateCFG(); + dominators = cfg.CreateDominators(); + var loops = cfg.CreateLoops(); + foreach (var loop in loops) + { + // Determine all body blocks + var bodyBlocks = loop.ComputeOrderedBlocks(0); + + // Register all loop headers + foreach (var header in loop.Headers) + headersBodyMap.Add(header, bodyBlocks); + + // Register all back edges + foreach (var backEdge in loop.BackEdges) + backEdges.Add(backEdge); + } + + // Allocate local masks and initialize all of them + foreach (var block in method.Blocks) + { + // Create a local variable to store the entry mask for this block + var blockMask = Emitter.DeclareLocal(typeof(VelocityLaneMask)); + blockMasks[block] = blockMask; + + // Declare a label for each block + blockLookup[block] = Emitter.DeclareLabel(); + } + } + + #endregion + + #region Properties + + /// + /// Returns the current instruction instance. + /// + public VelocityInstructions Instructions { get; } + + /// + /// Returns the current generation module. + /// + public VelocityGenerationModule Module { get; } + + /// + /// Returns the current warp size to be used. + /// + public int WarpSize { get; } + + /// + /// Returns the current type generator being used. + /// + public VelocityTypeGenerator TypeGenerator => Module.TypeGenerator; + + /// + /// Returns the current method. + /// + public Method Method { get; } + + /// + /// Returns all allocations. + /// + public Allocas Allocas { get; } + + /// + /// Returns the current emitter. + /// + public TILEmitter Emitter { get; } + + #endregion + + #region IBackendCodeGenerator + + /// + /// Perform no operation. + /// + public void GenerateHeader(object builder) + { + // We do not need to generate any headers + } + + /// + /// Generates an MSIL runtime method. + /// + public abstract void GenerateCode(); + + /// + /// Perform no operation. + /// + public void GenerateConstants(object builder) + { + // We do not need to emit any constants + } + + /// + public void Merge(object builder) + { + // We do not need to perform any action + } + + #endregion + + #region Methods + + /// + /// Resets the block mask for the given block to all lanes. + /// + protected void EnableAllLanes(ILLocal local) + { + Emitter.Emit(OpCodes.Ldsfld, Instructions.AllLanesMask); + Emitter.Emit(LocalOperation.Store, local); + } + + /// + /// Resets the block mask for the given block to no lanes at all. + /// + protected void DisableAllLanes(ILLocal local) + { + Emitter.Emit(OpCodes.Ldsfld, Instructions.NoLanesMask); + Emitter.Emit(LocalOperation.Store, local); + } + + /// + /// Returns the block mask for the given basic block. + /// + /// The block to lookup. + /// The block mask to use. + protected ILLocal GetBlockMask(BasicBlock block) => blockMasks[block]; + + private BranchBuilder CreateBranchBuilder(BasicBlock current) => + new(this, current); + + /// + /// Intersects the current mask with the mask on the top of the stack. + /// + private void IntersectWithMaskOf(BasicBlock current) + { + // Intersect with the current mask + var currentMask = GetBlockMask(current); + IntersectWithMask(currentMask); + Emitter.Emit(LocalOperation.Store, currentMask); + } + + /// + /// Intersects the current mask with the mask on the top of the stack. + /// + private void IntersectWithMask(ILLocal current) + { + // Intersect with the current mask + Emitter.Emit(LocalOperation.Load, current); + Emitter.EmitCall(Instructions.IntersectLanesMask); + } + + /// + /// Unifies the target mask with the mask on the top of the stack and stores + /// the result. + /// + private void UnifyWithMaskOf(BasicBlock target) + { + var targetMask = blockMasks[target]; + UnifyWithMask(targetMask); + } + + /// + /// Unifies the target mask with the mask on the top of the stack and stores + /// the result. + /// + private void UnifyWithMask(ILLocal targetMask) + { + Emitter.Emit(LocalOperation.Load, targetMask); + Emitter.EmitCall(Instructions.UnifyLanesMask); + Emitter.Emit(LocalOperation.Store, targetMask); + } + + /// + /// Disables all internal lanes. + /// + private void DisableAllLanes() + { + foreach (var (basicBlock, blockMask) in blockMasks) + { + if (basicBlock == Method.EntryBlock) + continue; + DisableAllLanes(blockMask); + } + } + + /// + /// Generates code for all blocks. + /// + protected void GenerateCodeInternal() + { + Method.DumpToConsole(); + + // Setup phi values + var bindingAllocator = new PhiBindingAllocator(this); + var phiBindings = PhiBindings.Create(Method.Blocks, bindingAllocator); + var intermediatePhis = new Dictionary( + phiBindings.MaxNumIntermediatePhis); + + // Init all possible phi values + foreach (var phiValue in phiBindings.PhiValues) + { + var nullValue = Emitter.LoadNull(GetVectorizedType(phiValue.PhiType)); + Emitter.Emit(LocalOperation.Load, nullValue); + Emitter.Emit(LocalOperation.Store, GetLocal(phiValue)); + } + + // Init all allocations + BindAllocations(); + + // Disable all lanes + DisableAllLanes(); + + // Emit code for each block + foreach (var block in Method.Blocks) + { + // Mark the current label + Emitter.MarkLabel(blockLookup[block]); + + // Generate code for all values + foreach (var value in block) + this.GenerateCodeFor(value); + + // Wire phi nodes + if (phiBindings.TryGetBindings(block, out var bindings)) + { + // Assign all phi values + BindPhis(bindings, intermediatePhis, block); + } + + // Build terminator + this.GenerateCodeFor(block.Terminator); + + // Reset all intermediate phis + intermediatePhis.Clear(); + } + } + + /// + /// Binds all shared and local memory allocations. + /// + private void BindAllocations() + { + // Bind shared allocations + foreach (var allocation in Allocas.SharedAllocations) + { + var allocationMethod = VelocityMultiprocessor.GetSharedMemoryMethodInfo + .MakeGenericMethod(new Type[] + { + TypeGenerator.GetLinearizedScalarType(allocation.ElementType) + }); + Emitter.LoadIntegerConstant(allocation.ArraySize); + Emitter.EmitCall(allocationMethod); + Store(allocation.Alloca); + } + + // Bind local allocations + foreach (var allocation in Allocas.LocalAllocations) + { + var allocationMethod = VelocityMultiprocessor.GetLocalMemoryMethodInfo + .MakeGenericMethod(new Type[] + { + TypeGenerator.GetLinearizedScalarType(allocation.ElementType) + }); + Emitter.LoadIntegerConstant(allocation.ArraySize); + Emitter.EmitCall(allocationMethod); + Store(allocation.Alloca); + } + + // Dynamic shared memory allocations are not supported at the moment + if (Allocas.DynamicSharedAllocations.Length > 0) + { + throw Method.GetNotSupportedException( + ErrorMessages.NotSupportedDynamicSharedMemoryAllocations); + } + } + + /// + /// Binds all phi values of the current block. + /// + private void BindPhis( + PhiBindings.PhiBindingCollection bindings, + Dictionary intermediatePhis, + BasicBlock block) + { + foreach (var (phiValue, value) in bindings) + { + // Check for an intermediate phi value + if (bindings.IsIntermediate(phiValue)) + { + // Declare a new intermediate local variable + var intermediateLocal = DeclareVectorizedTemporary(phiValue.PhiType); + intermediatePhis.Add(phiValue, intermediateLocal); + + // Move this phi value into a temporary register for reuse + Load(phiValue); + Emitter.Emit(LocalOperation.Store, intermediateLocal); + } + + // Determine the source value from which we need to copy from + var sourceLocal = intermediatePhis + .TryGetValue(value, out var tempLocal) + ? tempLocal + : GetLocal(value); + + // Move contents while merging our information + var phiLocal = GetLocal(phiValue); + var phiBlockMask = blockMasks[block]; + var intermediateTempLocal = EmitMerge(phiValue, + () => + { + Emitter.Emit(LocalOperation.Load, phiLocal); + return phiLocal.VariableType; + }, + () => + { + Emitter.Emit(LocalOperation.Load, sourceLocal); + return sourceLocal.VariableType; + }, + () => Emitter.Emit(LocalOperation.Load, phiBlockMask), + () => Emitter.EmitCall(Instructions.MergeWithMaskOperation32), + () => Emitter.EmitCall(Instructions.MergeWithMaskOperation64), + _ => phiLocal); + // Store the value to the phi local explicitly + if (!intermediateTempLocal.HasValue) + Emitter.Emit(LocalOperation.Store, phiLocal); + } + } + + /// + /// Converts the value on the top of the stack to a full-featured velocity warp + /// vector either consisting of 32bit or 64bit values. + /// + /// + /// True, if the current value is considered a 32bit value. + /// + /// The current operation mode. + public void ToWarpValue(bool is32Bit, VelocityWarpOperationMode mode) + { + // Determine whether the current value on the stack is a 32bit value or not + var operation = is32Bit + ? Instructions.GetConstValueOperation32(mode) + : Instructions.GetConstValueOperation64(mode); + Emitter.EmitCall(operation); + } + + /// + /// Loads a local variable that has been associated with the given value. + /// + /// The value to load. + /// The loaded variable. + private ILLocal GetLocal(Value value) + { + // Load the local + value.Assert(locals.ContainsKey(value)); + return locals[value]; + } + + /// + /// Loads the given value onto the evaluation stack. + /// + /// The value to load. + public void Load(Value value) + { + var local = GetLocal(value); + Emitter.Emit(LocalOperation.Load, local); + // Note that we assume that all locals have already been converted to + // their vector counterparts + } + + /// + /// Loads the given value onto the evaluation stack. + /// + /// The value to load. + /// The loaded managed type. + public void LoadVectorized(Value value, out Type type) + { + Load(value); + type = GetVectorizedType(value.Type); + } + + /// + /// Loads a reference to the given value onto the evaluation stack. + /// + /// The value to load. + public void LoadRef(Value value) + { + // Load address of local variable + var local = GetLocal(value); + Emitter.Emit(LocalOperation.LoadAddress, local); + } + + /// + /// Loads a reference to the given value onto the evaluation stack. + /// + /// The value to load. + /// The loaded managed type. + public void LoadRefAndType(Value value, out Type type) + { + LoadRef(value); + type = GetVectorizedType(value.Type); + } + + /// + /// Declares a new phi value. + /// + /// The phi value to declare. + public void Declare(PhiValue phiValue) + { + var local = DeclareVectorizedTemporary(phiValue.PhiType); + locals.Add(phiValue, local); + } + + /// + /// Declares a new vectorized temporary variable. + /// + /// The type of the variable to allocate. + /// The allocated variable. + public ILLocal DeclareVectorizedTemporary(TypeNode typeNode) => + Emitter.DeclareLocal(GetVectorizedType(typeNode)); + + /// + /// Stores the given value by popping its value from the evaluation stack. + /// + /// The value to store. + public void Store(Value value) + { + value.Assert(!locals.ContainsKey(value)); + if (!value.Uses.HasAny) + return; + + var local = Emitter.DeclareLocal(GetVectorizedType(value.Type)); + locals.Add(value, local); + Emitter.Emit(LocalOperation.Store, local); + } + + /// + /// Aliases the given value with the specified local. + /// + /// The value to register an alias for. + /// The local variable alias. + public void Alias(Value value, ILLocal local) + { + value.Assert(!locals.ContainsKey(value)); + locals.Add(value, local); + } + + /// + /// Loads the vectorized managed type that corresponds to the given IR type. + /// + /// The IR type to convert + /// The vectorized managed type. + private Type GetVectorizedType(TypeNode type) => + TypeGenerator.GetVectorizedType(type); + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs b/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs new file mode 100644 index 000000000..39fd383c9 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs @@ -0,0 +1,103 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCompiledKernel.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Runtime.Velocity; +using System; +using System.Collections.Immutable; +using System.Reflection; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Represents a compiled kernel in vectorized MSIL form. + /// + public sealed class VelocityCompiledKernel : CompiledKernel + { + #region Instance + + /// + /// Constructs a new IL compiled kernel. + /// + /// The associated context. + /// The entry point. + /// The main kernel method. + /// The custom parameters type. + /// + /// The type constructor of the parameters type. + /// + /// + /// Mapping of kernel parameter indices to parameter fields. + /// + /// + /// The amount of statically allocated bytes of shared memory. + /// + internal VelocityCompiledKernel( + Context context, + EntryPoint entryPoint, + MethodInfo kernelMethod, + Type parametersType, + ConstructorInfo parametersTypeConstructor, + ImmutableArray parameterFields, + int allocatedSharedMemorySize) + : base(context, entryPoint, null) + { + KernelMethod = kernelMethod; + ParametersType = parametersType; + ParameterFields = parameterFields; + ParametersTypeConstructor = parametersTypeConstructor; + AllocatedSharedMemorySize = allocatedSharedMemorySize; + } + + #endregion + + #region Properties + + /// + /// Returns the main kernel method. + /// + public MethodInfo KernelMethod { get; } + + /// + /// Returns the custom parameter store type to dispatch the kernel. + /// + internal Type ParametersType { get; } + + /// + /// Returns the type constructor to instantiate the custom parameters type. + /// + internal ConstructorInfo ParametersTypeConstructor { get; } + + /// + /// Returns a mapping of kernel parameter indices to parameter field.s + /// + internal ImmutableArray ParameterFields { get; } + + /// + /// Returns the size of statically allocated shared memory in bytes. + /// + public int AllocatedSharedMemorySize { get; } + + #endregion + + #region Methods + + /// + /// Creates a new kernel entry point to be used with this kernel module. + /// + /// A kernel entry point delegate. + internal VelocityKernelEntryPoint CreateKernelEntryPoint() => + KernelMethod.CreateDelegate(); + + #endregion + } +} + diff --git a/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs new file mode 100644 index 000000000..7911db1db --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs @@ -0,0 +1,114 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityFunctionGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using System.Net.Http.Headers; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A generator for non primary Velocity functions. + /// + /// The IL emitter type. + /// The view generator type. + sealed class VelocityFunctionGenerator : + VelocityCodeGenerator + where TILEmitter : struct, IILEmitter + where TVerifier : IVelocityWarpVerifier, new() + { + /// + /// The internal return label. + /// + private readonly ILLabel returnLabel; + + /// + /// The internal return-value local (if any). + /// + private readonly ILLocal? returnLocal; + + /// + /// Creates a new Velocity function generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + public VelocityFunctionGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + : base(args, method, allocas) + { + returnLabel = Emitter.DeclareLabel(); + returnLocal = method.IsVoid + ? null + : Emitter.DeclareLocal( + TypeGenerator.GetVectorizedType(method.ReturnType)); + } + + /// + /// Generates Velocity code for this function. + /// + public override void GenerateCode() + { + // Bind the mask parameter + Emitter.Emit(ArgumentOperation.Load, MaskParameterIndex); + Emitter.Emit(LocalOperation.Store, GetBlockMask(Method.EntryBlock)); + + // Bind all remaining parameters + for (int i = 0; i < Method.NumParameters; ++i) + { + var parameterType = Method.Parameters[i].ParameterType; + var parameterLocal = DeclareVectorizedTemporary(parameterType); + + Emitter.Emit(ArgumentOperation.Load, i + 1); + Emitter.Emit(LocalOperation.Store, parameterLocal); + + Alias(Method.Parameters[i], parameterLocal); + } + + // Emit the remaining code + GenerateCodeInternal(); + + // Emit the actual return part + Emitter.MarkLabel(returnLabel); + if (returnLocal.HasValue) + Emitter.Emit(LocalOperation.Load, returnLocal.Value); + Emitter.Emit(OpCodes.Ret); + } + + /// + public override void GenerateCode(ReturnTerminator returnTerminator) + { + // Note that this automatically returns a vectorized version + // of all return values + + // Jump to the next block in case all lanes have been disabled + Emitter.Emit( + LocalOperation.Load, + GetBlockMask(returnTerminator.BasicBlock)); + Emitter.EmitCall(Instructions.AreAllLanesActive); + + // In case not all lanes have completed processing, we will have to skip + // the actual return statement here + if (returnLocal.HasValue) + { + Load(returnTerminator.ReturnValue); + Emitter.Emit(LocalOperation.Store, returnLocal.Value); + } + Emitter.Emit(OpCodes.Brtrue, returnLabel); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs new file mode 100644 index 000000000..00d2aea31 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs @@ -0,0 +1,374 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityGenerationModule.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.Resources; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Reflection; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A kernel module generator for managed velocity kernel types. + /// + sealed class VelocityGenerationModule : DisposeBase + { + #region Static + + /// + /// Builds a complete parameter-type class wrapper that takes all scalar kernel + /// arguments as constructor arguments and converts them into ready-to-load + /// vectorized versions that are in turn stored as class fields. + /// + private static Type BuildParametersType( + RuntimeSystem runtimeSystem, + VelocityInstructions instructions, + VelocityTypeGenerator typeGenerator, + in Backend.BackendContext backendContext, + EntryPoint entryPoint, + out ConstructorInfo constructor, + out ImmutableArray parameterFields) + { + // Build a new parameter passing type + using var parametersLock = runtimeSystem.DefineRuntimeClass( + typeof(VelocityParameters), + out var typeBuilder); + + var kernelMethod = backendContext.KernelMethod; + int numParameters = + kernelMethod.Parameters.Count - + entryPoint.KernelIndexParameterOffset; + var nativeParameterTypes = new TypeNode[numParameters]; + var constructorParameterTypes = new Type[nativeParameterTypes.Length]; + var constructorLocalTypes = new Type[nativeParameterTypes.Length]; + var builtFields = new FieldInfo[numParameters]; + for (int i = 0; i < numParameters; ++i) + { + // Determine the scalar parameter type and remember it + int parameterIndex = i + entryPoint.KernelIndexParameterOffset; + var parameterType = kernelMethod.Parameters[parameterIndex].ParameterType; + nativeParameterTypes[i] = parameterType; + constructorLocalTypes[i] = + typeGenerator.GetLinearizedScalarType(parameterType); + constructorParameterTypes[i] = typeof(void*); + + // Convert the parameter type and declare a new field + var vectorizedType = typeGenerator.GetVectorizedType(parameterType); + builtFields[i] = typeBuilder.DefineField( + StructureType.GetFieldName(i), + vectorizedType, + FieldAttributes.Public); + } + + // Build a constructor that converts all parameters into their vectorized + // representation + var constructorBuilder = typeBuilder.DefineConstructor( + MethodAttributes.Public, + CallingConventions.Standard, + constructorParameterTypes); + { + // Create a new constructor IL emitter + var emitter = new ILEmitter(constructorBuilder.GetILGenerator()); + + // Load each argument passed to the constructor and convert it into its + // vectorized form via specialized convert operations + for (int i = 0; i < constructorParameterTypes.Length; ++i) + { + // Convert the current argument into a temporary local to load from + var loadLocal = emitter.DeclareLocal(constructorLocalTypes[i]); + emitter.Emit(ArgumentOperation.Load, i + 1); + + // Load object via direct memory operations from pinned memory + emitter.Emit(OpCodes.Ldobj, constructorLocalTypes[i]); + emitter.Emit(LocalOperation.Store, loadLocal); + + // Load a vectorized version + emitter.Emit(OpCodes.Ldarg_0); + BuildParameterLoad( + emitter, + loadLocal, + nativeParameterTypes[i], + instructions, + typeGenerator); + + // Store vectorized version + emitter.Emit(OpCodes.Stfld, builtFields[i]); + } + + // Return + emitter.Emit(OpCodes.Ret); + } + + // Build the parameter type and determine the parameter mapping + var result = typeBuilder.CreateType(); + var parameterMapping = + ImmutableArray.CreateBuilder(numParameters); + for (int i = 0; i < numParameters; ++i) + { + var fieldInfo = ILEmitterExtensions.GetFieldInfo(result, i); + parameterMapping.Add(fieldInfo); + } + parameterFields = parameterMapping.MoveToImmutable(); + constructor = result.GetConstructor(constructorParameterTypes); + return result; + } + + /// + /// Builds a vectorized kernel parameter load for arbitrary types. + /// + private static void BuildParameterLoad( + in TILEmitter emitter, + ILLocal source, + TypeNode typeNode, + VelocityInstructions instructions, + VelocityTypeGenerator typeGenerator) + where TILEmitter : struct, IILEmitter + { + if (typeNode is StructureType structureType) + { + var vectorizedType = typeGenerator.GetVectorizedType(structureType); + var temporary = emitter.DeclareLocal(vectorizedType); + + // Fill the temporary structure instance with values + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the target variable address + emitter.Emit(LocalOperation.LoadAddress, temporary); + + // Load the input value + emitter.Emit(LocalOperation.Load, source); + emitter.LoadField(source.VariableType, fieldAccess.Index); + + // Load the converted field type + BuildScalarParameterLoad( + emitter, + fieldType, + instructions); + + // Store it into out structure field + emitter.StoreField(vectorizedType, fieldAccess.Index); + } + + emitter.Emit(LocalOperation.Load, temporary); + } + else + { + // Load input argument value + emitter.Emit(LocalOperation.Load, source); + + // Load the scalar parameter + BuildScalarParameterLoad( + emitter, + typeNode, + instructions); + } + } + + /// + /// Builds a vectorized kernel parameter load for scalar types. + /// + private static void BuildScalarParameterLoad( + in TILEmitter emitter, + TypeNode typeNode, + VelocityInstructions instructions) + where TILEmitter : struct, IILEmitter + { + var basicValueType = typeNode switch + { + PrimitiveType primitiveType => primitiveType.BasicValueType, + PaddingType paddingType => paddingType.BasicValueType, + PointerType _ => BasicValueType.Int64, + _ => // Not supported type conversions + throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType, + typeNode) + }; + + // Convert value on top of the evaluation stack without sign extension + var mode = VelocityWarpOperationMode.F; + if (basicValueType.IsInt()) + { + // Expand type + emitter.Emit(basicValueType.IsTreatedAs32Bit() + ? OpCodes.Conv_U4 + : OpCodes.Conv_U8); + + mode = VelocityWarpOperationMode.U; + } + else + { + // Convert half to 32bit float + if (basicValueType == BasicValueType.Float16) + emitter.EmitCall(instructions.FromHalfMethod); + } + + // Load the values onto the evaluation stack + var load = basicValueType.IsTreatedAs32Bit() + ? instructions.GetConstValueOperation32(mode) + : instructions.GetConstValueOperation64(mode); + emitter.EmitCall(load); + } + + #endregion + + #region Instance + + private readonly Dictionary methodMapping; + + public VelocityGenerationModule( + RuntimeSystem runtimeSystem, + VelocityInstructions instructions, + VelocityTypeGenerator typeGenerator, + in Backend.BackendContext backendContext, + EntryPoint entryPoint) + { + methodMapping = new Dictionary( + backendContext.Count); + TypeGenerator = typeGenerator; + + // Create the parameter passing type + ParametersType = BuildParametersType( + runtimeSystem, + instructions, + typeGenerator, + backendContext, + entryPoint, + out var constructorInfo, + out var parameterFields); + ParametersTypeConstructor = constructorInfo; + ParameterFields = parameterFields; + + // Declare all methods + DeclareMethod(runtimeSystem, backendContext.KernelMethod); + foreach (var (method, _) in backendContext) + DeclareMethod(runtimeSystem, method); + + // Get the kernel method + KernelMethod = this[backendContext.KernelMethod]; + + // Setup shared memory information + SharedAllocationSize = backendContext.SharedAllocations.TotalSize; + } + + #endregion + + #region Properties + + /// + /// Returns the current type generator being used. + /// + public VelocityTypeGenerator TypeGenerator { get; } + + /// + /// Returns the kernel method. + /// + public MethodInfo KernelMethod { get; } + + /// + /// Gets the method builder that is associated with the given method. + /// + /// The method to get the managed method for. + public MethodInfo this[Method method] => methodMapping[method].Method; + + /// + /// Returns the class type to store all parameter values to. + /// + public Type ParametersType { get; } + + /// + /// Returns the constructor to build a new parameters type instance. + /// + public ConstructorInfo ParametersTypeConstructor { get; } + + /// + /// Returns all parameter fields to store the actual parameter data into. + /// + public ImmutableArray ParameterFields { get; } + + /// + /// The total amount of bytes residing in shared memory. + /// + public int SharedAllocationSize { get; } + + #endregion + + #region Methods + + /// + /// Declares the given method. + /// + private void DeclareMethod(RuntimeSystem runtimeSystem, Method method) + { + // Convert the method signature + var returnType = TypeGenerator.GetVectorizedType(method.ReturnType); + int parameterOffset = method.HasFlags(MethodFlags.EntryPoint) ? 1 : 0; + int parameterAddition = method.HasFlags(MethodFlags.EntryPoint) ? 0 : 1; + var parameterTypes = new Type[ + method.NumParameters - parameterOffset + parameterAddition]; + + // The first parameter is the current mask (if it is not an entry point) + if (parameterOffset > 0) + { + // This is our main method + parameterTypes = VelocityMultiprocessor.KernelHandlerTypes.ToArray(); + } + else + { + // Convert all parameter types + parameterTypes[0] = typeof(VelocityLaneMask); + for (int i = 0; i < method.NumParameters; ++i) + { + var parameterType = method.Parameters[i].ParameterType; + parameterTypes[i + parameterAddition] = + TypeGenerator.GetVectorizedType(parameterType); + } + } + + // Define a new method stub + using var scopedLock = runtimeSystem.DefineRuntimeMethod( + returnType, + parameterTypes, + out var methodBuilder); + methodMapping.Add(method, methodBuilder); + } + + /// + /// Gets the IL generator that is associated with the method. + /// + public ILGenerator GetILGenerator(Method method) => + methodMapping[method].ILGenerator; + + #endregion + + #region IDisposable + + /// + /// Frees the current scoped locked. + /// + protected override void Dispose(bool disposing) + { + foreach (var (_, builder) in methodMapping) + builder.Finish(); + base.Dispose(disposing); + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityInstructions.cs b/Src/ILGPU/Backends/Velocity/VelocityInstructions.cs new file mode 100644 index 000000000..d45b17c6c --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityInstructions.cs @@ -0,0 +1,364 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityInstructions.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Resources; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Diagnostics; +using System.Reflection; +using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace ILGPU.Backends.Velocity +{ + sealed class VelocityInstructions : VelocityOperations + { + #region Static + + /// + /// Implements a debug assertion failure. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void DebugAssert( + VelocityLaneMask laneMask, + VelocityWarp32 condition, + string message) + { + // Check for failed lanes + var conditionMask = VelocityWarp32.ToMask(condition); + var failedConditionMask = VelocityLaneMask.Negate(conditionMask); + var assertionMask = VelocityLaneMask.Intersect(laneMask, failedConditionMask); + if (!assertionMask.HasAny) + return; + Debug.WriteLine(message); + Debug.Fail(message); + } + + /// + /// Dumps the given velocity warp. + /// + public static void Dump32(VelocityWarp32 warp) => + Console.WriteLine(warp.ToString()); + + /// + /// Dumps the given velocity warp. + /// + public static void Dump64(VelocityWarp64 warp) => + Console.WriteLine(warp.ToString()); + + /// + /// Inspects the given value by dumping meta-level information. + /// + public static void InspectValue(T value) + { + var type = value.GetType(); + var fields = type.GetFields(BindingFlags.Public | BindingFlags.Instance); + foreach (var field in fields) + { + var fieldValue = field.GetValue(value); + Console.WriteLine(fieldValue.ToString()); + } + Console.WriteLine(value.ToString()); + } + + #endregion + + #region Instance + + /// + /// The memory barrier method. + /// + private readonly MethodInfo memoryBarrierMethod; + + /// + /// The default failed assertion method. + /// + private readonly MethodInfo assertMethod; + + /// + /// The generic write method. + /// + private readonly MethodInfo writeMethod; + + /// + /// Inspects a generic value. + /// + private readonly MethodInfo inspectValueMethod; + + /// + /// Dumps a 32bit velocity warp. + /// + private readonly MethodInfo dump32Method; + + /// + /// Dumps a 64bit velocity warp. + /// + private readonly MethodInfo dump64Method; + + /// + /// Initializes all general runtime methods. + /// + public VelocityInstructions() + { + memoryBarrierMethod = GetMethod( + typeof(Interlocked), + nameof(Interlocked.MemoryBarrier)); + + assertMethod = GetMethod( + typeof(VelocityInstructions), + nameof(DebugAssert)); + writeMethod = GetMethod( + typeof(Interop), + nameof(Interop.WriteImplementation)); + inspectValueMethod = GetMethod( + typeof(VelocityInstructions), + nameof(InspectValue)); + dump32Method = GetMethod( + typeof(VelocityInstructions), + nameof(Dump32)); + dump64Method = GetMethod( + typeof(VelocityInstructions), + nameof(Dump64)); + } + + #endregion + + #region Methods + + /// + /// Gets load and store operations for the given basic value type. The IO + /// operations automatically convert from vectorized types into scalar types + /// while taking an additional mask parameter and the current warp size into + /// account. + /// + /// The basic value type. + /// The current warp size. + /// Load and store operations for the given basic value type. + public (MethodInfo Load, MethodInfo Store) GetIOOperation( + BasicValueType basicValueType, + int warpSize) => + basicValueType.IsTreatedAs32Bit() + ? GetIOOperation32(basicValueType, warpSize) + : GetIOOperation64(basicValueType, warpSize); + + /// + /// Creates code to load primitive values and pointers from memory while using + /// the given mask to differentiate between active and inactive lanes. + /// + private void CreateNonStructureLoad( + in TILEmitter emitter, + ILLocal mask, + TypeNode typeNode, + int warpSize) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(LocalOperation.Load, mask); + + switch (typeNode) + { + case PrimitiveType primitiveType: + var basicValueType = primitiveType.BasicValueType; + var operations = GetIOOperation(basicValueType, warpSize); + emitter.EmitCall(operations.Load); + break; + case PointerType _: + emitter.EmitCall(GetIOOperation64( + BasicValueType.Int64, + warpSize).Load); + break; + default: + throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType); + } + } + + /// + /// Creates a sequence of load instructions to load a vectorized value via + /// specialized IO operations. + /// + public void CreateLoad( + in TILEmitter emitter, + ILLocal mask, + ILLocal source, + TypeNode typeNode, + VelocityTypeGenerator typeGenerator) + where TILEmitter : struct, IILEmitter + { + if (typeNode is StructureType structureType) + { + // Allocate a new temporary allocation to fill all fields + var vectorizedType = typeGenerator.GetVectorizedType(structureType); + var temporary = emitter.LoadNull(vectorizedType); + + // Fill the temporary structure instance with values + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the variable address + emitter.Emit(LocalOperation.LoadAddress, temporary); + + // Adjust the actual source address based on offsets in the type + // definition + // Adjust the target offset + long fieldOffset = structureType.GetOffset(fieldAccess); + emitter.EmitConstant(fieldOffset); + emitter.EmitCall(GetConstValueOperation64( + VelocityWarpOperationMode.I)); + emitter.Emit(LocalOperation.Load, source); + emitter.EmitCall(GetBinaryOperation64( + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U)); + + // Load the converted field type + CreateNonStructureLoad( + emitter, + mask, + fieldType, + typeGenerator.WarpSize); + + // Store it into out structure field + emitter.StoreField(vectorizedType, fieldAccess.Index); + } + + // Load local variable onto the stack containing all required information + emitter.Emit(LocalOperation.Load, temporary); + } + else + { + // Load the type directly + emitter.Emit(LocalOperation.Load, source); + CreateNonStructureLoad( + emitter, + mask, + typeNode, + typeGenerator.WarpSize); + } + } + + /// + /// Calls a typed method that is able to reinterpret the given value. + /// + public void CallMemoryBarrier(in TILEmitter emitter) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(memoryBarrierMethod); + + /// + /// Calls a method that triggers an assertion check. + /// + public void CallAssert(in TILEmitter emitter) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(assertMethod); + + public void CallInspect(in TILEmitter emitter, Type type) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(inspectValueMethod.MakeGenericMethod(new Type[] { type })); + + public void CallDump( + in TILEmitter emitter, + bool is32Bit, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Dup); + if (is32Bit) + emitter.EmitCall(dump32Method); + else + emitter.EmitCall(dump64Method); + } + + #endregion + } + + static class VelocityInstructionsHelpers + { + /// + /// Returns true if the given value type is actually a 32bit value type. + /// + public static bool Is32Bit(this BasicValueType valueType) => + valueType switch + { + BasicValueType.Int32 => true, + BasicValueType.Float32 => true, + _ => false, + }; + + /// + /// Returns true if the given value type is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this BasicValueType valueType) => + valueType switch + { + BasicValueType.Float64 => false, + BasicValueType.Int64 => false, + _ => true, + }; + + /// + /// Returns true if the given value is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this Value value) => + value.BasicValueType.IsTreatedAs32Bit(); + + /// + /// Returns true if the given value is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this ArithmeticValue value) => + value.ArithmeticBasicValueType switch + { + ArithmeticBasicValueType.Float64 => false, + ArithmeticBasicValueType.Int64 => false, + ArithmeticBasicValueType.UInt64 => false, + _ => true, + }; + + /// + /// Determines the current warp-operation mode for the given arithmetic basic + /// value type. + /// + public static VelocityWarpOperationMode GetWarpMode( + this ArithmeticBasicValueType valueType) => + valueType switch + { + ArithmeticBasicValueType.UInt1 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt8 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt16 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt32 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt64 => VelocityWarpOperationMode.U, + + ArithmeticBasicValueType.Int8 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int16 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int32 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int64 => VelocityWarpOperationMode.I, + + ArithmeticBasicValueType.Float16 => VelocityWarpOperationMode.F, + ArithmeticBasicValueType.Float32 => VelocityWarpOperationMode.F, + ArithmeticBasicValueType.Float64 => VelocityWarpOperationMode.D, + _ => throw new NotSupportedException() + }; + + /// + /// Determines the current warp-operation mode for the given value. + /// + public static VelocityWarpOperationMode GetWarpMode(this ArithmeticValue value) => + value.ArithmeticBasicValueType.GetWarpMode(); + + /// + /// Determines the current warp-operation mode for the given value. + /// + public static VelocityWarpOperationMode GetWarpMode(this CompareValue value) => + value.CompareType.GetWarpMode(); + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs new file mode 100644 index 000000000..a66732c7e --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs @@ -0,0 +1,208 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityKernelFunctionGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A generator for primary Velocity kernels. + /// + /// The IL emitter type. + /// The view generator type. + sealed class VelocityKernelFunctionGenerator : + VelocityCodeGenerator + where TILEmitter : struct, IILEmitter + where TVerifier : IVelocityWarpVerifier, new() + { + #region Constants + + public const int GlobalStartParameterIndex = 0; + public const int GlobalEndParameterIndex = 1; + public const int GlobalParametersIndex = 2; + + #endregion + + private readonly ILLabel nextMarker; + + /// + /// Creates a new Velocity kernel generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + public VelocityKernelFunctionGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + : base(args, method, allocas) + { + EntryPoint = args.EntryPoint; + ParametersType = args.Module.ParametersType; + + // Generate an next marker to jump to when the kernel function returns + nextMarker = Emitter.DeclareLabel(); + } + + /// + /// Returns the current entry point. + /// + public EntryPoint EntryPoint { get; } + + /// + /// Returns the current parameters type. + /// + public Type ParametersType { get; } + + /// + /// Generates Velocity code for this kernel. + /// + public override void GenerateCode() + { + // Extract all arguments of the actual parameters object + var parametersLocal = Emitter.DeclareLocal(ParametersType); + Emitter.Emit(ArgumentOperation.Load, GlobalParametersIndex); + Emitter.Emit(OpCodes.Castclass, ParametersType); + Emitter.Emit(LocalOperation.Store, parametersLocal); + + // Load all parameters by mapping them to local variables + for ( + int i = EntryPoint.KernelIndexParameterOffset; + i < Method.NumParameters; + ++i) + { + var parameterType = Method.Parameters[i].ParameterType; + var parameterLocal = DeclareVectorizedTemporary(parameterType); + + Emitter.Emit(LocalOperation.Load, parametersLocal); + Emitter.LoadField( + ParametersType, + i - EntryPoint.KernelIndexParameterOffset); + Emitter.Emit(LocalOperation.Store, parameterLocal); + + Alias(Method.Parameters[i], parameterLocal); + } + + // Load the actual counters and start the processing loop + var headerMarker = Emitter.DeclareLabel(); + var exitMarker = Emitter.DeclareLabel(); + + // Create initial offset variable + var offsetVariable = Emitter.DeclareLocal(typeof(int)); + Emitter.Emit(ArgumentOperation.Load, GlobalStartParameterIndex); + Emitter.Emit(LocalOperation.Store, offsetVariable); + + // Create cached group size information + var groupDim = Emitter.DeclareLocal(typeof(int)); + Emitter.EmitCall(VelocityMultiprocessor.GetCurrentGroupDimScalarMethodInfo); + Emitter.Emit(LocalOperation.Store, groupDim); + + // Bind the current implicitly grouped kernel index (if any) + ILLocal? offsetVector = null; + if (EntryPoint.IsImplicitlyGrouped) + { + offsetVector = Emitter.DeclareLocal(typeof(VelocityWarp32)); + Alias(Method.Parameters[0], offsetVector.Value); + } + + // Create the loop header + Emitter.MarkLabel(headerMarker); + + // Perform range check + Emitter.Emit(LocalOperation.Load, offsetVariable); + Emitter.Emit(ArgumentOperation.Load, GlobalEndParameterIndex); + Emitter.Emit(OpCodes.Clt); + Emitter.Emit(OpCodes.Brfalse, exitMarker); + + // The actual loop body + { + // Adjust linear index + Emitter.Emit(LocalOperation.Load, offsetVariable); + Emitter.EmitCall(VelocityMultiprocessor.SetCurrentLinearIdxMethod); + + // Check whether the current linear index allows us to activate certain + // lanes that are smaller than the global end parameter index + Emitter.EmitCall(VelocityMultiprocessor.GetCurrentLinearIdxMethod); + + // Check whether we need to bind an internal offset vector + if (offsetVector.HasValue) + { + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, offsetVector.Value); + } + + // Perform range check + Emitter.Emit(ArgumentOperation.Load, GlobalEndParameterIndex); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.GetCompareOperation32( + CompareKind.LessThan, + VelocityWarpOperationMode.I)); + + // Get local group sizes and perform range check for all lanes + Emitter.EmitCall(Instructions.LaneIndexVectorOperation32); + Emitter.Emit(LocalOperation.Load, groupDim); + Emitter.EmitCall(Instructions.GetConstValueOperation32( + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.GetCompareOperation32( + CompareKind.LessThan, + VelocityWarpOperationMode.I)); + + // Convert into a single lane mask and store the converted mask + Emitter.EmitCall(Instructions.GetBinaryOperation32( + BinaryArithmeticKind.And, + VelocityWarpOperationMode.I)); + Emitter.EmitCall(Instructions.ToMaskOperation32); + Emitter.Emit(LocalOperation.Store, GetBlockMask(Method.EntryBlock)); + + // Emit the actual kernel code + GenerateCodeInternal(); + } + + // Increment the current offset by adding the current warp size + Emitter.MarkLabel(nextMarker); + + Emitter.Emit(LocalOperation.Load, offsetVariable); + Emitter.Emit(LocalOperation.Load, groupDim); + Emitter.Emit(OpCodes.Add); + Emitter.Emit(LocalOperation.Store, offsetVariable); + + // Branch back to the header + Emitter.Emit(OpCodes.Br, headerMarker); + + // Emit the exit marker + Emitter.MarkLabel(exitMarker); + + // Return + Emitter.Emit(OpCodes.Ret); + } + + /// + public override void GenerateCode(ReturnTerminator returnTerminator) + { + returnTerminator.Assert(returnTerminator.IsVoidReturn); + + // Jump to the next block in case all lanes have been disabled + Emitter.Emit( + LocalOperation.Load, + GetBlockMask(returnTerminator.BasicBlock)); + Emitter.EmitCall(Instructions.AreAllLanesActive); + Emitter.Emit(OpCodes.Brtrue, nextMarker); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs new file mode 100644 index 000000000..eee9fb63c --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs @@ -0,0 +1,355 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityTypeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.IR.Types; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Reflection; +using System.Reflection.Emit; +using System.Threading; + +namespace ILGPU.Backends.IL +{ + /// + /// A type generator for managed IL types. + /// + sealed class VelocityTypeGenerator : DisposeBase + { + #region Static + + /// + /// Maps basic types to vectorized basic types. + /// + private static readonly Type[] VectorizedBasicTypeMapping = new Type[] + { + null, // None + + typeof(VelocityWarp32), // Int1 + typeof(VelocityWarp32), // Int8 + typeof(VelocityWarp32), // Int16 + typeof(VelocityWarp32), // Int32 + typeof(VelocityWarp64), // Int64 + + typeof(VelocityWarp32), // Float16 + typeof(VelocityWarp32), // Float32 + typeof(VelocityWarp64), // Float64 + }; + + /// + /// Gets a vectorized type corresponding to the given basic value type. + /// + public static Type GetVectorizedBasicType(BasicValueType basicValueType) => + VectorizedBasicTypeMapping[(int)basicValueType]; + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + private static Type LoadStructureType( + StructureType structureType, + VelocityTypeGenerator parent, + in TTypeProvider typeProvider) + where TTypeProvider : IExtendedTypeProvider + { + using var scopedLock = parent.RuntimeSystem.DefineRuntimeStruct( + explicitLayout: typeProvider.UsesExplicitOffsets, + out var typeBuilder); + int index = 0; + foreach (var (type, fieldAccess) in structureType) + { + var field = typeBuilder.DefineField( + StructureType.GetFieldName(index++), + type.LoadManagedType(typeProvider), + FieldAttributes.Public); + + int offset = structureType.GetOffset(fieldAccess.Index); + typeProvider.SetOffset(field, offset); + } + + return typeBuilder.CreateType(); + } + + #endregion + + #region Nested Types + + private interface IExtendedTypeProvider : IManagedTypeProvider + { + /// + /// Returns true if this provider requires explicit offsets. + /// + bool UsesExplicitOffsets { get; } + + /// + /// Sets an explicit field offset. + /// + void SetOffset(FieldBuilder fieldBuilder, int offset); + } + + /// + /// Provides linearized scalar versions of given scalar managed types. + /// + private readonly struct LinearScalarTypeProvider : IExtendedTypeProvider + { + private readonly VelocityTypeGenerator parent; + private readonly TypeNode.ScalarManagedTypeProvider scalarProvider; + + /// + /// Creates a new instance of the scalar type provider. + /// + /// The parent IL type generator. + public LinearScalarTypeProvider(VelocityTypeGenerator typeGenerator) + { + parent = typeGenerator; + scalarProvider = new TypeNode.ScalarManagedTypeProvider(); + } + + /// + /// Returns the default managed type for the given primitive one. + /// + public Type GetPrimitiveType(PrimitiveType primitiveType) => + scalarProvider.GetPrimitiveType(primitiveType); + + /// + /// Returns the default managed array type for the given array type. + /// + public Type GetArrayType(ArrayType arrayType) => + scalarProvider.GetArrayType(arrayType); + + /// + /// Returns a specialized pointer implementation. + /// + public Type GetPointerType(PointerType pointerType) => + scalarProvider.GetPointerType(pointerType); + + /// + /// Returns a specialized pointer-view implementation. + /// + public Type GetViewType(ViewType viewType) => + scalarProvider.GetViewType(viewType); + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + public Type GetStructureType(StructureType structureType) => + LoadStructureType(structureType, parent, this); + + /// + /// Returns true. + /// + public bool UsesExplicitOffsets => true; + + /// + /// Sets the current field offset. + /// + public void SetOffset(FieldBuilder fieldBuilder, int offset) => + fieldBuilder.SetOffset(offset); + } + + /// + /// Provides vectorized versions of given scalar managed types. + /// + private readonly struct VectorizedTypeProvider : IExtendedTypeProvider + { + private readonly VelocityTypeGenerator parent; + + /// + /// Creates a new instance of the vectorized type provider. + /// + /// The parent IL type generator. + public VectorizedTypeProvider(VelocityTypeGenerator typeGenerator) + { + parent = typeGenerator; + } + + /// + /// Returns the default managed type for the given primitive one. + /// + public Type GetPrimitiveType(PrimitiveType primitiveType) => + GetVectorizedBasicType(primitiveType.BasicValueType); + + /// + /// Returns the default managed array type for the given array type. + /// + public Type GetArrayType(ArrayType arrayType) => arrayType.LoadManagedType(); + + /// + /// Returns a specialized pointer implementation. + /// + public Type GetPointerType(PointerType pointerType) => + GetVectorizedBasicType(BasicValueType.Int64); + + /// + /// Returns a specialized pointer-view implementation. + /// + public Type GetViewType(ViewType viewType) => + PointerViews.ViewImplementation.GetImplementationType( + viewType.ElementType.LoadManagedType()); + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + public Type GetStructureType(StructureType structureType) => + LoadStructureType(structureType, parent, this); + + /// + /// Returns false. + /// + public bool UsesExplicitOffsets => false; + + /// + /// Does not do anything. + /// + public void SetOffset(FieldBuilder fieldBuilder, int offset) { } + } + + #endregion + + #region Static + + /// + /// Gets or creates a new managed type using the given type provider instance. + /// + private static Type GetOrCreateType( + ReaderWriterLockSlim readerWriterLock, + Dictionary typeMapping, + TypeNode typeNode, + TTypeProvider typeProvider, + Func typeSelector, + Func typeBinder) + where TTypeProvider : IManagedTypeProvider + { + // Synchronize all accesses below using a read/write scope + using var readWriteScope = readerWriterLock.EnterUpgradeableReadScope(); + + if (typeMapping.TryGetValue(typeNode, out var mappedType)) + { + var selected = typeSelector(mappedType.Linear, mappedType.Vectorized); + if (selected != null) + return selected; + } + + // Get a new type instance + using var writeScope = readWriteScope.EnterWriteScope(); + var newMappedType = typeNode.LoadManagedType(typeProvider); + mappedType = typeBinder( + mappedType.Linear, + mappedType.Vectorized, + newMappedType); + typeMapping[typeNode] = mappedType; + + return typeSelector(mappedType.Linear, mappedType.Vectorized); + } + + #endregion + + #region Instance + + private readonly ReaderWriterLockSlim readerWriterLock = + new ReaderWriterLockSlim(LockRecursionPolicy.SupportsRecursion); + private readonly Dictionary + typeMapping = new Dictionary(); + + /// + /// Constructs a new IL type generator. + /// + /// The parent runtime system. + /// The current warp size. + public VelocityTypeGenerator(RuntimeSystem runtimeSystem, int warpSize) + { + RuntimeSystem = runtimeSystem; + WarpSize = warpSize; + } + + #endregion + + #region Properties + + /// + /// Returns the parent runtime system. + /// + public RuntimeSystem RuntimeSystem { get; } + + /// + /// Returns the current warp size. + /// + public int WarpSize { get; } + + #endregion + + #region Methods + + /// + /// Gets or creates a linearized managed type for the given IR type. + /// + /// The type to build a vectorized type for. + /// + /// The linearized scalar managed type that corresponds to the given IR type. + /// + public Type GetLinearizedScalarType(TypeNode typeNode) + { + // Check for primitive types without locking + if (typeNode is PrimitiveType || typeNode is PaddingType) + return typeNode.LoadManagedType(); + + // Get or create a new type + return GetOrCreateType( + readerWriterLock, + typeMapping, + typeNode, + new LinearScalarTypeProvider(this), + (linear, _) => linear, + (_, vectorized, newLinear) => (newLinear, vectorized)); + } + + /// + /// Gets or creates a vectorized managed type for the given IR type. + /// + /// The type to build a vectorized type for. + /// + /// The vectorized managed type that corresponds to the given IR type. + /// + public Type GetVectorizedType(TypeNode typeNode) + { + // Check for primitive types without locking + if (typeNode is PrimitiveType || typeNode is PaddingType) + return GetVectorizedBasicType(typeNode.BasicValueType); + + // Get or create a new type + return GetOrCreateType( + readerWriterLock, + typeMapping, + typeNode, + new VectorizedTypeProvider(this), + (_, vectorized) => vectorized, + (linear, _, newVectorized) => (linear, newVectorized)); + } + + #endregion + + #region IDisposable + + /// + protected override void Dispose(bool disposing) + { + if (disposing) + readerWriterLock.Dispose(); + base.Dispose(disposing); + } + + #endregion + } +} + diff --git a/Src/ILGPU/ILGPU.csproj b/Src/ILGPU/ILGPU.csproj index 8149d59de..24a37b3f5 100644 --- a/Src/ILGPU/ILGPU.csproj +++ b/Src/ILGPU/ILGPU.csproj @@ -184,6 +184,10 @@ TextTemplatingFileGenerator CudaInstructionSet.Generated.cs + + TextTemplatingFileGenerator + VelocityWarps.cs + @@ -347,6 +351,9 @@ True PrimitiveDataBlocks.tt + + VelocityWarps.tt + diff --git a/Src/ILGPU/IR/Construction/Terminators.cs b/Src/ILGPU/IR/Construction/Terminators.cs index 5aaeec7a8..074c98af4 100644 --- a/Src/ILGPU/IR/Construction/Terminators.cs +++ b/Src/ILGPU/IR/Construction/Terminators.cs @@ -87,13 +87,20 @@ public Branch CreateIfBranch( Value condition, BasicBlock trueTarget, BasicBlock falseTarget, - IfBranchFlags flags) => - CreateTerminator(new IfBranch( + IfBranchFlags flags) + { + // Simplify unnecessary if branches and fold them to unconditional branches + if (trueTarget == falseTarget) + return CreateBranch(location, trueTarget); + + // Create an if branch in all other cases + return CreateTerminator(new IfBranch( GetInitializer(location), condition, trueTarget, falseTarget, flags)); + } /// /// Creates a switch terminator builder. diff --git a/Src/ILGPU/IR/Types/ArrayType.cs b/Src/ILGPU/IR/Types/ArrayType.cs index 8dd6e042f..d52f2bc39 100644 --- a/Src/ILGPU/IR/Types/ArrayType.cs +++ b/Src/ILGPU/IR/Types/ArrayType.cs @@ -64,8 +64,16 @@ internal ArrayType( /// /// Creates a managed array type. /// - protected override Type GetManagedType() => - ElementType.LoadManagedType().MakeArrayType(NumDimensions); + internal Type GetDefaultManagedType(TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider => + ElementType.LoadManagedType(typeProvider).MakeArrayType(NumDimensions); + + /// + /// Creates a managed array type. + /// + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetArrayType(this); #endregion diff --git a/Src/ILGPU/IR/Types/HandleType.cs b/Src/ILGPU/IR/Types/HandleType.cs index 1ae2a986d..99d082c4b 100644 --- a/Src/ILGPU/IR/Types/HandleType.cs +++ b/Src/ILGPU/IR/Types/HandleType.cs @@ -35,7 +35,9 @@ internal HandleType(IRTypeContext typeContext) /// /// Creates an object type. /// - protected override Type GetManagedType() => typeof(object); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeof(object); #endregion diff --git a/Src/ILGPU/IR/Types/PaddingType.cs b/Src/ILGPU/IR/Types/PaddingType.cs index 9612a5e99..eda3aeba2 100644 --- a/Src/ILGPU/IR/Types/PaddingType.cs +++ b/Src/ILGPU/IR/Types/PaddingType.cs @@ -58,7 +58,9 @@ internal PaddingType(IRTypeContext typeContext, PrimitiveType primitiveType) /// /// Returns the corresponding managed basic value type. /// - protected override Type GetManagedType() => BasicValueType.GetManagedType(); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetPrimitiveType(PrimitiveType); #endregion diff --git a/Src/ILGPU/IR/Types/PointerTypes.cs b/Src/ILGPU/IR/Types/PointerTypes.cs index 7e86feabf..0718b7597 100644 --- a/Src/ILGPU/IR/Types/PointerTypes.cs +++ b/Src/ILGPU/IR/Types/PointerTypes.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2022 ILGPU Project +// Copyright (c) 2018-2023 ILGPU Project // www.ilgpu.net // // File: PointerTypes.cs @@ -194,11 +194,20 @@ internal PointerType( #region Methods + /// + /// Creates a default managed view type. + /// + internal Type GetDefaultManagedPointerType( + TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider => + ElementType.LoadManagedType(typeProvider).MakePointerType(); + /// /// Creates a managed pointer type. /// - protected override Type GetManagedType() => - ElementType.LoadManagedType().MakePointerType(); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetPointerType(this); #endregion @@ -253,11 +262,20 @@ internal ViewType( #region Methods /// - /// Creates a managed view type. + /// Creates a default managed view type. /// - protected override Type GetManagedType() => + internal Type GetDefaultManagedViewType( + TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider => typeof(ArrayView<>).MakeGenericType( - ElementType.LoadManagedType()); + ElementType.LoadManagedType(typeProvider)); + + /// + /// Creates a managed view type. + /// + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetViewType(this); #endregion diff --git a/Src/ILGPU/IR/Types/PrimitiveTypes.cs b/Src/ILGPU/IR/Types/PrimitiveTypes.cs index 649ff5ed8..4db2c927d 100644 --- a/Src/ILGPU/IR/Types/PrimitiveTypes.cs +++ b/Src/ILGPU/IR/Types/PrimitiveTypes.cs @@ -12,6 +12,7 @@ using ILGPU.Util; using System; using System.Collections.Immutable; +using System.Runtime.CompilerServices; namespace ILGPU.IR.Types { @@ -118,7 +119,9 @@ internal PrimitiveType(IRTypeContext typeContext, BasicValueType basicValueType) /// /// Returns the corresponding managed basic value type. /// - protected override Type GetManagedType() => BasicValueType.GetManagedType(); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetPrimitiveType(this); #endregion @@ -169,7 +172,9 @@ internal StringType(IRTypeContext typeContext) /// /// Returns the corresponding managed basic value type. /// - protected override Type GetManagedType() => typeof(string); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeof(string); #endregion diff --git a/Src/ILGPU/IR/Types/StructureType.cs b/Src/ILGPU/IR/Types/StructureType.cs index 30e9558d2..32ec82640 100644 --- a/Src/ILGPU/IR/Types/StructureType.cs +++ b/Src/ILGPU/IR/Types/StructureType.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2022 ILGPU Project +// Copyright (c) 2018-2023 ILGPU Project // www.ilgpu.net // // File: StructureType.cs @@ -680,6 +680,14 @@ typeNode is StructureType structureType ? structureType.NumFields : 1; + /// + /// Gets the field name of a managed structure type. + /// + /// The field index. + /// The managed field name within a structure type. + public static string GetFieldName(int fieldIndex) => + "Field" + fieldIndex; + #endregion #region Instance @@ -905,7 +913,8 @@ private void SliceRecursive( /// /// Creates a managed type that corresponds to this structure type. /// - protected override Type GetManagedType() + internal Type GetDefaultManagedType(TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider { using var scopedLock = RuntimeSystem.DefineRuntimeStruct( out var typeBuilder); @@ -913,14 +922,21 @@ protected override Type GetManagedType() foreach (var type in DirectFields) { typeBuilder.DefineField( - "Field" + index++, - type.LoadManagedType(), + GetFieldName(index++), + type.LoadManagedType(typeProvider), FieldAttributes.Public); } return typeBuilder.CreateType(); } + /// + /// Creates a managed type that corresponds to this structure type. + /// + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeProvider.GetStructureType(this); + #endregion #region IEnumerable diff --git a/Src/ILGPU/IR/Types/TypeNode.cs b/Src/ILGPU/IR/Types/TypeNode.cs index 0c1887759..961f29a2d 100644 --- a/Src/ILGPU/IR/Types/TypeNode.cs +++ b/Src/ILGPU/IR/Types/TypeNode.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2022 ILGPU Project +// Copyright (c) 2018-2023 ILGPU Project // www.ilgpu.net // // File: TypeNode.cs @@ -10,6 +10,7 @@ // --------------------------------------------------------------------------------------- using ILGPU.Resources; +using ILGPU.Util; using System; using System.Runtime.CompilerServices; @@ -61,7 +62,50 @@ public interface ITypeNode : INode /// /// The type representation in the managed world. /// - Type LoadManagedType(); + /// The type provider to use. + Type LoadManagedType(TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider; + } + + /// + /// An abstract type provider to convert IR types to managed types. + /// + public interface IManagedTypeProvider + { + /// + /// Gets the managed type for the given primitive type. + /// + /// The current primitive type. + /// The managed primitive representation. + Type GetPrimitiveType(PrimitiveType primitiveType); + + /// + /// Converts the given view type to a managed array representation. + /// + /// The current array type. + /// The managed array representation. + Type GetArrayType(ArrayType arrayType); + + /// + /// Converts the given view type to a managed pointer representation. + /// + /// The current pointer type. + /// The managed pointer representation. + Type GetPointerType(PointerType pointerType); + + /// + /// Converts the given view type to a managed view representation. + /// + /// The current view type. + /// The managed view representation. + Type GetViewType(ViewType viewType); + + /// + /// Converts the given structure type to a managed view representation. + /// + /// The current structure type. + /// The managed structure representation. + Type GetStructureType(StructureType structureType); } /// @@ -96,6 +140,47 @@ public static int Align(int offset, int fieldAlignment) => #endregion + #region Nested Types + + /// + /// A simple loop-back type provider. + /// + public readonly struct ScalarManagedTypeProvider : IManagedTypeProvider + { + /// + /// Returns the default managed type for the given primitive one. + /// + public Type GetPrimitiveType(PrimitiveType primitiveType) => + primitiveType.BasicValueType.GetManagedType(); + + /// + /// Returns the default managed array type for the given array type. + /// + public Type GetArrayType(ArrayType arrayType) => + arrayType.GetDefaultManagedType(this); + + /// + /// Returns the default pointer type implementation. + /// + public Type GetPointerType(PointerType pointerType) => + pointerType.GetDefaultManagedPointerType(this); + + /// + /// Returns the default view type implementation. + /// + public Type GetViewType(ViewType viewType) => + viewType.GetDefaultManagedViewType(this); + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + public Type GetStructureType(StructureType structureType) => + structureType.GetDefaultManagedType(this); + } + + #endregion + #region Instance /// @@ -126,7 +211,7 @@ protected TypeNode(IRTypeContext typeContext) public IRTypeContext TypeContext { get; } /// - /// Returns the urrent runtime system. + /// Returns the current runtime system. /// public RuntimeSystem RuntimeSystem => TypeContext.RuntimeSystem; @@ -222,10 +307,20 @@ protected TypeNode(IRTypeContext typeContext) #region Methods + /// + /// The type representation in the managed world by using the default type + /// provider instance that emits scalar managed types. + /// + public Type LoadManagedType() => + managedType ??= LoadManagedType(new ScalarManagedTypeProvider()); + /// /// The type representation in the managed world. /// - public Type LoadManagedType() => managedType ??= GetManagedType(); + public Type LoadManagedType( + TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider => + GetManagedType(typeProvider); /// /// Returns true if the given flags are set. @@ -245,7 +340,8 @@ public bool HasFlags(TypeFlags typeFlags) => /// Creates a managed type that corresponds to this IR type. /// /// The created managed type. - protected abstract Type GetManagedType(); + protected abstract Type GetManagedType(TTypeProvider typeProvider) + where TTypeProvider : IManagedTypeProvider; /// /// Converts the current type to the given type . diff --git a/Src/ILGPU/IR/Types/VoidType.cs b/Src/ILGPU/IR/Types/VoidType.cs index ce31f0aee..df6b63fdc 100644 --- a/Src/ILGPU/IR/Types/VoidType.cs +++ b/Src/ILGPU/IR/Types/VoidType.cs @@ -42,7 +42,9 @@ internal VoidType(IRTypeContext typeContext) /// /// Returns the void type. /// - protected override Type GetManagedType() => typeof(void); + protected override Type GetManagedType( + TTypeProvider typeProvider) => + typeof(void); #endregion diff --git a/Src/ILGPU/Interop.cs b/Src/ILGPU/Interop.cs index dab8bef6f..524cec5be 100644 --- a/Src/ILGPU/Interop.cs +++ b/Src/ILGPU/Interop.cs @@ -246,7 +246,7 @@ internal static string GetWriteLineFormat(string format) /// /// The expression format to write. /// All elements to write in string format. - private static void WriteImplementation( + internal static void WriteImplementation( string format, params string[] elements) => Console.Write(format, elements); diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs index 1d5610f81..066d3aee7 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs @@ -1,7 +1,6 @@ -//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ // // This code was generated by a tool. -// Runtime Version:4.0.30319.42000 // // Changes to this file may cause incorrect behavior and will be lost if // the code is regenerated. @@ -14,12 +13,10 @@ namespace ILGPU.Resources { /// /// A strongly-typed resource class, for looking up localized strings, etc. + /// This class was generated by MSBuild using the GenerateResource task. + /// To add or remove a member, edit your .resx file then rerun MSBuild. /// - // This class was auto-generated by the StronglyTypedResourceBuilder - // class via a tool like ResGen or Visual Studio. - // To add or remove a member, edit your .ResX file then rerun ResGen - // with the /str option, or rebuild your VS project. - [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "17.0.0.0")] + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.Build.Tasks.StronglyTypedResourceBuilder", "15.1.0.0")] [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] internal class RuntimeErrorMessages { @@ -428,5 +425,23 @@ internal static string UnknownParentAccelerator { return ResourceManager.GetString("UnknownParentAccelerator", resourceCulture); } } + + /// + /// Looks up a localized string similar to The Velocity accelerator supports little-endian machines only. + /// + internal static string VelocityLittleEndian { + get { + return ResourceManager.GetString("VelocityLittleEndian", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false'. + /// + internal static string VelocityPlatform64 { + get { + return ResourceManager.GetString("VelocityPlatform64", resourceCulture); + } + } } } diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.resx b/Src/ILGPU/Resources/RuntimeErrorMessages.resx index eb4c0cfbd..6674fc3d5 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.resx +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.resx @@ -1,17 +1,17 @@  - @@ -240,4 +240,10 @@ Unknown parent accelerator + + Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false' + + + The Velocity accelerator supports little-endian machines only + \ No newline at end of file diff --git a/Src/ILGPU/Runtime/Accelerator.cs b/Src/ILGPU/Runtime/Accelerator.cs index cc2b4c33e..879e0cb43 100644 --- a/Src/ILGPU/Runtime/Accelerator.cs +++ b/Src/ILGPU/Runtime/Accelerator.cs @@ -30,6 +30,11 @@ public enum AcceleratorType : int /// CPU, + /// + /// Represents a SIMD CPU performance accelerator. + /// + Velocity, + /// /// Represents a Cuda accelerator. /// diff --git a/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs b/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs index 62d75ba2e..c54d57b4a 100644 --- a/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs +++ b/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs @@ -12,6 +12,7 @@ using ILGPU.Backends; using ILGPU.Backends.IL; using ILGPU.Resources; +using ILGPU.Runtime.Velocity; using System; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -250,12 +251,13 @@ protected override void OnUnbind() { } /// protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) => - otherAccelerator as CPUAccelerator != null; + otherAccelerator is CPUAccelerator || + otherAccelerator is VelocityAccelerator; /// protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) { - if (otherAccelerator as CPUAccelerator == null) + if (!CanAccessPeerInternal(otherAccelerator)) { throw new InvalidOperationException( RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator); @@ -266,7 +268,7 @@ protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) protected override void DisablePeerAccessInternal( Accelerator otherAccelerator) => Debug.Assert( - otherAccelerator is CPUAccelerator, + CanAccessPeerInternal(otherAccelerator), "Invalid EnablePeerAccess method"); #endregion @@ -482,7 +484,7 @@ protected override int EstimateGroupSizeInternal( #region Page Lock Scope /// - protected unsafe override PageLockScope CreatePageLockFromPinnedInternal( + protected override PageLockScope CreatePageLockFromPinnedInternal( IntPtr pinned, long numElements) { diff --git a/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs b/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs index 9ce5c268d..86e9cb4b4 100644 --- a/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs +++ b/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs @@ -91,8 +91,12 @@ public static void CPUCopyFrom( in ArrayView targetView) where T : unmanaged { - if (targetView.GetAcceleratorType() != AcceleratorType.CPU) + switch (targetView.GetAcceleratorType()) { + case AcceleratorType.CPU: + case AcceleratorType.Velocity: + break; + default: throw new NotSupportedException( RuntimeErrorMessages.NotSupportedTargetAccelerator); } @@ -106,6 +110,7 @@ public static void CPUCopyFrom( switch (sourceView.GetAcceleratorType()) { case AcceleratorType.CPU: + case AcceleratorType.Velocity: // Copy from CPU to CPU CPUCopyToCPU( ref sourceView.LoadEffectiveAddress(), @@ -156,10 +161,14 @@ public static void CPUCopyTo( in ArrayView targetView) where T : unmanaged { - if (sourceView.GetAcceleratorType() != AcceleratorType.CPU) + switch (sourceView.GetAcceleratorType()) { - throw new NotSupportedException( - RuntimeErrorMessages.NotSupportedTargetAccelerator); + case AcceleratorType.CPU: + case AcceleratorType.Velocity: + break; + default: + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedTargetAccelerator); } if (targetView.Length > sourceView.Length) throw new ArgumentOutOfRangeException(nameof(sourceView)); @@ -171,6 +180,7 @@ public static void CPUCopyTo( switch (targetView.GetAcceleratorType()) { case AcceleratorType.CPU: + case AcceleratorType.Velocity: // Copy from CPU to CPU CPUCopyToCPU( ref sourceView.LoadEffectiveAddress(), @@ -222,11 +232,14 @@ public static void CPUCopy( in ArrayView targetView) where T : unmanaged { - if (sourceView.GetAcceleratorType() == AcceleratorType.CPU) + if (sourceView.GetAcceleratorType() == AcceleratorType.CPU || + sourceView.GetAcceleratorType() == AcceleratorType.Velocity) { CPUCopyTo(stream, sourceView, targetView); } - else if (targetView.GetAcceleratorType() == AcceleratorType.CPU) + else if ( + targetView.GetAcceleratorType() == AcceleratorType.CPU || + sourceView.GetAcceleratorType() == AcceleratorType.Velocity) { CPUCopyFrom(stream, sourceView, targetView); } diff --git a/Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs b/Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs index 032ae6945..a94538d8b 100644 --- a/Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs +++ b/Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs @@ -214,7 +214,7 @@ protected void ReleaseLock( where TParent : IParent { // If we are the main thread, release the lock by issuing an atomic - // exchange operation in order to be visible by another AquireLock + // exchange operation in order to be visible by another AcquireLock // operation that might be executed in the future. if (isMainThread) Interlocked.Exchange(ref memoryLock, 0); diff --git a/Src/ILGPU/Runtime/Device.cs b/Src/ILGPU/Runtime/Device.cs index 4517450cf..344fa8153 100644 --- a/Src/ILGPU/Runtime/Device.cs +++ b/Src/ILGPU/Runtime/Device.cs @@ -152,7 +152,7 @@ protected Device() public int MaxNumThreadsPerGroup { get; protected set; } /// - /// Returns the maximum number of shared memory per thread group in bytes. + /// Returns the maximum shared memory per thread group in bytes. /// public int MaxSharedMemoryPerGroup { get; protected set; } diff --git a/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs b/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs index a54a352e7..fe6477a4d 100644 --- a/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs +++ b/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs @@ -12,6 +12,7 @@ using ILGPU.Runtime.CPU; using ILGPU.Runtime.Cuda; using ILGPU.Runtime.OpenCL; +using ILGPU.Runtime.Velocity; namespace ILGPU.Runtime { @@ -28,6 +29,13 @@ public interface IAcceleratorExtensionProvider /// The created extension. TExtension CreateCPUExtension(CPUAccelerator accelerator); + /// + /// Creates an extension for a Velocity accelerator. + /// + /// The target accelerator. + /// The created extension. + TExtension CreateVelocityExtension(VelocityAccelerator accelerator); + /// /// Creates an extension for a Cuda accelerator. /// diff --git a/Src/ILGPU/Runtime/KernelLauncherBuilder.cs b/Src/ILGPU/Runtime/KernelLauncherBuilder.cs index 461f8ba8d..a1d41cd4c 100644 --- a/Src/ILGPU/Runtime/KernelLauncherBuilder.cs +++ b/Src/ILGPU/Runtime/KernelLauncherBuilder.cs @@ -123,7 +123,7 @@ public static void EmitLoadKernelConfig( int customGroupSize = 0) where TEmitter : struct, IILEmitter { - if (entryPoint.IsImplictlyGrouped) + if (entryPoint.IsImplicitlyGrouped) { Debug.Assert(customGroupSize >= 0, "Invalid custom group size"); diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs new file mode 100644 index 000000000..b1e4a85e6 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs @@ -0,0 +1,512 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityAccelerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Backends.IL; +using ILGPU.Backends.Velocity; +using ILGPU.Resources; +using ILGPU.Runtime.CPU; +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A SIMD-enabled CPU-based accelerator. + /// + public sealed class VelocityAccelerator : Accelerator + { + #region Static + + /// + /// The internal run method to launch kernels. + /// + private static readonly MethodInfo RunMethodInfo = + typeof(VelocityAccelerator).GetMethod( + nameof(Run), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + + #endregion + + #region Instance + + private readonly VelocityMultiprocessor[] multiprocessors; + + [SuppressMessage( + "Microsoft.Usage", + "CA2213: Disposable fields should be disposed", + Justification = "This is disposed in DisposeAccelerator_SyncRoot")] + private readonly SemaphoreSlim taskConcurrencyLimit = new SemaphoreSlim(1); + + [SuppressMessage( + "Microsoft.Usage", + "CA2213: Disposable fields should be disposed", + Justification = "This is disposed in DisposeAccelerator_SyncRoot")] + private readonly Barrier multiprocessorBarrier; + + /// + /// Constructs a new Velocity accelerator. + /// + /// The ILGPU context. + /// The Velocity device. + /// + /// The thread priority of the execution threads. + /// + internal VelocityAccelerator( + Context context, + VelocityDevice device, + ThreadPriority threadPriority) + : base(context, device) + { + if (!device.IsLittleEndian) + { + throw new NotSupportedException( + RuntimeErrorMessages.VelocityLittleEndian); + } + + NativePtr = new IntPtr(2); + DefaultStream = CreateStreamInternal(); + + multiprocessors = new VelocityMultiprocessor[device.NumMultiprocessors]; + multiprocessorBarrier = new Barrier(device.NumMultiprocessors + 1); + ThreadPriority = threadPriority; + MaxLocalMemoryPerThread = device.MaxLocalMemoryPerThread; + NumThreads = device.WarpSize * device.NumMultiprocessors; + + // Initialize all multiprocessors + Action processingCompleted = OnProcessingCompleted; + for (int i = 0; i < device.NumMultiprocessors; ++i) + { + var multiProcessor = new VelocityMultiprocessor(this, i); + multiProcessor.ProcessingCompleted += processingCompleted; + multiprocessors[i] = multiProcessor; + } + + // Init the underlying Velocity backend + Init(new VelocityBackend( + context, + new CPUCapabilityContext(), + WarpSize, + new VelocityArgumentMapper(context))); + } + + #endregion + + #region Properties + + /// + /// Returns the Velocity backend of this accelerator. + /// + internal new VelocityBackend Backend => + base.Backend as VelocityBackend; + + /// + /// Returns the current thread priority. + /// + public ThreadPriority ThreadPriority { get; } + + /// + /// Returns the maximum local memory per thread in bytes. + /// + public int MaxLocalMemoryPerThread { get; } + + /// + /// Returns the maximum number of parallel threads. + /// + public int NumThreads { get; } + + #endregion + + #region Launch Methods + + /// + /// Main internal run method to launch loaded kernels. + /// + /// The user-defined kernel config. + /// + /// The actual runtime kernel config to be used for launching. + /// + /// The kernel entry point handler. + /// + /// The current velocity kernel parameters. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void Run( + KernelConfig userKernelConfig, + RuntimeKernelConfig runtimeKernelConfig, + VelocityKernelEntryPoint kernelHandler, + VelocityParameters velocityParameters) + { + // Avoid concurrent executions of kernels.. we have to wait for the current + // kernel to finish first + taskConcurrencyLimit.Wait(); + + // Determine actual thread-grid sizes + int gridSize = runtimeKernelConfig.GridDim.Size; + int groupSize = runtimeKernelConfig.GroupDim.Size; + Debug.Assert(groupSize <= WarpSize, "Invalid group size"); + + // Distribute the workload + int numActiveMPs = Math.Min( + IntrinsicMath.DivRoundUp(gridSize, NumMultiprocessors), + NumMultiprocessors); + + // Compute the chunk size per multiprocessor and adjust it to be a multiple + // of the warp size to avoid holes in the processing grid + int chunkSizePerMP = IntrinsicMath.DivRoundUp(gridSize, numActiveMPs); + chunkSizePerMP *= groupSize; + + // Setup multiprocessor barrier + int upperBound = numActiveMPs + 1; + int participantCount = multiprocessorBarrier.ParticipantCount; + if (participantCount > upperBound) + multiprocessorBarrier.RemoveParticipants(participantCount - upperBound); + else if (participantCount < upperBound) + multiprocessorBarrier.AddParticipants(upperBound - participantCount); + + try + { + // Start the multiprocessor journey + int totalGridSize = Math.Min( + gridSize * groupSize, + (int)userKernelConfig.Size); + for (int i = 0; i < numActiveMPs; ++i) + { + int startIndex = i * chunkSizePerMP; + int endIndex = Math.Min(startIndex + chunkSizePerMP, totalGridSize); + multiprocessors[i].Run( + kernelHandler, + startIndex, + endIndex, + gridSize, + groupSize, + velocityParameters); + } + + // Wait for all multiprocessors to finish + multiprocessorBarrier.SignalAndWait(); + } + finally + { + taskConcurrencyLimit.Release(); + } + } + + /// + /// Once processing is completed in the scope of each multiprocessor, + /// + /// + private void OnProcessingCompleted(VelocityMultiprocessor processor) => + multiprocessorBarrier.SignalAndWait(); + + /// + /// Generates a dynamic kernel-launcher method that will be just-in-time compiled + /// during the first invocation. Using the generated launcher lowers the overhead + /// for kernel launching dramatically, since unnecessary operations (like boxing) + /// can be avoided. + /// + /// The kernel to generate a launcher for. + /// + /// The custom group size for the launching operation. + /// + /// The generated launcher method. + private MethodInfo GenerateKernelLauncherMethod( + VelocityCompiledKernel kernel, + int customGroupSize) + { + var entryPoint = kernel.EntryPoint; + AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); + + // Add support for by ref parameters + if (entryPoint.HasByRefParameters) + { + throw new NotSupportedException( + ErrorMessages.NotSupportedByRefKernelParameters); + } + + // Declare a new launcher method + using var scopedLock = entryPoint.CreateLauncherMethod( + Context.RuntimeSystem, + out var launcher); + var emitter = new ILEmitter(launcher.ILGenerator); + + // Map all arguments to an argument structure containing mapped views + var argumentMapper = Backend.ArgumentMapper; + var (structLocal, _) = argumentMapper.Map(emitter, entryPoint); + + var velocityKernel = emitter.DeclareLocal(typeof(VelocityKernel)); + KernelLauncherBuilder.EmitLoadKernelArgument( + Kernel.KernelInstanceParamIdx, emitter); + emitter.Emit(LocalOperation.Store, velocityKernel); + + // Create an instance of the custom parameters type + var parametersInstance = emitter.DeclarePinnedLocal(kernel.ParametersType); + emitter.Emit(OpCodes.Ldnull); + emitter.Emit(LocalOperation.Store, parametersInstance); + { + // Assign parameters + var parameters = entryPoint.Parameters; + for (int i = 0, e = parameters.Count; i < e; ++i) + { + // Load native address onto stack + emitter.Emit(LocalOperation.LoadAddress, structLocal); + emitter.LoadFieldAddress(structLocal.VariableType, i); + emitter.Emit(OpCodes.Conv_I); + } + + // Create new task object + emitter.EmitNewObject(kernel.ParametersTypeConstructor); + + // Store task + emitter.Emit(LocalOperation.Store, parametersInstance); + } + + // Load the kernel delegate + emitter.Emit(LocalOperation.Load, velocityKernel); + emitter.EmitCall(VelocityKernel.GetVelocityAccelerator); + + // Load custom user dimension + KernelLauncherBuilder.EmitLoadKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize); + + // Load dimensions + KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize, + customGroupSize); + + // Load the kernel delegate + emitter.Emit(LocalOperation.Load, velocityKernel); + emitter.EmitCall(VelocityKernel.GetKernelExecutionDelegate); + + // Load the parameters object + emitter.Emit(LocalOperation.Load, parametersInstance); + + // Launch kernel execution + emitter.EmitCall(RunMethodInfo); + + // End of launch method + emitter.Emit(OpCodes.Ret); + emitter.Finish(); + + return launcher.Finish(); + } + + #endregion + + /// + public override TExtension CreateExtension< + TExtension, + TExtensionProvider>(TExtensionProvider provider) => + provider.CreateVelocityExtension(this); + + /// + protected override MemoryBuffer AllocateRawInternal( + long length, + int elementSize) => + new VelocityMemoryBuffer(this, length, elementSize); + + /// + /// Loads the given kernel. + /// + /// The kernel to load. + /// The custom group size. + /// The loaded kernel + private Kernel LoadKernel(CompiledKernel kernel, int customGroupSize) + { + if (kernel is null) + throw new ArgumentNullException(nameof(kernel)); + if (!(kernel is VelocityCompiledKernel compiledKernel)) + { + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedKernel); + } + + var launcherMethod = GenerateKernelLauncherMethod( + compiledKernel, + customGroupSize); + return new VelocityKernel( + this, + compiledKernel, + launcherMethod); + } + + /// + /// Loads a default kernel. + /// + protected override Kernel LoadKernelInternal(CompiledKernel kernel) => + LoadKernel(kernel, 0); + + /// + /// Loads an implicitly grouped kernel. + /// + protected override Kernel LoadImplicitlyGroupedKernelInternal( + CompiledKernel kernel, + int customGroupSize, + out KernelInfo kernelInfo) + { + if (customGroupSize < 0) + throw new ArgumentOutOfRangeException(nameof(customGroupSize)); + kernelInfo = KernelInfo.CreateFrom( + kernel.Info, + customGroupSize, + null); + return LoadKernel(kernel, customGroupSize); + } + + /// + /// Loads an auto grouped kernel. + /// + protected override Kernel LoadAutoGroupedKernelInternal( + CompiledKernel kernel, + out KernelInfo kernelInfo) + { + var result = LoadKernel(kernel, WarpSize); + kernelInfo = new KernelInfo(WarpSize, NumThreads / WarpSize); + return result; + } + + /// + protected override AcceleratorStream CreateStreamInternal() => + new VelocityStream(this); + + /// + protected override void SynchronizeInternal() { } + + /// + protected override void OnBind() { } + + /// + protected override void OnUnbind() { } + + #region Peer Access + + /// + protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) => + otherAccelerator is CPUAccelerator || + otherAccelerator is VelocityAccelerator; + + /// + protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) + { + if (!CanAccessPeerInternal(otherAccelerator)) + { + throw new InvalidOperationException( + RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator); + } + } + + /// + protected override void DisablePeerAccessInternal( + Accelerator otherAccelerator) => + Debug.Assert( + CanAccessPeerInternal(otherAccelerator), + "Invalid EnablePeerAccess method"); + + #endregion + + #region Occupancy + + /// + protected override int EstimateMaxActiveGroupsPerMultiprocessorInternal( + Kernel kernel, + int groupSize, + int dynamicSharedMemorySizeInBytes) => + kernel is VelocityKernel + ? groupSize > MaxGroupSize.Size ? 0 : NumMultiprocessors + : throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + Func computeSharedMemorySize, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = NumThreads; + return Math.Min(maxGroupSize, MaxGroupSize.Size); + } + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + int dynamicSharedMemorySizeInBytes, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = NumThreads; + return 1; + } + + #endregion + + #region Page Lock Scope + + /// + protected override PageLockScope CreatePageLockFromPinnedInternal( + IntPtr pinned, + long numElements) + { + Trace.WriteLine(RuntimeErrorMessages.NotSupportedPageLock); + return new NullPageLockScope(this, pinned, numElements); + } + + #endregion + + #region IDisposable + + /// + /// Dispose all managed resources allocated by this CPU accelerator instance. + /// + protected override void DisposeAccelerator_SyncRoot(bool disposing) + { + if (!disposing) + return; + + // Dispose task engine + taskConcurrencyLimit.Wait(); + + // Dispose all multiprocessors + foreach (var multiprocessor in multiprocessors) + multiprocessor.Dispose(); + + // Dispose barriers + taskConcurrencyLimit.Dispose(); + multiprocessorBarrier.Dispose(); + } + + #endregion + + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs new file mode 100644 index 000000000..cd3872c2c --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs @@ -0,0 +1,73 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityContextExtensions.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Resources; +using System; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Cuda specific context extensions. + /// + public static class VelocityContextExtensions + { + #region Builder + + /// + /// Enables all velocity devices. + /// + /// The builder instance. + /// + /// The maximum number bytes of shared memory per group. + /// + /// The updated builder instance. + public static Context.Builder Velocity( + this Context.Builder builder, + int maxSharedMemoryPerGroup = VelocityDevice.MinSharedMemoryPerGroup) + { + if (!Backend.RuntimePlatform.Is64Bit()) + { + throw new NotSupportedException(string.Format( + RuntimeErrorMessages.VelocityPlatform64, + Backend.RuntimePlatform)); + } + + builder.DeviceRegistry.Register(new VelocityDevice()); + return builder; + } + + #endregion + + #region Context + + /// + /// Gets a registered Velocity device. + /// + /// The ILGPU context. + /// The registered Velocity device. + public static VelocityDevice GetVelocityDevice(this Context context) => + context.GetDevice(0); + + /// + /// Creates a new Velocity accelerator. + /// + /// The ILGPU context. + /// The created Velocity accelerator. + public static VelocityAccelerator CreateVelocityAccelerator( + this Context context) => + context.GetVelocityDevice().CreateVelocityAccelerator(context); + + #endregion + } + +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs new file mode 100644 index 000000000..d0126dad6 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs @@ -0,0 +1,165 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityDevice.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a software-emulated velocity device for high-performance execution of + /// tasks on the CPU using vectorization. + /// + [DeviceType(AcceleratorType.Velocity)] + public sealed class VelocityDevice : Device + { + #region Constants + + /// + /// The default maximum amount of shared memory in bytes (1024k). + /// + public const int MinSharedMemoryPerGroup = 1 << 20; + + #endregion + + #region Instance + + /// + /// Creates a new velocity device with the default amount of shared memory per + /// group (refer to for more + /// information about the default size). + /// + public VelocityDevice() + : this(Environment.ProcessorCount) + { } + + /// + /// Creates a new velocity device with the default amount of shared memory per + /// group (refer to for more + /// information about the default size). + /// + public VelocityDevice(int numMultiprocessors) + : this(numMultiprocessors, MinSharedMemoryPerGroup) + { } + + /// + /// Creates a new velocity device using the given amount of shared memory (min + /// amount is per group). + /// + /// + /// The maximum amount of shared memory per group in bytes. + /// + /// + /// The number of multiprocessors to use. + /// + public VelocityDevice(int numMultiprocessors, int maxSharedMemoryPerGroup) + { + if (numMultiprocessors < 1) + throw new ArgumentOutOfRangeException(nameof(numMultiprocessors)); + if (maxSharedMemoryPerGroup < MinSharedMemoryPerGroup) + throw new ArgumentOutOfRangeException(nameof(maxSharedMemoryPerGroup)); + + Name = nameof(VelocityAccelerator); + WarpSize = VelocityWarp32.RawVectorLength; + MinWarpSize = VelocityWarp64.RawVectorLength; + MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize; + NumMultiprocessors = numMultiprocessors; + MaxGroupSize = new Index3D( + MaxNumThreadsPerGroup, + 1, + 1); + + MemorySize = long.MaxValue; + MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue); + MaxSharedMemoryPerGroup = maxSharedMemoryPerGroup; + MaxConstantMemory = int.MaxValue; + NumThreads = MaxNumThreads; + + // Get the endian type from the global BitConverter class + IsLittleEndian = BitConverter.IsLittleEndian; + + // Allocate a sufficient amount of local memory per thread equal to + // the maximum number of shared memory per group in bytes + MaxLocalMemoryPerThread = maxSharedMemoryPerGroup; + } + + #endregion + + #region Properties + + /// + /// Returns the minimum warp size of this device. + /// + public int MinWarpSize { get; } + + /// + /// Returns the number of threads. + /// + public int NumThreads { get; } + + /// + /// Returns true if this device operates in little endian mode. + /// + public bool IsLittleEndian { get; } + + /// + /// Returns the maximum local memory per thread in bytes. + /// + public int MaxLocalMemoryPerThread { get; } + + #endregion + + #region Methods + + /// + public override Accelerator CreateAccelerator(Context context) => + CreateVelocityAccelerator(context); + + /// + /// Creates a new performance CPU accelerator using and the default thread + /// priority. + /// + /// The ILGPU context. + /// The created CPU accelerator. + public VelocityAccelerator CreateVelocityAccelerator( + Context context) => + CreateVelocityAccelerator(context, ThreadPriority.Normal); + + /// + /// Creates a new performance CPU accelerator using and the default thread + /// priority. + /// + /// The ILGPU context. + /// + /// The thread priority of the execution threads. + /// + /// The created CPU accelerator. + public VelocityAccelerator CreateVelocityAccelerator( + Context context, + ThreadPriority threadPriority) => + new VelocityAccelerator(context, this, threadPriority); + + #endregion + + #region Object + + /// + public override bool Equals(object obj) => + obj is VelocityDevice device && + device.MaxSharedMemoryPerGroup == MaxSharedMemoryPerGroup && + base.Equals(obj); + + /// + public override int GetHashCode() => base.GetHashCode() ^ MaxSharedMemoryPerGroup; + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs new file mode 100644 index 000000000..4db0817d9 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs @@ -0,0 +1,88 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityKernel.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.Velocity; +using System.Reflection; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single Velocity kernel. + /// + public sealed class VelocityKernel : Kernel + { + #region Static + + /// + /// Represents the property getter. + /// + internal static readonly MethodInfo GetVelocityAccelerator = + typeof(VelocityKernel).GetProperty( + nameof(VelocityAccelerator), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .GetGetMethod(true); + + /// + /// Represents the property getter. + /// + internal static readonly MethodInfo GetKernelExecutionDelegate = + typeof(VelocityKernel).GetProperty( + nameof(KernelEntryPoint), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .GetGetMethod(true); + + #endregion + + #region Instance + + /// + /// Loads a compiled kernel into the given Cuda context as kernel program. + /// + /// The associated accelerator. + /// The source kernel. + /// The launcher method for the given kernel. + internal VelocityKernel( + VelocityAccelerator accelerator, + VelocityCompiledKernel kernel, + MethodInfo launcher) + : base(accelerator, kernel, launcher) + { + KernelEntryPoint = kernel.CreateKernelEntryPoint(); + } + + #endregion + + #region Properties + + /// + /// Returns the associated Velocity runtime. + /// + public VelocityAccelerator VelocityAccelerator => + Accelerator as VelocityAccelerator; + + /// + /// The main kernel entry point function to be called from each velocity + /// multiprocessor during execution. + /// + internal VelocityKernelEntryPoint KernelEntryPoint { get; } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityLaneMask.cs b/Src/ILGPU/Runtime/Velocity/VelocityLaneMask.cs new file mode 100644 index 000000000..942f3c5a8 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityLaneMask.cs @@ -0,0 +1,376 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityLaneMask.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using System; +using System.Diagnostics; +using System.Linq; +using System.Reflection; +using System.Runtime.CompilerServices; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A mask for all active lanes stored in the general purpose register bank. + /// + readonly struct VelocityLaneMask : IEquatable + { + #region Static + + public static readonly MethodInfo DumpMethod = typeof(VelocityLaneMask).GetMethod( + nameof(Dump), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Specifies the maximum number of lanes per mask. + /// + public const int MaxNumberOfLanes = sizeof(uint) * 8; + + /// + /// Represents the maximum lanes mask. + /// + private static readonly uint MaxNumberOfLanesMask = + uint.MaxValue >> (MaxNumberOfLanes - VelocityWarp32.Length); + + /// + /// Represents a mask in which all lanes are active. + /// + public static readonly VelocityLaneMask All = new VelocityLaneMask(uint.MaxValue); + + /// + /// Represents a mask in which all lanes are inactive. + /// + public static readonly VelocityLaneMask None = new VelocityLaneMask(0); + + /// + /// Dumps the given lane mask to the console output. + /// + /// The lane mask to output. + public static void Dump(VelocityLaneMask mask) => + Console.WriteLine(mask.ToString()); + + /// + /// Verifies the given lane index. + /// + /// The lane index to verify. + private static void VerifyLaneIndex(int laneIndex) => + Debug.Assert( + laneIndex >= 0 && laneIndex < MaxNumberOfLanes, + "Lane index out of range"); + + /// + /// Gets a raw lane mask for the given lane index. + /// + /// The lane index. + /// A raw activity lane mask for the given lane index. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint GetLaneIndexMask(int laneIndex) => 1U << laneIndex; + + /// + /// Gets a new lane mask for the given lane index. + /// + /// The lane index. + /// Non-zero to activate the lane, zero otherwise. + /// The created activity lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Get(int laneIndex, uint value) => + new VelocityLaneMask(value != 0U ? GetLaneIndexMask(laneIndex) : 0); + + /// + /// Gets a new lane mask for the given lane index. + /// + /// The lane index. + /// Non-zero to activate the lane, zero otherwise. + /// The created activity lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Get(int laneIndex, ulong value) => + new VelocityLaneMask(value != 0UL ? GetLaneIndexMask(laneIndex) : 0); + + /// + /// Gets a new lane mask for the given lane index. + /// + /// The lane index. + /// The created activity lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Get(int laneIndex) => + new VelocityLaneMask(GetLaneIndexMask(laneIndex)); + + /// + /// Unifies two lane masks. + /// + /// The left lane mask. + /// The right lane mask. + /// + /// A unified lane mask containing active lanes from both masks. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Unify( + VelocityLaneMask left, + VelocityLaneMask right) => + new VelocityLaneMask(left.mask | right.mask); + + /// + /// Intersects two lane masks. + /// + /// The left lane mask. + /// The right lane mask. + /// + /// An intersected lane mask containing only active lanes that are active in the + /// left and the right masks. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Intersect( + VelocityLaneMask left, + VelocityLaneMask right) => + new VelocityLaneMask(left.mask & right.mask); + + /// + /// Negates the given lane mask. + /// + /// The lane mask to negate. + /// The negated version of the input lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask Negate(VelocityLaneMask mask) + { + uint negatedMask = ~mask.mask & MaxNumberOfLanesMask; + return new VelocityLaneMask(negatedMask); + } + + /// + /// Returns true if the given lane mask has at least one active lane. + /// + /// The lane mask to text. + /// True if the given lane mask has at least one active lane. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool HasActiveLanes(VelocityLaneMask mask) => mask.HasAny; + + /// + /// Returns true if all lanes in the given mask are considered active. + /// + /// The lane mask to text. + /// + /// True true if all lanes in the given mask are considered active. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool AreAllLanesActive(VelocityLaneMask mask) => + mask.Count == VelocityWarp32.Length; + + #endregion + + #region Instance + + private readonly uint mask; + + /// + /// Constructs a new lane mask based on the given raw mask. + /// + /// The raw lane mask. + internal VelocityLaneMask(uint rawMask) + { + mask = rawMask; + } + + #endregion + + #region Properties + + /// + /// Returns the number of active lanes in this mask. + /// + public int Count => IntrinsicMath.PopCount(mask); + + /// + /// Returns true if this mask contains at least one active lane. + /// + public bool HasAny => (mask & MaxNumberOfLanesMask) != 0; + + #endregion + + #region Methods + + /// + /// Returns a raw activity mask for the specified lane. + /// + /// The lane index to get the mask for. + /// The raw activity mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint GetActivityMaskI(int laneIndex) + { + VerifyLaneIndex(laneIndex); + return IsActive(laneIndex) ? uint.MaxValue : 0; + } + + /// + /// Returns a raw activity mask for the specified lane. + /// + /// The lane index to get the mask for. + /// The raw activity mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ulong GetActivityMaskL(int laneIndex) + { + VerifyLaneIndex(laneIndex); + return IsActive(laneIndex) ? ulong.MaxValue : 0L; + } + + /// + /// Returns true if the specified lane is active. + /// + /// The lane index to test. + /// True if the specified lane is active.. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsActive(int laneIndex) + { + VerifyLaneIndex(laneIndex); + return (mask & GetLaneIndexMask(laneIndex)) != 0; + } + + /// + /// Disables the specified lane. + /// + /// The lane index to disable. + /// The updated lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityLaneMask Disable(int laneIndex) + { + VerifyLaneIndex(laneIndex); + return new VelocityLaneMask(mask & ~GetLaneIndexMask(laneIndex)); + } + + /// + /// Enables the specified lane. + /// + /// The lane index to enable. + /// The updated lane mask. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityLaneMask Enable(int laneIndex) + { + VerifyLaneIndex(laneIndex); + return new VelocityLaneMask(mask | GetLaneIndexMask(laneIndex)); + } + + #endregion + + #region IEquatable + + /// + /// Returns true if both masks are equal. + /// + /// The other mask to compare to. + /// True if both masks are equal. + public bool Equals(VelocityLaneMask other) => mask == other.mask; + + #endregion + + #region Object + + /// + /// Returns true if the current mask is equal to the given object. + /// + /// The other object to compare to. + /// True if both objects represent the same mask. + public override bool Equals(object other) => + other is VelocityLaneMask otherMask && Equals(otherMask); + + /// + /// Returns true the hash code of this mask. + /// + /// The hash code of this mask. + public override int GetHashCode() => mask.GetHashCode(); + + /// + /// Returns the bit string representation of the current mask. + /// + public override string ToString() + { + var baseArray = Convert.ToString(mask, 2).ToCharArray(); + Array.Reverse(baseArray); + return new string(baseArray); + } + + #endregion + + #region Operators + + /// + /// Unifies two lane masks. + /// + /// The left lane mask. + /// The right lane mask. + /// + /// A unified lane mask containing active lanes from both masks. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityLaneMask operator |( + VelocityLaneMask left, + VelocityLaneMask right) => + Unify(left, right); + + #endregion + } + + partial class VelocityOperations + { + #region Lane Masks + + /// + /// Initializes all lane mask emitters. + /// + private void InitVelocityLaneMaskEmitter() + { + var type = typeof(VelocityLaneMask); + NoLanesMask = GetField(type, nameof(VelocityLaneMask.None)); + AllLanesMask = GetField(type, nameof(VelocityLaneMask.All)); + UnifyLanesMask = GetMethod(type, nameof(VelocityLaneMask.Unify)); + IntersectLanesMask = GetMethod(type, nameof(VelocityLaneMask.Intersect)); + NegateLaneMask = GetMethod(type, nameof(VelocityLaneMask.Negate)); + MaskHasActiveLanes = GetMethod(type, nameof(VelocityLaneMask.HasActiveLanes)); + AreAllLanesActive = GetMethod( + type, + nameof(VelocityLaneMask.AreAllLanesActive)); + } + + /// + /// Returns the no-lane mask getter method. + /// + public FieldInfo NoLanesMask { get; private set; } + + /// + /// Returns the all-lane mask getter method. + /// + public FieldInfo AllLanesMask { get; private set; } + + /// + /// Returns the unify masks method. + /// + public MethodInfo UnifyLanesMask { get; private set; } + + /// + /// Returns the intersect masks method. + /// + public MethodInfo IntersectLanesMask { get; private set; } + + /// + /// Returns the negate masks method. + /// + public MethodInfo NegateLaneMask { get; private set; } + + /// + /// Returns the method to test whether a given lane mask has active lanes. + /// + public MethodInfo MaskHasActiveLanes { get; private set; } + + /// + /// Returns the method to test whether all lanes are active. + /// + public MethodInfo AreAllLanesActive { get; private set; } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs new file mode 100644 index 000000000..a46433faf --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs @@ -0,0 +1,155 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMemoryBuffer.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.CPU; +using ILGPU.Util; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A memory buffer that lives in CPU space. + /// + public class VelocityMemoryBuffer : MemoryBuffer + { + #region Instance + + /// + /// Initializes this array view source on the CPU. + /// + /// The parent accelerator (if any). + /// The length of this source. + /// The element size. + internal VelocityMemoryBuffer( + Accelerator accelerator, + long length, + int elementSize) + : base(accelerator, length, elementSize) + { + // Ensure that all element accesses will be properly aligned + long nativeLength = length * elementSize; + int alignmentOffset = Interop.ComputeAlignmentOffset( + nativeLength, + elementSize * accelerator.WarpSize); + // Pad the length to ensure a valid buffer size + long paddedLength = nativeLength + alignmentOffset; + + // Allocate resources and assign pointers + NativeBufferPtr = Marshal.AllocHGlobal(new IntPtr(paddedLength)); + NativePtr = NativeBufferPtr + alignmentOffset; + } + + #endregion + + #region Properties + + /// + /// Returns the natively allocated underlying buffer pointer which may not be + /// aligned in all cases. + /// + public IntPtr NativeBufferPtr { get; private set; } + + #endregion + + #region Methods + + /// + protected internal override void MemSet( + AcceleratorStream stream, + byte value, + in ArrayView targetView) => + CPUMemoryBuffer.CPUMemSet( + targetView.LoadEffectiveAddressAsPtr(), + value, + 0L, + targetView.LengthInBytes); + + /// + protected internal override void CopyFrom( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyFrom(stream, sourceView, targetView); + + /// + protected internal override void CopyTo( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyTo(stream, sourceView, targetView); + + #endregion + + #region IDisposable + + /// + /// Disposes the underlying memory buffer. + /// + protected override void DisposeAcceleratorObject(bool disposing) + { + Marshal.FreeHGlobal(NativeBufferPtr); + NativeBufferPtr = IntPtr.Zero; + NativePtr = IntPtr.Zero; + } + + #endregion + + } + + sealed class VelocityMemoryBufferPool : VelocityMemoryBuffer + { + #region Instance + + private int sharedMemoryOffset; + private readonly int warpSize; + + public VelocityMemoryBufferPool( + VelocityAccelerator accelerator, + int size) + : base(accelerator, size, 1) + { + warpSize = accelerator.WarpSize; + } + + #endregion + + #region Methods + + /// + /// Resets the internal shared memory offset. + /// + public void Reset() => sharedMemoryOffset = 0; + + /// + /// Gets a chunk of memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ArrayView Allocate(int length) + where T : unmanaged + { + int totalElementSize = length * Interop.SizeOf(); + int alignment = Interop.ComputeAlignmentOffset( + sharedMemoryOffset, + totalElementSize); + int newOffset = sharedMemoryOffset + alignment; + sharedMemoryOffset += alignment + totalElementSize; + return new ArrayView(this, newOffset, length); + } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs new file mode 100644 index 000000000..6e803ca1f --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityMultiprocessor.cs @@ -0,0 +1,425 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMultiprocessor.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Util; +using System; +using System.Collections.Immutable; +using System.Numerics; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single velocity kernel processing delegate. + /// + /// The start index within the thread grid. + /// The end index within the thread grid. + /// The current parameters. + delegate void VelocityKernelEntryPoint( + int globalStartIndex, + int globalEndIndex, + VelocityParameters parameters); + + /// + /// A single velocity multiprocessor consisting of a single processing thread and + /// a runtime context. + /// + sealed class VelocityMultiprocessor : DisposeBase + { + #region Static + + /// + /// All kernel handler types required to launch a kernel delegate on this MP. + /// + public static readonly ImmutableArray KernelHandlerTypes = + ImmutableArray.Create( + typeof(int), + typeof(int), + typeof(VelocityParameters)); + + /// + /// Stores the current velocity multiprocessor. + /// + [ThreadStatic] + private static VelocityMultiprocessor current; + + /// + /// Returns the parent velocity multiprocessor for the current thread. + /// + /// The parent multiprocessor for the current thread. + public static VelocityMultiprocessor GetCurrent() => current; + + /// + /// Allocates a chunk of shared memory. + /// + /// A velocity warp made of shared-memory pointers. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 GetSharedMemory(int length) + where T : unmanaged + { + var currentProcessor = GetCurrent(); + var sharedMemoryView = currentProcessor.GetSharedMemoryFromPool(length); + long intPtr = sharedMemoryView.LoadEffectiveAddressAsPtr().ToInt64(); + return VelocityWarp64.GetConstI(intPtr); + } + + /// + /// Allocates a chunk of local memory. + /// + /// A velocity warp made of local-memory pointers. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 GetLocalMemory(int length) + where T : unmanaged + { + var currentProcessor = GetCurrent(); + var localMemoryView = currentProcessor.GetLocalMemoryFromPool( + length * currentProcessor.WarpSize); + + long intPtr = localMemoryView.LoadEffectiveAddressAsPtr().ToInt64(); + var addresses = VelocityWarp64.GetConstI(intPtr); + var offsets = VelocityWarp64.GetConstI(Interop.SizeOf()); + return offsets.MultiplyAddU( + VelocityWarp64.LaneIndexVector, + addresses); + } + + /// + /// Returns the current linear thread indices for all warp lanes. + /// + /// A velocity warp made of grid indices. + public static VelocityWarp32 GetCurrentLinearIdx() => GetCurrent().LinearIdx; + + /// + /// Returns the current linear thread indices for all warp lanes. + /// + /// A velocity warp made of grid indices. + public static void SetCurrentLinearIdx(int linearIndex) => + GetCurrent().ResetLinearIndex(linearIndex); + + /// + /// Returns the current grid indices for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of grid indices. + public static VelocityWarp32 GetCurrentGridIdx() => + VelocityWarp32.GetConstI(GetCurrent().GridIdx); + + /// + /// Returns the current grid dimension for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of the current grid dimension. + public static VelocityWarp32 GetCurrentGridDim() => + VelocityWarp32.GetConstI(GetCurrent().GridDim); + + /// + /// Returns the current group dimension for all warp lanes associated with this + /// multiprocessor. + /// + /// A velocity warp made of the current group dimension. + public static VelocityWarp32 GetCurrentGroupDim() => + VelocityWarp32.GetConstI(GetCurrentGroupDimScalar()); + + /// + /// Returns the current group dimension for all warp lanes associated with this + /// multiprocessor. + /// + /// The current group dimension. + public static int GetCurrentGroupDimScalar() => GetCurrent().GroupDim; + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetSharedMemoryMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetSharedMemory), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetLocalMemoryMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetLocalMemory), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetCurrentLinearIdxMethod = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentLinearIdx), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo SetCurrentLinearIdxMethod = + typeof(VelocityMultiprocessor).GetMethod( + nameof(SetCurrentLinearIdx), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetCurrentGridIdxMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGridIdx), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetCurrentGridDimMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGridDim), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetCurrentGroupDimMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGroupDim), + BindingFlags.Public | BindingFlags.Static); + + /// + /// Represents a handle to the method. + /// + public static readonly MethodInfo GetCurrentGroupDimScalarMethodInfo = + typeof(VelocityMultiprocessor).GetMethod( + nameof(GetCurrentGroupDimScalar), + BindingFlags.Public | BindingFlags.Static); + + #endregion + + #region Events + + /// + /// Will be raised once a chunk of a scheduled thread grid has been completed. + /// + public Action ProcessingCompleted; + + #endregion + + #region Instance + + // Thread data + private readonly Thread runtimeThread; + private readonly SemaphoreSlim startProcessingSema; + + // Context data + private readonly VelocityMemoryBufferPool sharedMemoryPool; + private readonly VelocityMemoryBufferPool localMemoryPool; + + // Runtime data + private volatile VelocityKernelEntryPoint kernelHandler; + private volatile int startIndexRange; + private volatile int endIndexRange; + private volatile VelocityParameters kernelParameters; + private volatile bool running = true; + + /// + /// Initializes a new velocity multiprocessor. + /// + /// The parent velocity accelerator. + /// The current processor index. + internal VelocityMultiprocessor( + VelocityAccelerator accelerator, + int processorIndex) + { + runtimeThread = new Thread(DoWork) + { + Priority = accelerator.ThreadPriority, + IsBackground = true, + Name = $"ILGPU_{accelerator.InstanceId}_Velocity_{processorIndex}" + }; + + startProcessingSema = new SemaphoreSlim(0); + sharedMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxSharedMemoryPerGroup); + localMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxLocalMemoryPerThread); + WarpSize = accelerator.WarpSize; + ProcessorIndex = processorIndex; + + runtimeThread.Start(); + } + + #endregion + + #region Properties + + /// + /// Returns the current warp size. + /// + public int WarpSize { get; } + + /// + /// Returns the multiprocessor index. + /// + public int ProcessorIndex { get; } + + /// + /// Returns the precomputed grid indices for all lanes in the current + /// multiprocessor. + /// + public VelocityWarp32 LinearIdx { get; private set; } + + /// + /// Returns the precomputed grid indices for all lanes in the current + /// multiprocessor. + /// + public int GridIdx { get; private set; } + + /// + /// Returns the current grid dimension. + /// + public int GridDim { get; private set; } + + /// + /// Returns the current group dimension. + /// + public int GroupDim { get; private set; } + + #endregion + + #region Methods + + /// + /// Resets the current linear index. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ResetLinearIndex(int linearIndex) + { + LinearIdx = + VelocityWarp32.LaneIndexVector.AddU( + VelocityWarp32.GetConstI(linearIndex)); + GridIdx = linearIndex / GroupDim; + } + + /// + /// Gets a chunk of shared memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + public ArrayView GetSharedMemoryFromPool(int length) + where T : unmanaged => + sharedMemoryPool.Allocate(length); + + /// + /// Gets a chunk of local memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of local memory. + public ArrayView GetLocalMemoryFromPool(int length) + where T : unmanaged => + localMemoryPool.Allocate(length); + + /// + /// Dispatches a new kernel execution. + /// + /// The kernel handler delegate. + /// The start interval index. + /// The end interval index. + /// The current grid dimension. + /// The current group dimension. + /// All kernel parameters. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Run( + VelocityKernelEntryPoint handler, + int startIndex, + int endIndex, + int gridDimension, + int groupDimension, + VelocityParameters parameters) + { + GridDim = gridDimension; + GroupDim = groupDimension; + + // Note that we do not have to invoke + // ResetGridIndex(offset: ...); + // here, as this method will be automatically invoked by each Velocity kernel + + // Schedule this operation + kernelHandler = handler; + startIndexRange = startIndex; + endIndexRange = endIndex; + kernelParameters = parameters; + sharedMemoryPool.Reset(); + localMemoryPool.Reset(); + + // Ensure visibility of all changes to other threads + Thread.MemoryBarrier(); + + // Launch the processing task + startProcessingSema.Release(); + } + + /// + /// The main processing thread of this multiprocessor. + /// + private void DoWork() + { + // Assign the current multiprocessor to this instance + current = this; + + // Process all tasks + while (true) + { + // Wait for the next task to arrive + startProcessingSema.Wait(); + + // Break the loop if we are shutting down + if (!running) + break; + + // Launch the actual kernel method + kernelHandler(startIndexRange, endIndexRange, kernelParameters); + + // Signal the main thread that the processing has been completed. Note + // that we avoid any null checks at this point + ProcessingCompleted(this); + } + } + + #endregion + + #region IDisposable + + /// + /// Waits for the processing thread to shutdown and disposes all internal thread + /// objects. + /// + protected override void Dispose(bool disposing) + { + if (disposing) + { + running = false; + startProcessingSema.Release(); + runtimeThread.Join(); + + startProcessingSema.Dispose(); + sharedMemoryPool.Dispose(); + localMemoryPool.Dispose(); + } + base.Dispose(disposing); + } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs new file mode 100644 index 000000000..f438e9afe --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs @@ -0,0 +1,27 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityParameters.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +namespace ILGPU.Runtime.Velocity +{ + /// + /// The base class for all velocity parameters. + /// + abstract class VelocityParameters + { + /// + /// Does nothing at the moment + /// + public VelocityParameters() + { + } + } +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityStream.cs b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs new file mode 100644 index 000000000..7b2fdf700 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs @@ -0,0 +1,74 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityStream.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.CPU; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a velocity stream. + /// + sealed class VelocityStream : AcceleratorStream + { + #region Static + + /// + /// The default instance. + /// + internal static readonly VelocityStream Default = new VelocityStream(); + + #endregion + + #region Instance + + /// + /// Constructs a new Velocity stream. + /// + private VelocityStream() : base() { } + + /// + /// Constructs a new Velocity stream. + /// + /// The associated accelerator. + internal VelocityStream(Accelerator accelerator) + : base(accelerator) + { } + + #endregion + + #region Methods + + /// + /// Does not perform any operation. + /// + public override void Synchronize() { } + + /// + protected unsafe override ProfilingMarker AddProfilingMarkerInternal() + { + using var binding = Accelerator.BindScoped(); + return new CPUProfilingMarker(Accelerator); + } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} + + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarp32.Operations.cs b/Src/ILGPU/Runtime/Velocity/VelocityWarp32.Operations.cs new file mode 100644 index 000000000..cf1c73342 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarp32.Operations.cs @@ -0,0 +1,677 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarp32.Operations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using ILGPU.IR.Values; +using System; +using System.Collections.Generic; +using System.Numerics; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; +using static ILGPU.Runtime.Velocity.VelocityWarpOperations32; +using Arm64Intrinsics = System.Runtime.Intrinsics.Arm.AdvSimd.Arm64; + +namespace ILGPU.Runtime.Velocity +{ + partial struct VelocityWarp32 + { + #region Unary Operations + + public VelocityWarp32 NegI() => -As(); + + public VelocityWarp32 NegU() => ~As(); + + public VelocityWarp32 NegF() => ~As(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Not() + { + // Special implementation for ARM + if (IsVector128 && AdvSimd.IsSupported) + return AdvSimd.Not(As().AsVector128()).AsVector(); + + return Vector.OnesComplement(As()); + } + + public VelocityWarp32 AbsU() => this; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 AbsI() => Vector.Abs(As()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 AbsF() => Vector.Abs(As()); + + private readonly struct PopCScalarOperation : IScalarIOperation + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int Apply(int index, int value) => IntrinsicMath.PopCount(value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 PopC() + { + // Special implementation for ARM + if (IsVector128 && Arm64Intrinsics.IsSupported) + { + // Determine the pop count per lane + var popCountPerByte = AdvSimd.PopCount(As().AsVector128()); + var popCountPerLane = + Arm64Intrinsics.AddPairwise( + Arm64Intrinsics.AddPairwise( + popCountPerByte, + Vector128.Zero), + Vector128.Zero); + // Distribute the pop-count values to all lanes + var lower = AdvSimd.VectorTableLookup( + popCountPerLane, + First2BytesToIntAdvSimd.AsByte()); + var upper = AdvSimd.VectorTableLookup( + popCountPerLane, + Second2BytesToIntAdvSimd.AsByte()); + return Vector128.Create(lower, upper).AsInt32().AsVector(); + } + + return this.ApplyScalarIOperation(new PopCScalarOperation()); + } + + private readonly struct RcpFScalarOperation : IScalarFOperation + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public float Apply(int index, float value) => 1.0f / value; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 RcpF() + { + if (IsVector128) + { + // Special implementation for ARM + if (AdvSimd.IsSupported) + { + return AdvSimd.ReciprocalEstimate(As().AsVector128()) + .AsVector(); + } + + // Special implementation for X86 + if (Sse.IsSupported) + { + return Sse.Reciprocal(As().AsVector128()) + .AsVector(); + } + } + + // Special implementation for X86 + if (IsVector256 && Avx.IsSupported) + { + return Avx.Reciprocal(As().AsVector256()) + .AsVector(); + } + + return this.ApplyScalarFOperation(new RcpFScalarOperation()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 IsNotNanF() => + Vector.Equals(As(), As()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 IsNanF() => IsNotNanF().Not(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 SqrtF() => Vector.SquareRoot(As()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 RsqrtF() + { + // Special implementation for ARM + if (IsVector128 && AdvSimd.IsSupported) + { + return AdvSimd.ReciprocalSquareRootEstimate( + As().AsVector128()) + .AsVector(); + } + return SqrtF().RcpF(); + } + + #endregion + + #region Binary Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 And(VelocityWarp32 other) => + Vector.BitwiseAnd(warpData, other.warpData); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Or(VelocityWarp32 other) => + Vector.BitwiseOr(warpData, other.warpData); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 AndNot(VelocityWarp32 other) => + Vector.AndNot(warpData, other.warpData); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Xor(VelocityWarp32 other) => + Or(other).AndNot(And(other)); + + #endregion + + #region Ternary Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 MultiplyAddI(VelocityWarp32 second, VelocityWarp32 third) + { + // Special implementation for ARM + if (IsVector128 && AdvSimd.IsSupported) + { + var sourceVec = As().AsVector128(); + var secondVec = second.As().AsVector128(); + var thirdVec = second.As().AsVector128(); + + return AdvSimd.MultiplyAdd(thirdVec, sourceVec, secondVec).AsVector(); + } + + return this.MulI(second).AddI(third); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 MultiplyAddU(VelocityWarp32 second, VelocityWarp32 third) + { + // Special implementation for ARM + if (IsVector128 && AdvSimd.IsSupported) + { + var sourceVec = As().AsVector128(); + var secondVec = second.As().AsVector128(); + var thirdVec = second.As().AsVector128(); + + return AdvSimd.MultiplyAdd(thirdVec, sourceVec, secondVec).AsVector(); + } + + return this.MulU(second).AddU(third); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 MultiplyAddF(VelocityWarp32 second, VelocityWarp32 third) + { + if (IsVector128) + { + var sourceVec = As().AsVector128(); + var secondVec = second.As().AsVector128(); + var thirdVec = second.As().AsVector128(); + + // Special implementation for ARM + if (AdvSimd.IsSupported) + AdvSimd.FusedMultiplyAdd(thirdVec, sourceVec, secondVec); + + // Special implementation for X86 + if (Fma.IsSupported) + return Fma.MultiplyAdd(sourceVec, secondVec, thirdVec).AsVector(); + } + + // Special implementation for X86 + if (IsVector256 && Fma.IsSupported) + { + var sourceVec = As().AsVector256(); + var secondVec = second.As().AsVector256(); + var thirdVec = second.As().AsVector256(); + return Fma.MultiplyAdd(sourceVec, secondVec, thirdVec).AsVector(); + } + + return this.Mul(second).Add(third); + } + + #endregion + } + + partial class VelocityWarpOperations32 + { + #region General Operations + + /// + /// Dumps the given warp to the default console output. + /// + public static void Dump(this VelocityWarp32 warp) => + Console.WriteLine(warp.ToString()); + + /// + /// Converts the given half into its raw format. + /// + public static float FromHalf(Half half) => half; + + /// + /// Implements a lane index vector for a 32bit warp. + /// + public static VelocityWarp32 GetLaneIndexVector() => + VelocityWarp32.LaneIndexVector; + + #endregion + + #region Group Operations + + /// + /// Implements a barrier pop-count operation for a 32bit warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 GroupBarrierPopCount( + VelocityWarp32 warp, + VelocityLaneMask mask) => + warp.BarrierPopCount(mask); + + /// + /// Implements a logical and barrier and operation for a 32bit warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 GroupBarrierAnd( + VelocityWarp32 warp, + VelocityLaneMask mask) => + warp.BarrierAnd(mask); + + /// + /// Implements a logical barrier or operation for a 32bit warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 GroupBarrierOr( + VelocityWarp32 warp, + VelocityLaneMask mask) => + warp.BarrierOr(mask); + + /// + /// Implements a logical group broadcast for a 32bit warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 GroupBroadcast( + VelocityWarp32 warp, + VelocityWarp32 sourceLanes) + where TVerifier : struct, IVelocityWarpVerifier => + warp.Broadcast(sourceLanes); + + #endregion + + #region Warp Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 WarpBroadcast( + VelocityWarp32 warp, + VelocityWarp32 sourceLanes) + where TVerifier : struct, IVelocityWarpVerifier => + GroupBroadcast(warp, sourceLanes); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 WarpShuffle( + VelocityWarp32 warp, + VelocityWarp32 sourceLanes) + where TVerifier : struct, IVelocityWarpVerifier => + warp.Shuffle(sourceLanes); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 WarpShuffleDown( + VelocityWarp32 warp, + VelocityWarp32 delta) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleDown(delta, VelocityWarp32.LengthVector); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 WarpShuffleUp( + VelocityWarp32 warp, + VelocityWarp32 delta) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleUp(delta, VelocityWarp32.LengthVector); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 WarpShuffleXor( + VelocityWarp32 warp, + VelocityWarp32 delta) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleXor(delta, VelocityWarp32.LengthVector); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 SubWarpShuffleDown( + VelocityWarp32 warp, + VelocityWarp32 delta, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleDown(delta, width); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 SubWarpShuffleUp( + VelocityWarp32 warp, + VelocityWarp32 delta, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleUp(delta, width); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 SubWarpShuffleXor( + VelocityWarp32 warp, + VelocityWarp32 delta, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier => + warp.ShuffleXor(delta, width); + + #endregion + + #region Merge Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 Merge( + this VelocityWarp32 left, + VelocityWarp32 right, + VelocityWarp32 rightMask) => + Vector.ConditionalSelect( + rightMask.As(), + right.As(), + left.As()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 MergeWithMask( + this VelocityWarp32 left, + VelocityWarp32 right, + VelocityLaneMask rightMask) + { + var maskVector = VelocityWarp32.FromMask(rightMask); + return Merge(left, right, maskVector); + } + + #endregion + + #region Convert Operations + + /// + /// Does not perform a conversion. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertNop(this VelocityWarp32 value) => value; + + /// + /// Converts the given 32bit integer warp to a 32bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertIToI(this VelocityWarp32 value) => + ConvertNop(value); + + /// + /// Converts the given 32bit unsigned integer warp to a 32bit unsigned integer + /// warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertUToU(this VelocityWarp32 value) => + ConvertNop(value); + + /// + /// Converts the given 32bit float warp to a 32bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertFToF(this VelocityWarp32 value) => + ConvertNop(value); + + /// + /// Converts the given 32bit integer warp to a 32bit unsigned integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertIToU(this VelocityWarp32 value) => + ConvertNop(value); + + /// + /// Converts the given 32bit unsigned integer warp to a 32bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertUToI(this VelocityWarp32 value) => + ConvertNop(value); + + /// + /// Converts the given 32bit integer warp to a 32bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertIToF(this VelocityWarp32 value) + { + var rawVector = Vector.ConvertToSingle(value.As()); + return new VelocityWarp32(rawVector); + } + + /// + /// Converts the given 32bit unsigned integer warp to a 32bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertUToF(this VelocityWarp32 value) + { + var rawVector = Vector.ConvertToSingle(value.As()); + return new VelocityWarp32(rawVector); + } + + /// + /// Converts the given 32bit float warp to a 32bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertFToI(this VelocityWarp32 value) + { + var rawVector = Vector.ConvertToInt32(value.As()); + return new VelocityWarp32(rawVector); + } + + /// + /// Converts the given 32bit float warp to a 32bit unsigned integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 ConvertFToU(this VelocityWarp32 value) + { + var rawVector = Vector.ConvertToUInt32(value.As()); + return new VelocityWarp32(rawVector); + } + + /// + /// Widens the given warp to a 64bit long warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 WidenI(this VelocityWarp32 warp) + { + Vector.Widen(warp.As(), out var low, out var high); + return new VelocityWarp64(low, high); + } + + /// + /// Widens the given warp to a 64bit ulong warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 WidenU(this VelocityWarp32 warp) + { + Vector.Widen(warp.As(), out var low, out var high); + return new VelocityWarp64(low, high); + } + + /// + /// Widens the given warp to a 64bit double warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 WidenF(this VelocityWarp32 warp) + { + Vector.Widen(warp.As(), out var low, out var high); + return new VelocityWarp64(low, high); + } + + #endregion + + #region Binary Operations + + public static VelocityWarp32 ComputeRemI( + this VelocityWarp32 left, + VelocityWarp32 right) => + SubI(left, MulU(DivI(left, right), right)); + + public static VelocityWarp32 ComputeRemU( + this VelocityWarp32 left, + VelocityWarp32 right) => + SubU(left, MulU(DivU(left, right), right)); + + public static VelocityWarp32 ComputeRemF( + this VelocityWarp32 left, + VelocityWarp32 right) => + Sub(left, Abs(Mul(Div(left, right), right))); + + #endregion + } + + partial class VelocityOperations + { + #region General 32bit Operations + + private readonly MethodInfo[] convertWidenOperations32 = new MethodInfo[3] + { + GetMethod( + typeof(VelocityWarpOperations32), + nameof(VelocityWarpOperations32.WidenI)), + GetMethod( + typeof(VelocityWarpOperations32), + nameof(VelocityWarpOperations32.WidenU)), + GetMethod( + typeof(VelocityWarpOperations32), + nameof(VelocityWarpOperations32.WidenF)), + }; + private readonly MethodInfo[] broadcastOperations32 = new MethodInfo[2]; + + /// + /// Initializes special 32bit warp operations. + /// + /// The metadata operation type. + private void InitVelocityOperations32(Type operationType32) + { + broadcastOperations32[0] = GetMethod( + operationType32, + nameof(GroupBroadcast)); + broadcastOperations32[1] = GetMethod( + operationType32, + nameof(WarpBroadcast)); + + DumpMethod32 = GetMethod( + operationType32, + nameof(VelocityWarpOperations32.Dump)); + FromHalfMethod = GetMethod( + operationType32, + nameof(FromHalf)); + LaneIndexVectorOperation32 = GetMethod( + operationType32, + nameof(GetLaneIndexVector)); + + MergeOperation32 = GetMethod( + operationType32, + nameof(VelocityWarpOperations32.Merge)); + MergeWithMaskOperation32 = GetMethod( + operationType32, + nameof(VelocityWarpOperations32.MergeWithMask)); + + FromMaskOperation32 = GetMethod( + typeof(VelocityWarp32), + nameof(VelocityWarp32.FromMask)); + ToMaskOperation32 = GetMethod( + typeof(VelocityWarp32), + nameof(VelocityWarp32.ToMask)); + + InitWarpOperations32(operationType32); + InitGroupOperations32(operationType32); + } + + public MethodInfo DumpMethod32 { get; private set; } + public MethodInfo FromHalfMethod { get; private set; } + + public MethodInfo LaneIndexVectorOperation32 { get; private set; } + + public MethodInfo MergeOperation32 { get; private set; } + public MethodInfo MergeWithMaskOperation32 { get; private set; } + + public MethodInfo FromMaskOperation32 { get; private set; } + public MethodInfo ToMaskOperation32 { get; private set; } + + public MethodInfo GetConvertWidenOperation32(VelocityWarpOperationMode mode) => + convertWidenOperations32[(int)mode]; + + #endregion + + #region Constant Values + + private readonly MethodInfo[] constValueOperations32 = new MethodInfo[] + { + GetMethod(typeof(VelocityWarp32), nameof(VelocityWarp32.GetConstI)), + GetMethod(typeof(VelocityWarp32), nameof(VelocityWarp32.GetConstU)), + GetMethod(typeof(VelocityWarp32), nameof(VelocityWarp32.GetConstF)), + }; + + public MethodInfo GetConstValueOperation32(VelocityWarpOperationMode mode) => + constValueOperations32[(int)mode]; + + #endregion + + #region Warp Operations + + private readonly Dictionary warpOperations32 = + new Dictionary(); + private readonly Dictionary subWarpOperations32 = + new Dictionary(); + + private void InitWarpOperations32(Type operationType32) + { + warpOperations32.Add(ShuffleKind.Generic, + GetMethod(operationType32, nameof(WarpShuffle))); + warpOperations32.Add(ShuffleKind.Down, + GetMethod(operationType32, nameof(WarpShuffleDown))); + warpOperations32.Add(ShuffleKind.Up, + GetMethod(operationType32, nameof(WarpShuffleUp))); + warpOperations32.Add(ShuffleKind.Xor, + GetMethod(operationType32, nameof(WarpShuffleXor))); + + subWarpOperations32.Add(ShuffleKind.Down, + GetMethod(operationType32, nameof(SubWarpShuffleDown))); + subWarpOperations32.Add(ShuffleKind.Up, + GetMethod(operationType32, nameof(SubWarpShuffleUp))); + subWarpOperations32.Add(ShuffleKind.Xor, + GetMethod(operationType32, nameof(SubWarpShuffleXor))); + } + + public MethodInfo GetWarpShuffleOperation32( + ShuffleKind kind, + Type warpVerifier) => + warpOperations32[kind].MakeGenericMethod(warpVerifier); + public MethodInfo GetSubWarpShuffleOperation32( + ShuffleKind kind, + Type warpVerifier) => + subWarpOperations32[kind].MakeGenericMethod(warpVerifier); + + public MethodInfo GetWarpBroadcastOperation32(Type warpVerifier) => + broadcastOperations32[1].MakeGenericMethod(warpVerifier); + + #endregion + + #region Group Operations + + private readonly Dictionary< + PredicateBarrierKind, + MethodInfo> groupPredicateBarrierOperations32 = + new Dictionary(); + + private void InitGroupOperations32(Type operationType32) + { + groupPredicateBarrierOperations32.Add(PredicateBarrierKind.PopCount, + GetMethod(operationType32, nameof(GroupBarrierPopCount))); + groupPredicateBarrierOperations32.Add(PredicateBarrierKind.And, + GetMethod(operationType32, nameof(GroupBarrierAnd))); + groupPredicateBarrierOperations32.Add(PredicateBarrierKind.Or, + GetMethod(operationType32, nameof(GroupBarrierOr))); + } + + public MethodInfo GetGroupBroadcastOperation32(Type warpVerifier) => + broadcastOperations32[0].MakeGenericMethod(warpVerifier); + + public MethodInfo GetGroupPredicateBarrierOperation32( + PredicateBarrierKind kind) => + groupPredicateBarrierOperations32[kind]; + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarp32.cs b/Src/ILGPU/Runtime/Velocity/VelocityWarp32.cs new file mode 100644 index 000000000..f358ec028 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarp32.cs @@ -0,0 +1,746 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarp32.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using static ILGPU.Runtime.Velocity.VelocityWarpOperations32; +using static ILGPU.Runtime.Velocity.VelocityWarpVerifier; +using Arm64Intrinsics = System.Runtime.Intrinsics.Arm.AdvSimd.Arm64; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A velocity runtime warp based on 32-bit unsigned integer values. + /// + internal readonly partial struct VelocityWarp32 : IEquatable + { + #region Runtime Helper Methods + + /// + /// The constant short vector. + /// + public static VelocityWarp32 GetConstI(int value) => new Vector(value); + + /// + /// The constant short vector. + /// + public static VelocityWarp32 GetConstU(uint value) => new Vector(value); + + /// + /// The constant float vector. + /// + public static VelocityWarp32 GetConstF(float value) => new Vector(value); + + #endregion + + #region Static + + /// + /// Represents the raw vector length for a single sub-warp element. + /// + public static readonly int RawVectorLength = Vector.Count; + + /// + /// Represents the vector length for the whole warp element. + /// + public static readonly int Length = RawVectorLength; + + /// + /// The length vector. + /// + public static readonly Vector LengthVector = + new Vector((uint)RawVectorLength); + + /// + /// Creates a new lane index vector. + /// + private static Vector CreateLaneIndexVector() + { + Span indices = stackalloc uint[RawVectorLength]; + for (int i = 0; i < indices.Length; ++i) + indices[i] = (uint)i; + return new Vector(indices); + } + + /// + /// The static lane index vector. + /// + public static readonly VelocityWarp32 LaneIndexVector = + CreateLaneIndexVector(); + + /// + /// True if this vector is a 128 bit vector. + /// + public static readonly bool IsVector128 = + Vector.Count == Vector128.Count; + + /// + /// True if this vector is a 256 bit vector. + /// + public static readonly bool IsVector256 = + Vector.Count == Vector256.Count; + + /// + /// Constructs a velocity warp from a given lane mask. + /// + /// The lane mask. + /// The constructed velocity warp. + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityWarp32 FromMask(VelocityLaneMask mask) + { + if (IsVector128) + { + // We know that there will be 4 lanes + return Vector128.Create( + mask.GetActivityMaskI(0), + mask.GetActivityMaskI(1), + mask.GetActivityMaskI(2), + mask.GetActivityMaskI(3)).AsVector(); + } + + if (IsVector256) + { + // We know that there will be 8 lanes + return Vector256.Create( + mask.GetActivityMaskI(0), + mask.GetActivityMaskI(1), + mask.GetActivityMaskI(2), + mask.GetActivityMaskI(3), + mask.GetActivityMaskI(4), + mask.GetActivityMaskI(5), + mask.GetActivityMaskI(6), + mask.GetActivityMaskI(7)).AsVector(); + } + + // Use generic stack-based method + Span target = stackalloc uint[RawVectorLength]; + for (int index = 0; index < target.Length; ++index) + target[index] = mask.GetActivityMaskI(index); + return new Vector(target); + } + + /// + /// Converts the given velocity warp into a lane mask. + /// + /// The warp to convert into a lane mask. + /// The converted lane mask. + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityLaneMask ToMask(VelocityWarp32 warp) + { + var warpRawVec = warp.warpData; + if (IsVector128) + { + var warpVec = warpRawVec.AsVector128(); + var mask0 = VelocityLaneMask.Get(0, warpVec.GetElement(0)); + var mask1 = VelocityLaneMask.Get(1, warpVec.GetElement(1)); + var mask2 = VelocityLaneMask.Get(2, warpVec.GetElement(2)); + var mask3 = VelocityLaneMask.Get(3, warpVec.GetElement(3)); + return mask0 | mask1 | mask2 | mask3; + } + + if (IsVector256) + { + // We know that there will be 8 lanes + var warpVec = warpRawVec.AsVector256(); + var mask0 = VelocityLaneMask.Get(0, warpVec.GetElement(0)); + var mask1 = VelocityLaneMask.Get(1, warpVec.GetElement(1)); + var mask2 = VelocityLaneMask.Get(2, warpVec.GetElement(2)); + var mask3 = VelocityLaneMask.Get(3, warpVec.GetElement(3)); + var mask4 = VelocityLaneMask.Get(4, warpVec.GetElement(4)); + var mask5 = VelocityLaneMask.Get(5, warpVec.GetElement(5)); + var mask6 = VelocityLaneMask.Get(6, warpVec.GetElement(6)); + var mask7 = VelocityLaneMask.Get(7, warpVec.GetElement(7)); + return mask0 | mask1 | mask2 | mask3 | mask4 | mask5 | mask6 | mask7; + } + + // Use generic method + var mask = VelocityLaneMask.None; + for (int index = 0; index < RawVectorLength; ++index) + mask |= VelocityLaneMask.Get(index, warpRawVec[index]); + return mask; + } + + #region AdvSimd + + private static readonly Vector64 FirstByteTableLowerAdvSimd = + Vector64.Create((byte)0, 0, 0, 0, 4, 4, 4, 4); + + private static readonly Vector64 FirstByteTableUpperAdvSimd = + Vector64.Create((byte)8, 8, 8, 8, 12, 12, 12, 12); + + private static readonly Vector64 ShuffleOffsetVec64AdvSimd = + Vector64.Create((byte)0, 1, 2, 3, 0, 1, 2, 3); + + private static readonly Vector64 First2BytesToIntAdvSimd = + Vector64.Create(0, -1, -1, -1, 1, -1, -1, -1); + + private static readonly Vector64 Second2BytesToIntAdvSimd = + Vector64.Create(2, -1, -1, -1, 3, -1, -1, -1); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ComputeShuffleConfigAdvSimd( + Vector128 indices, + Vector128 width) + { + var div = Arm64Intrinsics.Divide( + AdvSimd.ConvertToSingle(LaneIndexVector.As().AsVector128()), + AdvSimd.ConvertToSingle(width)); + var offsets = AdvSimd.Multiply( + AdvSimd.ConvertToUInt32RoundToZero(div), + width); + var lessThan = AdvSimd.CompareLessThan(indices, offsets); + var greaterThanOrEqual = AdvSimd.CompareGreaterThanOrEqual( + indices, + AdvSimd.Add(offsets, width)); + var selectionMask = AdvSimd.Or(lessThan, greaterThanOrEqual); + return AdvSimd.BitwiseSelect( + selectionMask, + LaneIndexVector.As().AsVector128(), + indices); + } + + #endregion + + #region Generic + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ComputeShuffleConfigGeneric( + Vector indices, + Vector width) + { + var offsets = LaneIndexVector.warpData / width * width; + var lessThan = Vector.LessThan(indices, offsets); + var greaterThanOrEqual = Vector.GreaterThanOrEqual( + indices, + offsets + width); + var selectionMask = lessThan | greaterThanOrEqual; + return Vector.ConditionalSelect( + selectionMask, + LaneIndexVector.warpData, + indices); + } + + #endregion + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ComputeShuffleConfig( + Vector indices, + Vector width) => + IsVector128 && Arm64Intrinsics.IsSupported + ? ComputeShuffleConfigAdvSimd( + indices.AsVector128(), + width.AsVector128()) + .AsVector() + : ComputeShuffleConfigGeneric(indices, width); + + #endregion + + #region Instance + + private readonly Vector warpData; + + /// + /// Creates a new warp instance using the given data vectors. + /// + public unsafe VelocityWarp32(uint* data) + : this(new ReadOnlySpan(data, Length)) + { } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public unsafe VelocityWarp32(ReadOnlySpan data) + : this(new Vector(data)) + { } + + /// + /// Creates a new warp instance using the given data vector. + /// + /// The input data vector. + public VelocityWarp32(Vector data) + { + warpData = data; + } + + /// + /// Creates a new warp instance using the given data vector. + /// + /// The input data vector. + public VelocityWarp32(Vector data) + : this(data.As()) + { } + + /// + /// Creates a new warp instance using the given data vector. + /// + /// The input data vector. + public VelocityWarp32(Vector data) + : this(data.As()) + { } + + #endregion + + #region Properties + + /// + /// Returns the value of the specified lane using specialized implementations. + /// + public uint this[int laneIndex] + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + if (IsVector128) + return warpData.AsVector128().GetElement(laneIndex); + if (IsVector256) + return warpData.AsVector256().GetElement(laneIndex); + return warpData[laneIndex]; + } + } + + /// + /// Returns the value of the specified lane using specialized implementations. + /// + public uint this[uint laneIndex] => this[(int)laneIndex]; + + #endregion + + #region Methods + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public float GetElementF(int laneIndex) => Interop.IntAsFloat(this[laneIndex]); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Vector As() where T : struct => warpData.As(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Mask(VelocityWarp32 mask) => warpData & mask.warpData; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 ToWarp64() + { + Vector.Widen(warpData, out var lower, out var upper); + return new VelocityWarp64(lower, upper); + } + + #endregion + + #region Barrier + + /// + /// Uses a comparison and a horizontal sum to compute the number of barrier + /// participants for which the predicate evaluated to true. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 BarrierPopCount(VelocityLaneMask mask) + { + var greaterThan = ~Vector.Equals( + warpData.As(), + Vector.Zero); + var masked = Vector.BitwiseAnd(greaterThan, FromMask(mask).As()); + return new Vector((uint)-Vector.Sum(masked)); + } + + /// + /// Uses a comparison check with a warp-length vector to determine which lanes + /// evaluated to true. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 BarrierAndFromPopCount(VelocityLaneMask mask) + { + var totalCount = GetConstI(mask.Count); + return Vector.Equals(warpData, totalCount.As()); + } + + /// + /// Uses a comparison check with a warp-length vector to determine which lanes + /// evaluated to true. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 BarrierAnd(VelocityLaneMask mask) => + BarrierPopCount(mask).BarrierAndFromPopCount(mask); + + /// + /// Uses a comparison check with a zero-data vector to determine which lanes + /// evaluated to true. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 BarrierOrFromPopCount() => + ~Vector.Equals(warpData, Vector.Zero); + + /// + /// Uses a comparison check with a zero-data vector to determine which lanes + /// evaluated to true. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 BarrierOr(VelocityLaneMask mask) => + BarrierPopCount(mask).BarrierOrFromPopCount(); + + #endregion + + #region Shuffle + + /// + /// Extracts the value from the lane given by the first value of the source + /// lane vector and broadcasts the value to all other lanes. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Broadcast(VelocityWarp32 sourceLanes) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyBroadcast(sourceLanes); + return Broadcast(sourceLanes[0]); + } + + /// + /// Extracts the value from the given lane and broadcasts the value to all + /// other lanes. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 Broadcast(uint sourceLane) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyBroadcast(sourceLane); + // Specialized implementation on ARM + if (IsVector128 && AdvSimd.IsSupported) + { + var indices = AdvSimd.Add( + Vector64.Create((byte)(sourceLane * 4)), + ShuffleOffsetVec64AdvSimd); + + // Shuffle elements + var shuffledData = AdvSimd.VectorTableLookup( + warpData.AsVector128().AsByte(), + indices).AsUInt32(); + return Vector128.Create(shuffledData, shuffledData).AsVector(); + } + // Use a generic broadcast operation instead + return new Vector(this[sourceLane]); + } + + private readonly struct ScalarShuffleOperation : IScalarIOperation + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int Apply(int index, int value) => + value >= 0 && value < RawVectorLength ? value : index; + } + + /// + /// Shuffles values using optimized implementations to improve performance. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Vector Shuffle(Vector sourceLanes) + { + // Use a special implementation on ARM + if (IsVector128 && AdvSimd.IsSupported) + { + var dataVec = warpData.AsVector128(); + var sourceVec = sourceLanes.AsVector128(); + + var source = AdvSimd.Multiply(sourceVec, LengthVector.AsVector128()) + .AsByte(); + var source1 = AdvSimd.VectorTableLookup( + source, + FirstByteTableLowerAdvSimd); + var shiftDelta1 = AdvSimd.Add(source1, ShuffleOffsetVec64AdvSimd); + var source2 = AdvSimd.VectorTableLookup( + source, + FirstByteTableUpperAdvSimd); + var shiftDelta2 = AdvSimd.Add(source2, ShuffleOffsetVec64AdvSimd); + + var firstPart = AdvSimd.VectorTableLookup(dataVec.AsByte(), shiftDelta1); + var secondPart = AdvSimd.VectorTableLookup(dataVec.AsByte(), shiftDelta2); + return Vector128 + .Create(firstPart.AsUInt32(), secondPart.AsUInt32()) + .AsVector(); + } + + return this.ApplyScalarIOperation(new ScalarShuffleOperation()).As(); + } + + /// + /// Shuffles values defined by the given source lanes using temporary stack + /// memory to shuffle generic vectors. + /// + public VelocityWarp32 Shuffle(VelocityWarp32 sourceLanes) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyShuffle(sourceLanes); + return Shuffle(sourceLanes.warpData); + } + + /// + /// Uses the internal to determine a + /// shuffle configuration using the down delta and uses the + /// method to reorder all values. + /// + /// + /// Note that all values of and + /// are assumed to be identical. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleDown( + VelocityWarp32 delta, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyShuffleDown(delta, width); + + var indices = LaneIndexVector.warpData + delta.warpData; + var offsets = ComputeShuffleConfig(indices, width.warpData); + return Shuffle(offsets); + } + + /// + /// Shuffles all lanes using down deltas by expanding the values of + /// and values to full-size + /// warp vectors. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleDown(uint delta, uint width) + where TVerifier : struct, IVelocityWarpVerifier => + ShuffleDown( + new Vector(delta), + new Vector(width)); + + /// + /// Uses the internal to determine a + /// shuffle configuration using the up delta and uses the + /// method to reorder all values. + /// + /// + /// Note that all values of and + /// are assumed to be identical. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleUp( + VelocityWarp32 delta, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyShuffleUp(delta, width); + + var indices = LaneIndexVector.warpData - delta.warpData; + var offsets = ComputeShuffleConfig(indices, width.warpData); + return Shuffle(offsets); + } + + /// + /// Shuffles all lanes using up deltas by expanding the values of + /// and values to full-size + /// warp vectors. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleUp(uint delta, uint width) + where TVerifier : struct, IVelocityWarpVerifier => + ShuffleUp( + new Vector(delta), + new Vector(width)); + + /// + /// Uses the internal to determine a + /// shuffle configuration using the xor-mask delta and uses the + /// method to reorder all values. + /// + /// + /// Note that all values of and + /// are assumed to be identical. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleXor( + VelocityWarp32 mask, + VelocityWarp32 width) + where TVerifier : struct, IVelocityWarpVerifier + { + GetVerifier().VerifyShuffleXor(mask, width); + + var indices = LaneIndexVector.warpData ^ mask.warpData; + var offsets = ComputeShuffleConfig(indices, width.warpData); + return Shuffle(offsets); + } + + /// + /// Shuffles all lanes using xor-masks by expanding the values of + /// and values to full-size + /// warp vectors. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ShuffleXor(uint mask, uint width) + where TVerifier : struct, IVelocityWarpVerifier => + ShuffleXor( + new Vector(mask), + new Vector(width)); + + #endregion + + #region Atomics + + internal interface IAtomicOperation + where T : unmanaged + { + T Atomic(ref T target, T value); + } + + private readonly struct AtomicScalarOperation : IScalarUOperation + where T : unmanaged + where TOperation : struct, IAtomicOperation + { + private readonly VelocityWarp64 target; + private readonly VelocityLaneMask mask; + + public AtomicScalarOperation( + VelocityWarp64 targetWarp, + VelocityLaneMask warpMask) + { + target = targetWarp; + mask = warpMask; + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public unsafe uint Apply(int index, uint value) + { + ulong targetAddress = target[index]; + ref T managedRef = ref Unsafe.AsRef((void*)targetAddress); + + TOperation op = default; + T convertedValue = Unsafe.As(ref value); + + if (!mask.IsActive(index)) + return 0U; + T result = op.Atomic(ref managedRef, convertedValue); + return Unsafe.As(ref result); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal VelocityWarp32 Atomic( + VelocityWarp64 target, + VelocityLaneMask mask) + where T : unmanaged + where TOperation : struct, IAtomicOperation => + this.ApplyScalarUOperation( + new AtomicScalarOperation(target, mask)); + + private readonly struct AtomicCompareExchangeOperation : IScalarUOperation + { + private readonly VelocityWarp64 target; + private readonly VelocityWarp32 compare; + private readonly VelocityLaneMask mask; + + public AtomicCompareExchangeOperation( + VelocityWarp64 targetWarp, + VelocityWarp32 compareWarp, + VelocityLaneMask warpMask) + { + target = targetWarp; + compare = compareWarp; + mask = warpMask; + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public unsafe uint Apply(int index, uint value) + { + ulong targetAddress = target[index]; + ref uint managedRef = ref Unsafe.AsRef((void*)targetAddress); + uint compareVal = compare[index]; + + if (!mask.IsActive(index)) + return compareVal; + return ILGPU.Atomic.CompareExchange( + ref managedRef, + compareVal, + value); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 AtomicCompareExchange( + VelocityWarp64 target, + VelocityWarp32 compare, + VelocityLaneMask mask) => + this.ApplyScalarUOperation( + new AtomicCompareExchangeOperation(target, compare, mask)); + + #endregion + + #region IEquatable + + /// + /// Returns true if the given warp is equal to the current one in terms of its + /// lane values. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(VelocityWarp32 other) => + other.warpData.Equals(warpData); + + #endregion + + #region Object + + /// + /// Returns true if the given object is equal to the current one in terms of + /// its lane values. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override bool Equals(object obj) => + obj is VelocityWarp32 other && Equals(other); + + /// + /// Returns the hash code of the underlying vector data. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override int GetHashCode() => warpData.GetHashCode(); + + /// + /// Returns the string representation of the underlying warp data. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override string ToString() => warpData.ToString(); + + #endregion + + #region Operators + + /// + /// Converts a generic vector into a generic warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator VelocityWarp32(Vector data) => + new VelocityWarp32(data.As()); + + /// + /// Converts a generic vector into a generic warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator VelocityWarp32(Vector data) => + new VelocityWarp32(data); + + /// + /// Converts a generic vector into a generic warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator VelocityWarp32(Vector data) => + new VelocityWarp32(data.As()); + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarp64.Operations.cs b/Src/ILGPU/Runtime/Velocity/VelocityWarp64.Operations.cs new file mode 100644 index 000000000..d68f82382 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarp64.Operations.cs @@ -0,0 +1,404 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarp64.Operations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using System; +using System.Numerics; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; +using static ILGPU.Runtime.Velocity.VelocityWarpOperations64; + +namespace ILGPU.Runtime.Velocity +{ + partial struct VelocityWarp64 + { + #region Unary Operations + + public VelocityWarp64 NegI() => + new VelocityWarp64(-LowerAs(), -UpperAs()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 NegU() => + new VelocityWarp64(~LowerAs(), ~UpperAs()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 NegF() => + new VelocityWarp64(~LowerAs(), ~UpperAs()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 Not() + { + // Special implementation for ARM + if (IsVector128 && AdvSimd.IsSupported) + { + return new VelocityWarp64( + AdvSimd.Not(LowerAs().AsVector128()).AsVector(), + AdvSimd.Not(UpperAs().AsVector128()).AsVector()); + } + + return new VelocityWarp64( + Vector.OnesComplement(LowerAs()), + Vector.OnesComplement(UpperAs())); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 AbsI() => + new VelocityWarp64( + Vector.Abs(LowerAs()), + Vector.Abs(UpperAs())); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 AbsF() => + new VelocityWarp64( + Vector.Abs(LowerAs()), + Vector.Abs(UpperAs())); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 IsNotNanF() => + new VelocityWarp64( + Vector.Equals(LowerAs(), LowerAs()), + Vector.Equals(UpperAs(), UpperAs())); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 IsNanF() => IsNotNanF().Not(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 IsNanF32() => IsNanF().NarrowI(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 SqrtF() => + new VelocityWarp64( + Vector.SquareRoot(LowerAs()), + Vector.SquareRoot(UpperAs())); + + #endregion + + #region Binary Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 And(VelocityWarp64 other) => + new VelocityWarp64( + Vector.BitwiseAnd(lowerData, other.lowerData), + Vector.BitwiseAnd(upperData, other.upperData)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 Or(VelocityWarp64 other) => + new VelocityWarp64( + Vector.BitwiseOr(lowerData, other.lowerData), + Vector.BitwiseOr(upperData, other.upperData)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 AndNot(VelocityWarp64 other) => + new VelocityWarp64( + Vector.AndNot(lowerData, other.lowerData), + Vector.AndNot(upperData, other.upperData)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 Xor(VelocityWarp64 other) => + Or(other).AndNot(And(other)); + + #endregion + + #region Ternary Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 MultiplyAddI(VelocityWarp64 second, VelocityWarp64 third) => + this.MulI(second).AddI(third); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 MultiplyAddU(VelocityWarp64 second, VelocityWarp64 third) => + this.MulU(second).AddU(third); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 MultiplyAddF(VelocityWarp64 second, VelocityWarp64 third) + { + // Special implementation for X86 + if (IsVector128 && Fma.IsSupported) + { + return new VelocityWarp64( + Fma.MultiplyAdd( + LowerAs().AsVector128(), + second.LowerAs().AsVector128(), + second.LowerAs().AsVector128()).AsVector(), + Fma.MultiplyAdd( + UpperAs().AsVector128(), + second.UpperAs().AsVector128(), + second.UpperAs().AsVector128()).AsVector()); + } + + // Special implementation for X86 + if (IsVector256 && Fma.IsSupported) + { + return new VelocityWarp64( + Fma.MultiplyAdd( + LowerAs().AsVector256(), + second.LowerAs().AsVector256(), + second.LowerAs().AsVector256()).AsVector(), + Fma.MultiplyAdd( + UpperAs().AsVector256(), + second.UpperAs().AsVector256(), + second.UpperAs().AsVector256()).AsVector()); + } + + return this.Mul(second).Add(third); + } + + #endregion + } + + partial class VelocityWarpOperations64 + { + #region General Operations + + /// + /// Dumps the given warp to the default console output. + /// + public static void Dump(this VelocityWarp64 warp) => + Console.WriteLine(warp.ToString()); + + public static VelocityWarp64 GetLaneIndexVector() => + VelocityWarp64.LaneIndexVector; + + #endregion + + #region Merge Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 Merge( + this VelocityWarp64 left, + VelocityWarp64 right, + VelocityWarp64 rightMask) => + new VelocityWarp64( + Vector.ConditionalSelect( + rightMask.LowerAs(), + right.LowerAs(), + left.LowerAs()), + Vector.ConditionalSelect( + rightMask.UpperAs(), + right.UpperAs(), + left.UpperAs())); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 MergeWithMask( + this VelocityWarp64 left, + VelocityWarp64 right, + VelocityLaneMask rightMask) + { + var maskVector = VelocityWarp64.FromMask(rightMask); + return Merge(left, right, maskVector); + } + + #endregion + + #region Convert Operations + + /// + /// Does not perform a conversion. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertNop(this VelocityWarp64 value) => value; + + /// + /// Converts the given 64bit integer warp to a 64bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertIToI(this VelocityWarp64 value) => + ConvertNop(value); + + /// + /// Converts the given 64bit unsigned integer warp to a 64bit unsigned integer + /// warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertUToU(this VelocityWarp64 value) => + ConvertNop(value); + + /// + /// Converts the given 64bit float warp to a 64bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertFToF(this VelocityWarp64 value) => + ConvertNop(value); + + + /// + /// Converts the given 64bit integer warp to a 64bit unsigned integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertIToU(this VelocityWarp64 value) => + ConvertNop(value); + + /// + /// Converts the given 64bit unsigned integer warp to a 64bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertUToI(this VelocityWarp64 value) => + ConvertNop(value); + + /// + /// Converts the given 64bit integer warp to a 64bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertIToF(this VelocityWarp64 value) + { + var lower = Vector.ConvertToDouble(value.LowerAs()); + var upper = Vector.ConvertToDouble(value.UpperAs()); + return new VelocityWarp64(lower, upper); + } + + /// + /// Converts the given 64bit unsigned integer warp to a 64bit float warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertUToF(this VelocityWarp64 value) + { + var lower = Vector.ConvertToDouble(value.LowerAs()); + var upper = Vector.ConvertToDouble(value.UpperAs()); + return new VelocityWarp64(lower, upper); + } + + /// + /// Converts the given 64bit float warp to a 64bit integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertFToI(this VelocityWarp64 value) + { + var lower = Vector.ConvertToInt64(value.LowerAs()); + var upper = Vector.ConvertToInt64(value.UpperAs()); + return new VelocityWarp64(lower, upper); + } + + /// + /// Converts the given 64bit float warp to a 64bit unsigned integer warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 ConvertFToU(this VelocityWarp64 value) + { + var lower = Vector.ConvertToUInt64(value.LowerAs()); + var upper = Vector.ConvertToUInt64(value.UpperAs()); + return new VelocityWarp64(lower, upper); + } + + /// + /// Narrows the given warp to a 32bit int warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 NarrowI(this VelocityWarp64 warp) => + Vector.Narrow(warp.LowerAs(), warp.UpperAs()); + + /// + /// Narrows the given warp to a 32bit uint warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 NarrowU(this VelocityWarp64 warp) => + Vector.Narrow(warp.LowerAs(), warp.UpperAs()); + + /// + /// Narrows the given warp to a 32bit float uint warp. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 NarrowF(this VelocityWarp64 warp) => + Vector.Narrow(warp.LowerAs(), warp.UpperAs()); + + #endregion + + #region Binary Operations + + public static VelocityWarp64 ComputeRemI( + this VelocityWarp64 left, + VelocityWarp64 right) => + SubI(left, MulU(DivI(left, right), right)); + + public static VelocityWarp64 ComputeRemU( + this VelocityWarp64 left, + VelocityWarp64 right) => + SubU(left, MulU(DivU(left, right), right)); + + public static VelocityWarp64 ComputeRemF( + this VelocityWarp64 left, + VelocityWarp64 right) => + Sub(left, Abs(Mul(Div(left, right), right))); + + #endregion + } + + partial class VelocityOperations + { + #region General 64bit Operations + + private readonly MethodInfo[] convertNarrowOperations64 = new MethodInfo[3] + { + GetMethod( + typeof(VelocityWarpOperations64), + nameof(VelocityWarpOperations64.NarrowI)), + GetMethod( + typeof(VelocityWarpOperations64), + nameof(VelocityWarpOperations64.NarrowU)), + GetMethod( + typeof(VelocityWarpOperations64), + nameof(VelocityWarpOperations64.NarrowF)), + }; + + private void InitVelocityOperations64(Type operationType64) + { + DumpMethod64 = GetMethod( + operationType64, + nameof(VelocityWarpOperations64.Dump)); + LaneIndexVectorOperation64 = GetMethod( + operationType64, + nameof(GetLaneIndexVector)); + + MergeOperation64 = GetMethod( + operationType64, + nameof(VelocityWarpOperations64.Merge)); + MergeWithMaskOperation64 = GetMethod( + operationType64, + nameof(VelocityWarpOperations64.MergeWithMask)); + + FromMaskOperation64 = GetMethod( + typeof(VelocityWarp64), + nameof(VelocityWarp64.FromMask)); + ToMaskOperation64 = GetMethod( + typeof(VelocityWarp64), + nameof(VelocityWarp64.ToMask)); + } + + public MethodInfo DumpMethod64 { get; private set; } + public MethodInfo LaneIndexVectorOperation64 { get; private set; } + + public MethodInfo MergeOperation64 { get; private set; } + public MethodInfo MergeWithMaskOperation64 { get; private set; } + + public MethodInfo FromMaskOperation64 { get; private set; } + public MethodInfo ToMaskOperation64 { get; private set; } + + public MethodInfo GetConvertNarrowOperation64(VelocityWarpOperationMode mode) => + convertNarrowOperations64[(int)mode]; + + #endregion + + #region Constant Values + + private readonly MethodInfo[] constValueOperations64 = new MethodInfo[] + { + GetMethod(typeof(VelocityWarp64), nameof(VelocityWarp64.GetConstI)), + GetMethod(typeof(VelocityWarp64), nameof(VelocityWarp64.GetConstU)), + GetMethod(typeof(VelocityWarp64), nameof(VelocityWarp64.GetConstF)), + }; + + public MethodInfo GetConstValueOperation64(VelocityWarpOperationMode mode) => + constValueOperations64[(int)mode]; + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarp64.cs b/Src/ILGPU/Runtime/Velocity/VelocityWarp64.cs new file mode 100644 index 000000000..ded3e9709 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarp64.cs @@ -0,0 +1,479 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarp64.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using static ILGPU.Runtime.Velocity.VelocityWarpOperations64; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a 64bit-wide extra-wide warp. + /// + internal readonly partial struct VelocityWarp64 : IEquatable + { + #region Runtime Constants + + /// + /// The constant short vector. + /// + public static VelocityWarp64 GetConstI(long value) => new VelocityWarp64( + new Vector(value), + new Vector(value)); + + /// + /// The constant short vector. + /// + public static VelocityWarp64 GetConstU(ulong value) => new VelocityWarp64( + new Vector(value), + new Vector(value)); + + /// + /// The constant float vector. + /// + public static VelocityWarp64 GetConstF(double value) => new VelocityWarp64( + new Vector(value), + new Vector(value)); + + #endregion + + #region Static + + /// + /// True if this vector is a 128 bit vector. + /// + public static readonly bool IsVector128 = + Vector.Count == Vector128.Count; + + /// + /// True if this vector is a 256 bit vector. + /// + public static readonly bool IsVector256 = + Vector.Count == Vector256.Count; + + /// + /// Represents the raw vector length for a single sub-warp element. + /// + public static readonly int RawVectorLength = Vector.Count; + + /// + /// Represents the number of elements of this warp. + /// + public static readonly int Length = RawVectorLength * 2; + + private static VelocityWarp64 CreateLaneIndexVector() + { + Span indices = stackalloc ulong[Length]; + for (int i = 0; i < indices.Length; ++i) + indices[i] = (ulong)i; + return new VelocityWarp64( + new Vector(indices[..RawVectorLength]), + new Vector(indices.Slice(RawVectorLength, RawVectorLength))); + } + + /// + /// The static lane index vector. + /// + public static readonly VelocityWarp64 LaneIndexVector = + CreateLaneIndexVector(); + + /// + /// Constructs a velocity warp from a given lane mask. + /// + /// The lane mask. + /// The constructed velocity warp. + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityWarp64 FromMask(VelocityLaneMask mask) + { + if (IsVector128) + { + // We know that there will be 4 lanes + return new VelocityWarp64( + Vector128.Create( + mask.GetActivityMaskL(0), + mask.GetActivityMaskL(1)).AsVector(), + Vector128.Create( + mask.GetActivityMaskL(2), + mask.GetActivityMaskL(3)).AsVector()); + } + + if (IsVector256) + { + // We know that there will be 8 lanes + return new VelocityWarp64( + Vector256.Create( + mask.GetActivityMaskL(0), + mask.GetActivityMaskL(1), + mask.GetActivityMaskL(2), + mask.GetActivityMaskL(3)).AsVector(), + Vector256.Create( + mask.GetActivityMaskL(4), + mask.GetActivityMaskL(5), + mask.GetActivityMaskL(6), + mask.GetActivityMaskL(7)).AsVector()); + } + + // Use generic stack-based method + Span target = stackalloc ulong[Length]; + for (int index = 0; index < target.Length; ++index) + target[index] = mask.GetActivityMaskL(index); + return new VelocityWarp64( + new Vector(target[..RawVectorLength]), + new Vector(target.Slice(RawVectorLength, RawVectorLength))); + } + + /// + /// Converts the given velocity warp into a lane mask. + /// + /// The warp to convert into a lane mask. + /// The converted lane mask. + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityLaneMask ToMask(VelocityWarp64 warp) + { + var lowerRawVec = warp.lowerData; + var upperRawVec = warp.upperData; + if (IsVector128) + { + var lowerVec = lowerRawVec.AsVector128(); + var upperVec = upperRawVec.AsVector128(); + + var mask0 = VelocityLaneMask.Get(0, lowerVec.GetElement(0)); + var mask1 = VelocityLaneMask.Get(1, lowerVec.GetElement(1)); + var mask2 = VelocityLaneMask.Get(2, upperVec.GetElement(0)); + var mask3 = VelocityLaneMask.Get(3, upperVec.GetElement(1)); + return mask0 | mask1 | mask2 | mask3; + } + + if (IsVector256) + { + // We know that there will be 8 lanes + var lowerVec = lowerRawVec.AsVector256(); + var upperVec = upperRawVec.AsVector256(); + + var mask0 = VelocityLaneMask.Get(0, lowerVec.GetElement(0)); + var mask1 = VelocityLaneMask.Get(1, lowerVec.GetElement(1)); + var mask2 = VelocityLaneMask.Get(2, lowerVec.GetElement(2)); + var mask3 = VelocityLaneMask.Get(3, lowerVec.GetElement(3)); + + var mask4 = VelocityLaneMask.Get(0, upperVec.GetElement(0)); + var mask5 = VelocityLaneMask.Get(1, upperVec.GetElement(1)); + var mask6 = VelocityLaneMask.Get(2, upperVec.GetElement(2)); + var mask7 = VelocityLaneMask.Get(3, upperVec.GetElement(3)); + return mask0 | mask1 | mask2 | mask3 | mask4 | mask5 | mask6 | mask7; + } + + // Use generic method + var mask = VelocityLaneMask.None; + for (int index = 0; index < RawVectorLength; ++index) + mask |= VelocityLaneMask.Get(index, warp.lowerData[index]); + for (int index = 0; index < RawVectorLength; ++index) + { + mask |= VelocityLaneMask.Get( + index + RawVectorLength, + warp.upperData[index]); + } + return mask; + } + + #endregion + + #region Instance + + private readonly Vector lowerData; + private readonly Vector upperData; + + /// + /// Creates a new warp instance using the given data vectors. + /// + public unsafe VelocityWarp64(ulong* data) + : this(new ReadOnlySpan(data, Length)) + { } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public VelocityWarp64(ReadOnlySpan data) + : this( + new Vector(data[..RawVectorLength]), + new Vector(data.Slice(RawVectorLength, RawVectorLength))) + { } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public VelocityWarp64(ReadOnlySpan data) + : this( + new Vector(data[..RawVectorLength]), + new Vector(data.Slice(RawVectorLength, RawVectorLength))) + { } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public VelocityWarp64(Vector lower, Vector upper) + { + lowerData = lower; + upperData = upper; + } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public VelocityWarp64(Vector lower, Vector upper) + : this(lower.As(), upper.As()) + { } + + /// + /// Creates a new warp instance using the given data vectors. + /// + public VelocityWarp64(Vector lower, Vector upper) + : this(lower.As(), upper.As()) + { } + + #endregion + + #region Properties + + /// + /// Returns the value of the specified lane using specialized implementations. + /// + public ulong this[int laneIndex] + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => laneIndex < RawVectorLength + ? GetLowerElement(laneIndex) + : GetUpperElement(laneIndex - RawVectorLength); + } + + #endregion + + #region Methods + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe T LoadFromPtr(int laneIndex) + where T : unmanaged + { + void* ptr = GetElementPtr(laneIndex); + return Unsafe.Read(ptr); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void StoreToPtr(int laneIndex, T value) + where T : unmanaged + { + void* ptr = GetElementPtr(laneIndex); + Unsafe.Write(ptr, value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public double GetElementF(int laneIndex) => + laneIndex < RawVectorLength + ? GetLowerElementF(laneIndex) + : GetUpperElementF(laneIndex - RawVectorLength); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void* GetElementPtr(int laneIndex) => + laneIndex < RawVectorLength + ? GetLowerElementPtr(laneIndex) + : GetUpperElementPtr(laneIndex - RawVectorLength); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ulong GetLowerElement(int laneIndex) + { + if (IsVector128) + return lowerData.AsVector128().GetElement(laneIndex); + if (IsVector256) + return lowerData.AsVector256().GetElement(laneIndex); + return lowerData[laneIndex]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public double GetLowerElementF(int laneIndex) => + Interop.IntAsFloat(GetLowerElement(laneIndex)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void* GetLowerElementPtr(int laneIndex) => + new IntPtr((long)GetLowerElement(laneIndex)).ToPointer(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ulong GetUpperElement(int laneIndex) + { + if (IsVector128) + return upperData.AsVector128().GetElement(laneIndex); + if (IsVector256) + return upperData.AsVector256().GetElement(laneIndex); + return upperData[laneIndex]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public double GetUpperElementF(int laneIndex) => + Interop.IntAsFloat(GetUpperElement(laneIndex)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void* GetUpperElementPtr(int laneIndex) => + new IntPtr((long)GetUpperElement(laneIndex)).ToPointer(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Vector LowerAs() where T : struct => lowerData.As(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Vector UpperAs() where T : struct => upperData.As(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 Mask(VelocityWarp64 mask) => + new VelocityWarp64( + lowerData & mask.lowerData, + upperData & mask.upperData); + + #endregion + + #region Atomics + + internal interface IAtomicOperation + where T : unmanaged + { + T Atomic(ref T target, T value); + } + + private readonly struct AtomicScalarOperation : IScalarUOperation + where T : unmanaged + where TOperation : struct, IAtomicOperation + { + private readonly VelocityWarp64 target; + private readonly VelocityLaneMask mask; + + public AtomicScalarOperation( + VelocityWarp64 targetWarp, + VelocityLaneMask warpMask) + { + target = targetWarp; + mask = warpMask; + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public unsafe ulong Apply(int index, ulong value) + { + ulong targetAddress = target[index]; + ref T managedRef = ref Unsafe.AsRef((void*)targetAddress); + + TOperation op = default; + T convertedValue = Unsafe.As(ref value); + + if (!mask.IsActive(index)) + return 0UL; + T result = op.Atomic(ref managedRef, convertedValue); + return Unsafe.As(ref result); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal VelocityWarp64 Atomic( + VelocityWarp64 target, + VelocityLaneMask mask) + where T : unmanaged + where TOperation : struct, IAtomicOperation => + this.ApplyScalarUOperation( + new AtomicScalarOperation(target, mask)); + + private readonly struct AtomicCompareExchangeOperation : IScalarUOperation + { + private readonly VelocityWarp64 target; + private readonly VelocityWarp64 compare; + private readonly VelocityLaneMask mask; + + public AtomicCompareExchangeOperation( + VelocityWarp64 targetWarp, + VelocityWarp64 compareWarp, + VelocityLaneMask warpMask) + { + target = targetWarp; + compare = compareWarp; + mask = warpMask; + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public unsafe ulong Apply(int index, ulong value) + { + ulong targetAddress = target[index]; + ref ulong managedRef = ref Unsafe.AsRef((void*)targetAddress); + ulong compareVal = compare[index]; + + if (!mask.IsActive(index)) + return compareVal; + return ILGPU.Atomic.CompareExchange( + ref managedRef, + compare[index], + value); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 AtomicCompareExchange( + VelocityWarp64 target, + VelocityWarp64 compare, + VelocityLaneMask mask) => + this.ApplyScalarUOperation( + new AtomicCompareExchangeOperation(target, compare, mask)); + + #endregion + + #region IEquatable + + /// + /// Returns true if the given warp is equal to the current one in terms of its + /// lane values. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(VelocityWarp64 other) => + lowerData.Equals(other.lowerData) && upperData.Equals(other.upperData); + + #endregion + + #region Object + + /// + /// Returns true if the given object is equal to the current one in terms of + /// its lane values. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override bool Equals(object obj) => + obj is VelocityWarp64 other && Equals(other); + + /// + /// Returns the hash code of the underlying vector data. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override int GetHashCode() => +#if NETSTANDARD2_1_OR_GREATER || NET5_0_OR_GREATER + HashCode.Combine(lowerData.GetHashCode(), upperData.GetHashCode()); +#else + lowerData.GetHashCode() ^ upperData.GetHashCode(); +#endif + + /// + /// Returns the string representation of the underlying warp data. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override string ToString() => $"({lowerData}, {upperData})"; + + #endregion + + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarpVerifier.cs b/Src/ILGPU/Runtime/Velocity/VelocityWarpVerifier.cs new file mode 100644 index 000000000..040c14c05 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarpVerifier.cs @@ -0,0 +1,99 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarpVerifier.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details +// --------------------------------------------------------------------------------------- + +using System.Runtime.CompilerServices; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a sanity checker for warp operations. + /// + internal interface IVelocityWarpVerifier + { + /// + /// Verifies broadcast operations. + /// + /// The source lane vector. + void VerifyBroadcast(VelocityWarp32 sourceLane); + + /// + /// Verifies broadcast operations. + /// + /// The uniform source lane value. + void VerifyBroadcast(uint sourceLane); + + /// + /// Verifies shuffle operations. + /// + /// The source lanes vector. + void VerifyShuffle(VelocityWarp32 sourceLanes); + + /// + /// Verifies shuffle down operations. + /// + /// The delta lanes vector. + /// The width vector. + void VerifyShuffleDown(VelocityWarp32 delta, VelocityWarp32 width); + + /// + /// Verifies shuffle up operations. + /// + /// The delta lanes vector. + /// The width vector. + void VerifyShuffleUp(VelocityWarp32 delta, VelocityWarp32 width); + + /// + /// Verifies shuffle xor operations. + /// + /// The delta lanes vector. + /// The width vector. + void VerifyShuffleXor(VelocityWarp32 delta, VelocityWarp32 width); + } + + /// + /// Default implementations for the interface. + /// + static class VelocityWarpVerifier + { + public static TVerifier GetVerifier() + where TVerifier : struct, IVelocityWarpVerifier => default; + + /// + /// Does not verify any values. + /// + public readonly struct Disabled : IVelocityWarpVerifier + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyBroadcast(VelocityWarp32 sourceLane) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyBroadcast(uint sourceLane) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyShuffle(VelocityWarp32 sourceLanes) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyShuffleDown(VelocityWarp32 delta, VelocityWarp32 width) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyShuffleUp(VelocityWarp32 delta, VelocityWarp32 width) + { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void VerifyShuffleXor(VelocityWarp32 delta, VelocityWarp32 width) + { } + } + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityWarps.tt b/Src/ILGPU/Runtime/Velocity/VelocityWarps.tt new file mode 100644 index 000000000..05bbf6bbb --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityWarps.tt @@ -0,0 +1,1703 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityWarps.tt/VelocityWarps.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../../Static/TypeInformation.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#@ output extension=".cs" #> +<# +string rootPath = Host.ResolvePath("../../Static"); +var unaryOps = GetUnaryMathOps(rootPath); +var binaryOps = GetBinaryMathOps(rootPath); +var ternaryOps = GetTernaryMathOps(rootPath); + +var implementationTypes32 = new (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName)[] +{ + (MathOpFlags.Ints, "I", "int", "Int32"), + (MathOpFlags.BoolsAndInts, "U", "uint", "UInt32"), + (MathOpFlags.Floats, "F", "float", "Float") +}; +var implementationTypes64 = new (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName)[] +{ + (MathOpFlags.Ints, "I", "long", "Int64"), + (MathOpFlags.BoolsAndInts, "U", "ulong", "UInt64"), + (MathOpFlags.Floats, "F", "double", "Double") +}; +var compareOperations = new (string Kind, string Op, string PostOp)[] +{ + ("Equal", "Equals", ""), + ("NotEqual", "Equals", ".Not()"), + ("LessThan", "LessThan", ""), + ("LessEqual", "LessThanOrEqual", ""), + ("GreaterThan", "GreaterThan", ""), + ("GreaterEqual", "GreaterThanOrEqual", "") +}; +var atomicTypes = new (string WarpName, int BitWidth, string ImplTypeName, TypeInformation[] Types)[] +{ + ("VelocityWarp32", 32, "VelocityWarpOperations32", AtomicNumericTypes32), + ("VelocityWarp64", 64, "VelocityWarpOperations64", AtomicNumericTypes64) +}; +var convTypeMultipliers = new int[] { 4, 8 }; +var warp32ConvTypes = new TypeInformation[] +{ + SignedIntTypes[0], + SignedIntTypes[1], + SignedIntTypes[2], + UnsignedIntTypes[0], + UnsignedIntTypes[1], + UnsignedIntTypes[2], + FloatTypes[0], + FloatTypes[1], +}; +var warp64ConvTypes = new TypeInformation[] +{ + SignedIntTypes[3], + UnsignedIntTypes[3], + FloatTypes[2], +}; +var warp32IOTypes = new TypeInformation[] +{ + UnsignedIntTypes[0], + UnsignedIntTypes[1], + UnsignedIntTypes[2], + FloatTypes[0], + FloatTypes[1], +}; +var warp64IOTypes = new TypeInformation[] +{ + UnsignedIntTypes[3], + FloatTypes[2], +}; +#> +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Numerics; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using ILGPU.IR.Values; + +// disable: max_line_length + +namespace ILGPU.Runtime.Velocity +{ + // Operation implementations + + enum VelocityWarpOperationMode + { + I, + U, + F, + D = F, + } + + static partial class VelocityWarpOperations32 + { + #region Scalar Operations + +<# foreach (var (_, typeName, implType, _) in implementationTypes32) { #> + + /// + /// A scalar operation working on <#= implType#>s. + /// + internal interface IScalar<#= typeName #>Operation + { + <#= implType #> Apply(int index, <#= implType #> value); + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityWarp32 ApplyScalar<#= typeName #>Operation( + this VelocityWarp32 warp, + TOperation operation) + where TOperation : struct, IScalar<#= typeName #>Operation + { + var source = warp.As<<#= implType #>>(); + if (VelocityWarp32.IsVector128) + { + // Use specialized Vector128 implementation + var sourceVec = source.AsVector128(); + return Vector128.Create( + operation.Apply(0, sourceVec.GetElement(0)), + operation.Apply(1, sourceVec.GetElement(1)), + operation.Apply(2, sourceVec.GetElement(2)), + operation.Apply(3, sourceVec.GetElement(3))) + .As<<#= implType #>, uint>() + .AsVector(); + } + else if (VelocityWarp32.IsVector256) + { + // Use specialized Vector256 implementation + var sourceVec = source.AsVector256(); + return Vector256.Create( + operation.Apply(0, sourceVec.GetElement(0)), + operation.Apply(1, sourceVec.GetElement(1)), + operation.Apply(2, sourceVec.GetElement(2)), + operation.Apply(3, sourceVec.GetElement(3)), + operation.Apply(4, sourceVec.GetElement(4)), + operation.Apply(5, sourceVec.GetElement(5)), + operation.Apply(6, sourceVec.GetElement(6)), + operation.Apply(7, sourceVec.GetElement(7))) + .As<<#= implType #>, uint>() + .AsVector(); + } + else + { + // Allocate memory locally on the stack + Span<<#= implType #>> sources = + stackalloc <#= implType #>[VelocityWarp32.RawVectorLength]; + Span<<#= implType #>> target = + stackalloc <#= implType #>[VelocityWarp32.RawVectorLength]; + + // Load data into temporary memory and operate on that + source.CopyTo(sources); + for (int i = 0; i < target.Length; ++i) + target[i] = operation.Apply(i, sources[i]); + return new Vector<<#= implType #>>(target) + .As<<#= implType #>, uint>(); + } + } + + /// + /// A binary scalar operation working on <#= implType#>s. + /// + internal interface IBinaryScalar<#= typeName #>Operation + { + <#= implType #> Apply(int index, <#= implType #> left, <#= implType #> right); + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + public static VelocityWarp32 ApplyBinaryScalar<#= typeName #>Operation( + VelocityWarp32 left, + VelocityWarp32 right, + TOperation operation) + where TOperation : struct, IBinaryScalar<#= typeName #>Operation + { + var leftSource = left.As<<#= implType #>>(); + var rightSource = right.As<<#= implType #>>(); + if (VelocityWarp32.IsVector128) + { + // Use specialized Vector128 implementation + var leftVec = leftSource.AsVector128(); + var rightVec = rightSource.AsVector128(); + return Vector128.Create( + operation.Apply(0, leftVec.GetElement(0), rightVec.GetElement(0)), + operation.Apply(1, leftVec.GetElement(1), rightVec.GetElement(1)), + operation.Apply(2, leftVec.GetElement(2), rightVec.GetElement(2)), + operation.Apply(3, leftVec.GetElement(3), rightVec.GetElement(3))) + .As<<#= implType #>, uint>() + .AsVector(); + } + else if (VelocityWarp32.IsVector256) + { + // Use specialized Vector256 implementation + var leftVec = leftSource.AsVector256(); + var rightVec = rightSource.AsVector256(); + return Vector256.Create( + operation.Apply(0, leftVec.GetElement(0), rightVec.GetElement(0)), + operation.Apply(1, leftVec.GetElement(1), rightVec.GetElement(1)), + operation.Apply(2, leftVec.GetElement(2), rightVec.GetElement(2)), + operation.Apply(3, leftVec.GetElement(3), rightVec.GetElement(3)), + operation.Apply(4, leftVec.GetElement(4), rightVec.GetElement(4)), + operation.Apply(5, leftVec.GetElement(5), rightVec.GetElement(5)), + operation.Apply(6, leftVec.GetElement(6), rightVec.GetElement(6)), + operation.Apply(7, leftVec.GetElement(7), rightVec.GetElement(7))) + .As<<#= implType #>, uint>() + .AsVector(); + } + else + { + // Allocate memory locally on the stack + Span<<#= implType #>> leftSources = + stackalloc <#= implType #>[VelocityWarp32.RawVectorLength]; + Span<<#= implType #>> rightSources = + stackalloc <#= implType #>[VelocityWarp32.RawVectorLength]; + Span<<#= implType #>> target = + stackalloc <#= implType #>[VelocityWarp32.RawVectorLength]; + + // Load data into temporary memory and operate on that + leftSource.CopyTo(leftSources); + rightSource.CopyTo(rightSources); + for (int i = 0; i < target.Length; ++i) + target[i] = operation.Apply(i, leftSource[i], rightSource[i]); + + return new Vector<<#= implType #>>(target) + .As<<#= implType #>, uint>(); + } + } + +<# } #> + + #endregion + + #region Unary Operations + +<# foreach (var op in unaryOps) { #> +<# var config = op.GetVelocity32(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> +<# string interfaceSuffix = op.IsPredicate ? "U" : suffix; #> +<# string interfaceTypeName = op.IsPredicate ? "uint" : typeName; #> +<# if (config.SoftwareEmulation) { #> + private readonly struct <#= op.Name #><#= implSuffix #>Impl : + IScalar<#= interfaceSuffix #>Operation + { + public <#= interfaceTypeName #> Apply(int _, <#= interfaceTypeName #> value) => +<# if (op.IsPredicate) { #> + <#= op.GetOpOrCall(false, "Interop.IntAsFloat(value)") #> ? uint.MaxValue : 0; +<# } else { #> + (<#= typeName #>)<#= op.GetOpOrCall(false, "value") #>; +<# } #> + } + +<# } #> + public static VelocityWarp32 <#= op.Name #><#= implSuffix #>( + this VelocityWarp32 warp) => +<# if (config.SoftwareEmulation) { #> + warp.ApplyScalar<#= interfaceSuffix #>Operation( + new <#= op.Name #><#= implSuffix #>Impl()); +<# } else { #> + <#= config.GetImplementation( + "As", + suffix, + typeName, + "warp") #>; +<# } #> + +<# } #> +<# } #> + + #endregion + + #region Binary Operations + +<# foreach (var op in binaryOps) { #> +<# var config = op.GetVelocity32(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> +<# if (config.SoftwareEmulation) { #> + private readonly struct <#= op.Name #><#= implSuffix #>Impl : + IBinaryScalar<#= suffix #>Operation + { + public <#= typeName #> Apply(int _, <#= typeName #> left, <#= typeName #> right) => + (<#= typeName #>)<#= op.GetOpOrCall(false, "left", "right") #>; + } + +<# } #> + public static VelocityWarp32 <#= op.Name #><#= implSuffix #>( + this VelocityWarp32 left, + VelocityWarp32 right) => +<# if (config.SoftwareEmulation) { #> + ApplyBinaryScalar<#= suffix #>Operation( + left, + right, + new <#= op.Name #><#= implSuffix #>Impl()); +<# } else { #> + <#= config.GetImplementation( + "As", + suffix, + typeName, + "left", + "right") #>; +<# } #> + +<# } #> +<# } #> + + #endregion + + #region Ternary Operations + +<# foreach (var op in ternaryOps) { #> +<# var config = op.GetVelocity32(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + public static VelocityWarp32 <#= op.Name #><#= implSuffix #>( + this VelocityWarp32 first, + VelocityWarp32 second, + VelocityWarp32 third) => + <#= config.GetImplementation( + "As", + suffix, + typeName, + "first", + "second", + "third") #>; + +<# } #> +<# } #> + #endregion + + #region Compare Operations + +<# foreach (var (kind, op, postOp) in compareOperations) { #> + public static VelocityWarp32 Compare<#= kind #>I( + VelocityWarp32 left, + VelocityWarp32 right) => + new VelocityWarp32(Vector.<#= op #>(left.As(), right.As())) + <#= postOp #>; + + public static VelocityWarp32 Compare<#= kind #>U( + VelocityWarp32 left, + VelocityWarp32 right) => + new VelocityWarp32(Vector.<#= op #>(left.As(), right.As())) + <#= postOp #>; + + public static VelocityWarp32 Compare<#= kind #>F( + VelocityWarp32 left, + VelocityWarp32 right) => + new VelocityWarp32(Vector.<#= op #>(left.As(), right.As())) + <#= postOp #>; + +<# } #> + + #endregion + + #region Convert Operations + +<# foreach (var sourceType in warp32ConvTypes) { #> +<# foreach (var targetType in warp32ConvTypes) { #> +<# string suffix = targetType.Kind == TypeInformationKind.SignedInt + ? "I" : targetType.Kind == TypeInformationKind.UnsignedInt ? "U" : "F"; #> +<# foreach (int multiplier in convTypeMultipliers) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 Convert<#= sourceType.Name #>To<#= targetType.Name #>_<#= multiplier #>( + this VelocityWarp32 value) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return value; +<# } else { #> + var data = new Data.Warp32_<#= sourceType.Name #>_<#= multiplier #>(value); + var targetData = data.ConvertTo<#= targetType.Name #>(); + return targetData.ToWarp<#= suffix #>(); +<# } #> + } + +<# } #> +<# } #> +<# } #> + + #endregion + } + + static partial class VelocityWarpOperations64 + { + #region Static Properties + + public static readonly int Length = VelocityWarp64.Length; + public static readonly int RawLength = VelocityWarp64.RawVectorLength; + + #endregion + + #region Scalar Operations + +<# foreach (var (_, typeName, implType, _) in implementationTypes64) { #> + /// + /// A scalar operation working on <#= implType #>s. + /// + internal interface IScalar<#= typeName #>Operation + { + <#= implType #> Apply(int index, <#= implType #> value); + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + internal static VelocityWarp64 ApplyScalar<#= typeName #>Operation( + this VelocityWarp64 warp, + TOperation operation) + where TOperation : struct, IScalar<#= typeName #>Operation + { + Vector<<#= implType #>> lowerResult; + Vector<<#= implType #>> upperResult; + + if (VelocityWarp64.IsVector128) + { + // Use specialized Vector128 implementation + var lowerVec = warp.LowerAs<<#= implType #>>().AsVector128(); + lowerResult = Vector128.Create( + operation.Apply(0, lowerVec.GetElement(0)), + operation.Apply(1, lowerVec.GetElement(1))) + .AsVector(); + + var upperVec = warp.UpperAs<<#= implType #>>().AsVector128(); + upperResult = Vector128.Create( + operation.Apply(2, upperVec.GetElement(0)), + operation.Apply(3, upperVec.GetElement(1))) + .AsVector(); + } + else if (VelocityWarp64.IsVector256) + { + // Use specialized Vector256 implementation + var lowerVec = warp.LowerAs<<#= implType #>>().AsVector256(); + lowerResult = Vector256.Create( + operation.Apply(0, lowerVec.GetElement(0)), + operation.Apply(1, lowerVec.GetElement(1)), + operation.Apply(2, lowerVec.GetElement(2)), + operation.Apply(3, lowerVec.GetElement(3))) + .AsVector(); + + var upperVec = warp.UpperAs<<#= implType #>>().AsVector256(); + upperResult = Vector256.Create( + operation.Apply(4, upperVec.GetElement(0)), + operation.Apply(5, upperVec.GetElement(1)), + operation.Apply(6, upperVec.GetElement(2)), + operation.Apply(7, upperVec.GetElement(3))) + .AsVector(); + } + else + { + // Allocate memory locally on the stack + Span<<#= implType #>> target = stackalloc <#= implType #>[Length]; + + // Fill target data from both chunks + warp.LowerAs<<#= implType #>>().CopyTo(target[0..RawLength]); + warp.UpperAs<<#= implType #>>().CopyTo(target[RawLength..RawLength]); + + // Process data in temporary memory + for (int i = 0; i < Length; ++i) + target[i] = operation.Apply(i, target[i]); + + lowerResult = new Vector<<#= implType #>>(target[0..RawLength]); + upperResult = new Vector<<#= implType #>>(target[RawLength..RawLength]); + } + + // Assemble the actual 64bit velocity warp + return new VelocityWarp64( + lowerResult.As<<#= implType #>, ulong>(), + upperResult.As<<#= implType #>, ulong>()); + } + + /// + /// A binary scalar operation working on <#= implType #>s. + /// + internal interface IBinaryScalar<#= typeName #>Operation + { + <#= implType #> Apply(int index, <#= implType #> left, <#= implType #> right); + } + + [MethodImpl( + MethodImplOptions.AggressiveInlining | + MethodImplOptions.AggressiveOptimization)] + internal static VelocityWarp64 ApplyBinaryScalar<#= typeName #>Operation( + this VelocityWarp64 left, + VelocityWarp64 right, + TOperation operation) + where TOperation : struct, IBinaryScalar<#= typeName #>Operation + { + Vector<<#= implType #>> lowerResult; + Vector<<#= implType #>> upperResult; + + if (VelocityWarp64.IsVector128) + { + // Use specialized Vector128 implementation + var lowerLeft = left.LowerAs<<#= implType #>>().AsVector128(); + var lowerRight = right.LowerAs<<#= implType #>>().AsVector128(); + lowerResult = Vector128.Create( + operation.Apply(0, lowerLeft.GetElement(0), lowerRight.GetElement(0)), + operation.Apply(1, lowerLeft.GetElement(1), lowerRight.GetElement(1))) + .AsVector(); + + var upperLeft = left.UpperAs<<#= implType #>>().AsVector128(); + var upperRight = right.UpperAs<<#= implType #>>().AsVector128(); + upperResult = Vector128.Create( + operation.Apply(2, upperLeft.GetElement(0), upperRight.GetElement(0)), + operation.Apply(3, upperLeft.GetElement(1), upperRight.GetElement(1))) + .AsVector(); + } + else if (VelocityWarp64.IsVector256) + { + // Use specialized Vector256 implementation + var lowerLeft = left.LowerAs<<#= implType #>>().AsVector256(); + var lowerRight = right.LowerAs<<#= implType #>>().AsVector256(); + lowerResult = Vector256.Create( + operation.Apply(0, lowerLeft.GetElement(0), lowerRight.GetElement(0)), + operation.Apply(1, lowerLeft.GetElement(1), lowerRight.GetElement(1)), + operation.Apply(2, lowerLeft.GetElement(2), lowerRight.GetElement(2)), + operation.Apply(3, lowerLeft.GetElement(3), lowerRight.GetElement(3))) + .AsVector(); + + var upperLeft = left.UpperAs<<#= implType #>>().AsVector256(); + var upperRight = right.UpperAs<<#= implType #>>().AsVector256(); + upperResult = Vector256.Create( + operation.Apply(4, upperLeft.GetElement(0), upperRight.GetElement(0)), + operation.Apply(5, upperLeft.GetElement(1), upperRight.GetElement(1)), + operation.Apply(6, upperLeft.GetElement(2), upperRight.GetElement(2)), + operation.Apply(7, upperLeft.GetElement(3), upperRight.GetElement(3))) + .AsVector(); + } + else + { + // Allocate memory locally on the stack + Span<<#= implType #>> tempLeft = stackalloc <#= implType #>[Length]; + Span<<#= implType #>> target = stackalloc <#= implType #>[Length]; + + // Fill target data from both chunks + left.LowerAs<<#= implType #>>().CopyTo(tempLeft[0..RawLength]); + left.UpperAs<<#= implType #>>().CopyTo(tempLeft[RawLength..RawLength]); + right.LowerAs<<#= implType #>>().CopyTo(target[0..RawLength]); + right.UpperAs<<#= implType #>>().CopyTo(target[RawLength..RawLength]); + + // Process data in temporary memory + for (int i = 0; i < Length; ++i) + target[i] = operation.Apply(i, tempLeft[i], target[i]); + + lowerResult = new Vector<<#= implType #>>(target[0..RawLength]); + upperResult = new Vector<<#= implType #>>(target[RawLength..RawLength]); + } + + // Assemble the actual 64bit velocity warp + return new VelocityWarp64( + lowerResult.As<<#= implType #>, ulong>(), + upperResult.As<<#= implType #>, ulong>()); + } + +<# } #> + + #endregion + + #region Unary Operations + +<# foreach (var op in unaryOps) { #> +<# var config = op.GetVelocity64(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> +<# string interfaceSuffix = op.IsPredicate ? "U" : suffix; #> +<# string interfaceTypeName = op.IsPredicate ? "ulong" : typeName; #> +<# if (config.SoftwareEmulation) { #> + private readonly struct <#= op.Name #><#= implSuffix #>Impl : + IScalar<#= interfaceSuffix #>Operation + { + public <#= interfaceTypeName #> Apply(int _, <#= interfaceTypeName #> value) => +<# if (op.IsPredicate) { #> + <#= op.GetOpOrCall(false, "Interop.IntAsFloat(value)") #> ? ulong.MaxValue : 0UL; +<# } else { #> + (<#= typeName #>)<#= op.GetOpOrCall(false, "value") #>; +<# } #> + } + +<# } #> + public static VelocityWarp<#= config.ReturnAsWarp32 ? "32" : "64" #> <#= op.Name #><#= implSuffix #>( + this VelocityWarp64 warp) => +<# if (config.SoftwareEmulation) { #> + warp.ApplyScalar<#= interfaceSuffix #>Operation( + new <#= op.Name #><#= implSuffix #>Impl()) +<# if (config.ReturnAsWarp32) { #> + .NarrowI() +<# } #>; +<# } else { #> + <#= config.GetImplementation( + "As", + suffix, + typeName, + "warp") #>; +<# } #> + +<# } #> +<# } #> + + #endregion + + #region Binary Operations + +<# foreach (var op in binaryOps) { #> +<# var config = op.GetVelocity64(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> +<# if (config.SoftwareEmulation) { #> + private readonly struct <#= op.Name #><#= implSuffix #>Impl : + IBinaryScalar<#= suffix #>Operation + { + public <#= typeName #> Apply(int _, <#= typeName #> left, <#= typeName #> right) => + (<#= typeName #>)<#= op.GetOpOrCall(false, "left", "right") #>; + } + +<# } #> + public static VelocityWarp64 <#= op.Name #><#= implSuffix #>( + this VelocityWarp64 left, + VelocityWarp64 right) => +<# if (config.SoftwareEmulation) { #> + ApplyBinaryScalar<#= suffix #>Operation( + left, + right, + new <#= op.Name #><#= implSuffix #>Impl()); +<# } else if (config.RequiresSpecialization) { #> + new VelocityWarp64( + <#= config.GetImplementation( + "LowerAs", + suffix, + typeName, + "left", + "right") #>, + <#= config.GetImplementation( + "UpperAs", + suffix, + typeName, + "left", + "right") #>); +<# } else { #> + <#= config.GetImplementation( + null, + suffix, + typeName, + "left", + "right") #>; +<# } #> + +<# } #> +<# } #> + #endregion + + #region Ternary Operations + +<# foreach (var op in ternaryOps) { #> +<# var config = op.GetVelocity64(); #> +<# foreach (var (flags, suffix, typeName, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + public static VelocityWarp64 <#= op.Name #><#= implSuffix #>( + this VelocityWarp64 first, + VelocityWarp64 second, + VelocityWarp64 third) => + <#= config.GetImplementation( + "As", + suffix, + typeName, + "first", + "second", + "third") #>; + +<# } #> +<# } #> + + #endregion + + #region Compare Operations + +<# foreach (var (kind, op, postOp) in compareOperations) { #> + public static VelocityWarp64 Compare<#= kind #>I( + VelocityWarp64 left, + VelocityWarp64 right) => + new VelocityWarp64( + Vector.<#= op #>(left.LowerAs(), right.LowerAs()), + Vector.<#= op #>(left.UpperAs(), right.UpperAs())) + <#= postOp #>; + + public static VelocityWarp64 Compare<#= kind #>U( + VelocityWarp64 left, + VelocityWarp64 right) => + new VelocityWarp64( + Vector.<#= op #>(left.LowerAs(), right.LowerAs()), + Vector.<#= op #>(left.UpperAs(), right.UpperAs())) + <#= postOp #>; + + public static VelocityWarp64 Compare<#= kind #>F( + VelocityWarp64 left, + VelocityWarp64 right) => + new VelocityWarp64( + Vector.<#= op #>(left.LowerAs(), right.LowerAs()), + Vector.<#= op #>(left.UpperAs(), right.UpperAs())) + <#= postOp #>; + +<# } #> + + #endregion + + #region Convert Operations + +<# foreach (var sourceType in warp64ConvTypes) { #> +<# foreach (var targetType in warp64ConvTypes) { #> +<# string suffix = targetType.Kind == TypeInformationKind.SignedInt + ? "I" : targetType.Kind == TypeInformationKind.UnsignedInt ? "U" : "F"; #> +<# foreach (int multiplier in convTypeMultipliers) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 Convert<#= sourceType.Name #>To<#= targetType.Name #>_<#= multiplier #>( + this VelocityWarp64 value) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return value; +<# } else { #> + var data = new Data.Warp64_<#= sourceType.Name #>_<#= multiplier #>(value); + var targetData = data.ConvertTo<#= targetType.Name #>(); + return targetData.ToWarp<#= suffix #>(); +<# } #> + } + +<# } #> +<# } #> +<# } #> + +<# foreach (var (_, prefix, _, implTypeName) in implementationTypes64) { #> +<# foreach (var (_, targetPrefix, _, targetImplTypeName) in implementationTypes64) { #> +<# if (prefix == targetPrefix) continue; #> +<# foreach (int multiplier in convTypeMultipliers) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 Convert<#= prefix #>To<#= targetPrefix #>_<#= multiplier #>(this VelocityWarp64 value) + { + var data = new Data.Warp64_<#= implTypeName #>_<#= multiplier #>(value); + var targetData = data.ConvertTo<#= targetImplTypeName #>(); + return targetData.ToWarp<#= targetPrefix == "F" ? "I" : targetPrefix #>(); + } + +<# } #> +<# } #> +<# } #> + + #endregion + + #region Load & Store Operations + +<# foreach (var type in warp32IOTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp32 Load32_<#= type.Name #>_<#= multiplier #>( + this VelocityWarp64 source, + VelocityLaneMask mask) + { + var data = new Data.Warp32_<#= type.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + mask.IsActive(<#= i #>) + ? source.LoadFromPtr<<#= type.Type #>>(<#= i #>) + : default<#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ); + return data.<#= type.IsSignedInt ? "ToWarpI" : "ToWarpU" #>(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Store32_<#= type.Name #>_<#= multiplier #>( + this VelocityWarp32 value, + VelocityWarp64 target, + VelocityLaneMask mask) + { + var data = new Data.Warp32_<#= type.Name #>_<#= multiplier #>(value); +<# for (int i = 0; i < multiplier; ++i) { #> + if (mask.IsActive(<#= i #>)) + target.StoreToPtr(<#= i #>, data.Field<#= i #>); +<# } #> + } + +<# } #> +<# }#> + +<# foreach (var type in warp64IOTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VelocityWarp64 Load64_<#= type.Name #>_<#= multiplier #>( + this VelocityWarp64 source, + VelocityLaneMask mask) + { + var data = new Data.Warp64_<#= type.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + mask.IsActive(<#= i #>) + ? source.LoadFromPtr<<#= type.Type #>>(<#= i #>) + : default<#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ); + return data.<#= type.IsSignedInt ? "ToWarpI" : "ToWarpU" #>(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Store64_<#= type.Name #>_<#= multiplier #>( + this VelocityWarp64 value, + VelocityWarp64 target, + VelocityLaneMask mask) + { + var data = new Data.Warp64_<#= type.Name #>_<#= multiplier #>(value); +<# for (int i = 0; i < multiplier; ++i) { #> + if (mask.IsActive(<#= i #>)) + target.StoreToPtr(<#= i #>, data.Field<#= i #>); +<# } #> + } + +<# } #> +<# }#> + + #endregion + } + +<# foreach (var (warpName, _, implTypeName, types) in atomicTypes) { #> + partial class <#= implTypeName #> + { + public static <#= warpName #> AtomicCompareExchange( + VelocityWarp64 target, + <#= warpName #> compare, + <#= warpName #> value, + VelocityLaneMask mask) => + value.AtomicCompareExchange(target, compare, mask); + +<# foreach (var atomicType in types) { #> +<# string suffix = atomicType.Name[0].ToString(); #> + private readonly struct AtomicExchange<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Exchange(ref target, value); + } + + public static <#= warpName #> AtomicExchange<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicExchange<#= suffix #>Impl>(target, mask); + + private readonly struct AtomicAdd<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Add(ref target, value); + } + + public static <#= warpName #> AtomicAdd<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicAdd<#= suffix #>Impl>(target, mask); + + private readonly struct AtomicMax<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Max(ref target, value); + } + + public static <#= warpName #> AtomicMax<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicMax<#= suffix #>Impl>(target, mask); + + private readonly struct AtomicMin<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Min(ref target, value); + } + + public static <#= warpName #> AtomicMin<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicMin<#= suffix #>Impl>(target, mask); + +<# if (atomicType.IsInt) { #> + private readonly struct AtomicAnd<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.And(ref target, value); + } + + public static <#= warpName #> AtomicAnd<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicAnd<#= suffix #>Impl>(target, mask); + + private readonly struct AtomicOr<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Or(ref target, value); + } + + public static <#= warpName #> AtomicOr<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicOr<#= suffix #>Impl>(target, mask); + + private readonly struct AtomicXor<#= suffix #>Impl : + <#= warpName #>.IAtomicOperation<<#= atomicType.Type #>> + { + public <#= atomicType.Type #> Atomic(ref <#= atomicType.Type #> target, <#= atomicType.Type #> value) => + ILGPU.Atomic.Xor(ref target, value); + } + + public static <#= warpName #> AtomicXor<#= suffix #>( + VelocityWarp64 target, + <#= warpName #> value, + VelocityLaneMask mask) => + value.Atomic<<#= atomicType.Type #>, AtomicXor<#= suffix #>Impl>(target, mask); + +<# } #> +<# } #> + } +<# } #> + + partial class VelocityOperations + { + #region Misc + + /// + /// The internal binding flags to access all methods and properties. + /// + protected const BindingFlags Flags = + BindingFlags.Public | + BindingFlags.NonPublic | + BindingFlags.Static; + + protected static FieldInfo GetField(Type type, string name) => + type.GetField(name, Flags); + + protected static PropertyInfo GetProperty(Type type, string name) => + type.GetProperty(name, Flags); + + protected static MethodInfo GetMethod(Type type, string name) => + type.GetMethod(name, Flags); + + public VelocityOperations() + { + var operationType32 = typeof(VelocityWarpOperations32); + var operationType64 = typeof(VelocityWarpOperations64); + + InitUnaryOperations(operationType32, operationType64); + InitBinaryOperations(operationType32, operationType64); + InitTernaryOperations(operationType32, operationType64); + + InitCompareOperations(operationType32, operationType64); + InitConvertOperations(operationType32, operationType64); + InitAtomicOperations(operationType32, operationType64); + InitIOOperations(operationType64); + + InitVelocityOperations32(operationType32); + InitVelocityOperations64(operationType64); + InitVelocityLaneMaskEmitter(); + } + + #endregion + + #region Unary Operations + + private readonly Dictionary<(UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> unaryOperations32 = + new Dictionary<(UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= unaryOps.Length * 3 #>); + private readonly Dictionary<(UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> unaryOperations64 = + new Dictionary<(UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= unaryOps.Length * 3 #>); + + public MethodInfo GetUnaryOperation32(UnaryArithmeticKind kind, VelocityWarpOperationMode mode) => + unaryOperations32[(kind, mode)]; + public MethodInfo GetUnaryOperation64(UnaryArithmeticKind kind, VelocityWarpOperationMode mode) => + unaryOperations64[(kind, mode)]; + + private void InitUnaryOperations(Type operationType32, Type operationType64) + { +<# foreach (var op in unaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + unaryOperations32.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType32, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + +<# foreach (var op in unaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + unaryOperations64.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType64, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + } + + #endregion + + #region Binary Operations + + private readonly Dictionary<(BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> binaryOperations32 = + new Dictionary<(BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= binaryOps.Length * 3 #>); + private readonly Dictionary<(BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> binaryOperations64 = + new Dictionary<(BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= binaryOps.Length * 3 #>); + + public MethodInfo GetBinaryOperation32(BinaryArithmeticKind kind, VelocityWarpOperationMode mode) => + binaryOperations32[(kind, mode)]; + public MethodInfo GetBinaryOperation64(BinaryArithmeticKind kind, VelocityWarpOperationMode mode) => + binaryOperations64[(kind, mode)]; + + private void InitBinaryOperations(Type operationType32, Type operationType64) + { +<# foreach (var op in binaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + binaryOperations32.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType32, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + +<# foreach (var op in binaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + binaryOperations64.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType64, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + } + + #endregion + + #region Ternary Operations + + private readonly Dictionary<(TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> ternaryOperations32 = + new Dictionary<(TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= ternaryOps.Length * 3 #>); + private readonly Dictionary<(TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> ternaryOperations64 = + new Dictionary<(TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>(<#= ternaryOps.Length * 3 #>); + + public MethodInfo GetTernaryOperation32(TernaryArithmeticKind kind, VelocityWarpOperationMode mode) => + ternaryOperations32[(kind, mode)]; + public MethodInfo GetTernaryOperation64(TernaryArithmeticKind kind, VelocityWarpOperationMode mode) => + ternaryOperations64[(kind, mode)]; + + private void InitTernaryOperations(Type operationType32, Type operationType64) + { +<# foreach (var op in ternaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + ternaryOperations32.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType32, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + +<# foreach (var op in ternaryOps) { #> +<# foreach (var (flags, suffix, _, _) in + implementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# string implSuffix = op.HasFloats && (flags & MathOpFlags.Floats) != 0 + ? string.Empty + : suffix; #> + ternaryOperations64.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType64, "<#= op.Name #><#= implSuffix #>")); +<# } #> +<# } #> + } + + #endregion + + #region Compare Operations + + private readonly Dictionary<(CompareKind, VelocityWarpOperationMode), MethodInfo> compareOperations32 = + new Dictionary<(CompareKind, VelocityWarpOperationMode), MethodInfo>(<#= compareOperations.Length * 3 #>); + private readonly Dictionary<(CompareKind, VelocityWarpOperationMode), MethodInfo> compareOperations64 = + new Dictionary<(CompareKind, VelocityWarpOperationMode), MethodInfo>(<#= compareOperations.Length * 3 #>); + + public MethodInfo GetCompareOperation32(CompareKind kind, VelocityWarpOperationMode mode) => + compareOperations32[(kind, mode)]; + public MethodInfo GetCompareOperation64(CompareKind kind, VelocityWarpOperationMode mode) => + compareOperations64[(kind, mode)]; + + private void InitCompareOperations(Type operationType32, Type operationType64) + { +<# foreach (var (kind, op, postOp) in compareOperations) { #> +<# foreach (var (_, suffix, _, _) in implementationTypes32) { #> + compareOperations32.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType32, "Compare<#= kind #><#= suffix #>")); +<# } #> +<# foreach (var (_, suffix, _, _) in implementationTypes64) { #> + compareOperations64.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType64, "Compare<#= kind #><#= suffix #>")); +<# } #> +<# } #> + } + + #endregion + + #region Convert Operations + + private readonly Dictionary<(ArithmeticBasicValueType, ArithmeticBasicValueType, int), MethodInfo> softwareConvertOperations32 = + new Dictionary<(ArithmeticBasicValueType, ArithmeticBasicValueType, int), MethodInfo>( + <#= (warp32ConvTypes.Length * warp32ConvTypes.Length) * convTypeMultipliers.Length #>); + private readonly Dictionary<(ArithmeticBasicValueType, ArithmeticBasicValueType, int), MethodInfo> softwareConvertOperations64 = + new Dictionary<(ArithmeticBasicValueType, ArithmeticBasicValueType, int), MethodInfo>( + <#= (warp64ConvTypes.Length * warp64ConvTypes.Length) * convTypeMultipliers.Length #>); + + private readonly Dictionary<(VelocityWarpOperationMode, VelocityWarpOperationMode), MethodInfo> convertOperations32 = + new Dictionary<(VelocityWarpOperationMode, VelocityWarpOperationMode), MethodInfo>( + <#= (implementationTypes32.Length * implementationTypes32.Length - 1 ) #>); + private readonly Dictionary<(VelocityWarpOperationMode, VelocityWarpOperationMode), MethodInfo> convertOperations64 = + new Dictionary<(VelocityWarpOperationMode, VelocityWarpOperationMode), MethodInfo>( + <#= (implementationTypes64.Length * implementationTypes64.Length - 1) #>); + + public MethodInfo GetSoftwareConvertOperation32( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target, + int multiplier) => + softwareConvertOperations32[(source, target, multiplier)]; + public MethodInfo GetSoftwareConvertOperation64( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target, + int multiplier) => + softwareConvertOperations64[(source, target, multiplier)]; + + public MethodInfo GetAcceleratedConvertOperation32( + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + convertOperations32[(source, target)]; + public MethodInfo GetAcceleratedConvertOperation64( + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + convertOperations64[(source, target)]; + + private void InitConvertOperations(Type operationType32, Type operationType64) + { +<# foreach (var source in warp32ConvTypes) { #> +<# foreach (var target in warp32ConvTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> + softwareConvertOperations32.Add(( + ArithmeticBasicValueType.<#= source.GetArithmeticBasicValueType() #>, + ArithmeticBasicValueType.<#= target.GetArithmeticBasicValueType() #>, + <#= multiplier #>), + GetMethod(operationType32, "Convert<#= source.Name #>To<#= target.Name #>_<#= multiplier #>")); +<# } #> +<# } #> +<# } #> + +<# foreach (var source in warp64ConvTypes) { #> +<# foreach (var target in warp64ConvTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> + softwareConvertOperations64.Add(( + ArithmeticBasicValueType.<#= source.GetArithmeticBasicValueType() #>, + ArithmeticBasicValueType.<#= target.GetArithmeticBasicValueType() #>, + <#= multiplier #>), + GetMethod(operationType64, "Convert<#= source.Name #>To<#= target.Name #>_<#= multiplier #>")); +<# } #> +<# } #> +<# } #> + +<# foreach (var (_, prefix, _, implTypeName) in implementationTypes32) { #> +<# foreach (var (_, targetPrefix, _, targetImplTypeName) in implementationTypes32) { #> + convertOperations32.Add(( + VelocityWarpOperationMode.<#= prefix #>, + VelocityWarpOperationMode.<#= targetPrefix #>), + GetMethod(operationType32, "Convert<#= prefix #>To<#= targetPrefix #>")); +<# } #> +<# } #> + +<# foreach (var (_, prefix, _, implTypeName) in implementationTypes64) { #> +<# foreach (var (_, targetPrefix, _, targetImplTypeName) in implementationTypes64) { #> + convertOperations64.Add(( + VelocityWarpOperationMode.<#= prefix #>, + VelocityWarpOperationMode.<#= targetPrefix #>), + GetMethod(operationType64, "Convert<#= prefix #>To<#= targetPrefix #>")); +<# } #> +<# } #> + } + + #endregion + + #region Load & Store Operations + + private readonly Dictionary<(BasicValueType, int), (MethodInfo Load, MethodInfo Store)> ioOperations32 = + new Dictionary<(BasicValueType, int), (MethodInfo, MethodInfo)>( + <#= convTypeMultipliers.Length * warp32IOTypes.Length #>); + private readonly Dictionary<(BasicValueType, int), (MethodInfo Load, MethodInfo Store)> ioOperations64 = + new Dictionary<(BasicValueType, int), (MethodInfo, MethodInfo)>( + <#= convTypeMultipliers.Length * warp64IOTypes.Length #>); + + public (MethodInfo Load, MethodInfo Store) GetIOOperation32( + BasicValueType type, + int multiplier) => + ioOperations32[(type, multiplier)]; + public (MethodInfo Load, MethodInfo Store) GetIOOperation64( + BasicValueType type, + int multiplier) => + ioOperations64[(type, multiplier)]; + + private void InitIOOperations(Type operationType64) + { +<# foreach (int multiplier in convTypeMultipliers) { #> +<# foreach (var type in warp32IOTypes) { #> + ioOperations32.Add( + (BasicValueType.<#= type.GetBasicValueType() #>, <#= multiplier #>), + (GetMethod(operationType64, "Load32_<#= type.Name #>_<#= multiplier #>"), + GetMethod(operationType64, "Store32_<#= type.Name #>_<#= multiplier #>"))); +<# } #> + +<# foreach (var type in warp64IOTypes) { #> + ioOperations64.Add( + (BasicValueType.<#= type.GetBasicValueType() #>, <#= multiplier #>), + (GetMethod(operationType64, "Load64_<#= type.Name #>_<#= multiplier #>"), + GetMethod(operationType64, "Store64_<#= type.Name #>_<#= multiplier #>"))); +<# } #> +<# }#> + } + + #endregion + + #region Atomic Operations + + private readonly MethodInfo[] atomicCompareExchangeOperations = new MethodInfo[] + { + GetMethod(typeof(VelocityWarpOperations32), "AtomicCompareExchange"), + GetMethod(typeof(VelocityWarpOperations64), "AtomicCompareExchange") + }; + + private readonly Dictionary<(AtomicKind, VelocityWarpOperationMode), MethodInfo> atomicOperations32 = + new Dictionary<(AtomicKind, VelocityWarpOperationMode), MethodInfo>( + <#= atomicTypes.Length * 7 #>); + private readonly Dictionary<(AtomicKind, VelocityWarpOperationMode), MethodInfo> atomicOperations64 = + new Dictionary<(AtomicKind, VelocityWarpOperationMode), MethodInfo>( + <#= atomicTypes.Length * 7 #>); + + public MethodInfo AtomicCompareExchangeOperation32 => + atomicCompareExchangeOperations[0]; + public MethodInfo AtomicCompareExchangeOperation64 => + atomicCompareExchangeOperations[1]; + + public MethodInfo GetAtomicOperation32( + AtomicKind kind, + VelocityWarpOperationMode mode) => + atomicOperations32[(kind, mode)]; + public MethodInfo GetAtomicOperation64( + AtomicKind kind, + VelocityWarpOperationMode mode) => + atomicOperations64[(kind, mode)]; + + private void InitAtomicOperations(Type operationType32, Type operationType64) + { +<# foreach (var (warpName, bitWidth, implTypeName, types) in atomicTypes) { #> +<# foreach (var atomicType in types) { #> +<# string suffix = atomicType.Name[0].ToString(); #> +<# var operations = new string[] { "Exchange", "Add", "Max", "Min" }; #> +<# var intOperations = new string[] { "And", "Or", "Xor" }; #> +<# var operationsToEmit = atomicType.IsInt ? operations.Concat(intOperations) : operations; #> +<# foreach (var operation in operationsToEmit) { #> + atomicOperations<#= bitWidth #>.Add( + (AtomicKind.<#= operation #>, VelocityWarpOperationMode.<#= suffix #>), + GetMethod(operationType<#= bitWidth #>, "Atomic<#= operation #><#= suffix #>")); +<# } #> +<# } #> +<# } #> + } + + #endregion + } + + namespace Data + { +<# foreach (var type in warp32ConvTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> +<# string suffix = type.IsFloat16 ? ".RawValue" : ""; #> +<# string getElement = type.IsIntOrFloat16 ? "warp[{0}]" : "warp.GetElementF({0})"; #> +<# string setElementU = type.IsIntOrFloat16 + ? "(uint)Field{0}" + suffix + : "Interop.FloatAsInt((float)Field{0})"; #> +<# string setElementI = type.IsIntOrFloat16 + ? "(int)Field{0}" + suffix + : "(int)Interop.FloatAsInt((float)Field{0})"; #> +<# string setElementF = type.IsInt? "(float)Field{0}" : "Field{0}" + suffix; #> + readonly struct Warp32_<#= type.Name #>_<#= multiplier #> + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Warp32_<#= type.Name #>_<#= multiplier #>(VelocityWarp32 warp) + { +<# for (int i = 0; i < multiplier; ++i) { #> +<# if (type.IsFloat16) { #> + Field<#= i #> = new <#= type.Type #>((ushort)<#= string.Format(getElement, i) #>); +<# } else { #> + Field<#= i #> = (<#= type.Type #>)<#= string.Format(getElement, i) #>; +<# } #> +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Warp32_<#= type.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + <#= type.Type #> field<#= i #><#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ) + { +<# for (int i = 0; i < multiplier; ++i) { #> + Field<#= i #> = field<#= i #>; +<# } #> + } + +<# for (int i = 0; i < multiplier; ++i) { #> + public readonly <#= type.Type #> Field<#= i #>; +<# } #> + +<# foreach (var targetType in warp32ConvTypes) { #> +<# if (type == targetType) continue; #> + public Warp32_<#= targetType.Name #>_<#= multiplier #> ConvertTo<#= targetType.Name #>() => + new Warp32_<#= targetType.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + (<#= targetType.Type #>)Field<#= i #><#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ); + +<# } #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ToWarpU() + { +<# if (multiplier > 4) { #> + if (VelocityWarp32.IsVector128) +<# } #> + { + return Vector128.Create( + <#= string.Format(setElementU, 0) #>, + <#= string.Format(setElementU, 1) #>, + <#= string.Format(setElementU, 2) #>, + <#= string.Format(setElementU, 3) #>) + .AsVector(); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return Vector256.Create( + <#= string.Format(setElementU, 0) #>, + <#= string.Format(setElementU, 1) #>, + <#= string.Format(setElementU, 2) #>, + <#= string.Format(setElementU, 3) #>, + <#= string.Format(setElementU, 4) #>, + <#= string.Format(setElementU, 5) #>, + <#= string.Format(setElementU, 6) #>, + <#= string.Format(setElementU, 7) #>) + .AsVector(); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc uint[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementU, i) #>; +<# } #> + return new Vector(results); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ToWarpI() + { +<# if (multiplier > 4) { #> + if (VelocityWarp32.IsVector128) +<# } #> + { + return Vector128.Create( + <#= string.Format(setElementI, 0) #>, + <#= string.Format(setElementI, 1) #>, + <#= string.Format(setElementI, 2) #>, + <#= string.Format(setElementI, 3) #>) + .AsVector(); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return Vector256.Create( + <#= string.Format(setElementI, 0) #>, + <#= string.Format(setElementI, 1) #>, + <#= string.Format(setElementI, 2) #>, + <#= string.Format(setElementI, 3) #>, + <#= string.Format(setElementI, 4) #>, + <#= string.Format(setElementI, 5) #>, + <#= string.Format(setElementI, 6) #>, + <#= string.Format(setElementI, 7) #>) + .AsVector(); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc int[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementI, i) #>; +<# } #> + return new Vector(results); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp32 ToWarpF() + { +<# if (multiplier > 4) { #> + if (VelocityWarp32.IsVector128) +<# } #> + { + return Vector128.Create( + <#= string.Format(setElementF, 0) #>, + <#= string.Format(setElementF, 1) #>, + <#= string.Format(setElementF, 2) #>, + <#= string.Format(setElementF, 3) #>) + .AsVector(); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return Vector256.Create( + <#= string.Format(setElementF, 0) #>, + <#= string.Format(setElementF, 1) #>, + <#= string.Format(setElementF, 2) #>, + <#= string.Format(setElementF, 3) #>, + <#= string.Format(setElementF, 4) #>, + <#= string.Format(setElementF, 5) #>, + <#= string.Format(setElementF, 6) #>, + <#= string.Format(setElementF, 7) #>) + .AsVector(); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc float[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementF, i) #>; +<# } #> + return new Vector(results); +<# } #> + } + } +<# } #> +<# }#> + +<# foreach (var type in warp64ConvTypes) { #> +<# foreach (int multiplier in convTypeMultipliers) { #> +<# string getElement = type.IsInt ? "warp[{0}]" : "warp.GetElementF({0})"; #> +<# string setElementU = type.IsInt ? "(ulong)Field{0}" : "Interop.FloatAsInt(Field{0})"; #> +<# string setElementI = type.IsInt ? "(long)Field{0}" : "(long)Interop.FloatAsInt(Field{0})"; #> +<# string setElementF = type.IsInt ? "(double)Field{0}" : "Field{0}"; #> + readonly struct Warp64_<#= type.Name #>_<#= multiplier #> + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Warp64_<#= type.Name #>_<#= multiplier #>(VelocityWarp64 warp) + { +<# for (int i = 0; i < multiplier; ++i) { #> + Field<#= i #> = (<#= type.Type #>)<#= string.Format(getElement, i) #>; +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Warp64_<#= type.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + <#= type.Type #> field<#= i #><#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ) + { +<# for (int i = 0; i < multiplier; ++i) { #> + Field<#= i #> = field<#= i #>; +<# } #> + } + +<# for (int i = 0; i < multiplier; ++i) { #> + public readonly <#= type.Type #> Field<#= i #>; +<# } #> + +<# foreach (var targetType in warp64ConvTypes) { #> +<# if (type == targetType) continue; #> + public Warp64_<#= targetType.Name #>_<#= multiplier #> ConvertTo<#= targetType.Name #>() => + new Warp64_<#= targetType.Name #>_<#= multiplier #>( +<# for (int i = 0; i < multiplier; ++i) { #> + (<#= targetType.Type #>)Field<#= i #><#= i + 1 < multiplier ? "," : string.Empty #> +<# } #> + ); + +<# } #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 ToWarpU() + { +<# if (multiplier > 4) { #> + if (VelocityWarp64.IsVector128) +<# } #> + { + return new VelocityWarp64( + Vector128.Create( + <#= string.Format(setElementU, 0) #>, + <#= string.Format(setElementU, 1) #>) + .AsVector(), + Vector128.Create( + <#= string.Format(setElementU, 2) #>, + <#= string.Format(setElementU, 3) #>) + .AsVector()); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return new VelocityWarp64( + Vector256.Create( + <#= string.Format(setElementU, 0) #>, + <#= string.Format(setElementU, 1) #>, + <#= string.Format(setElementU, 2) #>, + <#= string.Format(setElementU, 3) #>) + .AsVector(), + Vector256.Create( + <#= string.Format(setElementU, 4) #>, + <#= string.Format(setElementU, 5) #>, + <#= string.Format(setElementU, 6) #>, + <#= string.Format(setElementU, 7) #>) + .AsVector()); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc ulong[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementU, i) #>; +<# } #> + return new VelocityWarp64( + new Vector(results[0..<#= multiplier / 2 #>]), + new Vector(results[<#= multiplier / 2 #>..<#= multiplier / 2 #>])); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 ToWarpI() + { +<# if (multiplier > 4) { #> + if (VelocityWarp64.IsVector128) +<# } #> + { + return new VelocityWarp64( + Vector128.Create( + <#= string.Format(setElementI, 0) #>, + <#= string.Format(setElementI, 1) #>) + .AsVector(), + Vector128.Create( + <#= string.Format(setElementI, 2) #>, + <#= string.Format(setElementI, 3) #>) + .AsVector()); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return new VelocityWarp64( + Vector256.Create( + <#= string.Format(setElementI, 0) #>, + <#= string.Format(setElementI, 1) #>, + <#= string.Format(setElementI, 2) #>, + <#= string.Format(setElementI, 3) #>) + .AsVector(), + Vector256.Create( + <#= string.Format(setElementI, 4) #>, + <#= string.Format(setElementI, 5) #>, + <#= string.Format(setElementI, 6) #>, + <#= string.Format(setElementI, 7) #>) + .AsVector()); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc long[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementI, i) #>; +<# } #> + return new VelocityWarp64( + new Vector(results[0..<#= multiplier / 2 #>]), + new Vector(results[<#= multiplier / 2 #>..<#= multiplier / 2 #>])); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VelocityWarp64 ToWarpF() + { +<# if (multiplier > 4) { #> + if (VelocityWarp64.IsVector128) +<# } #> + { + return new VelocityWarp64( + Vector128.Create( + <#= string.Format(setElementF, 0) #>, + <#= string.Format(setElementF, 1) #>) + .AsVector(), + Vector128.Create( + <#= string.Format(setElementF, 2) #>, + <#= string.Format(setElementF, 3) #>) + .AsVector()); + } + +<# if (multiplier > 4) { #> +<# if (multiplier > 8) { #> + if (VelocityWarp32.IsVector256) +<# } #> + { + return new VelocityWarp64( + Vector256.Create( + <#= string.Format(setElementF, 0) #>, + <#= string.Format(setElementF, 1) #>, + <#= string.Format(setElementF, 2) #>, + <#= string.Format(setElementF, 3) #>) + .AsVector(), + Vector256.Create( + <#= string.Format(setElementF, 4) #>, + <#= string.Format(setElementF, 5) #>, + <#= string.Format(setElementF, 6) #>, + <#= string.Format(setElementF, 7) #>) + .AsVector()); + } +<# } #> + +<# if (multiplier > 8) { #> + Span results = stackalloc double[<#= multiplier #>]; +<# for (int i = 0; i < 4; ++i) { #> + results[<#= i #>] = <#= string.Format(setElementF, i) #>; +<# } #> + return new VelocityWarp64( + new Vector(results[0..<#= multiplier / 2 #>]), + new Vector(results[<#= multiplier / 2 #>..<#= multiplier / 2 #>])); +<# } #> + } + } +<# } #> +<# }#> + } +} \ No newline at end of file diff --git a/Src/ILGPU/RuntimeSystem.cs b/Src/ILGPU/RuntimeSystem.cs index 8b951c2da..6321e4ab7 100644 --- a/Src/ILGPU/RuntimeSystem.cs +++ b/Src/ILGPU/RuntimeSystem.cs @@ -110,7 +110,7 @@ public MethodEmitter( /// /// Returns the associated method builder. /// - private DynamicMethod Method { get; } + internal DynamicMethod Method { get; } /// /// Returns the internal IL generator. diff --git a/Src/ILGPU/Static/BinaryMathOperations.xml b/Src/ILGPU/Static/BinaryMathOperations.xml index 37b2b0fc0..533b06d7e 100644 --- a/Src/ILGPU/Static/BinaryMathOperations.xml +++ b/Src/ILGPU/Static/BinaryMathOperations.xml @@ -12,6 +12,7 @@ Add + {Value0}[{Type}] + {Value1}[{Type}] The - operation. @@ -38,6 +39,7 @@ Add + {Value0}[{Type}] - {Value1}[{Type}] The * operation. @@ -68,6 +70,7 @@ Mul + {Value0}[{Type}] * {Value1}[{Type}] The / operation. @@ -104,6 +107,7 @@ BinaryArithmeticKind.Shr) + {Value0}[{Type}] / {Value1}[{Type}] The % operation. @@ -113,6 +117,7 @@ {Value0} % {Value1} + ComputeRem{T}({Value0}, {Value1}) @@ -134,6 +139,7 @@ {Value0} + {Value0}.And({Value1}) The logical or operation. @@ -154,11 +160,13 @@ {Value1} + {Value0}.Or({Value1}) The logical xor operation. BoolsAndInts {Value0} ^ {Value1} + {Value0}.Xor({Value1}) The shift left operation. @@ -172,6 +180,7 @@ {Value1}.IsZero {Value0} + The shift right operation. @@ -185,6 +194,7 @@ {Value1}.IsZero {Value0} + @@ -196,6 +206,7 @@ Min + Vector.Min({Value0}[{Type}], {Value1}[{Type}]) The max operation. @@ -206,6 +217,7 @@ Max + Vector.Max({Value0}[{Type}], {Value1}[{Type}]) @@ -213,23 +225,27 @@ Floats IntrinsicMath.CPUOnly.Atan2 {MathType}.Atan2({Value0}, {Value1}) + The pow operation. Floats IntrinsicMath.CPUOnly.Pow {MathType}.Pow({Value0}, {Value1}) + The binary log operation. Floats IntrinsicMath.CPUOnly.Log {MathType}.Log({Value0}, {Value1}) + The copy sign operation. Floats IntrinsicMath.CopySign + diff --git a/Src/ILGPU/Static/TernaryMathOperations.xml b/Src/ILGPU/Static/TernaryMathOperations.xml index 9079425c8..db322f530 100644 --- a/Src/ILGPU/Static/TernaryMathOperations.xml +++ b/Src/ILGPU/Static/TernaryMathOperations.xml @@ -4,5 +4,6 @@ The FMA operation. IntsAndFloats IntrinsicMath.CPUOnly.FMA + {Value0}.MultiplyAdd{T}({Value1}, {Value2}) diff --git a/Src/ILGPU/Static/TypeInformation.ttinclude b/Src/ILGPU/Static/TypeInformation.ttinclude index b904e4d79..2ada0ef64 100644 --- a/Src/ILGPU/Static/TypeInformation.ttinclude +++ b/Src/ILGPU/Static/TypeInformation.ttinclude @@ -27,6 +27,27 @@ public enum TypeInformationKind public class TypeInformation { + private static readonly Dictionary BasicValueTypeMapping = + new Dictionary() + { + { "UInt8", "Int8" }, + { "UInt16", "Int16" }, + { "UInt32", "Int32" }, + { "UInt64", "Int64" }, + + { "Half", "Float16" }, + { "Float", "Float32" }, + { "Double", "Float64" }, + }; + + private static readonly Dictionary ArithmeticBasicValueTypeMapping = + new Dictionary() + { + { "Half", "Float16" }, + { "Float", "Float32" }, + { "Double", "Float64" }, + }; + public TypeInformation( string name, string type, @@ -42,31 +63,35 @@ public class TypeInformation } public string Name { get; } - public string Type { get; } - public TypeInformationKind Kind { get; } public string Prefix { get; } - public string Suffix { get; } public bool IsInt => IsSignedInt || IsUnsignedInt; - public bool IsSignedInt => Kind == TypeInformationKind.SignedInt; - public bool IsUnsignedInt => Kind == TypeInformationKind.UnsignedInt; - public bool IsFloat => Kind == TypeInformationKind.Float; - + public bool IsFloat16 => Name == "Half"; + public bool IsIntOrFloat16 => IsInt || IsFloat16; public bool IsCLSCompliant => !(IsUnsignedInt || Name == "Int8"); - public string SizeOfType => Name == "Half" ? "sizeof(ushort)" : $"sizeof({Type})"; - - public string DefaultValue => Name == "Half" ? "Half.Zero" : "0"; + public string SizeOfType => IsFloat16 ? "sizeof(ushort)" : $"sizeof({Type})"; + public string DefaultValue => IsFloat16 ? "Half.Zero" : "0"; public string FormatNumber(string number) => Prefix + "(" + number + Suffix + ")"; + + public string GetBasicValueType() => + BasicValueTypeMapping.TryGetValue(Name, out string mappedType) + ? mappedType + : Name; + + public string GetArithmeticBasicValueType() => + ArithmeticBasicValueTypeMapping.TryGetValue(Name, out string mappedType) + ? mappedType + : Name; }; public static readonly TypeInformation[] SignedIntTypes = @@ -115,6 +140,20 @@ public static readonly TypeInformation[] AtomicFloatTypes = public static readonly TypeInformation[] AtomicNumericTypes = AtomicIntTypes.Concat(AtomicFloatTypes).ToArray(); +public static readonly TypeInformation[] AtomicNumericTypes32 = new TypeInformation[] +{ + AtomicIntTypes[0], + AtomicIntTypes[2], + AtomicFloatTypes[0] +}; + +public static readonly TypeInformation[] AtomicNumericTypes64 = new TypeInformation[] +{ + AtomicIntTypes[1], + AtomicIntTypes[3], + AtomicFloatTypes[1] +}; + // Index types public class IndexDimensionDefinition @@ -279,6 +318,39 @@ public class MathOpRewriter MakeExpr(Target, values); } +public class VelocityMathConfig +{ + [XmlText] + public string Implementation { get; set; } + + [XmlAttribute] + public bool SoftwareEmulation { get; set; } + + [XmlAttribute] + public bool ReturnAsWarp32 { get; set; } + + [XmlIgnore] + public bool RequiresSpecialization => + !string.IsNullOrWhiteSpace(Implementation) && + Implementation.Contains("["); + + public string GetImplementation( + string lowerAsExpression, + string suffix, + string typeName, + params string[] variables) + { + var result = new StringBuilder(Implementation); + result.Replace("[", $".{lowerAsExpression}<"); + result.Replace("]", ">()"); + result.Replace("{T}", suffix); + result.Replace("{Type}", typeName); + for (int i = 0; i < variables.Length; ++i) + result.Replace($"{{Value{i}}}", variables[i]); + return result.ToString(); + } +} + public class MathOp { #region Data @@ -342,7 +414,19 @@ public class MathOp { if (!HasCall) throw new InvalidOperationException(); - return Call.Split('.').Last(); + int index = Call.LastIndexOf('.'); + return index < 0 ? Call : Call.Substring(index + 1); + } + } + + [XmlIgnore] + public string MethodTypeName + { + get + { + if (!HasCall) + throw new InvalidOperationException(); + return Call.Substring(0, Call.LastIndexOf('.')); } } @@ -351,10 +435,25 @@ public class MathOp public bool HasRewriters => (Rewriters?.Length ?? 0) > 0; + [XmlElement("Velocity32")] + public VelocityMathConfig Velocity32 { get; set; } + + [XmlElement("Velocity64")] + public VelocityMathConfig Velocity64 { get; set; } + + [XmlElement("Velocity")] + public VelocityMathConfig Velocity { get; set; } + #endregion #region Methods + public VelocityMathConfig GetVelocity32() => + Velocity32 != null ? Velocity32 : Velocity; + + public VelocityMathConfig GetVelocity64() => + Velocity64 != null ? Velocity64 : Velocity; + public IEnumerable GetRewriters(int valueDependency) { if (Rewriters == null) diff --git a/Src/ILGPU/Static/UnaryMathOperations.xml b/Src/ILGPU/Static/UnaryMathOperations.xml index 3f18fb6e0..27ee2d11b 100644 --- a/Src/ILGPU/Static/UnaryMathOperations.xml +++ b/Src/ILGPU/Static/UnaryMathOperations.xml @@ -13,6 +13,7 @@ UnaryArithmeticKind.Not) + {Value0}.Neg{T}() The logical not operation. @@ -36,6 +37,7 @@ true InvertCompareValue({Location}, {Value1}) + {Value0}.Not() The abs operation. @@ -47,27 +49,34 @@ {Value0} + {Value0}.Abs{T}() The popcount operation. Ints IntrinsicMath.BitOperations.PopCount + {Value0}.PopC() + The CLZ operation. Ints IntrinsicMath.BitOperations.LeadingZeroCount + The CTZ operation. Ints IntrinsicMath.BitOperations.TrailingZeroCount + The reciprocal operation. Floats IntrinsicMath.CPUOnly.Rcp {Const1} / {Value0} + {Value0}.RcpF() + @@ -75,18 +84,24 @@ Floats IntrinsicMath.CPUOnly.IsNaN {TypeName}.IsNaN({Value0}) + {Value0}.IsNanF() + {Value0}.IsNanF32() The is-infinity operation. Floats IntrinsicMath.CPUOnly.IsInfinity {TypeName}.IsInfinity({Value0}) + + The is-finite operation. Floats IntrinsicMath.CPUOnly.IsFinite !IsNaN({Value0}) && !IsInfinity({Value0}) + + @@ -94,12 +109,14 @@ Floats IntrinsicMath.CPUOnly.Sqrt {MathType}.Sqrt({Value0}) + {Value0}.SqrtF() Computes 1/sqrt(value). Floats IntrinsicMath.CPUOnly.Rsqrt Rcp(Sqrt({Value0})) + {Value0}.RsqrtF() @@ -107,18 +124,21 @@ Floats IntrinsicMath.CPUOnly.Asin {MathType}.Asin({Value0}) + Computes sin(x). Floats IntrinsicMath.CPUOnly.Sin {MathType}.Sin({Value0}) + Computes sinh(x). Floats IntrinsicMath.CPUOnly.Sinh {MathType}.Sinh({Value0}) + @@ -126,18 +146,21 @@ Floats IntrinsicMath.CPUOnly.Acos {MathType}.Acos({Value0}) + Computes cos(x). Floats IntrinsicMath.CPUOnly.Cos {MathType}.Cos({Value0}) + Computes cosh(x). Floats IntrinsicMath.CPUOnly.Cosh {MathType}.Cosh({Value0}) + @@ -145,18 +168,21 @@ Floats IntrinsicMath.CPUOnly.Tan {MathType}.Tan({Value0}) + Computes tanh(x). Floats IntrinsicMath.CPUOnly.Tanh {MathType}.Tanh({Value0}) + Computes atan(x). Floats IntrinsicMath.CPUOnly.Atan {MathType}.Atan({Value0}) + @@ -164,12 +190,14 @@ Floats IntrinsicMath.CPUOnly.Exp {MathType}.Exp({Value0}) + Computes 2^x. Floats IntrinsicMath.CPUOnly.Exp2 {MathType}.Pow({Const2}, {Value0}) + @@ -177,12 +205,14 @@ Floats IntrinsicMath.CPUOnly.Floor {MathType}.Floor({Value0}) + Computes ceil(x). Floats IntrinsicMath.CPUOnly.Ceiling {MathType}.Ceiling({Value0}) + @@ -190,17 +220,20 @@ Floats IntrinsicMath.CPUOnly.Log {MathType}.Log({Value0}) + Computes log(x) to base 2. Floats IntrinsicMath.CPUOnly.Log2 {MathType}.Log({Value0}, {Const2}) + Computes log(x) to base 10. Floats IntrinsicMath.CPUOnly.Log10 {MathType}.Log10({Value0}) + diff --git a/Src/ILGPU/Util/TypeExtensions.cs b/Src/ILGPU/Util/TypeExtensions.cs index 59f2fa6aa..988688bd6 100644 --- a/Src/ILGPU/Util/TypeExtensions.cs +++ b/Src/ILGPU/Util/TypeExtensions.cs @@ -248,6 +248,29 @@ public static Type GetManagedType(this BasicValueType type) => _ => null, }; + /// + /// Resolves the managed type for the given basic-value type. + /// + /// The source type. + /// The resolved managed type. + public static Type GetManagedType(this ArithmeticBasicValueType type) => + type switch + { + ArithmeticBasicValueType.UInt1 => typeof(bool), + ArithmeticBasicValueType.Int8 => typeof(byte), + ArithmeticBasicValueType.Int16 => typeof(short), + ArithmeticBasicValueType.Int32 => typeof(int), + ArithmeticBasicValueType.Int64 => typeof(long), + ArithmeticBasicValueType.Float16 => typeof(Half), + ArithmeticBasicValueType.Float32 => typeof(float), + ArithmeticBasicValueType.Float64 => typeof(double), + ArithmeticBasicValueType.UInt8 => typeof(byte), + ArithmeticBasicValueType.UInt16 => typeof(short), + ArithmeticBasicValueType.UInt32 => typeof(int), + ArithmeticBasicValueType.UInt64 => typeof(long), + _ => null, + }; + /// /// Resolves the basic-value type for the given managed type. /// @@ -374,6 +397,80 @@ public static ArithmeticBasicValueType GetArithmeticBasicValueType( _ => ArithmeticBasicValueType.None, }; + /// + /// Forces the given basic type to 32 bits. + /// + /// The source type. + /// The limited arithmetic basic value. + public static ArithmeticBasicValueType ForceTo32Bit( + this ArithmeticBasicValueType type) => + type switch + { + ArithmeticBasicValueType.UInt1 => ArithmeticBasicValueType.UInt32, + ArithmeticBasicValueType.UInt8 => ArithmeticBasicValueType.UInt32, + ArithmeticBasicValueType.UInt16 => ArithmeticBasicValueType.UInt32, + ArithmeticBasicValueType.UInt32 => ArithmeticBasicValueType.UInt32, + ArithmeticBasicValueType.UInt64 => ArithmeticBasicValueType.UInt32, + + ArithmeticBasicValueType.Int8 => ArithmeticBasicValueType.Int32, + ArithmeticBasicValueType.Int16 => ArithmeticBasicValueType.Int32, + ArithmeticBasicValueType.Int32 => ArithmeticBasicValueType.Int32, + ArithmeticBasicValueType.Int64 => ArithmeticBasicValueType.Int32, + + ArithmeticBasicValueType.Float16 => ArithmeticBasicValueType.Float32, + ArithmeticBasicValueType.Float32 => ArithmeticBasicValueType.Float32, + ArithmeticBasicValueType.Float64 => ArithmeticBasicValueType.Float32, + _ => ArithmeticBasicValueType.None, + }; + + /// + /// Limits the given basic type to 32 bits. + /// + /// The source type. + /// The limited arithmetic basic value. + public static ArithmeticBasicValueType LimitTo32Bit( + this ArithmeticBasicValueType type) + { + switch (type) + { + case ArithmeticBasicValueType.Int64: + return ArithmeticBasicValueType.Int32; + case ArithmeticBasicValueType.UInt64: + return ArithmeticBasicValueType.UInt32; + case ArithmeticBasicValueType.Float64: + return ArithmeticBasicValueType.Float32; + default: + return type; + } + } + + /// + /// Limits the given basic type to 64 bits. + /// + /// The source type. + /// The limited arithmetic basic value. + public static ArithmeticBasicValueType LimitTo64Bit( + this ArithmeticBasicValueType type) + { + switch (type) + { + case ArithmeticBasicValueType.Int8: + case ArithmeticBasicValueType.Int16: + case ArithmeticBasicValueType.Int32: + return ArithmeticBasicValueType.Int64; + case ArithmeticBasicValueType.UInt1: + case ArithmeticBasicValueType.UInt8: + case ArithmeticBasicValueType.UInt16: + case ArithmeticBasicValueType.UInt32: + return ArithmeticBasicValueType.UInt64; + case ArithmeticBasicValueType.Float16: + case ArithmeticBasicValueType.Float32: + return ArithmeticBasicValueType.Float64; + default: + return type; + } + } + /// /// Returns true if the given type represents an int. ///