diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25537afcc..653c474a7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,12 @@ jobs: os: ${{ fromJson(needs.setup-os-matrix.outputs.os) }} library: [ILGPU, ILGPU.Algorithms] framework: [net6.0, net7.0] + flavor: [CPU, Velocity] + exclude: + - library: ILGPU.Algorithms + flavor: Velocity + - os: cuda + flavor: Velocity fail-fast: false runs-on: ${{ matrix.os }} steps: @@ -174,7 +180,7 @@ jobs: - name: Set test flavor id: test-flavor shell: bash - run: echo "flavor=$([[ "${{ matrix.os }}" == cuda ]] && echo "Cuda" || echo "CPU")" >> $GITHUB_OUTPUT + run: echo "flavor=$([[ "${{ matrix.os }}" == cuda ]] && echo "Cuda" || echo "${{ matrix.flavor }}")" >> $GITHUB_OUTPUT - name: Build and test run: dotnet test Src/${{ matrix.library }}.Tests.${{ steps.test-flavor.outputs.flavor }} --configuration=Release --framework=${{ matrix.framework }} -p:TreatWarningsAsErrors=true diff --git a/.gitignore b/.gitignore index a82ac601d..0fb0024c3 100644 --- a/.gitignore +++ b/.gitignore @@ -257,6 +257,7 @@ Src/ILGPU/AtomicFunctions.cs Src/ILGPU/Backends/PTX/PTXIntrinsics.Generated.cs Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.cs Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.cs +Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.cs Src/ILGPU/Frontend/Intrinsic/RemappedIntrinsics.Generated.cs Src/ILGPU/HalfConversion.cs Src/ILGPU/IR/Construction/ArithmeticOperations.cs @@ -339,6 +340,7 @@ Src/ILGPU.Tests/.test.runsettings Src/ILGPU.Tests.CPU/Configurations.cs Src/ILGPU.Tests.Cuda/Configurations.cs Src/ILGPU.Tests.OpenCL/Configurations.cs +Src/ILGPU.Tests.Velocity/Configurations.cs # Generated test source files (Algorithms) Src/ILGPU.Algorithms.Tests/Generic/ConfigurationBase.cs diff --git a/Src/ILGPU.Tests.Velocity/.editorconfig b/Src/ILGPU.Tests.Velocity/.editorconfig new file mode 100644 index 000000000..09af4abce --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/.editorconfig @@ -0,0 +1,7 @@ +[*.cs] + +# CA1707: Identifiers should not contain underscores +dotnet_diagnostic.CA1707.severity = none + +# CA1014: Mark assemblies with CLSCompliant +dotnet_diagnostic.CA1014.severity = none diff --git a/Src/ILGPU.Tests.Velocity/Configurations.tt b/Src/ILGPU.Tests.Velocity/Configurations.tt new file mode 100644 index 000000000..3290ac580 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/Configurations.tt @@ -0,0 +1,51 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: Configurations.tt/Configurations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../ILGPU.Tests/Generic/ConfigurationBase.tt" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.IO" #> +using Xunit; +using Xunit.Abstractions; + +<# +var configurationFile = Host.ResolvePath("../ILGPU.Tests/Configurations.txt"); +var configurations = TestConfig.Parse(configurationFile); +#> +namespace ILGPU.Tests.Velocity +{ +<# foreach (var (test, level, collection) in configurations) { #> +<# var name = $"Velocity{test}_{level}"; #> + [Collection("VelocityContextCollection<#= collection #>")] + public sealed partial class <#= name #> : <#= test #> + { + public <#= name #>( + ITestOutputHelper output, + VelocityTestContext<#= collection #> testContext) + : base(output, testContext) + { } + } + +<# } #> +<# foreach (var (config, level) in TestConfig.AllConfigurations) { #> + public class VelocityTestContext<#= config #> : VelocityTestContext + { + public VelocityTestContext<#= config #>() + : base(OptimizationLevel.<#= level #>) + { } + } + + [CollectionDefinition("VelocityContextCollection<#= config #>")] + public class VelocityContextCollection<#= config #> : + ICollectionFixture> { } + +<# } #> +} \ No newline at end of file diff --git a/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj b/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj new file mode 100644 index 000000000..34a741884 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/ILGPU.Tests.Velocity.csproj @@ -0,0 +1,59 @@ + + + + $(LibraryUnitTestTargetFrameworks) + false + + + + $(MSBuildProjectDirectory)\..\ILGPU.Tests\.test.runsettings + + + + true + AllEnabledByDefault + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + True + True + Configurations.tt + + + + + + TextTemplatingFileGenerator + Configurations.cs + + + + + + + + + + True + True + Configurations.tt + + + diff --git a/Src/ILGPU.Tests.Velocity/TestContext.cs b/Src/ILGPU.Tests.Velocity/TestContext.cs new file mode 100644 index 000000000..4431d6963 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity/TestContext.cs @@ -0,0 +1,45 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: TestContext.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.Velocity; +using System; + +namespace ILGPU.Tests.Velocity +{ + /// + /// An abstract test context for Velocity accelerators. + /// + public abstract class VelocityTestContext : TestContext + { + /// + /// Creates a new test context instance. + /// + /// The optimization level to use. + /// The context preparation handler. + protected VelocityTestContext( + OptimizationLevel optimizationLevel, + Action prepareContext) + : base( + optimizationLevel, + builder => prepareContext( + builder.Velocity(VelocityDeviceType.Scalar2)), + context => context.CreateVelocityAccelerator()) + { } + + /// + /// Creates a new test context instance. + /// + /// The optimization level to use. + protected VelocityTestContext(OptimizationLevel optimizationLevel) + : this(optimizationLevel, _ => { }) + { } + } +} diff --git a/Src/ILGPU.sln b/Src/ILGPU.sln index 56c81a8cd..74d3f49db 100644 --- a/Src/ILGPU.sln +++ b/Src/ILGPU.sln @@ -25,6 +25,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILGPU.Algorithms.Tests.Open EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILGPU.Algorithms.Tests", "ILGPU.Algorithms.Tests\ILGPU.Algorithms.Tests.csproj", "{18F2225C-82FD-4B01-8AF9-CF746D16EDA1}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Tests.Velocity", "ILGPU.Tests.Velocity\ILGPU.Tests.Velocity.csproj", "{4AFD2AAD-FA52-43EA-B9A8-10E948F9A139}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -71,6 +73,10 @@ Global {18F2225C-82FD-4B01-8AF9-CF746D16EDA1}.Debug|Any CPU.Build.0 = Debug|Any CPU {18F2225C-82FD-4B01-8AF9-CF746D16EDA1}.Release|Any CPU.ActiveCfg = Release|Any CPU {18F2225C-82FD-4B01-8AF9-CF746D16EDA1}.Release|Any CPU.Build.0 = Release|Any CPU + {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -84,6 +90,7 @@ Global {AA73D3B1-873F-4A79-8347-E0781E382FF8} = {7701FE3C-4187-401C-9612-44667203B0E5} {6ED7EDA1-E6DA-4867-8084-C0327EEFB7A9} = {7701FE3C-4187-401C-9612-44667203B0E5} {18F2225C-82FD-4B01-8AF9-CF746D16EDA1} = {7701FE3C-4187-401C-9612-44667203B0E5} + {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139} = {7701FE3C-4187-401C-9612-44667203B0E5} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {22270DEE-D42D-479D-A76F-B2E7A5F7C949} diff --git a/Src/ILGPU/Backends/Backend.cs b/Src/ILGPU/Backends/Backend.cs index 496e8b6ff..b9134f321 100644 --- a/Src/ILGPU/Backends/Backend.cs +++ b/Src/ILGPU/Backends/Backend.cs @@ -66,6 +66,11 @@ public enum BackendType /// IL, + /// + /// A Velocity backend. + /// + Velocity, + /// /// A PTX backend. /// @@ -123,7 +128,7 @@ public void OptimizedKernelContext( /// /// Represents the current kernel context in scope of a backend instance. /// - protected readonly ref struct BackendContext + protected internal readonly ref struct BackendContext { #region Nested Types diff --git a/Src/ILGPU/Backends/Velocity/Analyses/VelocityMasks.cs b/Src/ILGPU/Backends/Velocity/Analyses/VelocityMasks.cs new file mode 100644 index 000000000..31f8088bd --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Analyses/VelocityMasks.cs @@ -0,0 +1,251 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMasks.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +// Uncomment this line to or define a preprocessor symbol to enable detailed Velocity +// accelerator debugging: +// #define DEBUG_VELOCITY + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Analyses.ControlFlowDirection; +using ILGPU.IR.Analyses.TraversalOrders; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using Loop = ILGPU.IR.Analyses.Loops< + ILGPU.IR.Analyses.TraversalOrders.ReversePostOrder, + ILGPU.IR.Analyses.ControlFlowDirection.Forwards>.Node; + +namespace ILGPU.Backends.Velocity.Analyses +{ + /// + /// A program analysis to gather information about vector masks to be used. + /// + /// The IL emitter type. + sealed class VelocityMasks + where TILEmitter : struct, IILEmitter + { + #region Instance + + /// + /// The set of all back edge source blocks. + /// + private BasicBlockSet backEdges; + + /// + /// The set of all loop blocks. + /// + private BasicBlockSet loopBlocks; + + /// + /// The set of all loop headers. + /// + private BasicBlockMap loopHeaders; + + /// + /// The set of all exit blocks. + /// + private BasicBlockMap> exitBlocks; + + /// + /// The set of all loop masks. + /// + private readonly Dictionary loopMasks = new(); + + /// + /// Maps blocks to their input masks. + /// + private readonly BasicBlockMap blockMasks; + + /// + /// Stores all loops. + /// + private readonly Loops loops; + + public VelocityMasks( + BasicBlockCollection blocks, + TILEmitter emitter, + VelocityTargetSpecializer specializer) + { + loopHeaders = blocks.CreateMap(); + exitBlocks = blocks.CreateMap>(); + backEdges = blocks.CreateSet(); + loopBlocks = blocks.CreateSet(); + blockMasks = blocks.CreateMap(); + + // Iterate over all loops and determine all body blocks and all back edges + var cfg = blocks.CreateCFG(); + loops = cfg.CreateLoops(); + + loops.ProcessLoops(loop => + { + // Declare a new loop mask + loopMasks[loop] = emitter.DeclareLocal(specializer.WarpType32); + + // Register all loop headers + foreach (var header in loop.Headers) + loopHeaders.Add(header, loop); + + // Remember all body blocks + foreach (var block in loop.AllMembers) + loopBlocks.Add(block); + + // Remember all exits + foreach (var block in loop.Exits) + { + if (!exitBlocks.TryGetValue(block, out var set)) + { + set = new HashSet(2) + { + loop + }; + } + else + { + set.Add(loop); + } + exitBlocks[block] = set; + } + + // Register all back edges + foreach (var backEdge in loop.BackEdges) + backEdges.Add(backEdge); + }); + + // Remove all headers again from all loops from the instantly reset set + loops.ProcessLoops(loop => + { + foreach (var header in loop.Headers) + loopBlocks.Remove(header); + }); + + // Allocate local masks and initialize all of them + foreach (var block in blocks) + { + // Create a local variable to store the entry mask for this block + var blockMask = emitter.DeclareLocal(specializer.WarpType32); + blockMasks[block] = blockMask; + } + } + + #endregion + + #region Methods + + /// + /// Disables all internal lanes. + /// + public void DisableAllLanes( + Method method, + TILEmitter emitter, + VelocityTargetSpecializer specializer) + { + foreach (var (basicBlock, blockMask) in blockMasks) + { + // Ignore the entry block + if (basicBlock == method.EntryBlock) + continue; + + // Ignore blocks that will be reset automatically + if (NeedsToRefreshMask(basicBlock)) + continue; + + specializer.PushNoLanesMask32(emitter); + emitter.Emit(LocalOperation.Store, blockMask); + } + } + + /// + /// Tries to map the given block to a loop and returns the loop if possible. + /// + /// The block to map to a loop. + /// The resolved loop (if any). + /// True if the given block could be mapped to a loop. + public bool TryGetLoop(BasicBlock block, [NotNullWhen(true)] out Loop? loop) => + loops.TryGetLoops(block, out loop); + + /// + /// Returns the block mask for the given basic block. + /// + /// The block to lookup. + /// The block mask to use. + public ILLocal GetBlockMask(BasicBlock block) => blockMasks[block]; + + /// + /// Returns the loop mask for the given loop. + /// + /// The loop to lookup. + /// The loop mask to use. + public ILLocal GetLoopMask(Loop loop) => loopMasks[loop]; + + /// + /// Returns true if the given block is a header and also returns a set of nested + /// loop headers that are implicitly controlled by this header. + /// + public bool IsHeader( + BasicBlock target, + [NotNullWhen(true)] + out Loop? loop) => + loopHeaders.TryGetValue(target, out loop); + + /// + /// Returns true if the given target block is an exit. + /// + public bool IsExit( + BasicBlock target, + [NotNullWhen(true)] + out Predicate? containsLoop) + { + if (exitBlocks.TryGetValue(target, out var loopsToExit)) + { + containsLoop = loopsToExit.Contains; + return true; + } + + containsLoop = null; + return false; + } + + /// + /// Returns true if the given block is a target potentially hit from a back edge. + /// + public bool IsBackEdgeBlock(BasicBlock block) => backEdges.Contains(block); + + /// + /// Returns true if this block needs to refresh its mask instantly. + /// + public bool NeedsToRefreshMask(BasicBlock block) => + loopBlocks.Contains(block); + +#if DEBUG_VELOCITY + public void DumpAllMasks( + TILEmitter emitter, + VelocityTargetSpecializer specializer) + { + foreach (var (block, mask) in blockMasks) + { + emitter.Emit(LocalOperation.Load, mask); + specializer.DumpWarp32(emitter, $" {block.ToReferenceString()}"); + } + foreach (var (loop, mask) in loopMasks) + { + emitter.Emit(LocalOperation.Load, mask); + specializer.DumpWarp32( + emitter, + $" LOOP_{loop.Headers[0].ToReferenceString()}"); + } + } +#endif + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/Scalar/Scalar.cs b/Src/ILGPU/Backends/Velocity/Scalar/Scalar.cs new file mode 100644 index 000000000..7074544f7 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Scalar/Scalar.cs @@ -0,0 +1,445 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: Scalar.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity.Scalar +{ + /// + /// A scalar 2-warp-wide sequential warp implementation. + /// + sealed class Scalar : VelocityTargetSpecializer + { + #region Instance & General Methods + + public Scalar() + : base( + ScalarOperations2.WarpSize, + ScalarOperations2.WarpType32, + ScalarOperations2.WarpType64) + { } + + public override VelocityTypeGenerator CreateTypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem) => + new ScalarTypeGenerator(capabilityContext, runtimeSystem); + + #endregion + + #region General + + public override void LoadLaneIndexVector32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadLaneIndexVector32Method); + + public override void LoadLaneIndexVector64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadLaneIndexVector64Method); + + public override void LoadWarpSizeVector32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadVectorLengthVector32Method); + + public override void LoadWarpSizeVector64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadVectorLengthVector64Method); + + #endregion + + #region Masks + + public override void PushAllLanesMask32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadAllLanesMask32Method); + + public override void PushNoLanesMask32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.LoadNoLanesMask32Method); + + public override void ConvertMask32To64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetConvert32To64Operation( + VelocityWarpOperationMode.I)); + + public override void ConvertMask64To32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetConvert64To32Operation( + VelocityWarpOperationMode.I)); + + public override void IntersectMask32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation32( + BinaryArithmeticKind.And, + VelocityWarpOperationMode.U)); + + public override void IntersectMask64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation64( + BinaryArithmeticKind.And, + VelocityWarpOperationMode.U)); + + public override void UnifyMask32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation32( + BinaryArithmeticKind.Or, + VelocityWarpOperationMode.U)); + + public override void UnifyMask64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation64( + BinaryArithmeticKind.Or, + VelocityWarpOperationMode.U)); + + public override void NegateMask32(TILEmitter emitter) + { + // As an active lane is 1 and a non-active lane is 0... + emitter.Emit(OpCodes.Ldc_I4_1); + emitter.EmitCall(ScalarOperations2.FromScalarU32Method); + BinaryOperation32( + emitter, + BinaryArithmeticKind.Xor, + VelocityWarpOperationMode.U); + } + + public override void NegateMask64(TILEmitter emitter) + { + emitter.Emit(OpCodes.Ldc_I4_1); + emitter.Emit(OpCodes.Conv_U8); + emitter.EmitCall(ScalarOperations2.FromScalarU64Method); + BinaryOperation64( + emitter, + BinaryArithmeticKind.Xor, + VelocityWarpOperationMode.U); + } + + public override void CheckForAnyActiveLaneMask(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.CheckForAnyActiveLaneMethod); + + public override void CheckForNoActiveLaneMask(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.CheckForNoActiveLaneMethod); + + public override void CheckForEqualMasks(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.CheckForEqualMasksMethod); + + public override void GetNumberOfActiveLanes(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.GetNumberOfActiveLanesMethod); + + public override void ConditionalSelect32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Select32Method); + + public override void ConditionalSelect64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Select64Method); + + #endregion + + #region Scalar Values + + public override void LoadWarpSize32(TILEmitter emitter) => + emitter.EmitConstant(WarpSize); + + public override void LoadWarpSize64(TILEmitter emitter) => + emitter.EmitConstant((long)WarpSize); + + public override void ConvertBoolScalar(TILEmitter emitter) => + // As the initial bool value was already converted to an integer, we can + // simply reuse the integer value + ConvertScalarTo32(emitter, VelocityWarpOperationMode.I); + + public override void ConvertScalarTo32( + TILEmitter emitter, + VelocityWarpOperationMode mode) + { + switch (mode) + { + case VelocityWarpOperationMode.I: + emitter.EmitCall(ScalarOperations2.FromScalarI32Method); + break; + case VelocityWarpOperationMode.U: + emitter.EmitCall(ScalarOperations2.FromScalarU32Method); + return; + case VelocityWarpOperationMode.F: + emitter.EmitCall(ScalarOperations2.FromScalarF32Method); + break; + default: + throw new NotSupportedException(); + } + } + + public override void ConvertScalarTo64( + TILEmitter emitter, + VelocityWarpOperationMode mode) + { + switch (mode) + { + case VelocityWarpOperationMode.I: + emitter.EmitCall(ScalarOperations2.FromScalarI64Method); + break; + case VelocityWarpOperationMode.U: + emitter.EmitCall(ScalarOperations2.FromScalarU64Method); + return; + case VelocityWarpOperationMode.F: + emitter.EmitCall(ScalarOperations2.FromScalarF64Method); + break; + default: + throw new NotSupportedException(); + } + } + + #endregion + + #region Comparisons + + public override void Compare32( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetCompareOperation32(kind, mode)); + + public override void Compare64( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetCompareOperation64(kind, mode)); + + #endregion + + #region Conversions + + public override void ConvertSoftware32( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) => + emitter.EmitCall(ScalarOperations2.GetConvertOperation32( + sourceType, + targetType)); + + public override void ConvertSoftware64( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) => + emitter.EmitCall(ScalarOperations2.GetConvertOperation64( + sourceType, + targetType)); + + public override void Convert32( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + emitter.EmitCall(ScalarOperations2.GetConvertOperation32( + source.GetArithmeticBasicValueType(is64Bit: false), + target.GetArithmeticBasicValueType(is64Bit: false))); + + public override void Convert64( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + emitter.EmitCall(ScalarOperations2.GetConvertOperation64( + source.GetArithmeticBasicValueType(is64Bit: true), + target.GetArithmeticBasicValueType(is64Bit: true))); + + public override void Convert32To64( + TILEmitter emitter, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetConvert32To64Operation(mode)); + + public override void Convert64To32( + TILEmitter emitter, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetConvert64To32Operation(mode)); + + #endregion + + #region Arithmetics + + public override void UnaryOperation32( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetUnaryOperation32(kind, mode)); + + public override void UnaryOperation64( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetUnaryOperation64(kind, mode)); + + public override void BinaryOperation32( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation32(kind, mode)); + + public override void BinaryOperation64( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetBinaryOperation64(kind, mode)); + + public override void TernaryOperation32( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetTernaryOperation32(kind, mode)); + + public override void TernaryOperation64( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetTernaryOperation64(kind, mode)); + + #endregion + + #region Atomics + + public override void AtomicCompareExchange32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.AtomicCompareExchange32Method); + + public override void AtomicCompareExchange64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.AtomicCompareExchange64Method); + + public override void Atomic32( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetAtomicOperation32(kind, mode)); + + public override void Atomic64( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(ScalarOperations2.GetAtomicOperation64(kind, mode)); + + #endregion + + #region Threads + + + public override void BarrierPopCount32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierPopCount32Method); + + public override void BarrierPopCount64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierPopCount64Method); + + public override void BarrierAnd32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierAnd32Method); + + public override void BarrierAnd64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierAnd64Method); + + public override void BarrierOr32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierOr32Method); + + public override void BarrierOr64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.BarrierOr64Method); + + public override void Broadcast32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Broadcast32Method); + + public override void Broadcast64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Broadcast64Method); + + public override void Shuffle32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Shuffle32Method); + + public override void Shuffle64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Shuffle64Method); + + public override void ShuffleUp32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleUp32Method); + + public override void ShuffleUp64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleUp64Method); + + public override void SubShuffleUp32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleUp32Method); + + public override void SubShuffleUp64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleUp64Method); + + public override void ShuffleDown32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleDown32Method); + + public override void ShuffleDown64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleDown64Method); + + public override void SubShuffleDown32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleDown32Method); + + public override void SubShuffleDown64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleDown64Method); + + public override void ShuffleXor32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleXor32Method); + + public override void ShuffleXor64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.ShuffleXor64Method); + + public override void SubShuffleXor32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleXor32Method); + + public override void SubShuffleXor64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.SubShuffleXor64Method); + + #endregion + + #region IO + + public override void Load8(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Load8Method); + + public override void Load16(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Load16Method); + + public override void Load32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Load32Method); + + public override void Load64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Load64Method); + + public override void Store8(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Store8Method); + + public override void Store16(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Store16Method); + + public override void Store32(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Store32Method); + + public override void Store64(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.Store64Method); + + #endregion + + #region Misc + + public override void DebugAssertFailed(TILEmitter emitter) => + emitter.EmitCall(ScalarOperations2.DebugAssertFailedMethod); + + public override void WriteToOutput(TILEmitter emitter) => + throw new NotSupportedException(); + + public override void DumpWarp32( + TILEmitter emitter, + string? label = null) + { + if (string.IsNullOrEmpty(label)) + emitter.EmitConstant(string.Empty); + else + emitter.EmitConstant(label + ": "); + emitter.EmitCall(ScalarOperations2.DumpWarp32Method); + } + + public override void DumpWarp64( + TILEmitter emitter, + string? label = null) + { + if (string.IsNullOrEmpty(label)) + emitter.EmitConstant(string.Empty); + else + emitter.EmitConstant(label + ": "); + emitter.EmitCall(ScalarOperations2.DumpWarp64Method); + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.tt b/Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.tt new file mode 100644 index 000000000..ab98fdd53 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.tt @@ -0,0 +1,1518 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: ScalarOperations.tt/ScalarOperations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../VelocityOperations.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#@ output extension=".cs" #> +<# +string rootPath = Host.ResolvePath("../../../Static"); +var unaryOps = GetUnaryMathOps(rootPath); +var binaryOps = GetBinaryMathOps(rootPath); +var ternaryOps = GetTernaryMathOps(rootPath); +var compareOperations = new (string, string)[] +{ + ("Equal", "=="), + ("NotEqual", "!="), + ("LessThan", "<"), + ("LessEqual", "<="), + ("GreaterThan", ">"), + ("GreaterEqual", ">=") +}; + +int warpSize = 2; +string GetWarpTypeName(string elementTypeName) => + $"({string.Join(", ", Enumerable.Repeat(elementTypeName, warpSize))})"; +var warpType32 = GetWarpTypeName("int"); +var warpType64 = GetWarpTypeName("long"); +#> +using ILGPU.IR.Values; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Runtime.CompilerServices; + +// ReSharper disable ArrangeMethodOrOperatorBody +// ReSharper disable RedundantCast +// disable: max_line_length + +namespace ILGPU.Backends.Velocity.Scalar +{ + static class ScalarOperations<#= warpSize #> + { + #region Warp Types + + public const int WarpSize = <#= warpSize #>; + public static readonly Type WarpType32 = typeof(<#= warpType32 #>); + public static readonly Type WarpType64 = typeof(<#= warpType64 #>); + + #endregion + + #region Initialization + + static ScalarOperations<#= warpSize #>() + { + InitUnaryOperations(); + InitBinaryOperations(); + InitTernaryOperations(); + InitializeCompareOperations(); + InitializeConvertOperations(); + InitializeVectorConvertOperations(); + InitializeAtomicOperations(); + } + + internal static MethodInfo GetMethod(string name) => + typeof(ScalarOperations2).GetMethod( + name, + BindingFlags.NonPublic | BindingFlags.Static) + .AsNotNull(); + + #endregion + + #region Creation + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static (TTarget, TTarget) CastWarp((T, T) source) + where T : struct => + Unsafe.As<(T, T), (TTarget, TTarget)>(ref source); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool CheckForAnyActiveLane(<#= warpType32 #> warp) + { + bool result = false; +<# for (int i = 1; i <= warpSize; ++i) { #> + result |= warp.Item<#= i #> != 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool CheckForNoActiveLane(<#= warpType32 #> warp) => + !CheckForAnyActiveLane(warp); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool CheckForEqualMasks( + <#= warpType32 #> firstMask, + <#= warpType32 #> secondMask) + { + bool result = true; +<# for (int i = 1; i <= warpSize; ++i) { #> + result &= firstMask.Item<#= i #> != 0 & secondMask.Item<#= i #> != 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetNumberOfActiveLanes(<#= warpType32 #> warp) + { + int result = 0; +<# for (int i = 1; i <= warpSize; ++i) { #> + result += warp.Item<#= i #> != 0 ? 1 : 0; +<# } #> + return result; + } + + public static readonly MethodInfo CheckForAnyActiveLaneMethod = + GetMethod(nameof(CheckForAnyActiveLane)); + public static readonly MethodInfo CheckForNoActiveLaneMethod = + GetMethod(nameof(CheckForNoActiveLane)); + public static readonly MethodInfo CheckForEqualMasksMethod = + GetMethod(nameof(CheckForEqualMasks)); + public static readonly MethodInfo GetNumberOfActiveLanesMethod = + GetMethod(nameof(GetNumberOfActiveLanes)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> LoadLaneIndexVector32() + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= i - 1 #>; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> LoadLaneIndexVector64() + { + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= i - 1 #>; +<# } #> + return result; + } + + public static readonly MethodInfo LoadLaneIndexVector32Method = + GetMethod(nameof(LoadLaneIndexVector32)); + public static readonly MethodInfo LoadLaneIndexVector64Method = + GetMethod(nameof(LoadLaneIndexVector64)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> LoadVectorLengthVector32() + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= warpSize #>; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> LoadVectorLengthVector64() + { + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= warpSize #>; +<# } #> + return result; + } + + public static readonly MethodInfo LoadVectorLengthVector32Method = + GetMethod(nameof(LoadVectorLengthVector32)); + public static readonly MethodInfo LoadVectorLengthVector64Method = + GetMethod(nameof(LoadVectorLengthVector64)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> LoadAllLanesMask32() + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = 1; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> LoadAllLanesMask64() + { + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = 1L; +<# } #> + return result; + } + + public static readonly MethodInfo LoadAllLanesMask32Method = + GetMethod(nameof(LoadAllLanesMask32)); + public static readonly MethodInfo LoadAllLanesMask64Method = + GetMethod(nameof(LoadAllLanesMask64)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> LoadNoLanesMask32() => default; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> LoadNoLanesMask64() => default; + + public static readonly MethodInfo LoadNoLanesMask32Method = + GetMethod(nameof(LoadNoLanesMask32)); + public static readonly MethodInfo LoadNoLanesMask64Method = + GetMethod(nameof(LoadNoLanesMask64)); + + #endregion + + #region Generic Casts + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Cast<#= prefix #>ToI32( + <#= GetWarpTypeName(typeName) #> input) => +<# if (typeName == "int") { #> + input; +<# } else { #> + CastWarp<<#= typeName #>, int>(input); +<# } #> + +<# if (typeName != "int") { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= GetWarpTypeName(typeName) #> CastITo<#= prefix #>32( + <#= warpType32 #> input) => + CastWarp>(input); + + public static readonly MethodInfo Cast<#= prefix #>ToI32Method = + GetMethod(nameof(Cast<#= prefix #>ToI32)); + +<# } #> + public static readonly MethodInfo CastITo<#= prefix #>32Method = + GetMethod(nameof(CastITo<#= prefix #>32)); + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Cast<#= prefix #>ToI64( + <#= GetWarpTypeName(typeName) #> input) => +<# if (typeName == "long") { #> + input; +<# } else { #> + CastWarp<<#= typeName #>, long>(input); +<# } #> + +<# if (typeName != "long") { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= GetWarpTypeName(typeName) #> CastITo<#= prefix #>64( + <#= warpType64 #> input) => + CastWarp>(input); + + public static readonly MethodInfo Cast<#= prefix #>ToI64Method = + GetMethod(nameof(Cast<#= prefix #>ToI64)); +<# } #> + + public static readonly MethodInfo CastITo<#= prefix #>64Method = + GetMethod(nameof(CastITo<#= prefix #>64)); + +<# } #> + + #endregion + + #region Scalar Operations + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> FromScalar<#= prefix #>32(<#= typeName #> scalar) + { + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = scalar; +<# } #> + return Cast<#= prefix #>ToI32(result); + } + + public static readonly MethodInfo FromScalar<#= prefix #>32Method = + GetMethod(nameof(FromScalar<#= prefix #>32)); + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> FromScalar<#= prefix #>64(<#= typeName #> scalar) + { + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = scalar; +<# } #> + return Cast<#= prefix #>ToI64(result); + } + + public static readonly MethodInfo FromScalar<#= prefix #>64Method = + GetMethod(nameof(FromScalar<#= prefix #>64)); + +<# } #> + + #endregion + + #region Select Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Select32( + <#= warpType32 #> mask, + <#= warpType32 #> left, + <#= warpType32 #> right) + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> == 0 ? left.Item<#= i #> : right.Item<#= i #>; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Select64( + <#= warpType32 #> mask, + <#= warpType64 #> left, + <#= warpType64 #> right) + { + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> == 0 ? left.Item<#= i #> : right.Item<#= i #>; +<# } #> + return result; + } + + public static readonly MethodInfo Select32Method = GetMethod(nameof(Select32)); + public static readonly MethodInfo Select64Method = GetMethod(nameof(Select64)); + + #endregion + + #region Unary Operations + +<# foreach (var op in unaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> warp) + { + var value = CastITo<#= prefix #>32(warp); +<# if (op.IsPredicate) { #> + Unsafe.SkipInit(out <#= warpType32 #> result); +<# } else { #> + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); +<# } #> +<# for (int i = 1; i <= warpSize; ++i) { #> +<# if (op.Name == "Neg" && typeName == "uint") { #> + var result<#= i #> = ~<#= $"value.Item{i}" #>; +<# } else { #> + var result<#= i #> = <#= op.GetOpOrCall(isBool: false, $"value.Item{i}") #>; +<# } #> +<# if (op.IsPredicate) { #> + result.Item<#= i #> = result<#= i #> ? 1 : 0; +<# } else { #> + result.Item<#= i #> = (<#= typeName #>)result<#= i #>; +<# } #> +<# } #> +<# if (op.IsPredicate) { #> + return result; +<# } else { #> + return Cast<#= prefix #>ToI32(result); +<# } #> + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# var returnType = op.Velocity.ReturnAsWarp32 || op.IsPredicate + ? "int" + : "long"; #> +<# var implType = op.Velocity.ReturnAsWarp32 || op.IsPredicate + ? "int" + : typeName; #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= GetWarpTypeName(returnType) #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> warp) + { + var value = CastITo<#= prefix #>64(warp); +<# if (op.IsPredicate) { #> + Unsafe.SkipInit(out <#= warpType32 #> result); +<# } else { #> + Unsafe.SkipInit(out <#= GetWarpTypeName(implType) #> result); +<# } #> +<# for (int i = 1; i <= warpSize; ++i) { #> +<# if (op.Name == "Neg" && typeName == "ulong") { #> + var result<#= i #> = ~<#= $"value.Item{i}" #>; +<# } else { #> + var result<#= i #> = <#= op.GetOpOrCall(isBool: false, $"value.Item{i}") #>; +<# } #> +<# if (op.IsPredicate) { #> + result.Item<#= i #> = result<#= i #> ? 1 : 0; +<# } else { #> + result.Item<#= i #> = (<#= implType #>)result<#= i #>; +<# } #> +<# } #> +<# if (op.IsPredicate || op.Velocity.ReturnAsWarp32) { #> + return result; +<# } else { #> + return Cast<#= prefix #>ToI64(result); +<# } #> + } + +<# } #> + +<# } #> + + private static readonly Dictionary< + (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + UnaryOperations32 = new(); + private static readonly Dictionary< + (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + UnaryOperations64 = new(); + + private static void InitUnaryOperations() + { +<# foreach (var op in unaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + UnaryOperations32.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + UnaryOperations64.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetUnaryOperation32( + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => UnaryOperations32[(kind, mode)]; + public static MethodInfo GetUnaryOperation64( + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => UnaryOperations64[(kind, mode)]; + + #endregion + + #region Binary Operations + +<# foreach (var op in binaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second) + { + var left = CastITo<#= prefix #>32(first); + var right = CastITo<#= prefix #>32(second); + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= op.GetOpOrCall( + isBool: false, + $"left.Item{i}", + $"right.Item{i}") #>; +<# } #> + return Cast<#= prefix #>ToI32(result); + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second) + { + var left = CastITo<#= prefix #>64(first); + var right = CastITo<#= prefix #>64(second); + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= op.GetOpOrCall( + isBool: false, + $"left.Item{i}", + $"right.Item{i}") #>; +<# } #> + return Cast<#= prefix #>ToI64(result); + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + BinaryOperations32 = new(); + private static readonly Dictionary< + (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + BinaryOperations64 = new(); + + private static void InitBinaryOperations() + { +<# foreach (var op in binaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + BinaryOperations32.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + BinaryOperations64.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetBinaryOperation32( + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => BinaryOperations32[(kind, mode)]; + public static MethodInfo GetBinaryOperation64( + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => BinaryOperations64[(kind, mode)]; + + #endregion + + #region Ternary Operations + +<# foreach (var op in ternaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second, + <#= warpType32 #> third) + { + var source = CastITo<#= prefix #>32(first); + var add = CastITo<#= prefix #>32(second); + var mul = CastITo<#= prefix #>32(third); + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= op.GetOpOrCall( + isBool: false, + $"source.Item{i}", + $"add.Item{i}", + $"mul.Item{i}") #>; +<# } #> + return Cast<#= prefix #>ToI32(result); + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second, + <#= warpType64 #> third) + { + var source = CastITo<#= prefix #>64(first); + var add = CastITo<#= prefix #>64(second); + var mul = CastITo<#= prefix #>64(third); + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = <#= op.GetOpOrCall( + isBool: false, + $"source.Item{i}", + $"add.Item{i}", + $"mul.Item{i}") #>; +<# } #> + return Cast<#= prefix #>ToI64(result); + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + TernaryOperations32 = new(); + private static readonly Dictionary< + (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + TernaryOperations64 = new(); + + private static void InitTernaryOperations() + { +<# foreach (var op in ternaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + TernaryOperations32.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + TernaryOperations64.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetTernaryOperation32( + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => TernaryOperations32[(kind, mode)]; + public static MethodInfo GetTernaryOperation64( + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => TernaryOperations64[(kind, mode)]; + + #endregion + + #region Compare Operations + +<# foreach (var (kind, op) in compareOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second) + { + var left = CastITo<#= prefix #>32(first); + var right = CastITo<#= prefix #>32(second); + Unsafe.SkipInit(out <#= warpType32 #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = left.Item<#= i #> <#= op #> right.Item<#= i #> ? 1 : 0; +<# } #> + return result; + } + +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second) + { + var left = CastITo<#= prefix #>64(first); + var right = CastITo<#= prefix #>64(second); + Unsafe.SkipInit(out <#= warpType32 #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = left.Item<#= i #> <#= op #> right.Item<#= i #> ? 1 : 0; +<# } #> + return result; + } + +<# } #> +<# } #> + private static readonly Dictionary< + (CompareKind, VelocityWarpOperationMode, bool), + MethodInfo> CompareOperations = new(); + + private static void InitializeCompareOperations() + { +<# foreach (var (kind, _) in compareOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + CompareOperations.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Compare<#= kind #><#= prefix #>32))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + CompareOperations.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Compare<#= kind #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetCompareOperation32( + CompareKind kind, + VelocityWarpOperationMode mode) => + CompareOperations[(kind, mode, false)]; + + public static MethodInfo GetCompareOperation64( + CompareKind kind, + VelocityWarpOperationMode mode) => + CompareOperations[(kind, mode, true)]; + + #endregion + + #region Convert Operations + +<# foreach (var sourceType in Warp32ConvTypes) { #> +<# foreach (var targetType in Warp32ConvTypes) { #> +<# var sourceImplType32 = GetImplementationType32(sourceType.Kind); #> +<# var implType32 = GetImplementationType32(targetType.Kind); #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_32( + <#= warpType32 #> warp) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return warp; +<# } else { #> + var value = CastITo<#= sourceImplType32.Prefix #>32(warp); + Unsafe.SkipInit(out <#= GetWarpTypeName(implType32.TypeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + var item<#= i #> = (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item<#= i #>; + result.Item<#= i #> = (<#= implType32.TypeName #>)item<#= i #>; +<# } #> + + return Cast<#= implType32.Prefix #>ToI32(result); +<# } #> + } + +<# } #> +<# } #> + +<# foreach (var sourceType in Warp64ConvTypes) { #> +<# foreach (var targetType in Warp64ConvTypes) { #> +<# var sourceImplType64 = GetImplementationType64(sourceType.Kind); #> +<# var implType64 = GetImplementationType64(targetType.Kind); #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_64( + <#= warpType64 #> warp) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return warp; +<# } else { #> + var value = CastITo<#= sourceImplType64.Prefix #>64(warp); + Unsafe.SkipInit(out <#= GetWarpTypeName(implType64.TypeName) #> result); + +<# for (int i = 1; i <= warpSize; ++i) { #> + var item<#= i #> = (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item<#= i #>; + result.Item<#= i #> = (<#= implType64.TypeName #>)item<#= i #>; +<# } #> + + return Cast<#= implType64.Prefix #>ToI64(result); +<# } #> + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (ArithmeticBasicValueType, ArithmeticBasicValueType, bool), + MethodInfo> ConvertOperations = new(); + + private static void InitializeConvertOperations() + { +<# foreach (var sourceType in Warp32ConvTypes) { #> +<# foreach (var targetType in Warp32ConvTypes) { #> +<# var sourceName = sourceType.GetArithmeticBasicValueType(); #> +<# var targetName = targetType.GetArithmeticBasicValueType(); #> + ConvertOperations.Add( + (ArithmeticBasicValueType.<#= sourceName #>, + ArithmeticBasicValueType.<#= targetName #>, + false), + GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_32))); +<# } #> +<# } #> +<# foreach (var sourceType in Warp64ConvTypes) { #> +<# foreach (var targetType in Warp64ConvTypes) { #> +<# var sourceName = sourceType.GetArithmeticBasicValueType(); #> +<# var targetName = targetType.GetArithmeticBasicValueType(); #> + ConvertOperations.Add( + (ArithmeticBasicValueType.<#= sourceName #>, + ArithmeticBasicValueType.<#= targetName #>, + true), + GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_64))); +<# } #> +<# } #> + } + + public static MethodInfo GetConvertOperation32( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target) => + ConvertOperations[(source, target, false)]; + + public static MethodInfo GetConvertOperation64( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target) => + ConvertOperations[(source, target, true)]; + + #endregion + + #region Vector Convert Operations + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Convert64To32<#= prefix #>(<#= warpType64 #> warp) + { + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + var value = CastITo<#= prefix #>64(warp); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = (<#= typeName #>)value.Item<#= i #>; +<# } #> + return Cast<#= prefix #>ToI32(result); + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Convert32To64<#= prefix #>(<#= warpType32 #> warp) + { + Unsafe.SkipInit(out <#= GetWarpTypeName(typeName) #> result); + var value = CastITo<#= prefix #>32(warp); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = (<#= typeName #>)value.Item<#= i #>; +<# } #> + return Cast<#= prefix #>ToI64(result); + } + +<# } #> + internal static readonly Dictionary< + (VelocityWarpOperationMode, bool), + MethodInfo> VectorConvertOperations = new(); + + internal static void InitializeVectorConvertOperations() + { +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + VectorConvertOperations.Add( + (VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Convert64To32<#= prefix #>))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + VectorConvertOperations.Add( + (VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Convert32To64<#= prefix #>))); +<# } #> + } + + public static MethodInfo GetConvert32To64Operation( + VelocityWarpOperationMode mode) => + VectorConvertOperations[(mode, true)]; + + public static MethodInfo GetConvert64To32Operation( + VelocityWarpOperationMode mode) => + VectorConvertOperations[(mode, false)]; + + #endregion + + #region Atomic Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType32 #> AtomicCompareExchange32( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType32 #> compare, + <#= warpType32 #> value) + { + var result = value; +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + { + result.Item<#= i #> = Atomic.CompareExchange( + ref Unsafe.AsRef((void*)target.Item<#= i #>), + compare.Item<#= i #>, + value.Item<#= i #>); + } +<# } #> + return result; + } + + public static readonly MethodInfo AtomicCompareExchange32Method = + GetMethod(nameof(AtomicCompareExchange32)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType64 #> AtomicCompareExchange64( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType64 #> compare, + <#= warpType64 #> value) + { + var result = value; +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + { + result.Item<#= i #> = Atomic.CompareExchange( + ref Unsafe.AsRef((void*)target.Item<#= i #>), + compare.Item<#= i #>, + value.Item<#= i #>); + } +<# } #> + return result; + } + + public static readonly MethodInfo AtomicCompareExchange64Method = + GetMethod(nameof(AtomicCompareExchange64)); + +<# foreach (var (op, isBinary) in AtomicOperations) { #> +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> +<# var targetPrefix = isBinary ? "U" : prefix; #> +<# var targetTypeName = isBinary ? "uint" : typeName; #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType32 #> Atomic<#= op #><#= prefix #>32( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType32 #> value) + { + var sourceValue = CastITo<#= targetPrefix #>32(value); + var result = sourceValue; +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + { + result.Item<#= i #> = Atomic.<#= op #>( + ref Unsafe.AsRef<<#= targetTypeName #>>((void*)target.Item<#= i #>), + sourceValue.Item<#= i #>); + } +<# } #> + return Cast<#= targetPrefix #>ToI32(result); + } + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> +<# var targetPrefix = isBinary ? "U" : prefix; #> +<# var targetTypeName = isBinary ? "ulong" : typeName; #> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType64 #> Atomic<#= op #><#= prefix #>64( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType64 #> value) + { + var sourceValue = CastITo<#= targetPrefix #>64(value); + var result = sourceValue; +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + { + result.Item<#= i #> = Atomic.<#= op #>( + ref Unsafe.AsRef<<#= targetTypeName #>>((void*)target.Item<#= i #>), + sourceValue.Item<#= i #>); + } +<# } #> + return Cast<#= targetPrefix #>ToI64(result); + } + +<# } #> +<# } #> + + internal static readonly Dictionary< + (AtomicKind, VelocityWarpOperationMode, bool), + MethodInfo> AtomicOperations = new(); + + internal static void InitializeAtomicOperations() + { +<# foreach (var (op, _) in AtomicOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + AtomicOperations.Add( + (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Atomic<#= op #><#= prefix #>32))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + AtomicOperations.Add( + (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Atomic<#= op #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetAtomicOperation32( + AtomicKind kind, + VelocityWarpOperationMode mode) => + AtomicOperations[(kind, mode, false)]; + + public static MethodInfo GetAtomicOperation64( + AtomicKind kind, + VelocityWarpOperationMode mode) => + AtomicOperations[(kind, mode, true)]; + + #endregion + + #region Thread Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> BarrierPopCount32( + <#= warpType32 #> mask, + <#= warpType32 #> warp) + { + int count = 0; +<# for (int i = 1; i <= warpSize; ++i) { #> + count += mask.Item<#= i #> != 0 ? (warp.Item<#= i #> != 0 ? 1 : 0) : 0; +<# } #> + return FromScalarI32(count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> BarrierPopCount64( + <#= warpType32 #> mask, + <#= warpType64 #> warp) + { + int count = 0; +<# for (int i = 1; i <= warpSize; ++i) { #> + count += mask.Item<#= i #> != 0 ? (warp.Item<#= i #> != 0 ? 1 : 0) : 0; +<# } #> + return FromScalarI64((long)count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> BarrierAnd32( + <#= warpType32 #> mask, + <#= warpType32 #> warp) + { + int andMask = 1; +<# for (int i = 1; i <= warpSize; ++i) { #> + andMask &= mask.Item<#= i #> != 0 ? warp.Item<#= i #> : 0; +<# } #> + return FromScalarI32(andMask); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> BarrierAnd64( + <#= warpType32 #> mask, + <#= warpType64 #> warp) + { + long andMask = 1; +<# for (int i = 1; i <= warpSize; ++i) { #> + andMask &= mask.Item<#= i #> != 0 ? warp.Item<#= i #> : 0; +<# } #> + return FromScalarI64(andMask); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> BarrierOr32( + <#= warpType32 #> mask, + <#= warpType32 #> warp) + { + int orMask = 0; +<# for (int i = 1; i <= warpSize; ++i) { #> + orMask |= mask.Item<#= i #> != 0 ? warp.Item<#= i #> : 0; +<# } #> + return FromScalarI32(orMask); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> BarrierOr64( + <#= warpType32 #> mask, + <#= warpType64 #> warp) + { + long orMask = 0; +<# for (int i = 1; i <= warpSize; ++i) { #> + orMask |= mask.Item<#= i #> != 0 ? warp.Item<#= i #> : 0; +<# } #> + return FromScalarI64(orMask); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetShuffledLane32( + <#= warpType32 #> value, + int sourceLane) + { + switch (sourceLane) + { +<# for (int i = 0; i < warpSize - 1; ++i) { #> + case <#= i #>: + return value.Item<#= i + 1 #>; +<# } #> + default: + return value.Item<#= warpSize #>; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Broadcast32( + <#= warpType32 #> mask, + <#= warpType32 #> value, + <#= warpType32 #> sourceLane) + { + // Mask is unused at the moment + int sourceLaneIdx = sourceLane.Item1; + int result = GetShuffledLane32(value, sourceLaneIdx); + return Select32(mask, value, FromScalarI32(result)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static long GetShuffledLane64( + <#= warpType64 #> value, + int sourceLane) + { + switch (sourceLane) + { +<# for (int i = 0; i < warpSize - 1; ++i) { #> + case <#= i #>: + return value.Item<#= i + 1 #>; +<# } #> + default: + return value.Item<#= warpSize #>; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Broadcast64( + <#= warpType32 #> mask, + <#= warpType64 #> value, + <#= warpType64 #> sourceLane) + { + // Mask is unused at the moment + int sourceLaneIdx = (int)sourceLane.Item1; + long result = GetShuffledLane64(value, sourceLaneIdx); + return Select64(mask, value, FromScalarI64(result)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> Shuffle32( + <#= warpType32 #> mask, + <#= warpType32 #> value, + <#= warpType32 #> sourceLanes) + { + // Mask is unused at the moment + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? GetShuffledLane32(value, sourceLanes.Item<#= i #>) + : value.Item<#= i #>; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> Shuffle64( + <#= warpType32 #> mask, + <#= warpType64 #> value, + <#= warpType64 #> sourceLanes) + { + // Mask is unused at the moment + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? GetShuffledLane64(value, (int)sourceLanes.Item<#= i #>) + : value.Item<#= i #>; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ComputeShuffleConfig( + <#= warpType32 #> width, + out <#= warpType32 #> lane, + out <#= warpType32 #> offset) + { + lane = RemI32(LoadLaneIndexVector32(), width); + offset = MulI32(DivI32(lane, width), width); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> ShuffleUp32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta) + { + var lane = SubI32(LoadLaneIndexVector32(), delta); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> SubShuffleUp32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = SubI32(lane, delta); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> ShuffleUp64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + var lane = SubI32(LoadLaneIndexVector32(), delta); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> SubShuffleUp64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = SubI32(lane, delta); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> ShuffleDown32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta) + { + var lane = AddI32(LoadLaneIndexVector32(), delta); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> SubShuffleDown32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = AddI32(lane, delta); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> ShuffleDown64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta) + { + var lane = AddI32(LoadLaneIndexVector32(), delta); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> SubShuffleDown64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = AddI32(lane, delta); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> ShuffleXor32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> laneMask) + { + var lane = XorU32(LoadLaneIndexVector32(), laneMask); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType32 #> SubShuffleXor32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> laneMask, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = XorU32(lane, laneMask); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> ShuffleXor64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> laneMask) + { + var lane = XorU32(LoadLaneIndexVector32(), laneMask); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> SubShuffleXor64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> laneMask, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = XorU32(lane, laneMask); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + public static readonly MethodInfo BarrierPopCount32Method = + GetMethod(nameof(BarrierPopCount32)); + public static readonly MethodInfo BarrierPopCount64Method = + GetMethod(nameof(BarrierPopCount64)); + public static readonly MethodInfo BarrierAnd32Method = + GetMethod(nameof(BarrierAnd32)); + public static readonly MethodInfo BarrierAnd64Method = + GetMethod(nameof(BarrierAnd64)); + public static readonly MethodInfo BarrierOr32Method = + GetMethod(nameof(BarrierOr32)); + public static readonly MethodInfo BarrierOr64Method = + GetMethod(nameof(BarrierOr64)); + public static readonly MethodInfo Broadcast32Method = + GetMethod(nameof(Broadcast32)); + public static readonly MethodInfo Broadcast64Method = + GetMethod(nameof(Broadcast64)); + public static readonly MethodInfo Shuffle32Method = + GetMethod(nameof(Shuffle32)); + public static readonly MethodInfo Shuffle64Method = + GetMethod(nameof(Shuffle64)); + public static readonly MethodInfo ShuffleUp32Method = + GetMethod(nameof(ShuffleUp32)); + public static readonly MethodInfo SubShuffleUp32Method = + GetMethod(nameof(SubShuffleUp32)); + public static readonly MethodInfo ShuffleUp64Method = + GetMethod(nameof(ShuffleUp64)); + public static readonly MethodInfo SubShuffleUp64Method = + GetMethod(nameof(SubShuffleUp64)); + public static readonly MethodInfo ShuffleDown32Method = + GetMethod(nameof(ShuffleDown32)); + public static readonly MethodInfo SubShuffleDown32Method = + GetMethod(nameof(SubShuffleDown32)); + public static readonly MethodInfo ShuffleDown64Method = + GetMethod(nameof(ShuffleDown64)); + public static readonly MethodInfo SubShuffleDown64Method = + GetMethod(nameof(SubShuffleDown64)); + public static readonly MethodInfo ShuffleXor32Method = + GetMethod(nameof(ShuffleXor32)); + public static readonly MethodInfo SubShuffleXor32Method = + GetMethod(nameof(SubShuffleXor32)); + public static readonly MethodInfo ShuffleXor64Method = + GetMethod(nameof(ShuffleXor64)); + public static readonly MethodInfo SubShuffleXor64Method = + GetMethod(nameof(SubShuffleXor64)); + + #endregion + + #region IO + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType32 #> Load8( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? Unsafe.AsRef((void*)address.Item<#= i #>) + : 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType32 #> Load16( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? Unsafe.AsRef((void*)address.Item<#= i #>) + : 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe <#= warpType32 #> Load32( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { + Unsafe.SkipInit(out <#= warpType32 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? Unsafe.AsRef((void*)address.Item<#= i #>) + : 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe <#= warpType64 #> Load64( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { + Unsafe.SkipInit(out <#= warpType64 #> result); +<# for (int i = 1; i <= warpSize; ++i) { #> + result.Item<#= i #> = mask.Item<#= i #> != 0 + ? Unsafe.AsRef((void*)address.Item<#= i #>) + : 0; +<# } #> + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe void Store8( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + Unsafe.AsRef((void*)address.Item<#= i #>) = (byte)(value.Item<#= i #> & 0xff); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe void Store16( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + Unsafe.AsRef((void*)address.Item<#= i #>) = (ushort)(value.Item<#= i #> & 0xffff); +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe void Store32( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + Unsafe.AsRef((void*)address.Item<#= i #>) = value.Item<#= i #>; +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void Store64( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType64 #> value) + { +<# for (int i = 1; i <= warpSize; ++i) { #> + if (mask.Item<#= i #> != 0) + Unsafe.AsRef((void*)address.Item<#= i #>) = value.Item<#= i #>; +<# } #> + } + + public static readonly MethodInfo Load8Method = + GetMethod(nameof(Load8)); + public static readonly MethodInfo Load16Method = + GetMethod(nameof(Load16)); + public static readonly MethodInfo Load32Method = + GetMethod(nameof(Load32)); + public static readonly MethodInfo Load64Method = + GetMethod(nameof(Load64)); + + public static readonly MethodInfo Store8Method = + GetMethod(nameof(Store8)); + public static readonly MethodInfo Store16Method = + GetMethod(nameof(Store16)); + public static readonly MethodInfo Store32Method = + GetMethod(nameof(Store32)); + public static readonly MethodInfo Store64Method = + GetMethod(nameof(Store64)); + + #endregion + + #region Misc + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void DebugAssertFailed( + <#= warpType32 #> mask, + <#= warpType32 #> value, + string message, + string fileName, + int line, + string method) + { + // Check if any lane failed the check + var failedAssertionMask = XorU32(FromScalarU32(1), value); + if (BarrierOr32(mask, failedAssertionMask).Item1 != 0) + Trace.Assert(false, message, $"@ {fileName}:{line} in {method}"); + } + + public static readonly MethodInfo DebugAssertFailedMethod = + GetMethod(nameof(DebugAssertFailed)); + + [SuppressMessage( + "Globalization", + "CA1303:Do not pass literals as localized parameters", + Justification = "Basic invariant string")] + internal static void DumpWarp32(<#= warpType32 #> value, string label) + { + Console.Write(label); +<# for (int i = 1; i < warpSize; ++i) { #> + Console.Write(value.Item<#= i #>); + Console.Write(", "); +<# } #> + Console.Write(value.Item<#= warpSize #>); + Console.WriteLine(); + } + + public static readonly MethodInfo DumpWarp32Method = + GetMethod(nameof(DumpWarp32)); + + [SuppressMessage( + "Globalization", + "CA1303:Do not pass literals as localized parameters", + Justification = "Basic invariant string")] + internal static void DumpWarp64(<#= warpType64 #> value, string label) + { + Console.Write(label); +<# for (int i = 1; i < warpSize; ++i) { #> + Console.Write(value.Item<#= i #>); + Console.Write(", "); +<# } #> + Console.Write(value.Item<#= warpSize #>); + Console.WriteLine(); + } + + public static readonly MethodInfo DumpWarp64Method = + GetMethod(nameof(DumpWarp64)); + + #endregion + } +} \ No newline at end of file diff --git a/Src/ILGPU/Backends/Velocity/Scalar/ScalarTypeGenerator.cs b/Src/ILGPU/Backends/Velocity/Scalar/ScalarTypeGenerator.cs new file mode 100644 index 000000000..a89a7b1cd --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Scalar/ScalarTypeGenerator.cs @@ -0,0 +1,71 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: ScalarTypeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.Runtime.Velocity; +using System; + +namespace ILGPU.Backends.Velocity.Scalar +{ + /// + /// A scalar type generator to be used with the Velocity backend. + /// + sealed class ScalarTypeGenerator : VelocityTypeGenerator + { + #region Static + + /// + /// Maps basic types to vectorized basic types. + /// + private static readonly Type[] VectorizedBasicTypeMapping = new Type[] + { + ScalarOperations2.WarpType32, // None/Unknown + + ScalarOperations2.WarpType32, // Int1 + ScalarOperations2.WarpType32, // Int8 + ScalarOperations2.WarpType32, // Int16 + ScalarOperations2.WarpType32, // Int32 + ScalarOperations2.WarpType64, // Int64 + + ScalarOperations2.WarpType32, // Float16 + ScalarOperations2.WarpType32, // Float32 + ScalarOperations2.WarpType64, // Float64 + }; + + #endregion + + #region Instance + + /// + /// Constructs a new IL scalar code generator. + /// + /// The parent capability context. + /// The parent runtime system. + public ScalarTypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem) + : base(capabilityContext, runtimeSystem, ScalarOperations2.WarpSize) + { } + + #endregion + + #region Type System + + public override Type GetVectorizedBasicType(BasicValueType basicValueType) + { + if (basicValueType == BasicValueType.Float16 && !CapabilityContext.Float16) + throw VelocityCapabilityContext.GetNotSupportedFloat16Exception(); + return VectorizedBasicTypeMapping[(int)basicValueType]; + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/Transformations/VelocityBlockScheduling.cs b/Src/ILGPU/Backends/Velocity/Transformations/VelocityBlockScheduling.cs new file mode 100644 index 000000000..cf24f70de --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Transformations/VelocityBlockScheduling.cs @@ -0,0 +1,69 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityBlockScheduling.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Transformations; +using ILGPU.IR.Values; +using ILGPU.Resources; +using System.Linq; + +namespace ILGPU.Backends.Velocity.Transformations +{ + /// + /// Adapts the actual block branch order in a way to ensure that loop exits are + /// visited after the main loop bodies. + /// + sealed class VelocityBlockScheduling : UnorderedTransformation + { + /// + /// Applies a Velocity-specific block order. + /// + protected override bool PerformTransformation(Method.Builder builder) + { + // We change the control-flow structure during the transformation but + // need to get information about previous predecessors and successors + builder.AcceptControlFlowUpdates(accept: true); + + // Compute all loops of this method + var cfg = builder.SourceBlocks.CreateCFG(); + var loops = cfg.CreateLoops(); + + loops.ProcessLoops(loop => + { + // Compute a set of all exit blocks + var exits = loop.Exits.ToHashSet(); + + // Check all blocks leaving the loop + foreach (var breaker in loop.Breakers) + { + // If we hit an if branch and the false target is leaving the loop, + // we will have to negate the condition to ensure that the ordering + // visits the internal block first + if (breaker.Terminator is IfBranch ifBranch && + exits.Contains(ifBranch.FalseTarget)) + { + // Invert the current branch + ifBranch.Invert(builder[breaker]); + } + else if (breaker.Terminator is SwitchBranch switchBranch) + { + // Skip this case for now + throw switchBranch.Location.GetNotSupportedException( + ErrorMessages.NotSupportedILInstruction); + } + } + }); + + return true; + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs b/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs new file mode 100644 index 000000000..baf13f1b8 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityArgumentMapper.cs @@ -0,0 +1,34 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityArgumentMapper.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.PTX; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Constructs mappings Velocity kernels. + /// + /// The current velocity backend uses the PTX argument mapper. + sealed class VelocityArgumentMapper : PTXArgumentMapper + { + #region Instance + + /// + /// Constructs a new IL argument mapper. + /// + /// The current context. + public VelocityArgumentMapper(Context context) + : base(context) + { } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityBackend.cs b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs new file mode 100644 index 000000000..5fb22a55d --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs @@ -0,0 +1,190 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityBackend.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.Backends.IL.Transformations; +using ILGPU.Backends.Velocity.Transformations; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Transformations; +using ILGPU.Runtime; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System.Diagnostics.CodeAnalysis; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Represents an automatic vectorization backend to be used with Velocity. + /// + /// The IL emitter type. + class VelocityBackend : + CodeGeneratorBackend< + VelocityBackend.Handler, + VelocityCodeGenerator.GeneratorArgs, + VelocityCodeGenerator, + object> + where TILEmitter : struct, IILEmitter + { + #region Nested Types + + /// + /// Represents the handler delegate type of custom code-generation handlers. + /// + /// The current backend. + /// The current emitter. + /// The value to generate code for. + public delegate void Handler( + VelocityBackend backend, + in TILEmitter emitter, + Value value); + + #endregion + + #region Instance + + /// + /// Constructs a new Velocity backend. + /// + /// The context to use. + /// The supported capabilities. + /// The current warp size. + /// The argument mapper to use. + /// The specializer to generate instructions. + public VelocityBackend( + Context context, + VelocityCapabilityContext capabilities, + int warpSize, + VelocityArgumentMapper argumentMapper, + VelocityTargetSpecializer specializer) + : base( + context, + capabilities, + BackendType.Velocity, + argumentMapper) + { + WarpSize = warpSize; + Specializer = specializer; + TypeGenerator = specializer.CreateTypeGenerator( + capabilities, + context.RuntimeSystem); + + InitIntrinsicProvider(); + InitializeKernelTransformers(builder => + { + var transformerBuilder = Transformer.CreateBuilder( + TransformerConfiguration.Empty); + transformerBuilder.AddBackendOptimizations( + new ILAcceleratorSpecializer( + AcceleratorType.Velocity, + PointerType, + warpSize, + Context.Properties.EnableAssertions, + Context.Properties.EnableIOOperations), + context.Properties.InliningMode, + context.Properties.OptimizationLevel); + + // Transform all if and switch branches to make them compatible with + // the internal vectorization engine + transformerBuilder.Add(new VelocityBlockScheduling()); + + builder.Add(transformerBuilder.ToTransformer()); + }); + } + + #endregion + + #region Properties + + /// + /// Returns the current warp size to be used. + /// + public int WarpSize { get; } + + /// + /// Returns the current specializer. + /// + internal VelocityTargetSpecializer Specializer { get; } + + /// + /// Returns the current type generator. + /// + internal VelocityTypeGenerator TypeGenerator { get; } + + /// + /// Returns the associated . + /// + public new VelocityArgumentMapper ArgumentMapper => + base.ArgumentMapper.AsNotNullCast(); + + #endregion + + [SuppressMessage( + "Reliability", + "CA2000:Dispose objects before losing scope", + Justification = "Module will be disposed during finalization")] + protected override object CreateKernelBuilder( + EntryPoint entryPoint, + in BackendContext backendContext, + in KernelSpecialization specialization, + out VelocityCodeGenerator.GeneratorArgs data) + { + // Create a new generation module + var module = new VelocityGenerationModule( + Context.RuntimeSystem, + Specializer, + TypeGenerator, + backendContext, + entryPoint); + data = new VelocityCodeGenerator.GeneratorArgs( + Specializer, + module, + entryPoint); + return null!; + } + + protected override VelocityCodeGenerator + CreateFunctionCodeGenerator( + Method method, + Allocas allocas, + VelocityCodeGenerator.GeneratorArgs data) => + new VelocityFunctionGenerator(data, method, allocas); + + protected override VelocityCodeGenerator + CreateKernelCodeGenerator( + in AllocaKindInformation sharedAllocations, + Method method, + Allocas allocas, + VelocityCodeGenerator.GeneratorArgs data) => + new VelocityKernelFunctionGenerator( + data, + method, + allocas); + + protected override CompiledKernel CreateKernel( + EntryPoint entryPoint, + CompiledKernel.KernelInfo? kernelInfo, + object builder, + VelocityCodeGenerator.GeneratorArgs data) + { + using var module = data.Module; + return new VelocityCompiledKernel( + Context, + entryPoint, + module.KernelMethod, + module.ParametersType, + module.ParametersTypeConstructor, + module.ParameterFields, + module.SharedAllocationSize); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs new file mode 100644 index 000000000..1c34f795a --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.IO.cs @@ -0,0 +1,179 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.IO.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Resources; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(Load load) + { + var mask = GetBlockMask(load.BasicBlock); + var source = GetLocal(load.Source); + CreateLoad(mask, source, load.Type); + Store(load); + } + + /// + /// Creates code to load primitive values and pointers from memory while using + /// the given mask to differentiate between active and inactive lanes. + /// + private void CreateNonStructureLoad(TypeNode typeNode) + { + switch (typeNode) + { + case PrimitiveType primitiveType: + Specializer.Load(Emitter, primitiveType.BasicValueType); + break; + case PointerType _: + Specializer.Load(Emitter, BasicValueType.Int64); + break; + default: + throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType); + } + } + + /// + /// Creates a sequence of load instructions to load a vectorized value via + /// specialized IO operations. + /// + private void CreateLoad(ILLocal mask, ILLocal source, TypeNode typeNode) + { + if (typeNode is StructureType structureType) + { + // Allocate a new temporary allocation to fill all fields + var vectorizedType = TypeGenerator.GetVectorizedType(structureType); + var temporary = Emitter.DeclareLocal(vectorizedType); + Emitter.LoadNull(temporary); + + // Fill the temporary structure instance with values + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the variable address + Emitter.Emit(LocalOperation.LoadAddress, temporary); + + // Load the local mask + Emitter.Emit(LocalOperation.Load, mask); + + // Adjust the actual source address based on offsets in the type + // definition + // Adjust the target offset + long fieldOffset = structureType.GetOffset(fieldAccess); + Emitter.EmitConstant(fieldOffset); + Specializer.ConvertScalarTo64(Emitter, VelocityWarpOperationMode.I); + Emitter.Emit(LocalOperation.Load, source); + Specializer.BinaryOperation64( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U); + + // Load the converted field type + CreateNonStructureLoad(fieldType); + + // Store it into out structure field + Emitter.StoreField(vectorizedType, fieldAccess.Index); + } + + // Load local variable onto the stack containing all required information + Emitter.Emit(LocalOperation.Load, temporary); + } + else + { + // Load the local mask + Emitter.Emit(LocalOperation.Load, mask); + + // Load the type directly + Emitter.Emit(LocalOperation.Load, source); + + CreateNonStructureLoad(typeNode); + } + } + + + /// + /// Generates code to store primitive values and pointers from memory while using + /// the given mask to differentiate between active and inactive lanes. + /// + private void GenerateNonStructureStore(TypeNode typeNode) + { + var basicValueType = typeNode switch + { + PrimitiveType primitiveType => primitiveType.BasicValueType, + PaddingType paddingType => paddingType.BasicValueType, + PointerType _ => BasicValueType.Int64, + _ => throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType, typeNode) + }; + + Specializer.Store(Emitter, basicValueType); + } + + /// + public void GenerateCode(Store store) + { + var mask = GetBlockMask(store.BasicBlock); + var target = GetLocal(store.Target); + var value = GetLocal(store.Value); + var type = store.Value.Type; + + if (type is StructureType structureType) + { + // Iterate over all fields and store them + var vectorizedType = GetVectorizedType(type); + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the current mask + Emitter.Emit(LocalOperation.Load, mask); + + // Load target directly + Emitter.Emit(LocalOperation.Load, target); + + // Get the source field offset in bytes + long fieldOffset = structureType.GetOffset(fieldAccess); + if (fieldOffset != 0L) + { + Emitter.EmitConstant(fieldOffset); + Specializer.ConvertScalarTo64(Emitter, + VelocityWarpOperationMode.U); + + // Load target address and adjust offset + Specializer.BinaryOperation64( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U); + } + + // Load the current field value + Emitter.Emit(LocalOperation.Load, value); + Emitter.LoadField(vectorizedType, fieldAccess.Index); + + // Store the field into memory + GenerateNonStructureStore(fieldType); + } + } + else + { + Emitter.Emit(LocalOperation.Load, mask); + Emitter.Emit(LocalOperation.Load, target); + Emitter.Emit(LocalOperation.Load, value); + + GenerateNonStructureStore(type); + } + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs new file mode 100644 index 000000000..284e3b714 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Terminators.cs @@ -0,0 +1,421 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Terminators.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.Backends.Velocity.Analyses; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Analyses.ControlFlowDirection; +using ILGPU.IR.Analyses.TraversalOrders; +using ILGPU.IR.Values; +using ILGPU.Util; +using System; +using System.Reflection.Emit; +using Loop = ILGPU.IR.Analyses.Loops< + ILGPU.IR.Analyses.TraversalOrders.ReversePostOrder, + ILGPU.IR.Analyses.ControlFlowDirection.Forwards>.Node; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + #region Branch Builder + + /// + /// Helps building branches by taking branch targets and masks into account. + /// + protected sealed class BranchBuilder + { + private readonly VelocityCodeGenerator codeGenerator; + + private readonly BasicBlock current; + private readonly ILLocal currentMask; + + private readonly bool isBackEdgeBlock; + + private readonly Loops.Node? currentLoop; + private readonly PhiBindings.PhiBindingCollection? phiBindings; + private InlineList<(BasicBlock Target, Action Condition)> targets; + + public BranchBuilder( + VelocityCodeGenerator parent, + BasicBlock currentBlock) + { + codeGenerator = parent; + current = currentBlock; + currentMask = parent.GetBlockMask(current); + + isBackEdgeBlock = Masks.IsBackEdgeBlock(currentBlock); + + Masks.TryGetLoop(currentBlock, out currentLoop); + phiBindings = codeGenerator.phiBindings.TryGetBindings( + current, + out var bindings) ? bindings : null; + targets = InlineList<(BasicBlock, Action)> + .Create(currentBlock.Successors.Length); + } + + public VelocityMasks Masks => codeGenerator.masks; + public TILEmitter Emitter => codeGenerator.Emitter; + public VelocityTargetSpecializer Specializer => codeGenerator.Specializer; + + /// + /// Records a branch target. + /// + /// The target block to branch to. + /// The pass mask action. + public void RecordBranchTarget(BasicBlock target, Action passMask) + { + // Check for a jump into a (possibly different) loop + if (Masks.IsHeader(target, out var targetLoop)) + { + // Check for a jump backwards + if (isBackEdgeBlock) + { + // Pass the current mask + passMask(); + + if (target != current) + { + // We are branching to our loop header and need to update our + // loop mask of the header we are branching back to + codeGenerator.UnifyWithMaskOf(target, keepOnStack: false); + } + else // Store the mask directly + { + Emitter.Emit(LocalOperation.Store, currentMask); + } + + targets.Add((target, () => + { + Emitter.Emit(LocalOperation.Load, currentMask); + var loopMask = Masks.GetLoopMask(targetLoop); + Emitter.Emit(LocalOperation.Load, loopMask); + Specializer.IntersectMask32(Emitter); + + // Check for active masks of the target block to test whether + // we actually branch back to the loop header + Specializer.CheckForAnyActiveLaneMask(Emitter); + })); + } + else + { + // Pass the current mask + passMask(); + + if (target != current) + { + // We are branching forwards and need to pass the mask while + // unifying all lanes + codeGenerator.UnifyWithMaskOf(target, keepOnStack: true); + } + else // Store the mask directly + { + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, currentMask); + } + + // Set the actual loop mask + var loopMask = Masks.GetLoopMask(targetLoop); + Emitter.Emit(LocalOperation.Load, loopMask); + Specializer.UnifyMask32(Emitter); + Emitter.Emit(LocalOperation.Store, loopMask); + + // Disable all loop body blocks + TryResetLoopBody(targetLoop); + } + } + else if (Masks.IsExit(target, out var isExitFor)) + { + // Check whether we leaving our loop at the moment + if (currentLoop is not null && isExitFor(currentLoop)) + { + // Based on the ordering ensured by the VelocityBlockScheduling + // transformation, we know that the exit block cannot be reached + // from within the loop implicitly. Therefore, it is sufficient to + // check whether the unified header masks are equal to the current + // mask of the target loop. This means that all lanes have reached + // this point and we can branch to the exit block. + + // Notify the loop mask that some lanes passed to this block left + var loopMask = Masks.GetLoopMask(currentLoop.AsNotNull()); + passMask(); + Emitter.Emit(OpCodes.Dup); + codeGenerator.DisableSpecificLanes(loopMask); + + // Unify with the target mask to cause the lanes to be ready when + // we continue processing the exit block + codeGenerator.UnifyWithMaskOf(target, keepOnStack: false); + + targets.Add((target, () => + { + // Load loop mask to see whether we have any lanes left + Emitter.Emit(LocalOperation.Load, loopMask); + + // Check whether all lane masks have been disabled + Specializer.CheckForNoActiveLaneMask(Emitter); + })); + } + else + { + // We are just branching forwards + passMask(); + codeGenerator.UnifyWithMaskOf(target, keepOnStack: false); + } + } + else // Default case in which we do not change any loop state + { + // Pass the current mask + passMask(); + + // We are branching forwards and need to pass the mask while unifying + // all lanes + codeGenerator.UnifyWithMaskOf(target, keepOnStack: false); + } + + // Bind all phi values on this edge + BindPhis(target, passMask); + } + + /// + /// Tries to reset masks for all loop members. + /// + /// The target loop to use. + private void TryResetLoopBody(Loop targetLoop) + { + foreach (var block in targetLoop.AllMembers) + { + if (!targetLoop.ContainsExclusively(block)) + continue; + codeGenerator.TryResetBlockLanes(block); + } + } + + /// + /// Binds phi values flowing about a particular edge + /// + private void BindPhis(BasicBlock target, Action passMask) + { + // Check whether we have any bindings for this block + if (!phiBindings.HasValue) + return; + + // Filter all phis flowing through this edge + codeGenerator.BindPhis(phiBindings.Value, target, passMask); + } + + /// + /// Emits a branch if required. + /// + public void EmitBranch() + { + // Check for required branches + if (targets.Count < 1) + { + // Disable our lanes as we passed this block + codeGenerator.DisableLanesOf(current); + + // Leave here as we do not require a single branch + return; + } + + // Optimize for the most trivial case in which we have a single branch + if (targets.Count == 1) + { + var (target, condition) = targets[0]; + + // Emit our condition checks + condition(); + + if (target != current) + { + // Disable all lanes at this point + codeGenerator.DisableLanesOf(current); + } + + // Jump to our target block + Emitter.Emit(OpCodes.Brtrue, codeGenerator.blockLookup[target]); + } + else + { + // We have reached the most difficult case, in which we have to find + // the right block to jump to. However, we do not need any sorting + // of the targets as the basic block scheduling transformation took + // care of that -> targets are in the right order (descending) + for (int i = targets.Count - 1; i >= 0; --i) + { + var (target, condition) = targets[i]; + + // Declare temp label to branch to in case we need to branch + var tempLabel = Emitter.DeclareLabel(); + + // Emit our condition + condition(); + + // Skip the following branches in case of a failed check + Emitter.Emit(OpCodes.Brfalse, tempLabel); + + if (target != current) + { + // Disable all lanes at this point + codeGenerator.DisableLanesOf(current); + } + + // Jump to our target block + Emitter.Emit(OpCodes.Br, codeGenerator.blockLookup[target]); + + // Emit the actual temp label to branch to in case to continue + // processing + Emitter.MarkLabel(tempLabel); + } + + // Disable all lanes at this point before (potentially) leaving + codeGenerator.DisableLanesOf(current); + } + + } + } + + #endregion + + #region Methods + + /// + public abstract void GenerateCode(ReturnTerminator returnTerminator); + + /// + public void GenerateCode(UnconditionalBranch branch) + { + // Create a branch if required + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + + // Cache temp local + var tempLocal = Emitter.DeclareLocal(Specializer.WarpType32); + Emitter.Emit(LocalOperation.Load, GetBlockMask(branch.BasicBlock)); + Emitter.Emit(LocalOperation.Store, tempLocal); + + branchBuilder.RecordBranchTarget(branch.Target, () => + { + // Pass the current mask + Emitter.Emit(LocalOperation.Load, tempLocal); + }); + branchBuilder.EmitBranch(); + } + + /// + public void GenerateCode(IfBranch branch) + { + // Get current mask + var currentMask = GetBlockMask(branch.BasicBlock); + + // Create a new branch builder + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + + // Load condition + Load(branch.Condition); + Emitter.Emit(OpCodes.Dup); + + // Adjust the true mask + var trueMask = Emitter.DeclareLocal(Specializer.WarpType32); + IntersectWithMask(currentMask); + Emitter.Emit(LocalOperation.Store, trueMask); + + // Intersect negated with the current mask + var falseMask = Emitter.DeclareLocal(Specializer.WarpType32); + Specializer.NegateMask32(Emitter); + IntersectWithMask(currentMask); + Emitter.Emit(LocalOperation.Store, falseMask); + + branchBuilder.RecordBranchTarget(branch.TrueTarget, () => + { + Emitter.Emit(LocalOperation.Load, trueMask); + }); + + branchBuilder.RecordBranchTarget(branch.FalseTarget, () => + { + Emitter.Emit(LocalOperation.Load, falseMask); + }); + + // Emit branch (if required) + branchBuilder.EmitBranch(); + } + + /// + public void GenerateCode(SwitchBranch branch) + { + // Get current mask + var currentMask = GetBlockMask(branch.BasicBlock); + + // Create a new branch builder + var branchBuilder = CreateBranchBuilder(branch.BasicBlock); + + // Check lower bounds: case < 0 + Load(branch.Condition); + Emitter.EmitConstant(0); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.LessThan, + VelocityWarpOperationMode.I); + + // Check upper bounds: case >= num cases + Load(branch.Condition); + Emitter.EmitConstant(branch.NumCasesWithoutDefault); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.GreaterEqual, + VelocityWarpOperationMode.I); + + // Store unified branch mask + Specializer.UnifyMask32(Emitter); + IntersectWithMask(currentMask); + + var outOfBoundsMask = Emitter.DeclareLocal(Specializer.WarpType32); + Emitter.Emit(LocalOperation.Store, outOfBoundsMask); + + // Record branch to the default block + branchBuilder.RecordBranchTarget(branch.DefaultBlock, () => + { + Emitter.Emit(LocalOperation.Load, outOfBoundsMask); + }); + + // Adjust masks for each target + for (int i = 0; i < branch.NumCasesWithoutDefault; ++i) + { + // Check whether the conditional selector is equal to the current case + Load(branch.Condition); + Emitter.EmitConstant(i); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.Equal, + VelocityWarpOperationMode.I); + + // Store the current mask + var currentCaseMask = Emitter.DeclareLocal(Specializer.WarpType32); + IntersectWithMask(currentMask); + Emitter.Emit(LocalOperation.Store, currentCaseMask); + + // Record branch + branchBuilder.RecordBranchTarget(branch.GetCaseTarget(i), () => + { + Emitter.Emit(LocalOperation.Load, currentCaseMask); + }); + } + + // Emit branch if necessary + branchBuilder.EmitBranch(); + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs new file mode 100644 index 000000000..58c43e4f4 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs @@ -0,0 +1,285 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Threads.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(GenericAtomic atomic) + { + // Load the target and the value + Emitter.Emit(LocalOperation.Load, GetBlockMask(atomic.BasicBlock)); + Load(atomic.Target); + Load(atomic.Value); + + // Get the appropriate atomic operation + var warpMode = atomic.ArithmeticBasicValueType.GetWarpMode(); + if (atomic.IsTreatedAs32Bit()) + Specializer.Atomic32(Emitter, atomic.Kind, warpMode); + else + Specializer.Atomic64(Emitter, atomic.Kind, warpMode); + + // Check whether we actually need the result + if (!atomic.Uses.HasAny) + Emitter.Emit(OpCodes.Pop); + else + Store(atomic); + } + + /// + public void GenerateCode(AtomicCAS atomicCAS) + { + // Load the target, the compare value and the value + Emitter.Emit(LocalOperation.Load, GetBlockMask(atomicCAS.BasicBlock)); + Load(atomicCAS.Target); + Load(atomicCAS.Value); + Load(atomicCAS.CompareValue); + + // Get the appropriate atomic operation + if (atomicCAS.IsTreatedAs32Bit()) + Specializer.AtomicCompareExchange32(Emitter); + else + Specializer.AtomicCompareExchange64(Emitter); + + // Store the result + Store(atomicCAS); + } + + /// + public void GenerateCode(GridIndexValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + // Load the first context argument and query the grid index + VelocityTargetSpecializer.GetGridIndex(Emitter); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(0); + break; + } + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Store(value); + } + + /// + public void GenerateCode(GroupIndexValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + Specializer.LoadLaneIndexVector32(Emitter); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(0); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + break; + } + Store(value); + } + + /// + public void GenerateCode(GridDimensionValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + VelocityTargetSpecializer.GetGridDim(Emitter); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(1); + break; + } + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Store(value); + } + + /// + public void GenerateCode(GroupDimensionValue value) + { + switch (value.Dimension) + { + case DeviceConstantDimension3D.X: + VelocityTargetSpecializer.GetGroupDim(Emitter); + break; + case DeviceConstantDimension3D.Y: + case DeviceConstantDimension3D.Z: + Emitter.LoadIntegerConstant(1); + break; + } + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Store(value); + } + + /// + public void GenerateCode(WarpSizeValue value) + { + Specializer.LoadWarpSize32(Emitter); + Store(value); + } + + /// + public void GenerateCode(LaneIdxValue value) + { + Specializer.LoadLaneIndexVector32(Emitter); + Store(value); + } + + /// + public void GenerateCode(PredicateBarrier barrier) + { + // Load predicate + Emitter.Emit(LocalOperation.Load, GetBlockMask(barrier.BasicBlock)); + Load(barrier.Predicate); + + // Load and call predicate operation + bool is32Bit = barrier.IsTreatedAs32Bit(); + switch (barrier.Kind) + { + case PredicateBarrierKind.PopCount: + if (is32Bit) + Specializer.BarrierPopCount32(Emitter); + else + Specializer.BarrierPopCount64(Emitter); + break; + case PredicateBarrierKind.And: + if (is32Bit) + Specializer.BarrierAnd32(Emitter); + else + Specializer.BarrierAnd64(Emitter); + break; + case PredicateBarrierKind.Or: + if (is32Bit) + Specializer.BarrierOr32(Emitter); + else + Specializer.BarrierOr64(Emitter); + break; + default: + throw new NotSupportedException(); + } + + Store(barrier); + } + + /// + public void GenerateCode(Barrier barrier) => + Specializer.Barrier(Emitter); + + /// + public void GenerateCode(Broadcast broadcast) + { + // Load the source variable + Emitter.Emit(LocalOperation.Load, GetBlockMask(broadcast.BasicBlock)); + Load(broadcast.Variable); + Load(broadcast.Origin); + + // Get the appropriate broadcast operation + if (broadcast.IsTreatedAs32Bit()) + Specializer.Broadcast32(Emitter); + else + Specializer.Broadcast64(Emitter); + + Store(broadcast); + } + + /// + public void GenerateCode(WarpShuffle shuffle) + { + // Load the source variable and the origin + Emitter.Emit(LocalOperation.Load, GetBlockMask(shuffle.BasicBlock)); + Load(shuffle.Variable); + Load(shuffle.Origin); + + // Get the appropriate broadcast operation + bool is32Bit = shuffle.IsTreatedAs32Bit(); + switch (shuffle.Kind) + { + case ShuffleKind.Generic: + if (is32Bit) + Specializer.Shuffle32(Emitter); + else + Specializer.Shuffle64(Emitter); + break; + case ShuffleKind.Up: + if (is32Bit) + Specializer.ShuffleUp32(Emitter); + else + Specializer.ShuffleUp64(Emitter); + break; + case ShuffleKind.Down: + if (is32Bit) + Specializer.ShuffleDown32(Emitter); + else + Specializer.ShuffleDown64(Emitter); + break; + case ShuffleKind.Xor: + if (is32Bit) + Specializer.ShuffleXor32(Emitter); + else + Specializer.ShuffleXor64(Emitter); + break; + default: + throw new NotSupportedException(); + } + + // Store the shuffle result + Store(shuffle); + } + + /// + public void GenerateCode(SubWarpShuffle shuffle) + { + // Load the source variable, the origin, and the sub-warp width + Emitter.Emit(LocalOperation.Load, GetBlockMask(shuffle.BasicBlock)); + Load(shuffle.Variable); + Load(shuffle.Origin); + Load(shuffle.Width); + + // Get the appropriate broadcast operation + bool is32Bit = shuffle.IsTreatedAs32Bit(); + switch (shuffle.Kind) + { + case ShuffleKind.Up: + if (is32Bit) + Specializer.SubShuffleUp32(Emitter); + else + Specializer.SubShuffleUp64(Emitter); + break; + case ShuffleKind.Down: + if (is32Bit) + Specializer.SubShuffleDown32(Emitter); + else + Specializer.SubShuffleDown64(Emitter); + break; + case ShuffleKind.Xor: + if (is32Bit) + Specializer.SubShuffleXor32(Emitter); + else + Specializer.SubShuffleXor64(Emitter); + break; + default: + throw new NotSupportedException(); + } + + // Store the shuffle result + Store(shuffle); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs new file mode 100644 index 000000000..50c3e0a2a --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs @@ -0,0 +1,536 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Values.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(MethodCall methodCall) + { + // Load the execution context + Emitter.Emit(OpCodes.Ldarg_0); + + // Load the current execution mask + Emitter.Emit(LocalOperation.Load, GetBlockMask(methodCall.BasicBlock)); + + // Load all arguments onto the evaluation stack + foreach (Value arg in methodCall) + Load(arg); + + // Call the module method + var method = Module[methodCall.Target]; + Emitter.EmitCall(method); + + if (!methodCall.Target.IsVoid) + Store(methodCall); + } + + /// + public void GenerateCode(Parameter parameter) + { + // Parameters have been bound in the beginning and do not need to be + // processed here + } + + /// + public void GenerateCode(PhiValue phiValue) + { + // Phi values need to be allocated in the beginning and do not need to be + // handled here + } + + /// + public void GenerateCode(UnaryArithmeticValue value) + { + Load(value.Value); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + + if (value.IsTreatedAs32Bit()) + Specializer.UnaryOperation32(Emitter, value.Kind, warpMode); + else + Specializer.UnaryOperation64(Emitter, value.Kind, warpMode); + + Store(value); + } + + /// + public void GenerateCode(BinaryArithmeticValue value) + { + Load(value.Left); + Load(value.Right); + + // Check for operation types + switch (value.Kind) + { + case BinaryArithmeticKind.Shl: + case BinaryArithmeticKind.Shr: + // We need to convert the rhs operations to int64 + if (!value.IsTreatedAs32Bit()) + Specializer.Convert32To64(Emitter, VelocityWarpOperationMode.I); + break; + } + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + if (value.IsTreatedAs32Bit()) + Specializer.BinaryOperation32(Emitter, value.Kind, warpMode); + else + Specializer.BinaryOperation64(Emitter, value.Kind, warpMode); + Store(value); + } + + /// + public void GenerateCode(TernaryArithmeticValue value) + { + Load(value.First); + Load(value.Second); + Load(value.Third); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + if (value.IsTreatedAs32Bit()) + Specializer.TernaryOperation32(Emitter, value.Kind, warpMode); + else + Specializer.TernaryOperation64(Emitter, value.Kind, warpMode); + Store(value); + } + + /// + public void GenerateCode(CompareValue value) + { + Load(value.Left); + Load(value.Right); + + // Determine the current warp mode and its bitness + var warpMode = value.GetWarpMode(); + if (value.CompareType.GetBasicValueType().IsTreatedAs32Bit()) + Specializer.Compare32(Emitter, value.Kind, warpMode); + else + Specializer.Compare64(Emitter, value.Kind, warpMode); + Store(value); + } + + /// + public void GenerateCode(ConvertValue value) + { + // Check to which value we have to convert the current value + var sourceMode = value.SourceType.GetWarpMode(); + var targetMode = value.TargetType.GetWarpMode(); + + // Load source + Load(value.Value); + + // Check whether have to expand or to narrow the current values on the stack + var sourceBasicValueType = value.SourceType.GetBasicValueType(); + bool sourceIs32Bit = sourceBasicValueType.IsTreatedAs32Bit(); + bool targetIs32Bit = value.IsTreatedAs32Bit(); + + if (sourceIs32Bit) + { + // The source value lives in the 32bit warp world + + // Check whether we have to widen first + if (targetIs32Bit) + { + // Use the local conversion functionality + Specializer.ConvertSoftware32( + Emitter, + value.SourceType, + value.TargetType); + } + else + { + // Use the local conversion mechanism in 32bit mode + ArithmeticBasicValueType targetType32; + if (sourceBasicValueType.IsFloat()) + { + // Ensure 32bit float compatibility + targetType32 = ArithmeticBasicValueType.Float32; + } + else + { + // Extent types to 32bit only while preserving the sign + targetType32 = value.TargetType.ForceTo32Bit(); + if (targetType32.IsFloat()) + { + targetType32 = value.IsSourceUnsigned + ? ArithmeticBasicValueType.UInt32 + : ArithmeticBasicValueType.Int32; + } + } + + Specializer.ConvertSoftware32( + Emitter, + value.SourceType, + targetType32); + + // Widen first + Specializer.Convert32To64(Emitter, sourceMode); + + // Ensure valid data types in 64bit world + Specializer.Convert64(Emitter, sourceMode, targetMode); + } + } + else + { + // The source value lives in the 64bit warp world + + // Convert the values according to the 64bit type information + Specializer.Convert64(Emitter, sourceMode, targetMode); + + // We have to enter the 32bit world + if (targetIs32Bit) + { + // Narrow to 32bit world + Specializer.Convert64To32(Emitter, targetMode); + + // Convert the remaining parts + Specializer.ConvertSoftware32( + Emitter, + value.TargetType.ForceTo32Bit(), + value.TargetType); + } + } + + Store(value); + } + + /// + public void GenerateCode(FloatAsIntCast value) + { + // Do nothing as this does not change any register contents + var valueLocal = GetLocal(value.Value); + Alias(value, valueLocal); + } + + /// + public void GenerateCode(IntAsFloatCast value) + { + // Do nothing as this does not change any register contents + var valueLocal = GetLocal(value.Value); + Alias(value, valueLocal); + } + + /// + /// Emits a new merge operation working on arbitrary values. + /// + protected ILLocal? EmitMerge( + Value value, + Func loadLeft, + Func loadRight, + Action loadCondition, + Func getTempLocal) + { + // Merges values based on predicate masks + void MergeLocal(BasicValueType basicValueType) + { + if (basicValueType.IsTreatedAs32Bit()) + { + // Merge 32bit values + Specializer.ConditionalSelect32(Emitter); + } + else + { + // Merge 64bit values + Specializer.ConditionalSelect64(Emitter); + } + } + + // Merge the actual values from all lanes + if (value.Type is StructureType structureType) + { + var targetType = TypeGenerator.GetVectorizedType(structureType); + var target = getTempLocal(targetType); + + // Iterate over all field elements + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load arguments + Emitter.Emit(LocalOperation.LoadAddress, target); + + loadCondition(); + var leftType = loadLeft(); + Emitter.LoadField(leftType, fieldAccess.Index); + var rightType = loadRight(); + Emitter.LoadField(rightType, fieldAccess.Index); + + // Merge + MergeLocal(fieldType.BasicValueType); + + // Store field + Emitter.StoreField(targetType, fieldAccess.Index); + } + + return target; + } + else + { + // A direct merge is possible + loadCondition(); + loadLeft(); + loadRight(); + MergeLocal(value.BasicValueType); + return null; + } + } + + /// + public void GenerateCode(Predicate predicate) + { + // Load true and false values in reverse order to match API spec + var falseLocal = GetLocal(predicate.FalseValue); + var trueLocal = GetLocal(predicate.TrueValue); + + // Emit the merge + var local = EmitMerge(predicate, + () => + { + Emitter.Emit(LocalOperation.Load, falseLocal); + return falseLocal.VariableType; + }, + () => + { + Emitter.Emit(LocalOperation.Load, trueLocal); + return trueLocal.VariableType; + }, + () => Load(predicate.Condition), + type => Emitter.DeclareLocal(type)); + + // Bind value result + if (local.HasValue) + Alias(predicate, local.Value); + else + Store(predicate); + } + + /// + public void GenerateCode(Alloca alloca) + { + // All allocations have already been processed in the beginning. + } + + /// + public void GenerateCode(MemoryBarrier barrier) => + VelocityTargetSpecializer.MemoryBarrier(Emitter); + + /// + public void GenerateCode(PrimitiveValue value) + { + switch (value.BasicValueType) + { + case BasicValueType.Int1: + Emitter.Emit(value.Int1Value ? OpCodes.Ldc_I4_1 : OpCodes.Ldc_I4_0); + Specializer.ConvertBoolScalar(Emitter); + break; + case BasicValueType.Int8: + Emitter.LoadIntegerConstant((int)(uint)value.Int8Value); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.U); + break; + case BasicValueType.Int16: + Emitter.LoadIntegerConstant((int)(uint)value.Int16Value); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.U); + break; + case BasicValueType.Int32: + Emitter.LoadIntegerConstant(value.Int32Value); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.U); + break; + case BasicValueType.Int64: + Emitter.EmitConstant(value.Int64Value); + Specializer.ConvertScalarTo64(Emitter, VelocityWarpOperationMode.U); + break; + case BasicValueType.Float16: + throw VelocityCapabilityContext.GetNotSupportedFloat16Exception(); + case BasicValueType.Float32: + Emitter.EmitConstant(value.Float32Value); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.F); + break; + case BasicValueType.Float64: + Emitter.EmitConstant(value.Float64Value); + Specializer.ConvertScalarTo64(Emitter, VelocityWarpOperationMode.D); + break; + default: + throw new NotSupportedIntrinsicException( + value.BasicValueType.ToString()); + } + Store(value); + } + + /// + public void GenerateCode(StringValue value) + { + Emitter.EmitConstant(value.String); + Store(value); + } + + /// + public void GenerateCode(NullValue value) + { + // Check whether we have already loaded a null value + if (!nullLocals.TryGetValue(value.Type, out var local)) + { + // If not... load the value + var tempLocal = Emitter.DeclareLocal( + GetVectorizedType(value.Type)); + Emitter.LoadNull(tempLocal); + nullLocals.Add(value.Type, tempLocal); + } + Alias(value, local); + } + + /// + public void GenerateCode(StructureValue value) + { + // Generate a local variable that contains the type + var managedType = GetVectorizedType(value.Type); + var local = Emitter.DeclareLocal(managedType); + + // Insert all fields + for (int i = 0, e = value.Count; i < e; ++i) + { + Emitter.Emit(LocalOperation.LoadAddress, local); + Load(value[i]); + Emitter.StoreField(managedType, i); + } + + Alias(value, local); + } + + /// + public void GenerateCode(GetField value) + { + // Check the result type of the operation + if (!value.FieldSpan.HasSpan) + { + // Extract the primitive value from the structure + LoadRefAndType(value.ObjectValue, out var objectType); + Emitter.LoadField(objectType, value.FieldSpan.Index); + + // Store field value + Store(value); + } + else + { + // The result is a new structure value + var newObjectType = GetVectorizedType(value.Type); + var local = Emitter.DeclareLocal(newObjectType); + // Extract all fields from the structure + int span = value.FieldSpan.Span; + for (int i = 0; i < span; ++i) + { + Emitter.Emit(LocalOperation.LoadAddress, local); + + LoadRefAndType(value.ObjectValue, out var objectType); + Emitter.LoadField( + objectType, + i + value.FieldSpan.Index); + + Emitter.StoreField(newObjectType, i); + } + + // Bind the current value + Alias(value, local); + } + } + + /// + public void GenerateCode(SetField value) + { + var mask = GetBlockMask(value.BasicBlock); + + // The result operation will be another structure instance + LoadVectorized(value.ObjectValue, out var type); + + // Copy object instance + var local = Emitter.DeclareLocal(type); + Emitter.Emit(LocalOperation.Store, local); + + var structureType = value.ObjectValue.Type.As(value); + for (int i = 0, e = value.FieldSpan.Span; i < e; ++i) + { + // Load the base address + int fieldOffset = value.FieldSpan.Index + i; + Emitter.Emit(LocalOperation.LoadAddress, local); + + // Load the mask + Emitter.Emit(LocalOperation.Load, mask); + + // Load the source value + Emitter.Emit(LocalOperation.LoadAddress, local); + Emitter.LoadField(type, fieldOffset); + + // Load the target value to store + if (e > 1) + { + LoadRef(value.Value); + Emitter.LoadField(type, i); + } + else + { + // Load the whole value + Load(value.Value); + } + + + // Merge data + if (structureType[i].BasicValueType.IsTreatedAs32Bit()) + Specializer.ConditionalSelect32(Emitter); + else + Specializer.ConditionalSelect64(Emitter); + + // Store merged value + Emitter.StoreField(type, fieldOffset); + } + + Alias(value, local); + } + + /// + public void GenerateCode(DebugAssertOperation debug) + { + // If the mask is active emit a failed debug assertion + var blockMask = GetBlockMask(debug.BasicBlock); + Emitter.Emit(LocalOperation.Load, blockMask); + + // Load the debug condition + Load(debug.Condition); + + // Load the debug error message + string errorMessage = debug.Message.Resolve() is StringValue stringValue + ? debug.Location.FormatErrorMessage(stringValue.String) + : "Assertion failed"; + Emitter.EmitConstant(errorMessage); + + var locationInfo = debug.GetLocationInfo(); + Emitter.EmitConstant(locationInfo.FileName); + Emitter.EmitConstant(locationInfo.Line); + Emitter.EmitConstant(locationInfo.Method); + + // Call our assertion method + Specializer.DebugAssertFailed(Emitter); + } + + /// + public void GenerateCode(WriteToOutput output) => + throw new NotSupportedIntrinsicException(); + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs new file mode 100644 index 000000000..d15adbced --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Views.cs @@ -0,0 +1,147 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.Views.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.IR.Values; + +namespace ILGPU.Backends.Velocity +{ + partial class VelocityCodeGenerator + { + /// + public void GenerateCode(IntAsPointerCast cast) + { + // Load the integer information + Load(cast.Value); + + // Check whether we have to convert it to a 64bit value + if (cast.SourceType.BasicValueType.IsTreatedAs32Bit()) + { + // Convert it to a 64bit pointer + Specializer.Convert32To64(Emitter, VelocityWarpOperationMode.U); + } + + // The integer can now be interpreted as pointer + Store(cast); + } + + /// + /// Creates an alias or stores a temporary value to ensure proper phi bindings. + /// + private void AliasOrStore(Value value, Value source) + { + if (source is PhiValue) + { + Load(source); + Store(value); + } + else + { + Alias(value, GetLocal(source)); + } + } + + /// + public void GenerateCode(PointerAsIntCast cast) => + AliasOrStore(cast, cast.Value); + + /// + public void GenerateCode(PointerCast cast) => + AliasOrStore(cast, cast.Value); + + /// + public void GenerateCode(AddressSpaceCast value) => + AliasOrStore(value, value.Value); + + /// + public void GenerateCode(LoadElementAddress value) + { + // Load the raw element offset to multiply + Load(value.Offset); + + // Widen the source address if necessary + if (value.Is32BitAccess) + Specializer.Convert32To64(Emitter, VelocityWarpOperationMode.I); + + // Load the source type information and the element size to multiply + var sourceType = value.Source.Type.As(value); + Emitter.EmitConstant((long)sourceType.ElementType.Size); + Specializer.ConvertScalarTo64(Emitter, VelocityWarpOperationMode.U); + + // Load the source vector to add + Load(value.Source); + + // Perform the actual offset computation + Specializer.TernaryOperation64( + Emitter, + TernaryArithmeticKind.MultiplyAdd, + VelocityWarpOperationMode.U); + Store(value); + } + + /// + public void GenerateCode(LoadFieldAddress value) + { + // Compute the actual field offset based on the vectorized type + long offset = value.StructureType.GetOffset(value.FieldSpan.Access.Index); + + // If this results in an actual byte offset... add it + if (offset != 0L) + { + // Load the source addresses + Load(value.Source); + + // Load constant + Emitter.EmitConstant(offset); + Specializer.ConvertScalarTo64(Emitter, VelocityWarpOperationMode.U); + + // Adjust address + Specializer.BinaryOperation64( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U); + + // Store the newly computed offset + Store(value); + } + else + { + AliasOrStore(value, value.Source); + } + } + + /// + public void GenerateCode(AlignTo value) => + AliasOrStore(value, value.Source); + + /// + public void GenerateCode(AsAligned value) => + AliasOrStore(value, value.Source); + + /// + public void GenerateCode(DynamicMemoryLengthValue value) + { + if (value.AddressSpace != MemoryAddressSpace.Shared) + throw new InvalidCodeGenerationException(); + + // Load our shared memory length + var elementType = TypeGenerator.GetLinearizedScalarType(value.ElementType); + Specializer.GetDynamicSharedMemoryLength(Emitter, elementType); + + // Store the computed length + Store(value); + } + + /// + public void GenerateCode(LanguageEmitValue value) { } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs new file mode 100644 index 000000000..18adce758 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs @@ -0,0 +1,584 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCodeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +// Uncomment this line to or define a preprocessor symbol to enable detailed Velocity +// accelerator debugging: +// #define DEBUG_VELOCITY + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.Backends.Velocity.Analyses; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Types; +using ILGPU.IR.Values; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Static helper class for Velocity code generation. + /// + static class VelocityCodeGenerator + { + #region Constants + + /// + /// The parameter index of all execution contexts. + /// + public const int ExecutionContextIndex = 0; + + /// + /// The parameter index of all masks. + /// + public const int MaskParameterIndex = 1; + + /// + /// The method parameter offset for all parameters. + /// + public const int MethodParameterOffset = 2; + + #endregion + } + + /// + /// Generates vectorized MSIL instructions out of IR values. + /// + /// The IL emitter type. + /// The code needs to be prepared for this code generator. + abstract partial class VelocityCodeGenerator : + IBackendCodeGenerator + where TILEmitter : struct, IILEmitter + { + #region Nested Types + + /// + /// Creation arguments passed to a constructor. + /// + /// The current target specializer + /// The current generation module. + /// The current entry point. + public readonly record struct GeneratorArgs( + VelocityTargetSpecializer Specializer, + VelocityGenerationModule Module, + EntryPoint EntryPoint); + + #endregion + + #region Instance + + /// + /// Maps blocks to labels. + /// + private readonly Dictionary blockLookup = new(); + + /// + /// The masks analysis holding information about the masks being required. + /// + private readonly VelocityMasks masks; + + /// + /// A dictionary mapping values to IL locals. + /// + private readonly Dictionary locals = new(); + + /// + /// Temporary locals for initialization. + /// + private readonly Dictionary nullLocals = new(); + + private readonly PhiBindings phiBindings; + private readonly Dictionary intermediatePhis; + + /// + /// Constructs a new IL code generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + protected VelocityCodeGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + { + Module = args.Module; + Specializer = args.Specializer; + + // Creates a new IL emitter + Method = method; + Allocas = allocas; + if (typeof(TILEmitter) == typeof(DebugILEmitter)) + { + Emitter = (TILEmitter)Activator.CreateInstance( + typeof(DebugILEmitter), + Console.Out).AsNotNull(); + } + else + { + Emitter = (TILEmitter)Activator.CreateInstance( + typeof(TILEmitter), + Module.GetILGenerator(method)).AsNotNull(); + } + + // Create a new vector masks analysis instance + masks = new(method.Blocks, Emitter, Specializer); + + // Determine all phi bindings + phiBindings = PhiBindings.Create( + Method.Blocks, + (_, phiValue) => Declare(phiValue)); + intermediatePhis = new Dictionary( + phiBindings.MaxNumIntermediatePhis); + + // Declare a label for each block + foreach (var block in method.Blocks) + blockLookup[block] = Emitter.DeclareLabel(); + } + + #endregion + + #region Properties + + /// + /// Returns the current generation module. + /// + public VelocityGenerationModule Module { get; } + + /// + /// Returns the current type generator being used. + /// + public VelocityTypeGenerator TypeGenerator => Module.TypeGenerator; + + /// + /// Returns the current method. + /// + public Method Method { get; } + + /// + /// Returns all allocations. + /// + public Allocas Allocas { get; } + + /// + /// Returns the current emitter. + /// + public TILEmitter Emitter { get; } + + /// + /// Returns the underlying target specializer. + /// + public VelocityTargetSpecializer Specializer { get; } + + #endregion + + #region IBackendCodeGenerator + + /// + /// Perform no operation. + /// + public void GenerateHeader(object builder) + { + // We do not need to generate any headers + } + + /// + /// Generates an MSIL runtime method. + /// + public abstract void GenerateCode(); + + /// + /// Perform no operation. + /// + public void GenerateConstants(object builder) + { + // We do not need to emit any constants + } + + /// + public void Merge(object builder) + { + // We do not need to perform any action + } + + #endregion + + #region Methods + + /// + /// Creates a branch builder to wire masks and build branches. + /// + /// The current block. + protected BranchBuilder CreateBranchBuilder(BasicBlock currentBlock) => + new(this, currentBlock); + + /// + /// Resets the block mask for the given block to no lanes at all. + /// + protected void TryResetBlockLanes(BasicBlock basicBlock) + { + if (masks.NeedsToRefreshMask(basicBlock)) + DisableLanesOf(basicBlock); + } + + /// + /// Resets the block mask for the given block by disabling the lanes from the + /// specified mask local. + /// + protected void DisableLanesOf(BasicBlock basicBlock) + { + Specializer.PushNoLanesMask32(Emitter); + Emitter.Emit(LocalOperation.Store, GetBlockMask(basicBlock)); + } + + /// + /// Resets the block mask for the given block by disabling the lanes from the + /// specified mask local. + /// + protected void DisableSpecificLanes(ILLocal target) + { + Specializer.NegateMask32(Emitter); + Emitter.Emit(LocalOperation.Load, target); + Specializer.IntersectMask32(Emitter); + Emitter.Emit(LocalOperation.Store, target); + } + + /// + /// Returns the block mask for the given basic block. + /// + /// The block to lookup. + /// The block mask to use. + protected ILLocal GetBlockMask(BasicBlock block) => masks.GetBlockMask(block); + + /// + /// Intersects the current mask with the mask on the top of the stack. + /// + private void IntersectWithMask(ILLocal current) + { + // Intersect with the current mask + Emitter.Emit(LocalOperation.Load, current); + Specializer.IntersectMask32(Emitter); + } + + /// + /// Unifies the target mask with the mask on the top of the stack and stores + /// the result. + /// + private void UnifyWithMaskOf(BasicBlock target, bool keepOnStack = false) + { + var targetMask = GetBlockMask(target); + Emitter.Emit(LocalOperation.Load, targetMask); + Specializer.UnifyMask32(Emitter); + if (keepOnStack) + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, targetMask); + } + +#if DEBUG_VELOCITY + private void DumpAllMasks(string source) + { + if (!string.IsNullOrWhiteSpace(source)) + Emitter.EmitWriteLine(source); + masks.DumpAllMasks(Emitter, Specializer); + VelocityTargetSpecializer.DebuggerBreak(Emitter); + } +#endif + + /// + /// Generates code for all blocks. + /// + protected void GenerateCodeInternal() + { +#if DEBUG_VELOCITY + Method.DumpToConsole(); +#endif + + // Init all possible phi values + foreach (var phiValue in phiBindings.PhiValues) + Emitter.LoadNull(GetLocal(phiValue)); + + // Init all allocations + BindAllocations(); + + // Disable all lanes + masks.DisableAllLanes(Method, Emitter, Specializer); + + // Emit code for each block + foreach (var block in Method.Blocks) + { + // Mark the current label + Emitter.MarkLabel(blockLookup[block]); + +#if DEBUG_VELOCITY + Console.WriteLine($"Generating code for: {block.ToReferenceString()}"); + Emitter.EmitWriteLine("Entering: " + block.ToReferenceString()); + DumpAllMasks(""); +#endif + + // Generate code for all values + foreach (var value in block) + this.GenerateCodeFor(value); + + // Reset all intermediate phis + intermediatePhis.Clear(); + + // Build terminator + this.GenerateCodeFor(block.Terminator.AsNotNull()); + } + } + + /// + /// Binds all shared and local memory allocations. + /// + private void BindAllocations() + { + // Bind shared allocations + foreach (var allocation in Allocas.SharedAllocations) + { + Specializer.GetSharedMemoryFromPool( + Emitter, + TypeGenerator.GetLinearizedScalarType(allocation.ElementType), + allocation.ArraySize); + Store(allocation.Alloca); + } + + // Bind dynamic shared memory allocations (we can treat the separately from + // static allocations, as this will also be the case for all other + // accelerator types in the future + foreach (var allocation in Allocas.DynamicSharedAllocations) + { + Specializer.GetDynamicSharedMemory(Emitter); + Store(allocation.Alloca); + } + + // Bind local allocations + foreach (var allocation in Allocas.LocalAllocations) + { + // Get unified pointer which needs further adjustments + int lengthInBytesPerThread = + allocation.ElementType.Size * allocation.ArraySize; + + // Compute allocation stride per thread: + // offset[laneIdx] = laneIdx * lengthInBytes + Emitter.EmitConstant(lengthInBytesPerThread); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.U); + Specializer.LoadLaneIndexVector32(Emitter); + Specializer.BinaryOperation32( + Emitter, + BinaryArithmeticKind.Mul, + VelocityWarpOperationMode.U); + Specializer.Convert32To64(Emitter, VelocityWarpOperationMode.U); + + // Get a unified base pointer for all threads in all lanes + Specializer.GetUnifiedLocalMemoryFromPool( + Emitter, + lengthInBytesPerThread); + + // Adjust local base pointer to refer to the right memory region + Specializer.BinaryOperation64( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.U); + + Store(allocation.Alloca); + } + } + + /// + /// Binds all phi values of the current block flowing through an edge to the + /// target block. Note that the current mask instance is assumed to be pushed + /// onto the evaluation stack. + /// + private void BindPhis( + PhiBindings.PhiBindingCollection bindings, + BasicBlock target, + Action passMask) + { + foreach (var (phiValue, value) in bindings) + { + // Reject phis not flowing to the target edge + if (phiValue.BasicBlock != target) + continue; + + // Check for an intermediate phi value + if (bindings.IsIntermediate(phiValue)) + { + // Declare a new intermediate local variable + var intermediateLocal = DeclareVectorizedTemporary(phiValue.PhiType); + intermediatePhis.Add(phiValue, intermediateLocal); + + // Move this phi value into a temporary register for reuse + Load(phiValue); + Emitter.Emit(LocalOperation.Store, intermediateLocal); + } + + // Determine the source value from which we need to copy from + var sourceLocal = intermediatePhis + .TryGetValue(value, out var tempLocal) + ? tempLocal + : GetLocal(value); + + // Move contents while merging our information + var phiLocal = GetLocal(phiValue); + var intermediateTempLocal = EmitMerge(phiValue, + () => + { + Emitter.Emit(LocalOperation.Load, phiLocal); + return phiLocal.VariableType; + }, + () => + { + Emitter.Emit(LocalOperation.Load, sourceLocal); + return sourceLocal.VariableType; + }, + passMask, + _ => phiLocal); + // Store the value to the phi local explicitly + if (!intermediateTempLocal.HasValue) + Emitter.Emit(LocalOperation.Store, phiLocal); + +#if DEBUG_VELOCITY + // Dump phi locals + string phiReference = $"Phi {phiValue.ToReferenceString()}: "; + if (phiValue.Type is PrimitiveType) + { + Emitter.Emit(LocalOperation.Load, phiLocal); + if (phiValue.BasicValueType.IsTreatedAs32Bit()) + Specializer.DumpWarp32(Emitter, phiReference); + else + Specializer.DumpWarp64(Emitter, phiReference); + } + else + { + Emitter.EmitWriteLine($"{phiReference}: complex value type"); + } +#endif + } + } + + /// + /// Loads a local variable that has been associated with the given value. + /// + /// The value to load. + /// The loaded variable. + private ILLocal GetLocal(Value value) + { + // Load the local + value.Assert(locals.ContainsKey(value)); + return locals[value]; + } + + /// + /// Loads the given value onto the evaluation stack. + /// + /// The value to load. + public void Load(Value value) + { + var local = GetLocal(value); + Emitter.Emit(LocalOperation.Load, local); + // Note that we assume that all locals have already been converted to + // their vector counterparts + } + + /// + /// Loads the given value onto the evaluation stack. + /// + /// The value to load. + /// The loaded managed type. + public void LoadVectorized(Value value, out Type type) + { + Load(value); + type = GetVectorizedType(value.Type); + } + + /// + /// Loads a reference to the given value onto the evaluation stack. + /// + /// The value to load. + public void LoadRef(Value value) + { + // Load address of local variable + var local = GetLocal(value); + Emitter.Emit(LocalOperation.LoadAddress, local); + } + + /// + /// Loads a reference to the given value onto the evaluation stack. + /// + /// The value to load. + /// The loaded managed type. + public void LoadRefAndType(Value value, out Type type) + { + LoadRef(value); + type = GetVectorizedType(value.Type); + } + + /// + /// Declares a new phi value. + /// + /// The phi value to declare. + public void Declare(PhiValue phiValue) + { + var local = DeclareVectorizedTemporary(phiValue.PhiType); + locals.Add(phiValue, local); + } + + /// + /// Declares a new vectorized temporary variable. + /// + /// The type of the variable to allocate. + /// The allocated variable. + public ILLocal DeclareVectorizedTemporary(TypeNode typeNode) => + Emitter.DeclareLocal(GetVectorizedType(typeNode)); + + /// + /// Stores the given value by popping its value from the evaluation stack. + /// + /// The value to store. + public void Store(Value value) + { + value.Assert(!locals.ContainsKey(value)); + if (!value.Uses.HasAny) + { + if (!value.Type.IsVoidType) + Emitter.Emit(OpCodes.Pop); + return; + } + + var local = Emitter.DeclareLocal(GetVectorizedType(value.Type)); + locals.Add(value, local); + Emitter.Emit(LocalOperation.Store, local); + } + + /// + /// Aliases the given value with the specified local. + /// + /// The value to register an alias for. + /// The local variable alias. + public void Alias(Value value, ILLocal local) + { + value.Assert(!locals.ContainsKey(value)); + locals.Add(value, local); + } + + /// + /// Loads the vectorized managed type that corresponds to the given IR type. + /// + /// The IR type to convert + /// The vectorized managed type. + private Type GetVectorizedType(TypeNode type) => + TypeGenerator.GetVectorizedType(type); + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs b/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs new file mode 100644 index 000000000..c869abd45 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityCompiledKernel.cs @@ -0,0 +1,103 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityCompiledKernel.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Runtime.Velocity; +using System; +using System.Collections.Immutable; +using System.Reflection; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Represents a compiled kernel in vectorized MSIL form. + /// + public sealed class VelocityCompiledKernel : CompiledKernel + { + #region Instance + + /// + /// Constructs a new IL compiled kernel. + /// + /// The associated context. + /// The entry point. + /// The main kernel method. + /// The custom parameters type. + /// + /// The type constructor of the parameters type. + /// + /// + /// Mapping of kernel parameter indices to parameter fields. + /// + /// + /// The amount of statically allocated bytes of shared memory. + /// + internal VelocityCompiledKernel( + Context context, + EntryPoint entryPoint, + MethodInfo kernelMethod, + Type parametersType, + ConstructorInfo parametersTypeConstructor, + ImmutableArray parameterFields, + int allocatedSharedMemorySize) + : base(context, entryPoint, null) + { + KernelMethod = kernelMethod; + ParametersType = parametersType; + ParameterFields = parameterFields; + ParametersTypeConstructor = parametersTypeConstructor; + AllocatedSharedMemorySize = allocatedSharedMemorySize; + } + + #endregion + + #region Properties + + /// + /// Returns the main kernel method. + /// + public MethodInfo KernelMethod { get; } + + /// + /// Returns the custom parameter store type to dispatch the kernel. + /// + internal Type ParametersType { get; } + + /// + /// Returns the type constructor to instantiate the custom parameters type. + /// + internal ConstructorInfo ParametersTypeConstructor { get; } + + /// + /// Returns a mapping of kernel parameter indices to parameter field.s + /// + internal ImmutableArray ParameterFields { get; } + + /// + /// Returns the size of statically allocated shared memory in bytes. + /// + public int AllocatedSharedMemorySize { get; } + + #endregion + + #region Methods + + /// + /// Creates a new kernel entry point to be used with this kernel module. + /// + /// A kernel entry point delegate. + internal VelocityEntryPointHandler CreateKernelEntryPoint() => + KernelMethod.CreateDelegate(); + + #endregion + } +} + diff --git a/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs new file mode 100644 index 000000000..96236bc98 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs @@ -0,0 +1,152 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityFunctionGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Values; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A generator for non primary Velocity functions. + /// + /// The IL emitter type. + sealed class VelocityFunctionGenerator : VelocityCodeGenerator + where TILEmitter : struct, IILEmitter + { + /// + /// The internal return label. + /// + private readonly ILLabel returnLabel; + + /// + /// The internal return-value local (if any). + /// + private readonly ILLocal? returnLocal; + + /// + /// The internal target mask counter. + /// + private readonly ILLocal targetMaskCount; + + /// + /// Creates a new Velocity function generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + public VelocityFunctionGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + : base(args, method, allocas) + { + returnLabel = Emitter.DeclareLabel(); + returnLocal = method.IsVoid + ? null + : Emitter.DeclareLocal( + TypeGenerator.GetVectorizedType(method.ReturnType)); + + // We use this counter to remember the number of active threads that entered + // the kernel successfully + targetMaskCount = Emitter.DeclareLocal(typeof(int)); + } + + /// + /// Generates Velocity code for this function. + /// + public override void GenerateCode() + { + // Bind the mask parameter + Emitter.Emit( + ArgumentOperation.Load, + VelocityCodeGenerator.MaskParameterIndex); + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, GetBlockMask(Method.EntryBlock)); + + // Determine target mask counter + Specializer.GetNumberOfActiveLanes(Emitter); + Emitter.Emit(LocalOperation.Store, targetMaskCount); + + // Bind all remaining parameters + for (int i = 0; i < Method.NumParameters; ++i) + { + var parameterType = Method.Parameters[i].ParameterType; + var parameterLocal = DeclareVectorizedTemporary(parameterType); + + Emitter.Emit( + ArgumentOperation.Load, + i + VelocityCodeGenerator.MethodParameterOffset); + Emitter.Emit(LocalOperation.Store, parameterLocal); + + Alias(Method.Parameters[i], parameterLocal); + } + + // Emit the remaining code + GenerateCodeInternal(); + + // Emit the actual return part + Emitter.MarkLabel(returnLabel); + if (returnLocal.HasValue) + Emitter.Emit(LocalOperation.Load, returnLocal.Value); + Emitter.Emit(OpCodes.Ret); + } + + /// + public override void GenerateCode(ReturnTerminator returnTerminator) + { + // Note that this automatically returns a vectorized version + // of all return values + void LoadMask() + { + Emitter.Emit( + LocalOperation.Load, + GetBlockMask(returnTerminator.BasicBlock)); + } + + // Jump to the exit block if all lanes are active + LoadMask(); + Specializer.GetNumberOfActiveLanes(Emitter); + Emitter.Emit(LocalOperation.Load, targetMaskCount); + + // In case not all lanes have completed processing, we will have to skip + // the actual return statement here and merge the result + if (returnLocal.HasValue) + { + var targetType = returnLocal.Value.VariableType; + var tempLocal = EmitMerge( + returnTerminator.ReturnValue, + () => + { + Emitter.Emit(LocalOperation.Load, returnLocal.Value); + return targetType; + }, + () => + { + Load(returnTerminator.ReturnValue); + return targetType; + }, + LoadMask, + _ => returnLocal.Value); + + if (!tempLocal.HasValue) + Emitter.Emit(LocalOperation.Store, returnLocal.Value); + } + + Emitter.Emit(OpCodes.Beq, returnLabel); + + // Reset the current mask if required + TryResetBlockLanes(returnTerminator.BasicBlock); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs new file mode 100644 index 000000000..8c0d687ed --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs @@ -0,0 +1,478 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityGenerationModule.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Types; +using ILGPU.Resources; +using ILGPU.Runtime; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Reflection; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A kernel module generator for managed velocity kernel types. + /// + sealed class VelocityGenerationModule : DisposeBase + { + #region Static + + /// + /// Represents a reference to the public dumping method. + /// + private static readonly MethodInfo DumpMethodInfo = + typeof(VelocityParameters).GetMethod( + nameof(VelocityParameters.DumpToConsole), + BindingFlags.Public | BindingFlags.Instance) + .ThrowIfNull(); + + /// + /// Builds a complete parameter-type class wrapper that takes all scalar kernel + /// arguments as constructor arguments and converts them into ready-to-load + /// vectorized versions that are in turn stored as class fields. + /// + private static Type BuildParametersType( + RuntimeSystem runtimeSystem, + VelocityTargetSpecializer specializer, + VelocityTypeGenerator typeGenerator, + in Backend.BackendContext backendContext, + EntryPoint entryPoint, + out ConstructorInfo constructor, + out ImmutableArray parameterFields) + { + // Build a new parameter passing type + using var parametersLock = runtimeSystem.DefineRuntimeClass( + typeof(VelocityParameters), + out var typeBuilder); + + var kernelMethod = backendContext.KernelMethod; + int numParameters = + kernelMethod.Parameters.Count - + entryPoint.KernelIndexParameterOffset; + var nativeParameterTypes = new TypeNode[numParameters]; + var constructorParameterTypes = new Type[nativeParameterTypes.Length]; + var constructorLocalTypes = new Type[nativeParameterTypes.Length]; + var builtFields = new FieldInfo[numParameters]; + for (int i = 0; i < numParameters; ++i) + { + // Determine the scalar parameter type and remember it + int parameterIndex = i + entryPoint.KernelIndexParameterOffset; + var parameterType = kernelMethod.Parameters[parameterIndex].ParameterType; + nativeParameterTypes[i] = parameterType; + constructorLocalTypes[i] = + typeGenerator.GetLinearizedScalarType(parameterType); + constructorParameterTypes[i] = typeof(void*); + + // Convert the parameter type and declare a new field + var vectorizedType = typeGenerator.GetVectorizedType(parameterType); + builtFields[i] = typeBuilder.DefineField( + StructureType.GetFieldName(i), + vectorizedType, + FieldAttributes.Public); + } + + // Build a constructor that converts all parameters into their vectorized + // representation + DefineConstructor( + specializer, + typeGenerator, + typeBuilder, + constructorParameterTypes, + constructorLocalTypes, + nativeParameterTypes, + builtFields); + + // Define our dumping method + DefineDumpMethod( + specializer, + typeBuilder, + nativeParameterTypes, + builtFields); + + // Build the parameter type and determine the parameter mapping + var result = typeBuilder.CreateType(); + var parameterMapping = + ImmutableArray.CreateBuilder(numParameters); + for (int i = 0; i < numParameters; ++i) + { + var fieldInfo = ILEmitterExtensions.GetFieldInfo(result, i); + parameterMapping.Add(fieldInfo); + } + parameterFields = parameterMapping.MoveToImmutable(); + constructor = result.GetConstructor(constructorParameterTypes).AsNotNull(); + return result; + } + + /// + /// Defines our argument conversion constructor to map scalar arguments to + /// vectorized Velocity space. + /// + private static void DefineConstructor( + VelocityTargetSpecializer specializer, + VelocityTypeGenerator typeGenerator, + TypeBuilder typeBuilder, + Type[] constructorParameterTypes, + Type[] constructorLocalTypes, + TypeNode[] nativeParameterTypes, + FieldInfo[] builtFields) + { + var constructorBuilder = typeBuilder.DefineConstructor( + MethodAttributes.Public, + CallingConventions.Standard, + constructorParameterTypes); + + // Create a new constructor IL emitter + var emitter = new ILEmitter(constructorBuilder.GetILGenerator()); + + // Load each argument passed to the constructor and convert it into its + // vectorized form via specialized convert operations + for (int i = 0; i < constructorParameterTypes.Length; ++i) + { + // Convert the current argument into a temporary local to load from + var loadLocal = emitter.DeclareLocal(constructorLocalTypes[i]); + emitter.Emit(ArgumentOperation.Load, i + 1); + + // Load object via direct memory operations from pinned memory + emitter.Emit(OpCodes.Ldobj, constructorLocalTypes[i]); + emitter.Emit(LocalOperation.Store, loadLocal); + + // Load a vectorized version + emitter.Emit(OpCodes.Ldarg_0); + BuildParameterLoad( + emitter, + loadLocal, + nativeParameterTypes[i], + specializer, + typeGenerator); + + // Store vectorized version + emitter.Emit(OpCodes.Stfld, builtFields[i]); + } + + // Return + emitter.Emit(OpCodes.Ret); + } + + /// + /// Builds a vectorized kernel parameter load for arbitrary types. + /// + private static void BuildParameterLoad( + TILEmitter emitter, + ILLocal source, + TypeNode typeNode, + VelocityTargetSpecializer specializer, + VelocityTypeGenerator typeGenerator) + where TILEmitter : struct, IILEmitter + { + if (typeNode is StructureType structureType) + { + var vectorizedType = typeGenerator.GetVectorizedType(structureType); + var temporary = emitter.DeclareLocal(vectorizedType); + + // Fill the temporary structure instance with values + foreach (var (fieldType, fieldAccess) in structureType) + { + // Load the target variable address + emitter.Emit(LocalOperation.LoadAddress, temporary); + + // Load the input value + emitter.Emit(LocalOperation.Load, source); + emitter.LoadField(source.VariableType, fieldAccess.Index); + + // Load the converted field type + BuildScalarParameterLoad( + emitter, + fieldType, + specializer); + + // Store it into out structure field + emitter.StoreField(vectorizedType, fieldAccess.Index); + } + + emitter.Emit(LocalOperation.Load, temporary); + } + else + { + // Load input argument value + emitter.Emit(LocalOperation.Load, source); + + // Load the scalar parameter + BuildScalarParameterLoad( + emitter, + typeNode, + specializer); + } + } + + /// + /// Defines a dumping method to visualize kernel input parameters. + /// + private static void DefineDumpMethod( + VelocityTargetSpecializer specializer, + TypeBuilder typeBuilder, + TypeNode[] nativeParameterTypes, + FieldInfo[] fieldsToVisualize) + { + var dumpMethod = typeBuilder.DefineMethod( + DumpMethodInfo.Name, + MethodAttributes.Public | MethodAttributes.Virtual, + typeof(void), + Array.Empty()); + + // Emit code to visualize each field + var emitter = new ILEmitter(dumpMethod.GetILGenerator()); + void DumpRawValue(BasicValueType basicValueType, string fieldLabel) + { + if (basicValueType.IsTreatedAs32Bit()) + specializer.DumpWarp32(emitter, fieldLabel); + else + specializer.DumpWarp64(emitter, fieldLabel); + } + + // Dump all fields + for (int i = 0; i < nativeParameterTypes.Length; ++i) + { + string fieldLabel = $"InputArg_{i}"; + + var nativeType = nativeParameterTypes[i]; + var field = fieldsToVisualize[i]; + + void LoadFieldInstance() + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.Emit(OpCodes.Ldfld, field); + } + + if (nativeType is StructureType structureType) + { + // Dump each field separately + emitter.EmitWriteLine(fieldLabel); + + for (int j = 0; j < structureType.NumFields; ++j) + { + LoadFieldInstance(); + emitter.LoadField(field.FieldType, j); + DumpRawValue(structureType[j].BasicValueType, $" Field_{j}"); + } + } + else + { + LoadFieldInstance(); + DumpRawValue(nativeType.BasicValueType, fieldLabel); + } + } + + emitter.Emit(OpCodes.Ret); + emitter.Finish(); + typeBuilder.DefineMethodOverride(dumpMethod, DumpMethodInfo); + } + + /// + /// Builds a vectorized kernel parameter load for scalar types. + /// + private static void BuildScalarParameterLoad( + TILEmitter emitter, + TypeNode typeNode, + VelocityTargetSpecializer specializer) + where TILEmitter : struct, IILEmitter + { + var basicValueType = typeNode switch + { + PointerType _ => BasicValueType.Int64, + _ => typeNode.BasicValueType == BasicValueType.None + ? throw typeNode.GetNotSupportedException( + ErrorMessages.NotSupportedType, + typeNode) + : typeNode.BasicValueType + }; + + // Convert value on top of the evaluation stack without sign extension + var mode = VelocityWarpOperationMode.F; + if (basicValueType.IsInt()) + { + // Expand type + emitter.Emit(basicValueType.IsTreatedAs32Bit() + ? OpCodes.Conv_U4 + : OpCodes.Conv_U8); + + mode = VelocityWarpOperationMode.U; + } + else + { + if (basicValueType == BasicValueType.Float16) + throw CapabilityContext.GetNotSupportedFloat16Exception(); + } + + // Load the values onto the evaluation stack + if (basicValueType.IsTreatedAs32Bit()) + specializer.ConvertScalarTo32(emitter, mode); + else + specializer.ConvertScalarTo64(emitter, mode); + } + + #endregion + + #region Instance + + private readonly Dictionary methodMapping; + + public VelocityGenerationModule( + RuntimeSystem runtimeSystem, + VelocityTargetSpecializer specializer, + VelocityTypeGenerator typeGenerator, + in Backend.BackendContext backendContext, + EntryPoint entryPoint) + { + methodMapping = new Dictionary( + backendContext.Count); + TypeGenerator = typeGenerator; + + // Create the parameter passing type + ParametersType = BuildParametersType( + runtimeSystem, + specializer, + typeGenerator, + backendContext, + entryPoint, + out var constructorInfo, + out var parameterFields); + ParametersTypeConstructor = constructorInfo; + ParameterFields = parameterFields; + + // Declare all methods + DeclareMethod(runtimeSystem, backendContext.KernelMethod, specializer); + foreach (var (method, _) in backendContext) + DeclareMethod(runtimeSystem, method, specializer); + + // Get the kernel method + KernelMethod = this[backendContext.KernelMethod]; + + // Setup shared memory information + SharedAllocationSize = backendContext.SharedAllocations.TotalSize; + } + + #endregion + + #region Properties + + /// + /// Returns the current type generator being used. + /// + public VelocityTypeGenerator TypeGenerator { get; } + + /// + /// Returns the kernel method. + /// + public MethodInfo KernelMethod { get; } + + /// + /// Gets the method builder that is associated with the given method. + /// + /// The method to get the managed method for. + public MethodInfo this[Method method] => methodMapping[method].Method; + + /// + /// Returns the class type to store all parameter values to. + /// + public Type ParametersType { get; } + + /// + /// Returns the constructor to build a new parameters type instance. + /// + public ConstructorInfo ParametersTypeConstructor { get; } + + /// + /// Returns all parameter fields to store the actual parameter data into. + /// + public ImmutableArray ParameterFields { get; } + + /// + /// The total amount of bytes residing in shared memory. + /// + public int SharedAllocationSize { get; } + + #endregion + + #region Methods + + /// + /// Declares the given method. + /// + private void DeclareMethod( + RuntimeSystem runtimeSystem, + Method method, + VelocityTargetSpecializer specializer) + { + // Convert the method signature + var returnType = TypeGenerator.GetVectorizedType(method.ReturnType); + Type[] parameterTypes; + + // The first parameter is the current mask (if it is not an entry point) + if (method.HasFlags(MethodFlags.EntryPoint)) + { + // This is our main method + parameterTypes = VelocityEntryPointHandlerHelper.EntryPointParameterTypes; + } + else + { + parameterTypes = new Type[ + method.NumParameters + VelocityCodeGenerator.MethodParameterOffset]; + // Convert all parameter types + parameterTypes[VelocityCodeGenerator.ExecutionContextIndex] = + typeof(VelocityGroupExecutionContext); + parameterTypes[VelocityCodeGenerator.MaskParameterIndex] = + specializer.WarpType32; + for (int i = 0; i < method.NumParameters; ++i) + { + var parameterType = method.Parameters[i].ParameterType; + parameterTypes[i + VelocityCodeGenerator.MethodParameterOffset] = + TypeGenerator.GetVectorizedType(parameterType); + } + } + + // Define a new method stub + using var scopedLock = runtimeSystem.DefineRuntimeMethod( + returnType, + parameterTypes, + out var methodBuilder); + methodMapping.Add(method, methodBuilder); + } + + /// + /// Gets the IL generator that is associated with the method. + /// + public ILGenerator GetILGenerator(Method method) => + methodMapping[method].ILGenerator; + + #endregion + + #region IDisposable + + /// + /// Frees the current scoped locked. + /// + protected override void Dispose(bool disposing) + { + foreach (var (_, builder) in methodMapping) + builder.Finish(); + base.Dispose(disposing); + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityHelpers.cs b/Src/ILGPU/Backends/Velocity/VelocityHelpers.cs new file mode 100644 index 000000000..d5f7899ae --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityHelpers.cs @@ -0,0 +1,157 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityHelpers.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.IR; +using ILGPU.IR.Values; +using System; + +namespace ILGPU.Backends.Velocity +{ + /// + /// Represents a type evaluation mode. + /// + enum VelocityWarpOperationMode + { + /// + /// Signed integers. + /// + I, + + /// + /// Unsigned integers. + /// + U, + + /// + /// Floats. + /// + F, + + /// + /// Doubles, or floats essentially. + /// + D = F, + } + + static class VelocityHelpers + { + /// + /// Returns true if the given value type is actually a 32bit value type. + /// + public static bool Is32Bit(this BasicValueType valueType) => + valueType switch + { + BasicValueType.Int32 => true, + BasicValueType.Float32 => true, + _ => false, + }; + + /// + /// Returns true if the given value type is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this BasicValueType valueType) => + valueType switch + { + BasicValueType.Float64 => false, + BasicValueType.Int64 => false, + _ => true, + }; + + /// + /// Returns true if the given value is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this Value value) => + value.BasicValueType.IsTreatedAs32Bit(); + + /// + /// Returns true if the given value is interpreted as a 32bit value type. + /// + public static bool IsTreatedAs32Bit(this ArithmeticValue value) => + value.ArithmeticBasicValueType switch + { + ArithmeticBasicValueType.Float64 => false, + ArithmeticBasicValueType.Int64 => false, + ArithmeticBasicValueType.UInt64 => false, + _ => true, + }; + + /// + /// Determines the current warp-operation mode for the given arithmetic basic + /// value type. + /// + public static VelocityWarpOperationMode GetWarpMode( + this ArithmeticBasicValueType valueType) => + valueType switch + { + ArithmeticBasicValueType.UInt1 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt8 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt16 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt32 => VelocityWarpOperationMode.U, + ArithmeticBasicValueType.UInt64 => VelocityWarpOperationMode.U, + + ArithmeticBasicValueType.Int8 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int16 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int32 => VelocityWarpOperationMode.I, + ArithmeticBasicValueType.Int64 => VelocityWarpOperationMode.I, + + ArithmeticBasicValueType.Float16 => VelocityWarpOperationMode.F, + ArithmeticBasicValueType.Float32 => VelocityWarpOperationMode.F, + ArithmeticBasicValueType.Float64 => VelocityWarpOperationMode.D, + _ => throw new NotSupportedException() + }; + + /// + /// Determines the current warp-operation mode for the given value. + /// + public static VelocityWarpOperationMode GetWarpMode(this ArithmeticValue value) => + value.ArithmeticBasicValueType.GetWarpMode(); + + /// + /// Determines the current warp-operation mode for the given value. + /// + public static VelocityWarpOperationMode GetWarpMode(this CompareValue value) => + value.CompareType.GetWarpMode(); + + /// + /// Gets the basic value type corresponding to the given warp mode. + /// + public static BasicValueType GetBasicValueType( + this VelocityWarpOperationMode mode, + bool is64Bit) => + mode switch + { + VelocityWarpOperationMode.I => !is64Bit + ? BasicValueType.Int32 : BasicValueType.Int64, + VelocityWarpOperationMode.U => !is64Bit + ? BasicValueType.Int32 : BasicValueType.Int64, + VelocityWarpOperationMode.F => !is64Bit + ? BasicValueType.Float32 : BasicValueType.Float64, + _ => throw new ArgumentOutOfRangeException(nameof(mode)) + }; + + /// + /// Gets the arithmetic basic value type corresponding to the given warp mode. + /// + public static ArithmeticBasicValueType GetArithmeticBasicValueType( + this VelocityWarpOperationMode mode, + bool is64Bit) => + mode switch + { + VelocityWarpOperationMode.I => !is64Bit + ? ArithmeticBasicValueType.Int32 : ArithmeticBasicValueType.Int64, + VelocityWarpOperationMode.U => !is64Bit + ? ArithmeticBasicValueType.UInt32 : ArithmeticBasicValueType.UInt64, + VelocityWarpOperationMode.F => !is64Bit + ? ArithmeticBasicValueType.Float32 : ArithmeticBasicValueType.Float64, + _ => throw new ArgumentOutOfRangeException(nameof(mode)) + }; + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs new file mode 100644 index 000000000..d950e2c6e --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs @@ -0,0 +1,167 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityKernelFunctionGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.EntryPoints; +using ILGPU.Backends.IL; +using ILGPU.IR; +using ILGPU.IR.Analyses; +using ILGPU.IR.Values; +using System; +using System.Reflection.Emit; + +namespace ILGPU.Backends.Velocity +{ + /// + /// A generator for primary Velocity kernels. + /// + /// The IL emitter type. + sealed class VelocityKernelFunctionGenerator : + VelocityCodeGenerator + where TILEmitter : struct, IILEmitter + { + #region Constants + + public const int GlobalParametersIndex = 1; + + #endregion + + private readonly ILLabel exitMarker; + private readonly ILLocal targetMaskCount; + + /// + /// Creates a new Velocity kernel generator. + /// + /// The generator args to use. + /// The current method to generate code for. + /// All allocations of the current method. + public VelocityKernelFunctionGenerator( + in GeneratorArgs args, + Method method, + Allocas allocas) + : base(args, method, allocas) + { + EntryPoint = args.EntryPoint; + ParametersType = args.Module.ParametersType; + + // Generate an exit marker to jump to when the kernel function returns + exitMarker = Emitter.DeclareLabel(); + + // We use this counter to remember the number of active threads that entered + // the kernel successfully + targetMaskCount = Emitter.DeclareLocal(typeof(int)); + } + + /// + /// Returns the current entry point. + /// + public EntryPoint EntryPoint { get; } + + /// + /// Returns the current parameters type. + /// + public Type ParametersType { get; } + + /// + /// Generates Velocity code for this kernel. + /// + public override void GenerateCode() + { + // Extract all arguments of the actual parameters object + var parametersLocal = Emitter.DeclareLocal(ParametersType); + Emitter.Emit(ArgumentOperation.Load, GlobalParametersIndex); + Emitter.Emit(OpCodes.Castclass, ParametersType); + Emitter.Emit(LocalOperation.Store, parametersLocal); + + // Load all parameters by mapping them to local variables + for ( + int i = EntryPoint.KernelIndexParameterOffset; + i < Method.NumParameters; + ++i) + { + var parameterType = Method.Parameters[i].ParameterType; + var parameterLocal = DeclareVectorizedTemporary(parameterType); + + Emitter.Emit(LocalOperation.Load, parametersLocal); + Emitter.LoadField( + ParametersType, + i - EntryPoint.KernelIndexParameterOffset); + Emitter.Emit(LocalOperation.Store, parameterLocal); + + Alias(Method.Parameters[i], parameterLocal); + } + + // Bind the current implicitly grouped kernel index (if any) + var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32); + if (EntryPoint.IsImplicitlyGrouped) + Alias(Method.Parameters[0], offsetVector); + + // Store the current global index + VelocityTargetSpecializer.ComputeGlobalBaseIndex(Emitter); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.LoadLaneIndexVector32(Emitter); + Specializer.BinaryOperation32( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.I); + Emitter.Emit(LocalOperation.Store, offsetVector); + + // Setup the current main kernel mask based on the current group size + Specializer.LoadLaneIndexVector32(Emitter); + VelocityTargetSpecializer.GetGroupDim(Emitter); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.LessThan, + VelocityWarpOperationMode.I); + + // Adjust the current main kernel mask based on the user grid size + Emitter.Emit(LocalOperation.Load, offsetVector); + VelocityTargetSpecializer.GetUserSize(Emitter); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.LessThan, + VelocityWarpOperationMode.I); + Specializer.IntersectMask32(Emitter); + + var entryPointMask = GetBlockMask(Method.EntryBlock); + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, entryPointMask); + + // Determine the target mask count + Specializer.GetNumberOfActiveLanes(Emitter); + Emitter.Emit(LocalOperation.Store, targetMaskCount); + + // Emit the actual kernel code + GenerateCodeInternal(); + + // Emit the exit marker + Emitter.MarkLabel(exitMarker); + + // Return + Emitter.Emit(OpCodes.Ret); + } + + /// + public override void GenerateCode(ReturnTerminator returnTerminator) + { + returnTerminator.Assert(returnTerminator.IsVoidReturn); + + // Jump to the exit block if all lanes are active + Emitter.Emit( + LocalOperation.Load, + GetBlockMask(returnTerminator.BasicBlock)); + Specializer.GetNumberOfActiveLanes(Emitter); + Emitter.Emit(LocalOperation.Load, targetMaskCount); + Emitter.Emit(OpCodes.Beq, exitMarker); + } + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityOperations.ttinclude b/Src/ILGPU/Backends/Velocity/VelocityOperations.ttinclude new file mode 100644 index 000000000..d0b712249 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityOperations.ttinclude @@ -0,0 +1,83 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityOperations.ttinclude +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ include file="../../Static/TypeInformation.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="Microsoft.VisualStudio.TextTemplating" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#+ +public static readonly + (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName)[] + ImplementationTypes32 = new (MathOpFlags, string, string, string)[] +{ + (MathOpFlags.Ints, "I", "int", "Int32"), + (MathOpFlags.BoolsAndInts, "U", "uint", "UInt32"), + (MathOpFlags.Floats, "F", "float", "Float") +}; +public static readonly + (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName)[] + ImplementationTypes64 = new (MathOpFlags, string, string, string)[] +{ + (MathOpFlags.Ints, "I", "long", "Int64"), + (MathOpFlags.BoolsAndInts, "U", "ulong", "UInt64"), + (MathOpFlags.Floats, "F", "double", "Double") +}; + +public static (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName) + GetImplementationType32(TypeInformationKind kind) => + kind == TypeInformationKind.SignedInt + ? ImplementationTypes32[0] + : kind == TypeInformationKind.UnsignedInt + ? ImplementationTypes32[1] + : ImplementationTypes32[2]; + +public static (MathOpFlags Flags, string Prefix, string TypeName, string ImplTypeName) + GetImplementationType64(TypeInformationKind kind) => + kind == TypeInformationKind.SignedInt + ? ImplementationTypes64[0] + : kind == TypeInformationKind.UnsignedInt + ? ImplementationTypes64[1] + : ImplementationTypes64[2]; + +public static readonly int[] ConvTypeMultipliers = new int[] { 4, 8 }; +public static readonly TypeInformation[] Warp32ConvTypes = new TypeInformation[] +{ + SignedIntTypes[0], + SignedIntTypes[1], + SignedIntTypes[2], + UnsignedIntTypes[0], + UnsignedIntTypes[1], + UnsignedIntTypes[2], + FloatTypes[0], + FloatTypes[1], +}; +public static readonly TypeInformation[] Warp64ConvTypes = new TypeInformation[] +{ + SignedIntTypes[3], + UnsignedIntTypes[3], + FloatTypes[2], +}; +public static readonly TypeInformation[] Warp32IOTypes = new TypeInformation[] +{ + UnsignedIntTypes[0], + UnsignedIntTypes[1], + UnsignedIntTypes[2], + FloatTypes[0], + FloatTypes[1], +}; +public static readonly TypeInformation[] Warp64IOTypes = new TypeInformation[] +{ + UnsignedIntTypes[3], + FloatTypes[2], +}; +#> \ No newline at end of file diff --git a/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs b/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs new file mode 100644 index 000000000..23fab7e9f --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs @@ -0,0 +1,692 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityTargetSpecializer.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Diagnostics; +using System.Reflection; +using System.Reflection.Emit; +using System.Threading; + +namespace ILGPU.Backends.Velocity +{ + /// + /// An abstract target specializer used to generated vectorized instructions. + /// + abstract class VelocityTargetSpecializer + { + #region Static + + internal static MethodInfo GetMethod(string name) => + typeof(T) + .GetMethod(name, BindingFlags.Static | BindingFlags.NonPublic) + .AsNotNull(); + + private static readonly MethodInfo MemoryBarrierMethod = + GetMethod(nameof(MemoryBarrier)); + private static readonly MethodInfo GetGridIndexMethod = + GetMethod(nameof(GetGridIndexImpl)); + private static readonly MethodInfo GetGridDimMethod = + GetMethod(nameof(GetGridDimImpl)); + private static readonly MethodInfo GetGroupDimMethod = + GetMethod(nameof(GetGroupDimImpl)); + private static readonly MethodInfo GetUserSizeMethod = + GetMethod(nameof(GetUserSizeImpl)); + + private static readonly MethodInfo GetDynamicSharedMemoryMethod = + GetMethod( + nameof(GetDynamicSharedMemoryImpl)); + private static readonly MethodInfo GetDynamicSharedMemoryLengthInBytesMethod = + GetMethod( + nameof(GetDynamicSharedMemoryLengthInBytesImpl)); + private static readonly MethodInfo GetSharedMemoryFromPoolMethod = + GetMethod(nameof(GetSharedMemoryFromPoolImpl)); + private static readonly MethodInfo GetLocalMemoryFromPoolMethod = + GetMethod(nameof(GetLocalMemoryFromPoolImpl)); + private static readonly MethodInfo ComputeGlobalBaseIndexMethod = + GetMethod(nameof(ComputeGlobalBaseIndexImpl)); + private static readonly MethodInfo DebuggerBreakMethod = + GetMethod(nameof(DebuggerBreakImpl)); + + /// + /// Wrapper around an Interlocked memory barrier. + /// + internal static void MemoryBarrier() => Interlocked.MemoryBarrier(); + + /// + /// Wrapper around a group extension context. + /// + internal static int GetGridIndexImpl(VelocityGroupExecutionContext context) => + context.GridIdx; + + /// + /// Wrapper around a group extension context. + /// + internal static int GetGridDimImpl(VelocityGroupExecutionContext context) => + context.GridDim; + + /// + /// Wrapper around a group extension context. + /// + internal static int GetGroupDimImpl(VelocityGroupExecutionContext context) => + context.GroupDim; + + /// + /// Wrapper around a group extension context. + /// + internal static int GetUserSizeImpl(VelocityGroupExecutionContext context) => + context.UserSize; + + /// + /// Wrapper around a group extension context. + /// + internal static int ComputeGlobalBaseIndexImpl( + VelocityGroupExecutionContext context) => context.GroupOffset; + + /// + /// Wrapper around a group extension context. + /// + internal static int GetDynamicSharedMemoryLengthInBytesImpl( + VelocityGroupExecutionContext context) + where T : unmanaged + { + int elementSize = Interop.SizeOf(); + return context.DynamicSharedMemory.IntLength / elementSize; + } + + /// + /// Wrapper around a group extension context. + /// + internal static long GetDynamicSharedMemoryImpl( + VelocityGroupExecutionContext context) => + context.DynamicSharedMemory.LoadEffectiveAddressAsPtr().ToInt64(); + + /// + /// Wrapper around a group extension context. + /// + internal static long GetSharedMemoryFromPoolImpl( + VelocityGroupExecutionContext context, + int length) + where T : unmanaged => + context.GetSharedMemoryFromPool(length) + .LoadEffectiveAddressAsPtr() + .ToInt64(); + + /// + /// Wrapper around a group extension context. + /// + internal static long GetLocalMemoryFromPoolImpl( + VelocityGroupExecutionContext context, + int lengthInBytes) => + context.GetLocalMemoryFromPool(lengthInBytes) + .LoadEffectiveAddressAsPtr() + .ToInt64(); + + /// + /// Wrapper around a debugger command. + /// + internal static void DebuggerBreakImpl() => Debugger.Break(); + + #endregion + + #region Instance + + protected VelocityTargetSpecializer( + int warpSize, + Type warpType32, + Type warpType64) + { + WarpSize = warpSize; + WarpType32 = warpType32; + WarpType64 = warpType64; + } + + #endregion + + #region Properties + + /// + /// Returns the warp size associated with this target specializer. + /// + public int WarpSize { get; } + + /// + /// Returns the type representing a current warp value instance operating on 32 + /// bit values. + /// + public Type WarpType32 { get; } + + /// + /// Returns the type representing a current warp value instance operating on 64 + /// bit values. + /// + public Type WarpType64 { get; } + + #endregion + + #region Methods + + /// + /// Creates a new type generator using the runtime system provided. + /// + /// + /// The parent capabilities system to use. + /// + /// The parent runtime system to use. + public abstract VelocityTypeGenerator CreateTypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem); + + #endregion + + #region General + + public abstract void LoadLaneIndexVector32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void LoadLaneIndexVector64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void LoadWarpSizeVector32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void LoadWarpSizeVector64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Masks + + public abstract void PushAllLanesMask32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void PushNoLanesMask32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertMask32To64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertMask64To32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void IntersectMask32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void IntersectMask64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void UnifyMask32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void UnifyMask64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void NegateMask32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void NegateMask64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConditionalSelect32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConditionalSelect64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void CheckForAnyActiveLaneMask(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void CheckForNoActiveLaneMask(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void CheckForEqualMasks(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void GetNumberOfActiveLanes(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Scalar Values + + public abstract void LoadWarpSize32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void LoadWarpSize64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertBoolScalar(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertScalarTo32( + TILEmitter emitter, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertScalarTo64( + TILEmitter emitter, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Comparisons + + public abstract void Compare32( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void Compare64( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Conversions + + public abstract void ConvertSoftware32( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) + where TILEmitter : struct, IILEmitter; + + public abstract void ConvertSoftware64( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) + where TILEmitter : struct, IILEmitter; + + public abstract void Convert32( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) + where TILEmitter : struct, IILEmitter; + + public abstract void Convert64( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) + where TILEmitter : struct, IILEmitter; + + public abstract void Convert32To64( + TILEmitter emitter, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void Convert64To32( + TILEmitter emitter, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Arithmetics + + public abstract void UnaryOperation32( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void UnaryOperation64( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void BinaryOperation32( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void BinaryOperation64( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void TernaryOperation32( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void TernaryOperation64( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Atomics + + public abstract void AtomicCompareExchange32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void AtomicCompareExchange64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Atomic32( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + public abstract void Atomic64( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region Threads + + public virtual void Barrier(TILEmitter emitter) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(MemoryBarrierMethod); + + public abstract void BarrierPopCount32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void BarrierPopCount64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void BarrierAnd32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void BarrierAnd64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void BarrierOr32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void BarrierOr64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Broadcast32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Broadcast64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Shuffle32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Shuffle64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleUp32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleUp64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleUp32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleUp64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleDown32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleDown64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleDown32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleDown64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleXor32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void ShuffleXor64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleXor32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void SubShuffleXor64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + #endregion + + #region IO + + public abstract void Load8(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Load16(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Load32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Load64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public void Load( + TILEmitter emitter, + BasicValueType basicValueType) + where TILEmitter : struct, IILEmitter + { + switch (basicValueType) + { + case BasicValueType.Int1: + case BasicValueType.Int8: + Load8(emitter); + break; + case BasicValueType.Int16: + case BasicValueType.Float16: + Load16(emitter); + break; + case BasicValueType.Int32: + case BasicValueType.Float32: + Load32(emitter); + break; + case BasicValueType.Int64: + case BasicValueType.Float64: + Load64(emitter); + break; + } + } + + public abstract void Store8(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Store16(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Store32(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void Store64(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public void Store( + TILEmitter emitter, + BasicValueType basicValueType) + where TILEmitter : struct, IILEmitter + { + switch (basicValueType) + { + case BasicValueType.Int1: + case BasicValueType.Int8: + Store8(emitter); + break; + case BasicValueType.Int16: + case BasicValueType.Float16: + Store16(emitter); + break; + case BasicValueType.Int32: + case BasicValueType.Float32: + Store32(emitter); + break; + case BasicValueType.Int64: + case BasicValueType.Float64: + Store64(emitter); + break; + } + } + + #endregion + + #region Misc + + public static void MemoryBarrier(TILEmitter emitter) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(MemoryBarrierMethod); + + public static void GetGridIndex(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(GetGridIndexMethod); + } + + public static void GetGridDim(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(GetGridDimMethod); + } + + public static void GetUserSize(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(GetUserSizeMethod); + } + + public static void GetGroupDim(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(GetGroupDimMethod); + } + + public void GetDynamicSharedMemory(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(GetDynamicSharedMemoryMethod); + + // Convert the scalar version into a warp-wide value + ConvertScalarTo64(emitter, VelocityWarpOperationMode.U); + } + + public void GetDynamicSharedMemoryLength( + TILEmitter emitter, + Type type) + where TILEmitter : struct, IILEmitter + { + // Get the base pointer from the shared pool + var method = GetDynamicSharedMemoryLengthInBytesMethod + .MakeGenericMethod(type); + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(method); + + // Convert the scalar version into a warp-wide value + ConvertScalarTo32(emitter, VelocityWarpOperationMode.U); + } + + public void GetSharedMemoryFromPool( + TILEmitter emitter, + Type type, + int length) + where TILEmitter : struct, IILEmitter + { + // Get the base pointer from the shared pool + var method = GetSharedMemoryFromPoolMethod.MakeGenericMethod(type); + emitter.Emit(OpCodes.Ldarg_0); + emitter.LoadIntegerConstant(length); + emitter.EmitCall(method); + + // Convert the scalar version into a warp-wide value + ConvertScalarTo64(emitter, VelocityWarpOperationMode.U); + } + + public void GetUnifiedLocalMemoryFromPool( + TILEmitter emitter, + int lengthInBytes) + where TILEmitter : struct, IILEmitter + { + // Get the base pointer from the local pool + emitter.Emit(OpCodes.Ldarg_0); + emitter.LoadIntegerConstant(lengthInBytes); + emitter.EmitCall(GetLocalMemoryFromPoolMethod); + + // Convert the scalar version into a warp-wide value + ConvertScalarTo64(emitter, VelocityWarpOperationMode.U); + } + + public static void ComputeGlobalBaseIndex(TILEmitter emitter) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Ldarg_0); + emitter.EmitCall(ComputeGlobalBaseIndexMethod); + } + + public static void DebuggerBreak(TILEmitter emitter) + where TILEmitter : struct, IILEmitter => + emitter.EmitCall(DebuggerBreakMethod); + + public abstract void DebugAssertFailed(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void WriteToOutput(TILEmitter emitter) + where TILEmitter : struct, IILEmitter; + + public abstract void DumpWarp32( + TILEmitter emitter, + string? label = null) + where TILEmitter : struct, IILEmitter; + + public abstract void DumpWarp64( + TILEmitter emitter, + string? label = null) + where TILEmitter : struct, IILEmitter; + + public void DupDumpAndBreak32( + TILEmitter emitter, + string? label = null) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Dup); + DumpWarp32(emitter, label); + DebuggerBreak(emitter); + } + + public void DupDumpAndBreak64( + TILEmitter emitter, + string? label = null) + where TILEmitter : struct, IILEmitter + { + emitter.Emit(OpCodes.Dup); + DumpWarp64(emitter, label); + DebuggerBreak(emitter); + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs new file mode 100644 index 000000000..1f1a35c2b --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/VelocityTypeGenerator.cs @@ -0,0 +1,354 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityTypeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.IR.Types; +using ILGPU.Runtime.Velocity; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Reflection; +using System.Reflection.Emit; +using System.Threading; + +namespace ILGPU.Backends.IL +{ + /// + /// A type generator for managed IL types. + /// + abstract class VelocityTypeGenerator : DisposeBase + { + #region Static + + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + private static Type LoadStructureType( + StructureType structureType, + VelocityTypeGenerator parent, + in TTypeProvider typeProvider) + where TTypeProvider : IExtendedTypeProvider + { + using var scopedLock = parent.RuntimeSystem.DefineRuntimeStruct( + explicitLayout: typeProvider.UsesExplicitOffsets, + out var typeBuilder); + int index = 0; + foreach (var (type, fieldAccess) in structureType) + { + var field = typeBuilder.DefineField( + StructureType.GetFieldName(index++), + type.LoadManagedType(typeProvider), + FieldAttributes.Public); + + int offset = structureType.GetOffset(fieldAccess.Index); + typeProvider.SetOffset(field, offset); + } + + return typeBuilder.CreateType(); + } + + #endregion + + #region Nested Types + + private interface IExtendedTypeProvider : IManagedTypeProvider + { + /// + /// Returns true if this provider requires explicit offsets. + /// + bool UsesExplicitOffsets { get; } + + /// + /// Sets an explicit field offset. + /// + void SetOffset(FieldBuilder fieldBuilder, int offset); + } + + /// + /// Provides linearized scalar versions of given scalar managed types. + /// + private readonly struct LinearScalarTypeProvider : IExtendedTypeProvider + { + private readonly VelocityTypeGenerator parent; + private readonly TypeNode.ScalarManagedTypeProvider scalarProvider; + + /// + /// Creates a new instance of the scalar type provider. + /// + /// The parent IL type generator. + public LinearScalarTypeProvider(VelocityTypeGenerator typeGenerator) + { + parent = typeGenerator; + scalarProvider = new TypeNode.ScalarManagedTypeProvider(); + } + + /// + /// Returns the default managed type for the given primitive one. + /// + public Type GetPrimitiveType(PrimitiveType primitiveType) + { + if (primitiveType.BasicValueType == BasicValueType.Float16 && + !parent.CapabilityContext.Float16) + { + throw VelocityCapabilityContext.GetNotSupportedFloat16Exception(); + } + return scalarProvider.GetPrimitiveType(primitiveType); + } + + /// + /// Returns the default managed array type for the given array type. + /// + public Type GetArrayType(ArrayType arrayType) => + scalarProvider.GetArrayType(arrayType); + + /// + /// Returns a specialized pointer implementation. + /// + public Type GetPointerType(PointerType pointerType) => + scalarProvider.GetPointerType(pointerType); + + /// + /// Returns a specialized pointer-view implementation. + /// + public Type GetViewType(ViewType viewType) => + scalarProvider.GetViewType(viewType); + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + public Type GetStructureType(StructureType structureType) => + LoadStructureType(structureType, parent, this); + + /// + /// Returns true. + /// + public bool UsesExplicitOffsets => true; + + /// + /// Sets the current field offset. + /// + public void SetOffset(FieldBuilder fieldBuilder, int offset) => + fieldBuilder.SetOffset(offset); + } + + /// + /// Provides vectorized versions of given scalar managed types. + /// + private readonly struct VectorizedTypeProvider : IExtendedTypeProvider + { + private readonly VelocityTypeGenerator parent; + + /// + /// Creates a new instance of the vectorized type provider. + /// + /// The parent IL type generator. + public VectorizedTypeProvider(VelocityTypeGenerator typeGenerator) + { + parent = typeGenerator; + } + + /// + /// Returns the default managed type for the given primitive one. + /// + public Type GetPrimitiveType(PrimitiveType primitiveType) => + parent.GetVectorizedBasicType(primitiveType.BasicValueType); + + /// + /// Returns the default managed array type for the given array type. + /// + public Type GetArrayType(ArrayType arrayType) => arrayType.LoadManagedType(); + + /// + /// Returns a specialized pointer implementation. + /// + public Type GetPointerType(PointerType pointerType) => + parent.GetVectorizedBasicType(BasicValueType.Int64); + + /// + /// Returns a specialized pointer-view implementation. + /// + public Type GetViewType(ViewType viewType) => + PointerViews.ViewImplementation.GetImplementationType( + viewType.ElementType.LoadManagedType()); + + /// + /// Returns the default structure type implementation reflecting the basic + /// type hierarchy. + /// + public Type GetStructureType(StructureType structureType) => + LoadStructureType(structureType, parent, this); + + /// + /// Returns false. + /// + public bool UsesExplicitOffsets => false; + + /// + /// Does not do anything. + /// + public void SetOffset(FieldBuilder fieldBuilder, int offset) { } + } + + #endregion + + #region Static + + /// + /// Gets or creates a new managed type using the given type provider instance. + /// + private static Type GetOrCreateType( + ReaderWriterLockSlim readerWriterLock, + Dictionary typeMapping, + TypeNode typeNode, + TTypeProvider typeProvider, + Func typeSelector, + Func typeBinder) + where TTypeProvider : IManagedTypeProvider + { + // Synchronize all accesses below using a read/write scope + using var readWriteScope = readerWriterLock.EnterUpgradeableReadScope(); + + if (typeMapping.TryGetValue(typeNode, out var mappedType)) + { + var selected = typeSelector(mappedType.Linear, mappedType.Vectorized); + if (selected is not null) + return selected; + } + + // Get a new type instance + using var writeScope = readWriteScope.EnterWriteScope(); + var newMappedType = typeNode.LoadManagedType(typeProvider); + mappedType = typeBinder( + mappedType.Linear, + mappedType.Vectorized, + newMappedType); + typeMapping[typeNode] = mappedType; + + return typeSelector(mappedType.Linear, mappedType.Vectorized).AsNotNull(); + } + + #endregion + + #region Instance + + private readonly ReaderWriterLockSlim readerWriterLock = + new ReaderWriterLockSlim(LockRecursionPolicy.SupportsRecursion); + private readonly Dictionary + typeMapping = new Dictionary(); + + /// + /// Constructs a new IL type generator. + /// + /// The parent context. + /// The parent runtime system. + /// The current warp size. + public VelocityTypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem, + int warpSize) + { + CapabilityContext = capabilityContext; + RuntimeSystem = runtimeSystem; + WarpSize = warpSize; + } + + #endregion + + #region Properties + + /// + /// Returns the parent capability context. + /// + public VelocityCapabilityContext CapabilityContext { get; } + + /// + /// Returns the parent runtime system. + /// + public RuntimeSystem RuntimeSystem { get; } + + /// + /// Returns the current warp size. + /// + public int WarpSize { get; } + + #endregion + + #region Methods + + /// + /// Gets a vectorized type corresponding to the given basic value type. + /// + public abstract Type GetVectorizedBasicType(BasicValueType basicValueType); + + /// + /// Gets or creates a linearized managed type for the given IR type. + /// + /// The type to build a vectorized type for. + /// + /// The linearized scalar managed type that corresponds to the given IR type. + /// + public Type GetLinearizedScalarType(TypeNode typeNode) + { + // Check for primitive types without locking + if (typeNode is PrimitiveType || typeNode is PaddingType) + return typeNode.LoadManagedType(); + + // Get or create a new type + return GetOrCreateType( + readerWriterLock, + typeMapping, + typeNode, + new LinearScalarTypeProvider(this), + (linear, _) => linear, + (_, vectorized, newLinear) => (newLinear, vectorized)); + } + + /// + /// Gets or creates a vectorized managed type for the given IR type. + /// + /// The type to build a vectorized type for. + /// + /// The vectorized managed type that corresponds to the given IR type. + /// + public Type GetVectorizedType(TypeNode typeNode) + { + // Check for primitive types without locking + if (typeNode is PrimitiveType || typeNode is PaddingType) + return GetVectorizedBasicType(typeNode.BasicValueType); + + // Get or create a new type + return GetOrCreateType( + readerWriterLock, + typeMapping, + typeNode, + new VectorizedTypeProvider(this), + (_, vectorized) => vectorized, + (linear, _, newVectorized) => (linear, newVectorized)); + } + + #endregion + + #region IDisposable + + /// + protected override void Dispose(bool disposing) + { + if (disposing) + readerWriterLock.Dispose(); + base.Dispose(disposing); + } + + #endregion + } +} + diff --git a/Src/ILGPU/ILGPU.csproj b/Src/ILGPU/ILGPU.csproj index eb69aa636..3848366e9 100644 --- a/Src/ILGPU/ILGPU.csproj +++ b/Src/ILGPU/ILGPU.csproj @@ -60,6 +60,11 @@ True PTXIntrinsics.Generated.tt + + True + True + ScalarOperations.tt + True True @@ -190,6 +195,11 @@ True PTXLibDeviceNvvm.tt + + True + True + ScalarOperations.tt + True True diff --git a/Src/ILGPU/IR/Analyses/Loops.cs b/Src/ILGPU/IR/Analyses/Loops.cs index 0d12616ad..6aca12db0 100644 --- a/Src/ILGPU/IR/Analyses/Loops.cs +++ b/Src/ILGPU/IR/Analyses/Loops.cs @@ -302,7 +302,7 @@ public Phis ComputePhis() /// /// The blocks to test. /// - /// True, if the given block contain at least one back edge block. + /// True if the given block contain at least one back edge block. /// public bool ContainsBackEdgeBlock(ReadOnlySpan blocks) { diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs index 9191b135c..7c6931ffb 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs @@ -1,7 +1,6 @@ -//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ // // This code was generated by a tool. -// Runtime Version:4.0.30319.42000 // // Changes to this file may cause incorrect behavior and will be lost if // the code is regenerated. @@ -14,12 +13,10 @@ namespace ILGPU.Resources { /// /// A strongly-typed resource class, for looking up localized strings, etc. + /// This class was generated by MSBuild using the GenerateResource task. + /// To add or remove a member, edit your .resx file then rerun MSBuild. /// - // This class was auto-generated by the StronglyTypedResourceBuilder - // class via a tool like ResGen or Visual Studio. - // To add or remove a member, edit your .ResX file then rerun ResGen - // with the /str option, or rebuild your VS project. - [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "17.0.0.0")] + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.Build.Tasks.StronglyTypedResourceBuilder", "15.1.0.0")] [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] internal class RuntimeErrorMessages { @@ -428,5 +425,23 @@ internal static string UnknownParentAccelerator { return ResourceManager.GetString("UnknownParentAccelerator", resourceCulture); } } + + /// + /// Looks up a localized string similar to The Velocity accelerator supports little-endian machines only. + /// + internal static string VelocityLittleEndian { + get { + return ResourceManager.GetString("VelocityLittleEndian", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false'. + /// + internal static string VelocityPlatform64 { + get { + return ResourceManager.GetString("VelocityPlatform64", resourceCulture); + } + } } } diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.resx b/Src/ILGPU/Resources/RuntimeErrorMessages.resx index 8e7d41429..d2b8d8819 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.resx +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.resx @@ -1,17 +1,17 @@  - @@ -240,4 +240,10 @@ Unknown parent accelerator + + Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false' + + + The Velocity accelerator supports little-endian machines only + \ No newline at end of file diff --git a/Src/ILGPU/Runtime/Accelerator.cs b/Src/ILGPU/Runtime/Accelerator.cs index a968d8189..0334c743f 100644 --- a/Src/ILGPU/Runtime/Accelerator.cs +++ b/Src/ILGPU/Runtime/Accelerator.cs @@ -30,6 +30,11 @@ public enum AcceleratorType : int /// CPU, + /// + /// Represents a SIMD CPU performance accelerator. + /// + Velocity, + /// /// Represents a Cuda accelerator. /// diff --git a/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs b/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs index b4882c3af..3e2360f23 100644 --- a/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs +++ b/Src/ILGPU/Runtime/CPU/CPUAccelerator.cs @@ -13,6 +13,7 @@ using ILGPU.Backends.IL; using ILGPU.Resources; using ILGPU.Util; +using ILGPU.Runtime.Velocity; using System; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -251,12 +252,13 @@ protected override void OnUnbind() { } /// protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) => - otherAccelerator as CPUAccelerator != null; + otherAccelerator is CPUAccelerator || + otherAccelerator is VelocityAccelerator; /// protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) { - if (otherAccelerator as CPUAccelerator == null) + if (!CanAccessPeerInternal(otherAccelerator)) { throw new InvalidOperationException( RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator); @@ -267,7 +269,7 @@ protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) protected override void DisablePeerAccessInternal( Accelerator otherAccelerator) => Debug.Assert( - otherAccelerator is CPUAccelerator, + CanAccessPeerInternal(otherAccelerator), "Invalid EnablePeerAccess method"); #endregion diff --git a/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs b/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs index 133f858aa..ef32f407b 100644 --- a/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs +++ b/Src/ILGPU/Runtime/CPU/CPUMemoryBuffer.cs @@ -92,8 +92,12 @@ public static void CPUCopyFrom( in ArrayView targetView) where T : unmanaged { - if (targetView.GetAcceleratorType() != AcceleratorType.CPU) + switch (targetView.GetAcceleratorType()) { + case AcceleratorType.CPU: + case AcceleratorType.Velocity: + break; + default: throw new NotSupportedException( RuntimeErrorMessages.NotSupportedTargetAccelerator); } @@ -107,6 +111,7 @@ public static void CPUCopyFrom( switch (sourceView.GetAcceleratorType()) { case AcceleratorType.CPU: + case AcceleratorType.Velocity: // Copy from CPU to CPU CPUCopyToCPU( ref sourceView.LoadEffectiveAddress(), @@ -157,10 +162,14 @@ public static void CPUCopyTo( in ArrayView targetView) where T : unmanaged { - if (sourceView.GetAcceleratorType() != AcceleratorType.CPU) + switch (sourceView.GetAcceleratorType()) { - throw new NotSupportedException( - RuntimeErrorMessages.NotSupportedTargetAccelerator); + case AcceleratorType.CPU: + case AcceleratorType.Velocity: + break; + default: + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedTargetAccelerator); } if (targetView.Length > sourceView.Length) throw new ArgumentOutOfRangeException(nameof(sourceView)); @@ -172,6 +181,7 @@ public static void CPUCopyTo( switch (targetView.GetAcceleratorType()) { case AcceleratorType.CPU: + case AcceleratorType.Velocity: // Copy from CPU to CPU CPUCopyToCPU( ref sourceView.LoadEffectiveAddress(), @@ -223,11 +233,14 @@ public static void CPUCopy( in ArrayView targetView) where T : unmanaged { - if (sourceView.GetAcceleratorType() == AcceleratorType.CPU) + if (sourceView.GetAcceleratorType() == AcceleratorType.CPU || + sourceView.GetAcceleratorType() == AcceleratorType.Velocity) { CPUCopyTo(stream, sourceView, targetView); } - else if (targetView.GetAcceleratorType() == AcceleratorType.CPU) + else if ( + targetView.GetAcceleratorType() == AcceleratorType.CPU || + sourceView.GetAcceleratorType() == AcceleratorType.Velocity) { CPUCopyFrom(stream, sourceView, targetView); } diff --git a/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs b/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs index a54a352e7..772e57a3c 100644 --- a/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs +++ b/Src/ILGPU/Runtime/IAcceleratorExtensionProvider.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2017-2021 ILGPU Project +// Copyright (c) 2017-2023 ILGPU Project // www.ilgpu.net // // File: IAcceleratorExtensionProvider.cs @@ -12,6 +12,7 @@ using ILGPU.Runtime.CPU; using ILGPU.Runtime.Cuda; using ILGPU.Runtime.OpenCL; +using ILGPU.Runtime.Velocity; namespace ILGPU.Runtime { @@ -28,6 +29,13 @@ public interface IAcceleratorExtensionProvider /// The created extension. TExtension CreateCPUExtension(CPUAccelerator accelerator); + /// + /// Creates an extension for a Velocity accelerator. + /// + /// The target accelerator. + /// The created extension. + TExtension CreateVelocityExtension(VelocityAccelerator accelerator); + /// /// Creates an extension for a Cuda accelerator. /// diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs new file mode 100644 index 000000000..450b35a73 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs @@ -0,0 +1,562 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityAccelerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Backends.IL; +using ILGPU.Backends.Velocity; +using ILGPU.Resources; +using ILGPU.Runtime.CPU; +using ILGPU.Util; +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; + +#pragma warning disable CA1508 + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A SIMD-enabled CPU-based accelerator. + /// + public sealed class VelocityAccelerator : Accelerator + { + #region Static + + /// + /// The internal run method to launch kernels. + /// + private static readonly MethodInfo RunMethodInfo = + typeof(VelocityAccelerator).GetMethod( + nameof(Run), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .AsNotNull(); + + #endregion + + #region Nested Types + + private sealed class ParallelExecutionEngine : + ParallelProcessingCache< + VelocityGroupExecutionContext, + ParallelExecutionEngine>, + IParallelProcessingBody + { + public ParallelExecutionEngine(VelocityAccelerator accelerator) + { + Accelerator = accelerator; + } + + /// + /// Returns the parent Velocity accelerator. + /// + public VelocityAccelerator Accelerator { get; } + + /// + /// Returns the current instance. + /// + protected override ParallelExecutionEngine CreateBody() => this; + + /// + /// Does not perform any operation. + /// + public void Initialize() { } + + /// + /// Gets or sets the current group dimension. + /// + public int GroupDim { get; set; } + + /// + /// Gets or sets the current grid dimension. + /// + public int GridDim { get; set; } + + /// + /// Returns the parent user size. + /// + public long UserSize { get; set; } + + /// + /// Returns the dynamic shared memory length in bytes. + /// + public int DynamicSharedMemoryLengthInBytes { get; set; } + + /// + /// Gets or sets the current entry point handler. + /// + public VelocityEntryPointHandler? EntryPointHandler { get; set; } + + /// + /// Gets or sets the current kernel parameters. + /// + public VelocityParameters? Parameters { get; set; } + + /// + /// Creates a new group execution context. + /// + protected override VelocityGroupExecutionContext CreateIntermediate() => + new(Accelerator); + + /// + /// Setups the group context and invokes the kernel callback for the current + /// thread group. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Process( + int index, + ParallelLoopState? loopState, + VelocityGroupExecutionContext intermediateState) + { + intermediateState.SetupThreadGrid( + index, + GroupDim, + GridDim, + (int)UserSize, + DynamicSharedMemoryLengthInBytes); + + // Invoke the actual kernel + EntryPointHandler.AsNotNull().Invoke( + intermediateState, + Parameters.AsNotNull()); + } + + /// + /// Does not perform any cleanup operations. + /// + public void Finalize( + ReadOnlySpan intermediateStates) + { } + } + + #endregion + + #region Instance + + [SuppressMessage( + "Microsoft.Usage", + "CA2213: Disposable fields should be disposed", + Justification = "This is disposed in DisposeAccelerator_SyncRoot")] + private readonly SemaphoreSlim taskConcurrencyLimit = new(1); + + [SuppressMessage( + "Microsoft.Usage", + "CA2213: Disposable fields should be disposed", + Justification = "This is disposed in DisposeAccelerator_SyncRoot")] + private readonly ParallelExecutionEngine executionEngine; + + private readonly ParallelOptions parallelOptions; + + /// + /// Constructs a new Velocity accelerator. + /// + /// The ILGPU context. + /// The Velocity device. + internal VelocityAccelerator(Context context, VelocityDevice device) + : base(context, device) + { + if (!device.IsLittleEndian) + { + throw new NotSupportedException( + RuntimeErrorMessages.VelocityLittleEndian); + } + + NativePtr = new IntPtr(2); + DefaultStream = CreateStreamInternal(); + + parallelOptions = new ParallelOptions() + { +#if DEBUG + MaxDegreeOfParallelism = 1, +#else + MaxDegreeOfParallelism = device.NumMultiprocessors, +#endif + }; + executionEngine = new ParallelExecutionEngine(this); + + // Uncomment this line and comment the corresponding creation line to enable + // code generation debugging on the command line + // Init(new VelocityBackend( + + // Init the underlying Velocity backend + Init(new VelocityBackend( + context, + device.Capabilities.AsNotNullCast(), + WarpSize, + new VelocityArgumentMapper(context), + device.TargetSpecializer)); + } + + #endregion + + #region Properties + + /// + /// Returns the Velocity backend of this accelerator. + /// + internal new VelocityBackend Backend => + base.Backend.AsNotNullCast>(); + + #endregion + + #region Launch Methods + + /// + /// Main internal run method to launch loaded kernels. + /// + /// The user-defined kernel config. + /// + /// The actual runtime kernel config to be used for launching. + /// + /// The kernel entry point handler. + /// + /// The current velocity kernel parameters. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void Run( + KernelConfig userKernelConfig, + RuntimeKernelConfig runtimeKernelConfig, + VelocityEntryPointHandler entryPointHandler, + VelocityParameters velocityParameters) + { + // Avoid concurrent executions of kernels.. we have to wait for the current + // kernel to finish first + taskConcurrencyLimit.Wait(); + + // Uncomment this line to see velocity input parameters being dumped to the + // standard output stream for debugging purposes + // velocityParameters.DumpToConsole(); + + try + { + // Determine actual thread-grid sizes + int gridSize = runtimeKernelConfig.GridDim.Size; + int groupSize = runtimeKernelConfig.GroupDim.Size; + + // Setup engine properties + executionEngine.GroupDim = groupSize; + executionEngine.UserSize = userKernelConfig.Size; + executionEngine.GridDim = gridSize; + executionEngine.DynamicSharedMemoryLengthInBytes = + runtimeKernelConfig.SharedMemoryConfig.DynamicArraySize; + executionEngine.EntryPointHandler = entryPointHandler; + executionEngine.Parameters = velocityParameters; + + // Launch all threads + executionEngine.ParallelFor(0, gridSize, parallelOptions); + } + finally + { + // Free the current task sema + taskConcurrencyLimit.Release(); + } + } + + /// + /// Generates a dynamic kernel-launcher method that will be just-in-time compiled + /// during the first invocation. Using the generated launcher lowers the overhead + /// for kernel launching dramatically, since unnecessary operations (like boxing) + /// can be avoided. + /// + /// The kernel to generate a launcher for. + /// + /// The custom group size for the launching operation. + /// + /// The generated launcher method. + private MethodInfo GenerateKernelLauncherMethod( + VelocityCompiledKernel kernel, + int customGroupSize) + { + var entryPoint = kernel.EntryPoint; + AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); + + // Add support for by ref parameters + if (entryPoint.HasByRefParameters) + { + throw new NotSupportedException( + ErrorMessages.NotSupportedByRefKernelParameters); + } + + // Declare a new launcher method + using var scopedLock = entryPoint.CreateLauncherMethod( + Context.RuntimeSystem, + out var launcher); + var emitter = new ILEmitter(launcher.ILGenerator); + + // Map all arguments to an argument structure containing mapped views + var argumentMapper = Backend.ArgumentMapper; + var (structLocal, _) = argumentMapper.Map(emitter, entryPoint); + + var velocityKernel = emitter.DeclareLocal(typeof(VelocityKernel)); + KernelLauncherBuilder.EmitLoadKernelArgument( + Kernel.KernelInstanceParamIdx, emitter); + emitter.Emit(LocalOperation.Store, velocityKernel); + + // Create an instance of the custom parameters type + var parametersInstance = emitter.DeclarePinnedLocal(kernel.ParametersType); + emitter.Emit(OpCodes.Ldnull); + emitter.Emit(LocalOperation.Store, parametersInstance); + { + // Assign parameters + var parameters = entryPoint.Parameters; + for (int i = 0, e = parameters.Count; i < e; ++i) + { + // Load native address onto stack + emitter.Emit(LocalOperation.LoadAddress, structLocal); + emitter.LoadFieldAddress(structLocal.VariableType, i); + emitter.Emit(OpCodes.Conv_I); + } + + // Create new task object + emitter.EmitNewObject(kernel.ParametersTypeConstructor); + + // Store task + emitter.Emit(LocalOperation.Store, parametersInstance); + } + + // Load the kernel delegate + emitter.Emit(LocalOperation.Load, velocityKernel); + emitter.EmitCall(VelocityKernel.GetVelocityAccelerator); + + // Load custom user dimension + KernelLauncherBuilder.EmitLoadKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize); + + // Load dimensions + KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( + entryPoint, + emitter, + Kernel.KernelParamDimensionIdx, + MaxGridSize, + MaxGroupSize, + customGroupSize); + + // Load the kernel delegate + emitter.Emit(LocalOperation.Load, velocityKernel); + emitter.EmitCall(VelocityKernel.GetKernelExecutionDelegate); + + // Load the parameters object + emitter.Emit(LocalOperation.Load, parametersInstance); + + // Launch kernel execution + emitter.EmitCall(RunMethodInfo); + + // End of launch method + emitter.Emit(OpCodes.Ret); + emitter.Finish(); + + return launcher.Finish(); + } + + #endregion + + /// + public override TExtension CreateExtension< + TExtension, + TExtensionProvider>(TExtensionProvider provider) => + provider.CreateVelocityExtension(this); + + /// + protected override MemoryBuffer AllocateRawInternal( + long length, + int elementSize) => + new VelocityMemoryBuffer(this, length, elementSize); + + /// + /// Loads the given kernel. + /// + /// The kernel to load. + /// The custom group size. + /// The loaded kernel + private Kernel LoadKernel(CompiledKernel kernel, int customGroupSize) + { + if (kernel is null) + throw new ArgumentNullException(nameof(kernel)); + if (!(kernel is VelocityCompiledKernel compiledKernel)) + { + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedKernel); + } + + var launcherMethod = GenerateKernelLauncherMethod( + compiledKernel, + customGroupSize); + return new VelocityKernel( + this, + compiledKernel, + launcherMethod); + } + + /// + /// Loads a default kernel. + /// + protected override Kernel LoadKernelInternal(CompiledKernel kernel) => + LoadKernel(kernel, 0); + + /// + /// Loads an implicitly grouped kernel. + /// + protected override Kernel LoadImplicitlyGroupedKernelInternal( + CompiledKernel kernel, + int customGroupSize, + out KernelInfo kernelInfo) + { + if (customGroupSize < 0) + throw new ArgumentOutOfRangeException(nameof(customGroupSize)); + kernelInfo = KernelInfo.CreateFrom( + kernel.Info, + customGroupSize, + null); + return LoadKernel(kernel, customGroupSize); + } + + /// + /// Loads an auto grouped kernel. + /// + protected override Kernel LoadAutoGroupedKernelInternal( + CompiledKernel kernel, + out KernelInfo kernelInfo) + { + var result = LoadKernel(kernel, WarpSize); + kernelInfo = new KernelInfo(WarpSize, MaxNumThreads / WarpSize); + return result; + } + + /// + protected override AcceleratorStream CreateStreamInternal() => + new VelocityStream(this); + + /// + protected override void SynchronizeInternal() { } + + /// + protected override void OnBind() { } + + /// + protected override void OnUnbind() { } + + #region Peer Access + + /// + protected override bool CanAccessPeerInternal(Accelerator otherAccelerator) => + otherAccelerator is CPUAccelerator || + otherAccelerator is VelocityAccelerator; + + /// + protected override void EnablePeerAccessInternal(Accelerator otherAccelerator) + { + if (!CanAccessPeerInternal(otherAccelerator)) + { + throw new InvalidOperationException( + RuntimeErrorMessages.CannotEnablePeerAccessToOtherAccelerator); + } + } + + /// + protected override void DisablePeerAccessInternal( + Accelerator otherAccelerator) => + Debug.Assert( + CanAccessPeerInternal(otherAccelerator), + "Invalid EnablePeerAccess method"); + + #endregion + + #region Occupancy + + /// + protected override int EstimateMaxActiveGroupsPerMultiprocessorInternal( + Kernel kernel, + int groupSize, + int dynamicSharedMemorySizeInBytes) => + kernel is VelocityKernel + ? groupSize > MaxGroupSize.Size ? 0 : NumMultiprocessors + : throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + Func computeSharedMemorySize, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = MaxNumThreads / WarpSize; + return Math.Min(maxGroupSize, MaxGroupSize.Size); + } + + /// + protected override int EstimateGroupSizeInternal( + Kernel kernel, + int dynamicSharedMemorySizeInBytes, + int maxGroupSize, + out int minGridSize) + { + if (!(kernel is VelocityKernel)) + throw new NotSupportedException(RuntimeErrorMessages.NotSupportedKernel); + + // Estimation + minGridSize = MaxNumThreads / WarpSize; + return WarpSize; + } + + #endregion + + #region Page Lock Scope + + /// + protected override PageLockScope CreatePageLockFromPinnedInternal( + IntPtr pinned, + long numElements) + { + Trace.WriteLine(RuntimeErrorMessages.NotSupportedPageLock); + return new NullPageLockScope(this, pinned, numElements); + } + + #endregion + + #region IDisposable + + /// + /// Dispose all managed resources allocated by this CPU accelerator instance. + /// + protected override void DisposeAccelerator_SyncRoot(bool disposing) + { + if (!disposing) + return; + + // Dispose task engine + taskConcurrencyLimit.Wait(); + executionEngine.Dispose(); + + // Dispose barriers + taskConcurrencyLimit.Dispose(); + } + + #endregion + + } +} + +#pragma warning restore CA1508 diff --git a/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs new file mode 100644 index 000000000..0eee424eb --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityContextExtensions.cs @@ -0,0 +1,97 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityContextExtensions.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends; +using ILGPU.Resources; +using System; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Velocity specific context extensions. + /// + public static class VelocityContextExtensions + { + #region Builder + + /// + /// Enables all Velocity devices supporting all types of vectorization executable + /// on the current hardware. + /// + /// The builder instance. + /// The updated builder instance. + public static Context.Builder AllVelocity(this Context.Builder builder) + { + foreach (var deviceType in Enum.GetValues()) + builder.Velocity(deviceType); + return builder; + } + + /// + /// Enables the Velocity device with the maximum vector length supported by the + /// current hardware. + /// + /// The builder instance. + /// The updated builder instance. + public static Context.Builder Velocity(this Context.Builder builder) => + builder.Velocity(VelocityDeviceType.Scalar2); + + /// + /// Enables a specific Velocity device. + /// + /// The builder instance. + /// The type of the Velocity device. + /// The updated builder instance. + public static Context.Builder Velocity( + this Context.Builder builder, + VelocityDeviceType deviceType) + { + if (!Backend.RuntimePlatform.Is64Bit()) + { + throw new NotSupportedException(string.Format( + RuntimeErrorMessages.VelocityPlatform64, + Backend.RuntimePlatform)); + } + + builder.DeviceRegistry.Register(new VelocityDevice(deviceType)); + return builder; + } + + #endregion + + #region Context + + /// + /// Gets a registered Velocity device. + /// + /// The ILGPU context. + /// The Velocity device index. + /// The registered Velocity device. + public static VelocityDevice GetVelocityDevice( + this Context context, + int index = 0) => + context.GetDevice(index); + + /// + /// Creates a new Velocity accelerator. + /// + /// The ILGPU context. + /// The Velocity device index. + public static VelocityAccelerator CreateVelocityAccelerator( + this Context context, + int index = 0) => + context.GetVelocityDevice(index).CreateVelocityAccelerator(context); + + #endregion + } + +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs new file mode 100644 index 000000000..e2793482e --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs @@ -0,0 +1,149 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityDevice.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.Velocity; +using ILGPU.Backends.Velocity.Scalar; +using ILGPU.Util; +using System; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// The device type of a Velocity device. + /// + public enum VelocityDeviceType + { + /// + /// Scalar operations to simulate two lanes per warp. + /// + Scalar2, + } + + /// + /// Represents a software-emulated velocity device for high-performance execution of + /// tasks on the CPU using vectorization. + /// + [DeviceType(AcceleratorType.Velocity)] + public sealed class VelocityDevice : Device + { + #region Static + + private static readonly Type[] VelocitySpecializers = new Type[] + { + typeof(Scalar) + }; + + #endregion + + #region Instance + + /// + /// Creates a new velocity device using the given device type. + /// + /// The Velocity device type to use. + public VelocityDevice(VelocityDeviceType deviceType) + { + switch (deviceType) + { + case VelocityDeviceType.Scalar2: + // Scalar is always supported + break; + default: + throw new ArgumentOutOfRangeException(nameof(deviceType)); + } + + Name = $"{nameof(VelocityAccelerator)}_{deviceType}"; + DeviceType = deviceType; + TargetSpecializer = Activator.CreateInstance( + VelocitySpecializers[(int)deviceType]) + .AsNotNullCast(); + WarpSize = TargetSpecializer.WarpSize; + MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize; + NumMultiprocessors = Environment.ProcessorCount; + MaxGroupSize = new Index3D( + MaxNumThreadsPerGroup, + 1, + 1); + + MemorySize = long.MaxValue; + MaxGridSize = new Index3D(int.MaxValue, 1, 1); + MaxConstantMemory = int.MaxValue; + NumThreads = MaxNumThreads; + + // Get the endian type from the global BitConverter class + IsLittleEndian = BitConverter.IsLittleEndian; + + // Allocate a sufficient amount of local memory per thread equal to + // the maximum number of shared memory per group in bytes (2 MB) + MaxSharedMemoryPerGroup = 2 << 20; + + // Setup default Velocity capabilities + Capabilities = new VelocityCapabilityContext(); + } + + #endregion + + #region Properties + + /// + /// Returns the current device type. + /// + public VelocityDeviceType DeviceType { get; } + + /// + /// Returns the internally used target specializer. + /// + internal VelocityTargetSpecializer TargetSpecializer { get; } + + /// + /// Returns the number of threads. + /// + public int NumThreads { get; } + + /// + /// Returns true if this device operates in little endian mode. + /// + public bool IsLittleEndian { get; } + + #endregion + + #region Methods + + /// + public override Accelerator CreateAccelerator(Context context) => + CreateVelocityAccelerator(context); + + /// + /// Creates a new performance CPU accelerator using and the default thread + /// priority. + /// + /// The ILGPU context. + /// The created CPU accelerator. + public VelocityAccelerator CreateVelocityAccelerator(Context context) => + new VelocityAccelerator(context, this); + + #endregion + + #region Object + + /// + public override bool Equals(object? obj) => + obj is VelocityDevice device && + device.DeviceType == DeviceType && + base.Equals(obj); + + /// + public override int GetHashCode() => + HashCode.Combine(base.GetHashCode(), DeviceType); + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs b/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs new file mode 100644 index 000000000..32a4ed1f1 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs @@ -0,0 +1,41 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityEntryPointHandler.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single velocity kernel processing delegate. + /// + /// The main group context. + /// The current parameters. + delegate void VelocityEntryPointHandler( + VelocityGroupExecutionContext groupContext, + VelocityParameters parameters); + + /// + /// Static helper class to support dealing with + /// instances. + /// + static class VelocityEntryPointHandlerHelper + { + /// + /// Represents all entry point parameters expected by a Velocity kernel entry + /// point function. + /// + public static readonly Type[] EntryPointParameterTypes = new Type[] + { + typeof(VelocityGroupExecutionContext), + typeof(VelocityParameters), + }; + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs b/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs new file mode 100644 index 000000000..87421c00d --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs @@ -0,0 +1,135 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityGroupExecutionContext.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Util; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents an execution context for a single thread group. + /// + sealed class VelocityGroupExecutionContext : DisposeBase + { + private readonly VelocityMemoryBufferPool sharedMemoryPool; + private readonly VelocityMemoryBufferPool localMemoryPool; + private readonly int warpSize; + + /// + /// Constructs a new execution context. + /// + /// The parent velocity accelerator. + public VelocityGroupExecutionContext(VelocityAccelerator accelerator) + { + sharedMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxSharedMemoryPerGroup); + localMemoryPool = new VelocityMemoryBufferPool( + accelerator, + accelerator.MaxSharedMemoryPerGroup); + warpSize = accelerator.WarpSize; + } + + /// + /// Returns the current grid index. + /// + public int GridIdx { get; private set; } + + /// + /// Returns the current group dimension. + /// + public int GroupDim { get; private set; } + + /// + /// Returns the current grid dimension. + /// + public int GridDim { get; private set; } + + /// + /// Returns the user-specific total size. + /// + public int UserSize { get; private set; } + + /// + /// Returns a view to dynamic shared memory (if any). + /// + public ArrayView DynamicSharedMemory { get; private set; } + + /// + /// Returns the linear group . + /// + public int GroupOffset => GridIdx * GroupDim; + + /// + /// Resets this execution context. + /// + private void Reset() + { + sharedMemoryPool.Reset(); + localMemoryPool.Reset(); + } + + /// + /// Sets up the current thread grid information for the current thread group. + /// + public void SetupThreadGrid( + int gridIdx, + int groupDim, + int gridDim, + int userSize, + int dynamicSharedMemoryLength) + { + GridIdx = gridIdx; + GridDim = gridDim; + GroupDim = groupDim; + UserSize = userSize; + + // Reset everything + Reset(); + + // Allocate dynamic shared memory + if (dynamicSharedMemoryLength > 0) + { + DynamicSharedMemory = + GetSharedMemoryFromPool(dynamicSharedMemoryLength); + } + } + + /// + /// Gets a chunk of shared memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + public ArrayView GetSharedMemoryFromPool(int length) + where T : unmanaged => + sharedMemoryPool.Allocate(length); + + /// + /// Gets a chunk of local memory of a certain type. + /// + /// + /// The number of bytes to allocate per thread. + /// + /// A view pointing to the right chunk of local memory. + public ArrayView GetLocalMemoryFromPool(int lengthInBytesPerThread) => + localMemoryPool.Allocate(lengthInBytesPerThread * warpSize); + + protected override void Dispose(bool disposing) + { + if (disposing) + { + sharedMemoryPool.Dispose(); + localMemoryPool.Dispose(); + } + base.Dispose(disposing); + } + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs new file mode 100644 index 000000000..0f58cc3b0 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityKernel.cs @@ -0,0 +1,93 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityKernel.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.Velocity; +using ILGPU.Util; +using System.Reflection; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a single Velocity kernel. + /// + public sealed class VelocityKernel : Kernel + { + #region Static + + /// + /// Represents the property getter. + /// + internal static readonly MethodInfo GetVelocityAccelerator = + typeof(VelocityKernel).GetProperty( + nameof(VelocityAccelerator), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .AsNotNull() + .GetGetMethod(true) + .AsNotNull(); + + /// + /// Represents the property getter. + /// + internal static readonly MethodInfo GetKernelExecutionDelegate = + typeof(VelocityKernel).GetProperty( + nameof(KernelEntryPoint), + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance) + .AsNotNull() + .GetGetMethod(true) + .AsNotNull(); + + #endregion + + #region Instance + + /// + /// Loads a compiled kernel into the given Cuda context as kernel program. + /// + /// The associated accelerator. + /// The source kernel. + /// The launcher method for the given kernel. + internal VelocityKernel( + VelocityAccelerator accelerator, + VelocityCompiledKernel kernel, + MethodInfo launcher) + : base(accelerator, kernel, launcher) + { + KernelEntryPoint = kernel.CreateKernelEntryPoint(); + } + + #endregion + + #region Properties + + /// + /// Returns the associated Velocity runtime. + /// + public VelocityAccelerator VelocityAccelerator => + Accelerator.AsNotNullCast(); + + /// + /// The main kernel entry point function to be called from each velocity + /// multiprocessor during execution. + /// + internal VelocityEntryPointHandler KernelEntryPoint { get; } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs new file mode 100644 index 000000000..e82ae0aaa --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityMemoryBuffer.cs @@ -0,0 +1,152 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityMemoryBuffer.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.CPU; +using ILGPU.Util; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// A memory buffer that lives in CPU space. + /// + public class VelocityMemoryBuffer : MemoryBuffer + { + #region Instance + + /// + /// Initializes this array view source on the CPU. + /// + /// The parent accelerator (if any). + /// The length of this source. + /// The element size. + internal VelocityMemoryBuffer( + Accelerator accelerator, + long length, + int elementSize) + : base(accelerator, length, elementSize) + { + // Ensure that all element accesses will be properly aligned + long nativeLength = length * elementSize; + int alignmentOffset = Interop.ComputeAlignmentOffset( + nativeLength, + elementSize * accelerator.WarpSize); + // Pad the length to ensure a valid buffer size + long paddedLength = nativeLength + alignmentOffset; + + // Allocate resources and assign pointers + NativeBufferPtr = Marshal.AllocHGlobal(new IntPtr(paddedLength)); + NativePtr = NativeBufferPtr + alignmentOffset; + } + + #endregion + + #region Properties + + /// + /// Returns the natively allocated underlying buffer pointer which may not be + /// aligned in all cases. + /// + public IntPtr NativeBufferPtr { get; private set; } + + #endregion + + #region Methods + + /// + protected internal override void MemSet( + AcceleratorStream stream, + byte value, + in ArrayView targetView) => + CPUMemoryBuffer.CPUMemSet( + targetView.LoadEffectiveAddressAsPtr(), + value, + 0L, + targetView.LengthInBytes); + + /// + protected internal override void CopyFrom( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyFrom(stream, sourceView, targetView); + + /// + protected internal override void CopyTo( + AcceleratorStream stream, + in ArrayView sourceView, + in ArrayView targetView) => + CPUMemoryBuffer.CPUCopyTo(stream, sourceView, targetView); + + #endregion + + #region IDisposable + + /// + /// Disposes the underlying memory buffer. + /// + protected override void DisposeAcceleratorObject(bool disposing) + { + Marshal.FreeHGlobal(NativeBufferPtr); + NativeBufferPtr = IntPtr.Zero; + NativePtr = IntPtr.Zero; + } + + #endregion + + } + + sealed class VelocityMemoryBufferPool : VelocityMemoryBuffer + { + #region Instance + + private int sharedMemoryOffset; + + public VelocityMemoryBufferPool( + VelocityAccelerator accelerator, + int size) + : base(accelerator, size, 1) + { } + + #endregion + + #region Methods + + /// + /// Resets the internal shared memory offset. + /// + public void Reset() => sharedMemoryOffset = 0; + + /// + /// Gets a chunk of memory of a certain type. + /// + /// The number of elements. + /// The element type to allocate. + /// A view pointing to the right chunk of shared memory. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ArrayView Allocate(int length) + where T : unmanaged + { + int totalElementSize = length * Interop.SizeOf(); + int alignment = Interop.ComputeAlignmentOffset( + sharedMemoryOffset, + totalElementSize); + int newOffset = sharedMemoryOffset + alignment; + sharedMemoryOffset += alignment + totalElementSize; + return new ArrayView(this, newOffset, length); + } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs new file mode 100644 index 000000000..16d58dad3 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityParameters.cs @@ -0,0 +1,30 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityParameters.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +namespace ILGPU.Runtime.Velocity +{ + /// + /// The base class for all velocity parameters. + /// + abstract class VelocityParameters + { + /// + /// Does nothing at the moment + /// + protected VelocityParameters() { } + + /// + /// Dumps all parameters to the default console for debugging purposes. + /// + public abstract void DumpToConsole(); + } +} + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityProfilingMarker.cs b/Src/ILGPU/Runtime/Velocity/VelocityProfilingMarker.cs new file mode 100644 index 000000000..19511e399 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityProfilingMarker.cs @@ -0,0 +1,76 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityProfilingMarker.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Resources; +using System; +using System.Diagnostics; + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a point-in-time marker used in Velocity profiling. + /// + internal sealed class VelocityProfilingMarker : ProfilingMarker + { + #region Instance + + internal VelocityProfilingMarker(Accelerator accelerator) + : base(accelerator) + { +#if NET7_0_OR_GREATER + Timestamp = Stopwatch.GetTimestamp(); +#else + Timestamp = DateTime.UtcNow.ToBinary(); +#endif + } + + #endregion + + #region Properties + + /// + /// The timestamp this profiling marker was created. + /// + public long Timestamp { get; private set; } + + #endregion + + #region Methods + + /// + public override void Synchronize() { } + + /// + public override TimeSpan MeasureFrom(ProfilingMarker marker) + { + using var binding = Accelerator.BindScoped(); + + return (marker is VelocityProfilingMarker startMarker) +#if NET7_0_OR_GREATER + ? Stopwatch.GetElapsedTime(startMarker.Timestamp, Timestamp) +#else + ? DateTime.FromBinary(Timestamp) - + DateTime.FromBinary(startMarker.Timestamp) +#endif + : throw new ArgumentException( + string.Format( + RuntimeErrorMessages.InvalidProfilingMarker, + GetType().Name, + marker.GetType().Name), + nameof(marker)); + } + + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} diff --git a/Src/ILGPU/Runtime/Velocity/VelocityStream.cs b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs new file mode 100644 index 000000000..4ea432842 --- /dev/null +++ b/Src/ILGPU/Runtime/Velocity/VelocityStream.cs @@ -0,0 +1,58 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2022-2023 ILGPU Project +// www.ilgpu.net +// +// File: VelocityStream.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents a velocity stream. + /// + sealed class VelocityStream : AcceleratorStream + { + #region Instance + + /// + /// Constructs a new Velocity stream. + /// + /// The associated accelerator. + internal VelocityStream(Accelerator accelerator) + : base(accelerator) + { } + + #endregion + + #region Methods + + /// + /// Does not perform any operation. + /// + public override void Synchronize() { } + + /// + protected override ProfilingMarker AddProfilingMarkerInternal() + { + using var binding = Accelerator.BindScoped(); + return new VelocityProfilingMarker(Accelerator); + } + + #endregion + + #region IDisposable + + /// + /// Does not perform any operation. + /// + protected override void DisposeAcceleratorObject(bool disposing) { } + + #endregion + } +} + + diff --git a/Src/ILGPU/Static/Capabilities.xml b/Src/ILGPU/Static/Capabilities.xml index f0747b2b5..fd9e46052 100644 --- a/Src/ILGPU/Static/Capabilities.xml +++ b/Src/ILGPU/Static/Capabilities.xml @@ -7,6 +7,7 @@ cl_khr_fp16 + diff --git a/Src/ILGPU/Static/CapabilitiesImporter.ttinclude b/Src/ILGPU/Static/CapabilitiesImporter.ttinclude index e049fd7eb..04e98f4f1 100644 --- a/Src/ILGPU/Static/CapabilitiesImporter.ttinclude +++ b/Src/ILGPU/Static/CapabilitiesImporter.ttinclude @@ -47,13 +47,23 @@ public class Capability [XmlElement] public CLEntry OpenCL { get; set; } - public bool IsCudaOnly => Cuda != null && OpenCL == null; - public bool IsOpenCLOnly => Cuda == null && OpenCL != null; + [XmlElement] + public VelocityEntry Velocity { get; set; } + + public bool IsCudaOnly => Cuda != null && OpenCL == null && Velocity == null; + public bool IsOpenCLOnly => Cuda == null && OpenCL != null && Velocity == null; + public bool IsVelocityOnly => Velocity != null && Cuda == null && OpenCL == null; [XmlIgnore] public string ParameterName => char.ToLower(Name[0]).ToString() + Name.Substring(1); } +public class VelocityEntry +{ + [XmlAttribute("supported")] + public bool Supported { get; set; } +} + public class CudaEntry { [XmlAttribute("minPTX")] diff --git a/Src/ILGPU/Static/CapabilityContext.tt b/Src/ILGPU/Static/CapabilityContext.tt index 2fcbf7e8d..9a734d392 100644 --- a/Src/ILGPU/Static/CapabilityContext.tt +++ b/Src/ILGPU/Static/CapabilityContext.tt @@ -25,6 +25,7 @@ var commonCapabilities = capabilities.Where(x => !x.IsCudaOnly && !x.IsOpenCLOnly).ToArray(); var cudaCapabilities = capabilities.Where(x => x.IsCudaOnly).ToArray(); var clCapabilities = capabilities.Where(x => x.IsOpenCLOnly).ToArray(); +var velocityCapabilities = capabilities.Where(x => x.IsVelocityOnly).ToArray(); #> using System; using System.Collections.Immutable; @@ -85,6 +86,51 @@ namespace ILGPU.Runtime.CPU } } +namespace ILGPU.Runtime.Velocity +{ + /// + /// Represents capabilities available to the Velocity accelerator. + /// + public sealed class VelocityCapabilityContext : CapabilityContext + { + #region Instance + + internal VelocityCapabilityContext() + { +<# foreach (var c in velocityCapabilities.Concat(commonCapabilities)) { #> + <#= c.Name #> = <#= c.Velocity.Supported ? "true" : "false" #>; +<# } #> + } + + #endregion + + #region Properties + +<# foreach (var c in velocityCapabilities) { #> + /// + /// <#= c.Summary #> + /// + public bool <#= c.Name #> { get; internal set; } + +<# } #> + #endregion + + #region Methods + +<# foreach (var c in velocityCapabilities) { #> + /// + /// Creates exception for '<#= c.Name #>'. + /// + public static Exception GetNotSupported<#= c.Name #>Exception() => + new CapabilityNotSupportedException( + string.Format(ErrorMessages.CapabilityNotSupported, + "<#= c.FeatureName #>")); + +<# } #> + #endregion + } +} + namespace ILGPU.Runtime.Cuda { /// diff --git a/Src/ILGPU/Static/TypeInformation.ttinclude b/Src/ILGPU/Static/TypeInformation.ttinclude index ebac0d15f..be71f5eec 100644 --- a/Src/ILGPU/Static/TypeInformation.ttinclude +++ b/Src/ILGPU/Static/TypeInformation.ttinclude @@ -315,6 +315,39 @@ public class MathOpRewriter MakeExpr(Target, values); } +public class VelocityMathConfig +{ + [XmlText] + public string Implementation { get; set; } + + [XmlAttribute] + public bool SoftwareEmulation { get; set; } + + [XmlAttribute] + public bool ReturnAsWarp32 { get; set; } + + [XmlIgnore] + public bool RequiresSpecialization => + !string.IsNullOrWhiteSpace(Implementation) && + Implementation.Contains("["); + + public string GetImplementation( + string lowerAsExpression, + string suffix, + string typeName, + params string[] variables) + { + var result = new StringBuilder(Implementation); + result.Replace("[", $".{lowerAsExpression}<"); + result.Replace("]", ">()"); + result.Replace("{T}", suffix); + result.Replace("{Type}", typeName); + for (int i = 0; i < variables.Length; ++i) + result.Replace($"{{Value{i}}}", variables[i]); + return result.ToString(); + } +} + public class MathOp { #region Data @@ -378,7 +411,19 @@ public class MathOp { if (!HasCall) throw new InvalidOperationException(); - return Call.Split('.').Last(); + int index = Call.LastIndexOf('.'); + return index < 0 ? Call : Call.Substring(index + 1); + } + } + + [XmlIgnore] + public string MethodTypeName + { + get + { + if (!HasCall) + throw new InvalidOperationException(); + return Call.Substring(0, Call.LastIndexOf('.')); } } @@ -387,6 +432,9 @@ public class MathOp public bool HasRewriters => (Rewriters?.Length ?? 0) > 0; + [XmlElement("Velocity")] + public VelocityMathConfig Velocity { get; set; } + #endregion #region Methods diff --git a/Src/ILGPU/Static/UnaryMathOperations.xml b/Src/ILGPU/Static/UnaryMathOperations.xml index 3f18fb6e0..67bd7e912 100644 --- a/Src/ILGPU/Static/UnaryMathOperations.xml +++ b/Src/ILGPU/Static/UnaryMathOperations.xml @@ -13,6 +13,7 @@ UnaryArithmeticKind.Not) + The logical not operation. @@ -36,6 +37,7 @@ true InvertCompareValue({Location}, {Value1}) + The abs operation. @@ -47,27 +49,32 @@ {Value0} + The popcount operation. Ints IntrinsicMath.BitOperations.PopCount + The CLZ operation. Ints IntrinsicMath.BitOperations.LeadingZeroCount + The CTZ operation. Ints IntrinsicMath.BitOperations.TrailingZeroCount + The reciprocal operation. Floats IntrinsicMath.CPUOnly.Rcp {Const1} / {Value0} + @@ -75,18 +82,21 @@ Floats IntrinsicMath.CPUOnly.IsNaN {TypeName}.IsNaN({Value0}) + The is-infinity operation. Floats IntrinsicMath.CPUOnly.IsInfinity {TypeName}.IsInfinity({Value0}) + The is-finite operation. Floats IntrinsicMath.CPUOnly.IsFinite !IsNaN({Value0}) && !IsInfinity({Value0}) + @@ -94,12 +104,14 @@ Floats IntrinsicMath.CPUOnly.Sqrt {MathType}.Sqrt({Value0}) + Computes 1/sqrt(value). Floats IntrinsicMath.CPUOnly.Rsqrt Rcp(Sqrt({Value0})) + @@ -107,18 +119,21 @@ Floats IntrinsicMath.CPUOnly.Asin {MathType}.Asin({Value0}) + Computes sin(x). Floats IntrinsicMath.CPUOnly.Sin {MathType}.Sin({Value0}) + Computes sinh(x). Floats IntrinsicMath.CPUOnly.Sinh {MathType}.Sinh({Value0}) + @@ -126,18 +141,21 @@ Floats IntrinsicMath.CPUOnly.Acos {MathType}.Acos({Value0}) + Computes cos(x). Floats IntrinsicMath.CPUOnly.Cos {MathType}.Cos({Value0}) + Computes cosh(x). Floats IntrinsicMath.CPUOnly.Cosh {MathType}.Cosh({Value0}) + @@ -145,18 +163,21 @@ Floats IntrinsicMath.CPUOnly.Tan {MathType}.Tan({Value0}) + Computes tanh(x). Floats IntrinsicMath.CPUOnly.Tanh {MathType}.Tanh({Value0}) + Computes atan(x). Floats IntrinsicMath.CPUOnly.Atan {MathType}.Atan({Value0}) + @@ -164,12 +185,14 @@ Floats IntrinsicMath.CPUOnly.Exp {MathType}.Exp({Value0}) + Computes 2^x. Floats IntrinsicMath.CPUOnly.Exp2 {MathType}.Pow({Const2}, {Value0}) + @@ -177,12 +200,14 @@ Floats IntrinsicMath.CPUOnly.Floor {MathType}.Floor({Value0}) + Computes ceil(x). Floats IntrinsicMath.CPUOnly.Ceiling {MathType}.Ceiling({Value0}) + @@ -190,17 +215,20 @@ Floats IntrinsicMath.CPUOnly.Log {MathType}.Log({Value0}) + Computes log(x) to base 2. Floats IntrinsicMath.CPUOnly.Log2 {MathType}.Log({Value0}, {Const2}) + Computes log(x) to base 10. Floats IntrinsicMath.CPUOnly.Log10 {MathType}.Log10({Value0}) +