diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5206671c..e44c74d2c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,20 +154,26 @@ jobs: os: ${{ fromJson(needs.setup-os-matrix.outputs.os) }} library: [ILGPU, ILGPU.Algorithms, ILGPU.Analyzers] framework: [net6.0, net7.0, net8.0] - flavor: [CPU, Velocity, Velocity128] + flavor: [CPU, Velocity, Velocity128, Velocity256] exclude: - library: ILGPU.Algorithms flavor: Velocity - library: ILGPU.Algorithms flavor: Velocity128 + - library: ILGPU.Algorithms + flavor: Velocity256 - library: ILGPU.Analyzers flavor: Velocity - library: ILGPU.Analyzers flavor: Velocity128 + - library: ILGPU.Analyzers + flavor: Velocity256 - os: cuda flavor: Velocity - os: cuda flavor: Velocity128 + - os: cuda + flavor: Velocity256 - os: cuda library: ILGPU.Analyzers fail-fast: false diff --git a/.gitignore b/.gitignore index ed56d7bae..9b823d2dd 100644 --- a/.gitignore +++ b/.gitignore @@ -259,6 +259,7 @@ Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.cs Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.cs Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.cs Src/ILGPU/Backends/Velocity/Vec128/Vec128Operations.cs +Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.cs Src/ILGPU/Backends/Velocity/VelocityIntrinsics.Generated.cs Src/ILGPU/Frontend/Intrinsic/RemappedIntrinsics.Generated.cs Src/ILGPU/HalfConversion.cs @@ -345,6 +346,7 @@ Src/ILGPU.Tests.Cuda/Configurations.cs Src/ILGPU.Tests.OpenCL/Configurations.cs Src/ILGPU.Tests.Velocity/Configurations.cs Src/ILGPU.Tests.Velocity128/Configurations.cs +Src/ILGPU.Tests.Velocity256/Configurations.cs # Generated test source files (Algorithms) Src/ILGPU.Algorithms.Tests/Generic/ConfigurationBase.cs diff --git a/Src/ILGPU.Tests.Velocity256/.editorconfig b/Src/ILGPU.Tests.Velocity256/.editorconfig new file mode 100644 index 000000000..09af4abce --- /dev/null +++ b/Src/ILGPU.Tests.Velocity256/.editorconfig @@ -0,0 +1,7 @@ +[*.cs] + +# CA1707: Identifiers should not contain underscores +dotnet_diagnostic.CA1707.severity = none + +# CA1014: Mark assemblies with CLSCompliant +dotnet_diagnostic.CA1014.severity = none diff --git a/Src/ILGPU.Tests.Velocity256/Configurations.tt b/Src/ILGPU.Tests.Velocity256/Configurations.tt new file mode 100644 index 000000000..2d8e81f01 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity256/Configurations.tt @@ -0,0 +1,60 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Configurations.tt/Configurations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../ILGPU.Tests/Generic/ConfigurationBase.tt" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.IO" #> +using Xunit; +using Xunit.Abstractions; + +<# +var configurationFile = Host.ResolvePath("../ILGPU.Tests/Configurations.txt"); +var configurations = TestConfig.Parse(configurationFile); +#> + +#if NET7_0_OR_GREATER + +namespace ILGPU.Tests.Velocity256 +{ +<# foreach (var (test, level, collection) in configurations) { #> +<# var name = $"Velocity256{test}_{level}"; #> + [Collection("Velocity256ContextCollection<#= collection #>")] + public sealed partial class <#= name #> : <#= test #> + { + public <#= name #>( + ITestOutputHelper output, + Velocity256TestContext<#= collection #> testContext) + : base(output, testContext) + { } + } + +<# } #> +<# foreach (var (config, level) in TestConfig.AllConfigurations) { #> + public class Velocity256TestContext<#= config #> : Velocity256TestContext + { + public Velocity256TestContext<#= config #>() + : base( + OptimizationLevel.<#= level #>, + enableAssertions: true, + forceDebugConfig: true, + _ => { }) + { } + } + + [CollectionDefinition("Velocity256ContextCollection<#= config #>")] + public class Velocity256ContextCollection<#= config #> : + ICollectionFixture> { } + +<# } #> +} + +#endif \ No newline at end of file diff --git a/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj b/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj new file mode 100644 index 000000000..d7a492f70 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj @@ -0,0 +1,60 @@ + + + + $(LibraryUnitTestTargetFrameworks) + false + + + + $(MSBuildProjectDirectory)\..\ILGPU.Tests\.test.runsettings + + + + true + AllEnabledByDefault + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + True + True + Configurations.tt + + + + + + TextTemplatingFileGenerator + Configurations.cs + + + + + + + + + + True + True + Configurations.tt + + + + diff --git a/Src/ILGPU.Tests.Velocity256/TestContext.cs b/Src/ILGPU.Tests.Velocity256/TestContext.cs new file mode 100644 index 000000000..256baf226 --- /dev/null +++ b/Src/ILGPU.Tests.Velocity256/TestContext.cs @@ -0,0 +1,51 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: TestContext.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.Velocity; +using System; + +#if NET7_0_OR_GREATER + +namespace ILGPU.Tests.Velocity256 +{ + /// + /// An abstract test context for Velocity accelerators. + /// + public abstract class Velocity256TestContext : TestContext + { + /// + /// Creates a new test context instance. + /// + /// The optimization level to use. + /// + /// Enables use of assertions. + /// + /// + /// Forces use of debug configuration in O1 and O2 builds. + /// + /// The context preparation handler. + protected Velocity256TestContext( + OptimizationLevel optimizationLevel, + bool enableAssertions, + bool forceDebugConfig, + Action prepareContext) + : base( + optimizationLevel, + enableAssertions, + forceDebugConfig, + builder => prepareContext( + builder.Velocity(VelocityDeviceType.Vector256)), + context => context.CreateVelocityAccelerator()) + { } + } +} + +#endif diff --git a/Src/ILGPU.sln b/Src/ILGPU.sln index 89fc63ff4..7cd27280a 100644 --- a/Src/ILGPU.sln +++ b/Src/ILGPU.sln @@ -36,6 +36,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILGPU.Analyzers", "ILGPU.An EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Tests.Velocity128", "ILGPU.Tests.Velocity128\ILGPU.Tests.Velocity128.csproj", "{422BA1AE-858D-4AA4-815B-CF42A429D305}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Tests.Velocity256", "ILGPU.Tests.Velocity256\ILGPU.Tests.Velocity256.csproj", "{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -98,6 +100,10 @@ Global {422BA1AE-858D-4AA4-815B-CF42A429D305}.Debug|Any CPU.Build.0 = Debug|Any CPU {422BA1AE-858D-4AA4-815B-CF42A429D305}.Release|Any CPU.ActiveCfg = Release|Any CPU {422BA1AE-858D-4AA4-815B-CF42A429D305}.Release|Any CPU.Build.0 = Release|Any CPU + {F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -114,6 +120,7 @@ Global {4AFD2AAD-FA52-43EA-B9A8-10E948F9A139} = {7701FE3C-4187-401C-9612-44667203B0E5} {B0101B27-F153-4041-8DEE-741B651453D5} = {7701FE3C-4187-401C-9612-44667203B0E5} {422BA1AE-858D-4AA4-815B-CF42A429D305} = {7701FE3C-4187-401C-9612-44667203B0E5} + {F24B884D-A64B-4511-85B6-FEEDA92CBBA1} = {7701FE3C-4187-401C-9612-44667203B0E5} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {22270DEE-D42D-479D-A76F-B2E7A5F7C949} diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs new file mode 100644 index 000000000..fd5205282 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs @@ -0,0 +1,413 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Vec256.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.IR.Values; +using ILGPU.Runtime.Velocity; +using System; +using System.Reflection.Emit; + +#if NET7_0_OR_GREATER + +namespace ILGPU.Backends.Velocity.Vec256 +{ + sealed class Vec256 : VelocityTargetSpecializer + { + #region Instance & General Methods + + public Vec256() + : base( + Vec256Operations.WarpSize, + Vec256Operations.WarpType32, + Vec256Operations.WarpType64) + { } + + public override VelocityTypeGenerator CreateTypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem) => + new Vec256TypeGenerator(capabilityContext, runtimeSystem); + + #endregion + + #region General + + public override void LoadLaneIndexVector32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadLaneIndexVector32Method); + + public override void LoadLaneIndexVector64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadLaneIndexVector64Method); + + public override void LoadWarpSizeVector32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadVectorLengthVector32Method); + + public override void LoadWarpSizeVector64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadVectorLengthVector64Method); + + #endregion + + #region Masks + + public override void PushAllLanesMask32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadAllLanesMask32Method); + + public override void PushNoLanesMask32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.LoadNoLanesMask32Method); + + public override void ConvertMask32To64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetConvert32To64Operation( + VelocityWarpOperationMode.I)); + + public override void ConvertMask64To32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetConvert64To32Operation( + VelocityWarpOperationMode.I)); + + public override void IntersectMask32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation32( + BinaryArithmeticKind.And, + VelocityWarpOperationMode.U)); + + public override void IntersectMask64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation64( + BinaryArithmeticKind.And, + VelocityWarpOperationMode.U)); + + public override void UnifyMask32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation32( + BinaryArithmeticKind.Or, + VelocityWarpOperationMode.U)); + + public override void UnifyMask64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation64( + BinaryArithmeticKind.Or, + VelocityWarpOperationMode.U)); + + public override void NegateMask32(TILEmitter emitter) + { + PushAllLanesMask32(emitter); + BinaryOperation32( + emitter, + BinaryArithmeticKind.Xor, + VelocityWarpOperationMode.U); + } + + public override void NegateMask64(TILEmitter emitter) + { + PushAllLanesMask32(emitter); + ConvertMask32To64(emitter); + BinaryOperation64( + emitter, + BinaryArithmeticKind.Xor, + VelocityWarpOperationMode.U); + } + + public override void CheckForAnyActiveLaneMask(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.CheckForAnyActiveLaneMethod); + + public override void CheckForNoActiveLaneMask(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.CheckForNoActiveLaneMethod); + + public override void CheckForEqualMasks(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.CheckForEqualMasksMethod); + + public override void GetNumberOfActiveLanes(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.GetNumberOfActiveLanesMethod); + + public override void ConditionalSelect32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Select32Method); + + public override void ConditionalSelect64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Select64Method); + + #endregion + + #region Scalar Values + + public override void LoadWarpSize32(TILEmitter emitter) => + emitter.EmitConstant(WarpSize); + + public override void LoadWarpSize64(TILEmitter emitter) => + emitter.EmitConstant((long)WarpSize); + + public override void ConvertBoolScalar(TILEmitter emitter, bool value) + { + emitter.Emit(value ? OpCodes.Ldc_I4_M1 : OpCodes.Ldc_I4_0); + ConvertScalarTo32(emitter, VelocityWarpOperationMode.I); + } + + public override void ConvertScalarTo32( + TILEmitter emitter, + VelocityWarpOperationMode mode) + { + switch (mode) + { + case VelocityWarpOperationMode.I: + emitter.EmitCall(Vec256Operations.FromScalarI32Method); + break; + case VelocityWarpOperationMode.U: + emitter.EmitCall(Vec256Operations.FromScalarU32Method); + return; + case VelocityWarpOperationMode.F: + emitter.EmitCall(Vec256Operations.FromScalarF32Method); + break; + default: + throw new NotSupportedException(); + } + } + + public override void ConvertScalarTo64( + TILEmitter emitter, + VelocityWarpOperationMode mode) + { + switch (mode) + { + case VelocityWarpOperationMode.I: + emitter.EmitCall(Vec256Operations.FromScalarI64Method); + break; + case VelocityWarpOperationMode.U: + emitter.EmitCall(Vec256Operations.FromScalarU64Method); + return; + case VelocityWarpOperationMode.F: + emitter.EmitCall(Vec256Operations.FromScalarF64Method); + break; + default: + throw new NotSupportedException(); + } + } + + #endregion + + #region Comparisons + + public override void Compare32( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetCompareOperation32(kind, mode)); + + public override void Compare64( + TILEmitter emitter, + CompareKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetCompareOperation64(kind, mode)); + + #endregion + + #region Conversions + + public override void ConvertSoftware32( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) => + emitter.EmitCall(Vec256Operations.GetConvertOperation32( + sourceType, + targetType)); + + public override void ConvertSoftware64( + TILEmitter emitter, + ArithmeticBasicValueType sourceType, + ArithmeticBasicValueType targetType) => + emitter.EmitCall(Vec256Operations.GetConvertOperation64( + sourceType, + targetType)); + + public override void Convert32( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + emitter.EmitCall(Vec256Operations.GetConvertOperation32( + source.GetArithmeticBasicValueType(is64Bit: false), + target.GetArithmeticBasicValueType(is64Bit: false))); + + public override void Convert64( + TILEmitter emitter, + VelocityWarpOperationMode source, + VelocityWarpOperationMode target) => + emitter.EmitCall(Vec256Operations.GetConvertOperation64( + source.GetArithmeticBasicValueType(is64Bit: true), + target.GetArithmeticBasicValueType(is64Bit: true))); + + public override void Convert32To64( + TILEmitter emitter, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetConvert32To64Operation(mode)); + + public override void Convert64To32( + TILEmitter emitter, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetConvert64To32Operation(mode)); + + #endregion + + #region Arithmetics + + public override void UnaryOperation32( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetUnaryOperation32(kind, mode)); + + public override void UnaryOperation64( + TILEmitter emitter, + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetUnaryOperation64(kind, mode)); + + public override void BinaryOperation32( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation32(kind, mode)); + + public override void BinaryOperation64( + TILEmitter emitter, + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetBinaryOperation64(kind, mode)); + + public override void TernaryOperation32( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetTernaryOperation32(kind, mode)); + + public override void TernaryOperation64( + TILEmitter emitter, + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetTernaryOperation64(kind, mode)); + + #endregion + + #region Atomics + + public override void AtomicCompareExchange32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.AtomicCompareExchange32Method); + + public override void AtomicCompareExchange64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.AtomicCompareExchange64Method); + + public override void Atomic32( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetAtomicOperation32(kind, mode)); + + public override void Atomic64( + TILEmitter emitter, + AtomicKind kind, + VelocityWarpOperationMode mode) => + emitter.EmitCall(Vec256Operations.GetAtomicOperation64(kind, mode)); + + #endregion + + #region Threads + + public override void BarrierPopCount32(TILEmitter emitter) + { + emitter.Emit(OpCodes.Pop); + emitter.EmitCall(Vec256Operations.BarrierPopCount32Method); + } + + public override void BarrierAnd32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.BarrierAnd32Method); + + public override void BarrierOr32(TILEmitter emitter) + { + emitter.Emit(OpCodes.Pop); + emitter.EmitCall(Vec256Operations.BarrierOr32Method); + } + + public override void Shuffle32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Shuffle32Method); + + public override void ShuffleUp32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.ShuffleUp32Method); + + public override void SubShuffleUp32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.SubShuffleUp32Method); + + public override void ShuffleDown32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.ShuffleDown32Method); + + public override void SubShuffleDown32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.SubShuffleDown32Method); + + public override void ShuffleXor32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.ShuffleXor32Method); + + public override void SubShuffleXor32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.SubShuffleXor32Method); + + #endregion + + #region IO + + public override void Load8(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Load8Method); + + public override void Load16(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Load16Method); + + public override void Load32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Load32Method); + + public override void Load64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Load64Method); + + public override void Store8(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Store8Method); + + public override void Store16(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Store16Method); + + public override void Store32(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Store32Method); + + public override void Store64(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.Store64Method); + + #endregion + + #region Misc + + public override void DebugAssertFailed(TILEmitter emitter) => + emitter.EmitCall(Vec256Operations.DebugAssertFailedMethod); + + public override void WriteToOutput(TILEmitter emitter) => + throw new NotSupportedException(); + + public override void DumpWarp32( + TILEmitter emitter, + string? label = null) + { + if (string.IsNullOrEmpty(label)) + emitter.EmitConstant(string.Empty); + else + emitter.EmitConstant(label + ": "); + emitter.EmitCall(Vec256Operations.DumpWarp32Method); + } + + public override void DumpWarp64( + TILEmitter emitter, + string? label = null) + { + if (string.IsNullOrEmpty(label)) + emitter.EmitConstant(string.Empty); + else + emitter.EmitConstant(label + ": "); + emitter.EmitCall(Vec256Operations.DumpWarp64Method); + } + + #endregion + } +} + +#endif diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs new file mode 100644 index 000000000..60154e6ce --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs @@ -0,0 +1,224 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Vec256Extensions.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +#if NET7_0_OR_GREATER + +namespace ILGPU.Backends.Velocity.Vec256 +{ + partial class Vec256Operations + { + #region Rcp + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 RcpImpl(Vector256 value) => + Avx.IsSupported + ? Avx.Reciprocal(value) + : Vector256.Create(1.0f) / value; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 RcpImpl(Vector256 value) => + Vector256.Create(1.0) / value; + + #endregion + + #region Rscrt + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 RsqrtImpl(Vector256 value) => + Avx.IsSupported + ? Avx.ReciprocalSqrt(value) + : Vector256.Create(1.0f) / value; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 RsqrtImpl(Vector256 value) => + Vector256.Create(1.0) / value; + + #endregion + + #region FMA + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => a * b + c; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => a * b + c; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => a * b + c; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => a * b + c; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => + Fma.IsSupported + ? Fma.MultiplyAdd(a, b, c) + : a * b + c; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 FMAImpl( + Vector256 a, + Vector256 b, + Vector256 c) => + Fma.IsSupported + ? Fma.MultiplyAdd(a, b, c) + : a * b + c; + + #endregion + + #region Thread Operations + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int BarrierPopCount32Scalar( + Vector256 mask, + Vector256 warp) => + -Vector.Sum(AndI32(mask, warp).AsVector()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 BarrierPopCount32( + Vector256 mask, + Vector256 warp) => + Vector256.Create(BarrierPopCount32Scalar(mask, warp)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int BarrierPopCount64Scalar( + Vector256 mask, + (Vector256, Vector256) warp) + { + var parts = AndI64(Convert32To64I(mask), warp); + return -(int)( + Vector.Sum(parts.Item1.AsVector()) - + Vector.Sum(parts.Item2.AsVector())); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static (Vector256, Vector256) BarrierPopCount64( + Vector256 mask, + (Vector256, Vector256) warp) => + FromScalarI64(BarrierPopCount64Scalar(mask, warp)); + + [MethodImpl(MethodImplOptions.NoInlining)] + internal static Vector256 BarrierAnd32( + Vector256 mask, + Vector256 warp, + int groupSize) => + BarrierPopCount32Scalar(mask, warp) == groupSize + ? Vector256.AllBitsSet + : Vector256.Zero; + + [MethodImpl(MethodImplOptions.NoInlining)] + internal static (Vector256, Vector256) BarrierAnd64( + Vector256 mask, + (Vector256, Vector256) warp, + int groupSize) => + BarrierPopCount64Scalar(mask, warp) == groupSize + ? (Vector256.AllBitsSet, Vector256.AllBitsSet) + : (Vector256.Zero, Vector256.Zero); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 BarrierOr32( + Vector256 mask, + Vector256 warp) => + BarrierPopCount32Scalar(mask, warp) != 0 + ? Vector256.AllBitsSet + : Vector256.Zero; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static (Vector256, Vector256) BarrierOr64( + Vector256 mask, + (Vector256, Vector256) warp) => + BarrierPopCount64Scalar(mask, warp) != 0 + ? (Vector256.AllBitsSet, Vector256.AllBitsSet) + : (Vector256.Zero, Vector256.Zero); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 Broadcast32( + Vector256 _, + Vector256 value, + Vector256 sourceLane) + { + // Extract base source lane + int sourceLaneIndex = sourceLane.GetElement(0); + + // Broadcast without referring to the current mask + return Broadcast32Internal(value, sourceLaneIndex); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Broadcast32Internal( + Vector256 value, + int sourceLaneIndex) => + Vector256.Create(value[sourceLaneIndex]); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static (Vector256, Vector256) Broadcast64( + Vector256 _, + (Vector256, Vector256) value, + (Vector256, Vector256) sourceLane) => + throw new NotImplementedException(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 Shuffle32( + Vector256 _, + Vector256 value, + Vector256 sourceLanes) + { + var lanes = MinI32( + MaxI32(sourceLanes, Vector256.Zero), + WarpSizeM1Vector); + + int value0 = value.GetElement(lanes.GetElement(0)); + int value1 = value.GetElement(lanes.GetElement(1)); + int value2 = value.GetElement(lanes.GetElement(2)); + int value3 = value.GetElement(lanes.GetElement(3)); + int value4 = value.GetElement(lanes.GetElement(4)); + int value5 = value.GetElement(lanes.GetElement(5)); + int value6 = value.GetElement(lanes.GetElement(6)); + int value7 = value.GetElement(lanes.GetElement(7)); + + return Vector256.Create( + value0, value1, value2, value3, + value4, value5, value6, value7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static (Vector256, Vector256) Shuffle64( + Vector256 _, + (Vector256, Vector256) value, + Vector256 sourceLanes) => + throw new NotImplementedException(); + + #endregion + } +} + +#endif diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt new file mode 100644 index 000000000..61147f3e2 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt @@ -0,0 +1,1470 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Vec256Operations.tt/Vec256Operations.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../VelocityOperations.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#@ output extension=".cs" #> +<# +string rootPath = Host.ResolvePath("../../../Static"); +var unaryOps = GetUnaryMathOps(rootPath); +var binaryOps = GetBinaryMathOps(rootPath); +var ternaryOps = GetTernaryMathOps(rootPath); +var compareOperations = new (string, string)[] +{ + ("Equal", "Vector256.Equals({0}, {1}).AsInt32()"), + ("NotEqual", "NotI32(Vector256.Equals({0}, {1}).AsInt32())"), + ("LessThan", "Vector256.LessThan({0}, {1}).AsInt32()"), + ("LessEqual", "Vector256.LessThanOrEqual({0}, {1}).AsInt32()"), + ("GreaterThan", "Vector256.GreaterThan({0}, {1}).AsInt32()"), + ("GreaterEqual", "Vector256.GreaterThanOrEqual({0}, {1}).AsInt32()") +}; +var acceleratedConvTypes32 = new (TypeInformation Left, TypeInformation Right, string Op)[] +{ + (SignedIntTypes[2], FloatTypes[1], "Vector256.ConvertToSingle"), + (UnsignedIntTypes[2], FloatTypes[1], "Vector256.ConvertToSingle"), + (FloatTypes[1], SignedIntTypes[2], "Vector256.ConvertToInt32"), + (FloatTypes[1], UnsignedIntTypes[2], "Vector256.ConvertToUInt32"), +}; +var acceleratedConvTypes64 = new (TypeInformation Left, TypeInformation Right, string Op)[] +{ + (SignedIntTypes[3], FloatTypes[2], "Vector256.ConvertToDouble"), + (UnsignedIntTypes[3], FloatTypes[2], "Vector256.ConvertToDouble"), + (FloatTypes[2], SignedIntTypes[3], "Vector256.ConvertToInt64"), + (FloatTypes[2], UnsignedIntTypes[3], "Vector256.ConvertToUInt64"), +}; + +int warpSize = 8; +string GetWarpTypeName32(string elementTypeName) => $"Vector256<{elementTypeName}>"; +string GetWarpTypeName64(string elementTypeName) => + $"({GetWarpTypeName32(elementTypeName)}, {GetWarpTypeName32(elementTypeName)})"; +string GetItemRef64(string name, int counter) => counter < 4 + ? $"{name}.Item1.GetElement({counter})" + : $"{name}.Item2.GetElement({counter - 4})"; +string GetCastIToX32(string prefix, string variable) => + prefix != "I" ? $"CastITo{prefix}32({variable})" : variable; +string GetCastIToX64(string prefix, string variable) => + prefix != "I" ? $"CastITo{prefix}64({variable})" : variable; +string GetCastXToI32(string prefix, string variable) => + prefix != "I" ? $"Cast{prefix}ToI32({variable})" : variable; +string GetCastXToI64(string prefix, string variable) => + prefix != "I" ? $"Cast{prefix}ToI64({variable})" : variable; + +var warpType32 = GetWarpTypeName32("int"); +var warpType64 = GetWarpTypeName64("long"); +string inliningAttribute = "AggressiveInlining"; +#> +using ILGPU.IR.Values; +using ILGPU.Util; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Numerics; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; + +// ReSharper disable ArrangeMethodOrOperatorBody +// ReSharper disable RedundantCast +// disable: max_line_length + +#if NET7_0_OR_GREATER + +namespace ILGPU.Backends.Velocity.Vec256 +{ + // Operation implementations + + static partial class Vec256Operations + { + #region Warp Types + + public static int WarpSize => Vector256.Count; + public static readonly Type WarpType32 = typeof(<#= warpType32 #>); + public static readonly Type WarpType64 = typeof(<#= warpType64 #>); + + #endregion + + #region Initialization + + static Vec256Operations() + { + InitUnaryOperations(); + InitBinaryOperations(); + InitTernaryOperations(); + InitializeCompareOperations(); + InitializeConvertOperations(); + InitializeVectorConvertOperations(); + InitializeAtomicOperations(); + } + + private static readonly Vector256 WarpSizeM1Vector = + Vector256.Create(WarpSize - 1); + + internal static MethodInfo GetMethod(string name) => + typeof(Vec256Operations).GetMethod( + name, + BindingFlags.NonPublic | BindingFlags.Static) + .AsNotNull(); + + #endregion + + #region Creation + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static Vector256 CastWarp32(Vector256 source) + where T : struct + where TTarget : struct => + source.As(); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static (Vector256, Vector256) CastWarp64( + (Vector256, Vector256) source) + where T : struct + where TTarget : struct => + (source.Item1.As(), source.Item2.As()); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> MaskTo64(Vector256 mask) => + Vector256.Widen(mask); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static bool CheckForAnyActiveLane(<#= warpType32 #> warp) => + Vector256.EqualsAny(<#= warpType32 #>.AllBitsSet, warp); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static bool CheckForNoActiveLane(<#= warpType32 #> warp) => + !CheckForAnyActiveLane(warp); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static bool CheckForEqualMasks( + <#= warpType32 #> firstMask, + <#= warpType32 #> secondMask) => + Vector256.EqualsAll(firstMask, secondMask); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static int GetNumberOfActiveLanes(<#= warpType32 #> warp) => + -Vector256.Sum(warp); + + public static readonly MethodInfo CheckForAnyActiveLaneMethod = + GetMethod(nameof(CheckForAnyActiveLane)); + public static readonly MethodInfo CheckForNoActiveLaneMethod = + GetMethod(nameof(CheckForNoActiveLane)); + public static readonly MethodInfo CheckForEqualMasksMethod = + GetMethod(nameof(CheckForEqualMasks)); + public static readonly MethodInfo GetNumberOfActiveLanesMethod = + GetMethod(nameof(GetNumberOfActiveLanes)); + + private static readonly <#= warpType32 #> LaneIndexVector32 = + Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> LoadLaneIndexVector32() => LaneIndexVector32; + + private static readonly <#= warpType64 #> LaneIndexVector64 = + (Vector256.Create(0L, 1L, 2L, 3L), Vector256.Create(4L, 5L, 6L, 7L)); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> LoadLaneIndexVector64() => LaneIndexVector64; + + public static readonly MethodInfo LoadLaneIndexVector32Method = + GetMethod(nameof(LoadLaneIndexVector32)); + public static readonly MethodInfo LoadLaneIndexVector64Method = + GetMethod(nameof(LoadLaneIndexVector64)); + + private static readonly <#= warpType32 #> LaneLengthVector32 = + Vector256.Create(Vector.Count); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> LoadVectorLengthVector32() => LaneLengthVector32; + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> LoadVectorLengthVector64() + { + long count = Vector.Count; + return (Vector256.Create(count), Vector256.Create(count)); + } + + public static readonly MethodInfo LoadVectorLengthVector32Method = + GetMethod(nameof(LoadVectorLengthVector32)); + public static readonly MethodInfo LoadVectorLengthVector64Method = + GetMethod(nameof(LoadVectorLengthVector64)); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> LoadAllLanesMask32() => + <#= warpType32 #>.AllBitsSet; + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> LoadAllLanesMask64() => + (Vector256.AllBitsSet, Vector256.AllBitsSet); + + public static readonly MethodInfo LoadAllLanesMask32Method = + GetMethod(nameof(LoadAllLanesMask32)); + public static readonly MethodInfo LoadAllLanesMask64Method = + GetMethod(nameof(LoadAllLanesMask64)); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> LoadNoLanesMask32() => default; + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> LoadNoLanesMask64() => default; + + public static readonly MethodInfo LoadNoLanesMask32Method = + GetMethod(nameof(LoadNoLanesMask32)); + public static readonly MethodInfo LoadNoLanesMask64Method = + GetMethod(nameof(LoadNoLanesMask64)); + + #endregion + + #region Generic Casts + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Cast<#= prefix #>ToI32( + <#= GetWarpTypeName32(typeName) #> input) => + CastWarp32<<#= typeName #>, int>(input); + +<# if (typeName != "int") { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= GetWarpTypeName32(typeName) #> CastITo<#= prefix #>32( + <#= warpType32 #> input) => + CastWarp32>(input); + + public static readonly MethodInfo Cast<#= prefix #>ToI32Method = + GetMethod(nameof(Cast<#= prefix #>ToI32)); + +<# } #> + public static readonly MethodInfo CastITo<#= prefix #>32Method = + GetMethod(nameof(CastITo<#= prefix #>32)); + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> Cast<#= prefix #>ToI64( + <#= GetWarpTypeName64(typeName) #> input) => + CastWarp64<<#= typeName #>, long>(input); + +<# if (typeName != "long") { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= GetWarpTypeName64(typeName) #> CastITo<#= prefix #>64( + <#= warpType64 #> input) => + CastWarp64>(input); + + public static readonly MethodInfo Cast<#= prefix #>ToI64Method = + GetMethod(nameof(Cast<#= prefix #>ToI64)); +<# } #> + + public static readonly MethodInfo CastITo<#= prefix #>64Method = + GetMethod(nameof(CastITo<#= prefix #>64)); + +<# } #> + + #endregion + + #region Scalar Operations + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> FromScalar<#= prefix #>32(<#= typeName #> scalar) + { + var result = Vector256.Create(scalar); + return <#= GetCastXToI32(prefix, "result") #>; + } + + public static readonly MethodInfo FromScalar<#= prefix #>32Method = + GetMethod(nameof(FromScalar<#= prefix #>32)); + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> FromScalar<#= prefix #>64(<#= typeName #> scalar) + { + var result = Vector256.Create(scalar); + return <#= GetCastXToI64(prefix, "(result, result)") #>; + } + + public static readonly MethodInfo FromScalar<#= prefix #>64Method = + GetMethod(nameof(FromScalar<#= prefix #>64)); + +<# } #> + + #endregion + + #region Select Operations + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Select32( + <#= warpType32 #> mask, + <#= warpType32 #> left, + <#= warpType32 #> right) => + Vector256.ConditionalSelect(mask, right, left); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> Select64( + <#= warpType32 #> mask, + <#= warpType64 #> left, + <#= warpType64 #> right) + { + var mask64 = MaskTo64(mask); + return ( + Vector256.ConditionalSelect(mask64.Item1, right.Item1, left.Item1), + Vector256.ConditionalSelect(mask64.Item2, right.Item2, left.Item2)); + } + + public static readonly MethodInfo Select32Method = GetMethod(nameof(Select32)); + public static readonly MethodInfo Select64Method = GetMethod(nameof(Select64)); + + #endregion + + #region Unary Operations + +<# foreach (var op in unaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# var velocity = op.Velocity.Velocity256; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> warp) + { + var value = <#= GetCastIToX32(prefix, "warp") #>; +<# if (velocity.SoftwareEmulation) { #> + var result = Vector256.Create( + <#= velocity.GetImplementation(op, 0, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 1, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 2, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : ","#> + <#= velocity.GetImplementation(op, 3, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : ","#> + <#= velocity.GetImplementation(op, 4, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : ","#> + <#= velocity.GetImplementation(op, 5, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : ","#> + <#= velocity.GetImplementation(op, 6, "value") #> + <#= op.IsPredicate ? " ? -1 : 0," : ","#> + <#= velocity.GetImplementation(op, 7, "value") #> + <#= op.IsPredicate ? " ? -1 : 0" : ""#>); +<# } else { #> + var result = <#= velocity.GetImplementation(op, variables: "value") #>; +<# } #> +<# if (!op.IsPredicate && !op.Velocity.ReturnAsWarp32) { #> + return <#= GetCastXToI32(prefix, "result") #>; +<# } else { #> + return result; +<# } #> + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# bool use32BitResult = op.Velocity.ReturnAsWarp32 | op.IsPredicate; #> +<# var returnType = use32BitResult ? warpType32 : warpType64; #> +<# var velocity = op.Velocity.Velocity256; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= returnType #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> warp) + { + var value = <#= GetCastIToX64(prefix, "warp") #>; +<# if (velocity.SoftwareEmulation && use32BitResult) { #> + var result = Vector256.Create( + <#= velocity.GetImplementation(op, 0, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 1, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 2, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 3, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 0, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 1, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 2, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 3, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0" : "" #>); +<# } else if (velocity.SoftwareEmulation) { #> + var result = ( + Vector256.Create( + <#= velocity.GetImplementation(op, 0, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 1, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 2, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 3, "value.Item1") #> + <#= op.IsPredicate ? " ? -1 : 0" : "" #>), + Vector256.Create( + <#= velocity.GetImplementation(op, 0, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 1, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 2, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0," : "," #> + <#= velocity.GetImplementation(op, 3, "value.Item2") #> + <#= op.IsPredicate ? " ? -1 : 0" : "" #>)); +<# } else if (use32BitResult) { #> + var result = <#= velocity.GetImplementation(op, variables: "value") #>; +<# } else { #> + var result = ( + <#= velocity.GetImplementation(op, variables: "value.Item1") #>, + <#= velocity.GetImplementation(op, variables: "value.Item2") #>); +<# } #> +<# if (!op.IsPredicate && !op.Velocity.ReturnAsWarp32) { #> + return <#= GetCastXToI64(prefix, "result") #>; +<# } else { #> + return result; +<# } #> + } + +<# } #> + +<# } #> + + private static readonly Dictionary< + (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + UnaryOperations32 = new(); + private static readonly Dictionary< + (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + UnaryOperations64 = new(); + + private static void InitUnaryOperations() + { +<# foreach (var op in unaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + UnaryOperations32.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + UnaryOperations64.Add( + (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetUnaryOperation32( + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => UnaryOperations32[(kind, mode)]; + public static MethodInfo GetUnaryOperation64( + UnaryArithmeticKind kind, + VelocityWarpOperationMode mode) => UnaryOperations64[(kind, mode)]; + + #endregion + + #region Binary Operations + +<# foreach (var op in binaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# var velocity = op.Velocity?.Velocity256; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second) + { + var left = <#= GetCastIToX32(prefix, "first") #>; + var right = <#= GetCastIToX32(prefix, "second") #>; + +<# if (velocity == null) { #> + var result = <#= op.GetOpOrCall(isBool: false, "left", "right") #>; +<# } else if (velocity.SoftwareEmulation) { #> + var result = Vector256.Create( + <#= velocity.GetImplementation(op, 0, "left", "right") #>, + <#= velocity.GetImplementation(op, 1, "left", "right") #>, + <#= velocity.GetImplementation(op, 2, "left", "right") #>, + <#= velocity.GetImplementation(op, 3, "left", "right") #>, + <#= velocity.GetImplementation(op, 4, "left", "right") #>, + <#= velocity.GetImplementation(op, 5, "left", "right") #>, + <#= velocity.GetImplementation(op, 6, "left", "right") #>, + <#= velocity.GetImplementation(op, 7, "left", "right") #>); +<# } else { #> + var result = <#= velocity.GetImplementation(op, null, "left", "right") #>; +<# } #> + return <#= GetCastXToI32(prefix, "result") #>; + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> +<# var velocity = op.Velocity?.Velocity256; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second) + { + var left = <#= GetCastIToX64(prefix, "first") #>; + var right = <#= GetCastIToX64(prefix, "second") #>; + +<# if (velocity == null) { #> + var result = ( + <#= op.GetOpOrCall(isBool: false, "left.Item1", "right.Item1") #>, + <#= op.GetOpOrCall(isBool: false, "left.Item2", "right.Item2") #>); +<# } else if (velocity.SoftwareEmulation) { #> + var result = ( + Vector256.Create( + <#= velocity.GetImplementation(op, 0, "left.Item1", "right.Item1") #>, + <#= velocity.GetImplementation(op, 1, "left.Item1", "right.Item1") #>, + <#= velocity.GetImplementation(op, 2, "left.Item1", "right.Item1") #>, + <#= velocity.GetImplementation(op, 3, "left.Item1", "right.Item1") #>), + Vector256.Create( + <#= velocity.GetImplementation(op, 0, "left.Item2", "right.Item2") #>, + <#= velocity.GetImplementation(op, 1, "left.Item2", "right.Item2") #>, + <#= velocity.GetImplementation(op, 2, "left.Item2", "right.Item2") #>, + <#= velocity.GetImplementation(op, 1, "left.Item2", "right.Item2") #>)); +<# } else { #> + var result = ( + <#= velocity.GetImplementation(op, null, "left.Item1", "right.Item1") #>, + <#= velocity.GetImplementation(op, null, "left.Item2", "right.Item2") #>); +<# } #> + + return <#= GetCastXToI64(prefix, "result") #>; + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + BinaryOperations32 = new(); + private static readonly Dictionary< + (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + BinaryOperations64 = new(); + + private static void InitBinaryOperations() + { +<# foreach (var op in binaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + BinaryOperations32.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + BinaryOperations64.Add( + (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetBinaryOperation32( + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => BinaryOperations32[(kind, mode)]; + public static MethodInfo GetBinaryOperation64( + BinaryArithmeticKind kind, + VelocityWarpOperationMode mode) => BinaryOperations64[(kind, mode)]; + + #endregion + + #region Ternary Operations + +<# foreach (var op in ternaryOps) { #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second, + <#= warpType32 #> third) + { + var a = <#= GetCastIToX32(prefix, "first") #>; + var b = <#= GetCastIToX32(prefix, "second") #>; + var c = <#= GetCastIToX32(prefix, "third") #>; + + var result = <#= op.Velocity.Velocity256.GetImplementation( + op, + null, + "a", "b", "c") #>; + + return <#= GetCastXToI32(prefix, "result") #>; + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second, + <#= warpType64 #> third) + { + var a = <#= GetCastIToX64(prefix, "first") #>; + var b = <#= GetCastIToX64(prefix, "second") #>; + var c = <#= GetCastIToX64(prefix, "third") #>; + + var result1 = <#= op.Velocity.Velocity256.GetImplementation( + op, + null, + "a.Item1", "b.Item1", "c.Item1") #>; + var result2 = <#= op.Velocity.Velocity256.GetImplementation( + op, + null, + "a.Item2", "b.Item2", "c.Item2") #>; + + return <#= GetCastXToI64(prefix, "(result1, result2)") #>; + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + TernaryOperations32 = new(); + private static readonly Dictionary< + (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo> + TernaryOperations64 = new(); + + private static void InitTernaryOperations() + { +<# foreach (var op in ternaryOps) { #> +<# foreach (var (_, prefix, _, _) in + ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #> + TernaryOperations32.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>32))); +<# } #> + +<# foreach (var (_, prefix, _, _) in + ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #> + TernaryOperations64.Add( + (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>), + GetMethod(nameof(<#= op.Name #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetTernaryOperation32( + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => TernaryOperations32[(kind, mode)]; + public static MethodInfo GetTernaryOperation64( + TernaryArithmeticKind kind, + VelocityWarpOperationMode mode) => TernaryOperations64[(kind, mode)]; + + #endregion + + #region Compare Operations + +<# foreach (var (kind, op) in compareOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>32( + <#= warpType32 #> first, + <#= warpType32 #> second) + { + var left = <#= GetCastIToX32(prefix, "first") #>; + var right = <#= GetCastIToX32(prefix, "second") #>; + + return <#= string.Format(op, "left", "right") #>; + } + +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>64( + <#= warpType64 #> first, + <#= warpType64 #> second) + { + var left = <#= GetCastIToX64(prefix, "first") #>; + var right = <#= GetCastIToX64(prefix, "second") #>; + + var result1 = <#= string.Format(op, "left.Item1", "right.Item1") #>; + var result2 = <#= string.Format(op, "left.Item2", "right.Item2") #>; + + return Vector256.Narrow(result1.AsInt64(), result2.AsInt64()); + } + +<# } #> +<# } #> + private static readonly Dictionary< + (CompareKind, VelocityWarpOperationMode, bool), + MethodInfo> CompareOperations = new(); + + private static void InitializeCompareOperations() + { +<# foreach (var (kind, _) in compareOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + CompareOperations.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Compare<#= kind #><#= prefix #>32))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + CompareOperations.Add( + (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Compare<#= kind #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetCompareOperation32( + CompareKind kind, + VelocityWarpOperationMode mode) => + CompareOperations[(kind, mode, false)]; + + public static MethodInfo GetCompareOperation64( + CompareKind kind, + VelocityWarpOperationMode mode) => + CompareOperations[(kind, mode, true)]; + + #endregion + + #region Convert Operations + +<# foreach (var sourceType in Warp32ConvTypes) { #> +<# foreach (var targetType in Warp32ConvTypes) { #> +<# var sourceImplType32 = GetImplementationType32(sourceType.Kind); #> +<# var acceleratedOp = acceleratedConvTypes32.FirstOrDefault( + t => t.Left == sourceType && t.Right == targetType); #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_32( + <#= warpType32 #> warp) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return warp; +<# } else if (acceleratedOp.Op != null) { #> + var value = <#= GetCastIToX32(sourceImplType32.Prefix, "warp") #>; + return <#= acceleratedOp.Op #>(value).AsInt32(); +<# } else { #> + var value = <#= GetCastIToX32(sourceImplType32.Prefix, "warp") #>; + return Vector256.Create( + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(0), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(1), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(2), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(3), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(4), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(5), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(6), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(7)) + .AsInt32(); +<# } #> + } + +<# } #> +<# } #> + +<# foreach (var sourceType in Warp64ConvTypes) { #> +<# foreach (var targetType in Warp64ConvTypes) { #> +<# var sourceImplType64 = GetImplementationType64(sourceType.Kind); #> +<# var acceleratedOp = acceleratedConvTypes64.FirstOrDefault( + t => t.Left == sourceType && t.Right == targetType); #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_64( + <#= warpType64 #> warp) + { +<# if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #> + return warp; +<# } else if (acceleratedOp.Op != null) { #> + var value = <#= GetCastIToX64(sourceImplType64.Prefix, "warp") #>; + return ( + <#= acceleratedOp.Op #>(value.Item1).AsInt64(), + <#= acceleratedOp.Op #>(value.Item2).AsInt64()); +<# } else { #> + var value = <#= GetCastIToX64(sourceImplType64.Prefix, "warp") #>; + var result1 = Vector256.Create( + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(0), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(1), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(2), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(3)); + var result2 = Vector256.Create( + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(0), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(1), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(2), + (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(3)); + return (result1.AsInt64(), result2.AsInt64()); +<# } #> + } + +<# } #> +<# } #> + + private static readonly Dictionary< + (ArithmeticBasicValueType, ArithmeticBasicValueType, bool), + MethodInfo> ConvertOperations = new(); + + private static void InitializeConvertOperations() + { +<# foreach (var sourceType in Warp32ConvTypes) { #> +<# foreach (var targetType in Warp32ConvTypes) { #> +<# var sourceName = sourceType.GetArithmeticBasicValueType(); #> +<# var targetName = targetType.GetArithmeticBasicValueType(); #> + ConvertOperations.Add( + (ArithmeticBasicValueType.<#= sourceName #>, + ArithmeticBasicValueType.<#= targetName #>, + false), + GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_32))); +<# } #> +<# } #> +<# foreach (var sourceType in Warp64ConvTypes) { #> +<# foreach (var targetType in Warp64ConvTypes) { #> +<# var sourceName = sourceType.GetArithmeticBasicValueType(); #> +<# var targetName = targetType.GetArithmeticBasicValueType(); #> + ConvertOperations.Add( + (ArithmeticBasicValueType.<#= sourceName #>, + ArithmeticBasicValueType.<#= targetName #>, + true), + GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_64))); +<# } #> +<# } #> + } + + public static MethodInfo GetConvertOperation32( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target) => + ConvertOperations[(source, target, false)]; + + public static MethodInfo GetConvertOperation64( + ArithmeticBasicValueType source, + ArithmeticBasicValueType target) => + ConvertOperations[(source, target, true)]; + + #endregion + + #region Vector Convert Operations + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> Convert64To32<#= prefix #>(<#= warpType64 #> warp) + { + var value = <#= GetCastIToX64(prefix, "warp") #>; + var result = Vector256.Narrow(value.Item1, value.Item2); + return <#= GetCastXToI32(prefix, "result") #>; + } + +<# } #> +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> Convert32To64<#= prefix #>(<#= warpType32 #> warp) + { + var value = <#= GetCastIToX32(prefix, "warp") #>; + var result = Vector256.Widen(value); + return <#= GetCastXToI64(prefix, "result") #>; + } + +<# } #> + internal static readonly Dictionary< + (VelocityWarpOperationMode, bool), + MethodInfo> VectorConvertOperations = new(); + + internal static void InitializeVectorConvertOperations() + { +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + VectorConvertOperations.Add( + (VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Convert64To32<#= prefix #>))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + VectorConvertOperations.Add( + (VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Convert32To64<#= prefix #>))); +<# } #> + } + + public static MethodInfo GetConvert32To64Operation( + VelocityWarpOperationMode mode) => + VectorConvertOperations[(mode, true)]; + + public static MethodInfo GetConvert64To32Operation( + VelocityWarpOperationMode mode) => + VectorConvertOperations[(mode, false)]; + + #endregion + + #region Atomic Operations + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType32 #> AtomicCompareExchange32( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType32 #> compare, + <#= warpType32 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + int result<#= i #> = default; + if (mask<#= i #> != 0) + { + result<#= i #> = Atomic.CompareExchange( + ref Unsafe.AsRef((void*)<#= GetItemRef64("target", i) #>), + compare.GetElement(<#= i #>), + value.GetElement(<#= i #>)); + } +<# } #> + return Vector256.Create( + result0, result1, result2, result3, + result4, result5, result6, result7); + } + + public static readonly MethodInfo AtomicCompareExchange32Method = + GetMethod(nameof(AtomicCompareExchange32)); + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType64 #> AtomicCompareExchange64( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType64 #> compare, + <#= warpType64 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + long result<#= i #> = default; + if (mask<#= i #> != 0) + { + result<#= i #> = Atomic.CompareExchange( + ref Unsafe.AsRef((void*)<#= GetItemRef64("target", i) #>), + <#= GetItemRef64("compare", i) #>, + <#= GetItemRef64("value", i) #>); + } +<# } #> + return ( + Vector256.Create(result0, result1, result2, result3), + Vector256.Create(result4, result5, result6, result7)); + } + + public static readonly MethodInfo AtomicCompareExchange64Method = + GetMethod(nameof(AtomicCompareExchange64)); + +<# foreach (var (op, isBinary) in AtomicOperations) { #> +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #> +<# var targetPrefix = isBinary ? "U" : prefix; #> +<# var targetTypeName = isBinary ? "uint" : typeName; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType32 #> Atomic<#= op #><#= prefix #>32( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType32 #> value) + { + var sourceValue = <#= GetCastIToX32(targetPrefix, "value") #>; +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + <#= targetTypeName #> result<#= i #> = default; + if (mask<#= i #> != 0) + { + result<#= i #> = Atomic.<#= op #>( + ref Unsafe.AsRef<<#= targetTypeName #>>((void*)<#= GetItemRef64("target", i) #>), + sourceValue.GetElement(<#= i #>)); + } +<# } #> + return <#= GetCastXToI32(targetPrefix, + "Vector256.Create(" + + "result0, result1, result2, result3," + + "result4, result5, result6, result7)") #>; + } + +<# } #> + +<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #> +<# var targetPrefix = isBinary ? "U" : prefix; #> +<# var targetTypeName = isBinary ? "ulong" : typeName; #> + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType64 #> Atomic<#= op #><#= prefix #>64( + <#= warpType32 #> mask, + <#= warpType64 #> target, + <#= warpType64 #> value) + { + var sourceValue = <#= GetCastIToX64(targetPrefix, "value") #>; +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + <#= targetTypeName #> result<#= i #> = default; + if (mask<#= i #> != 0) + { + result<#= i #> = Atomic.<#= op #>( + ref Unsafe.AsRef<<#= targetTypeName #>>((void*)<#= GetItemRef64("target", i) #>), + <#= GetItemRef64("sourceValue", i) #>); + } +<# } #> + return <#= GetCastXToI64(targetPrefix, + "(Vector256.Create(result0, result1, result2, result3)," + + "Vector256.Create(result4, result5, result6, result7))") #>; + } + +<# } #> +<# } #> + + internal static readonly Dictionary< + (AtomicKind, VelocityWarpOperationMode, bool), + MethodInfo> AtomicOperations = new(); + + internal static void InitializeAtomicOperations() + { +<# foreach (var (op, _) in AtomicOperations) { #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #> + AtomicOperations.Add( + (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, false), + GetMethod(nameof(Atomic<#= op #><#= prefix #>32))); +<# } #> +<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #> + AtomicOperations.Add( + (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, true), + GetMethod(nameof(Atomic<#= op #><#= prefix #>64))); +<# } #> +<# } #> + } + + public static MethodInfo GetAtomicOperation32( + AtomicKind kind, + VelocityWarpOperationMode mode) => + AtomicOperations[(kind, mode, false)]; + + public static MethodInfo GetAtomicOperation64( + AtomicKind kind, + VelocityWarpOperationMode mode) => + AtomicOperations[(kind, mode, true)]; + + #endregion + + #region Thread Operations + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static void ComputeShuffleConfig( + <#= warpType32 #> width, + out <#= warpType32 #> lane, + out <#= warpType32 #> offset) + { + lane = RemI32(LoadLaneIndexVector32(), width); + offset = MulI32(DivI32(lane, width), width); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> ShuffleUp32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta) + { + var lane = SubI32(LoadLaneIndexVector32(), delta); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> SubShuffleUp32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = SubI32(lane, delta); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static <#= warpType64 #> ShuffleUp64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + var lane = SubI32(LoadLaneIndexVector32(), delta); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> SubShuffleUp64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = SubI32(lane, delta); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> ShuffleDown32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta) + { + var lane = AddI32(LoadLaneIndexVector32(), delta); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> SubShuffleDown32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = AddI32(lane, delta); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> ShuffleDown64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta) + { + var lane = AddI32(LoadLaneIndexVector32(), delta); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> SubShuffleDown64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> delta, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = AddI32(lane, delta); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> ShuffleXor32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> laneMask) + { + var lane = XorU32(LoadLaneIndexVector32(), laneMask); + return Shuffle32(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType32 #> SubShuffleXor32( + <#= warpType32 #> mask, + <#= warpType32 #> warp, + <#= warpType32 #> laneMask, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = XorU32(lane, laneMask); + return Shuffle32(mask, warp, AddI32(adjustedLane, offset)); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> ShuffleXor64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> laneMask) + { + var lane = XorU32(LoadLaneIndexVector32(), laneMask); + return Shuffle64(mask, warp, lane); + } + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static <#= warpType64 #> SubShuffleXor64( + <#= warpType32 #> mask, + <#= warpType64 #> warp, + <#= warpType32 #> laneMask, + <#= warpType32 #> width) + { + ComputeShuffleConfig(width, out var lane, out var offset); + var adjustedLane = XorU32(lane, laneMask); + return Shuffle64(mask, warp, AddI32(adjustedLane, offset)); + } + + public static readonly MethodInfo BarrierPopCount32Method = + GetMethod(nameof(BarrierPopCount32)); + public static readonly MethodInfo BarrierPopCount64Method = + GetMethod(nameof(BarrierPopCount64)); + public static readonly MethodInfo BarrierAnd32Method = + GetMethod(nameof(BarrierAnd32)); + public static readonly MethodInfo BarrierAnd64Method = + GetMethod(nameof(BarrierAnd64)); + public static readonly MethodInfo BarrierOr32Method = + GetMethod(nameof(BarrierOr32)); + public static readonly MethodInfo BarrierOr64Method = + GetMethod(nameof(BarrierOr64)); + public static readonly MethodInfo Broadcast32Method = + GetMethod(nameof(Broadcast32)); + public static readonly MethodInfo Broadcast64Method = + GetMethod(nameof(Broadcast64)); + public static readonly MethodInfo Shuffle32Method = + GetMethod(nameof(Shuffle32)); + public static readonly MethodInfo Shuffle64Method = + GetMethod(nameof(Shuffle64)); + public static readonly MethodInfo ShuffleUp32Method = + GetMethod(nameof(ShuffleUp32)); + public static readonly MethodInfo SubShuffleUp32Method = + GetMethod(nameof(SubShuffleUp32)); + public static readonly MethodInfo ShuffleUp64Method = + GetMethod(nameof(ShuffleUp64)); + public static readonly MethodInfo SubShuffleUp64Method = + GetMethod(nameof(SubShuffleUp64)); + public static readonly MethodInfo ShuffleDown32Method = + GetMethod(nameof(ShuffleDown32)); + public static readonly MethodInfo SubShuffleDown32Method = + GetMethod(nameof(SubShuffleDown32)); + public static readonly MethodInfo ShuffleDown64Method = + GetMethod(nameof(ShuffleDown64)); + public static readonly MethodInfo SubShuffleDown64Method = + GetMethod(nameof(SubShuffleDown64)); + public static readonly MethodInfo ShuffleXor32Method = + GetMethod(nameof(ShuffleXor32)); + public static readonly MethodInfo SubShuffleXor32Method = + GetMethod(nameof(SubShuffleXor32)); + public static readonly MethodInfo ShuffleXor64Method = + GetMethod(nameof(ShuffleXor64)); + public static readonly MethodInfo SubShuffleXor64Method = + GetMethod(nameof(SubShuffleXor64)); + + #endregion + + #region IO + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType32 #> Load8( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var result<#= i #> = mask<#= i #> != 0 + ? (uint)*(byte*)<#= GetItemRef64("address", i) #> + : 0; +<# } #> + return Vector256.Create(result0, result1, result2, result3).AsInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType32 #> Load16( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var result<#= i #> = mask<#= i #> != 0 + ? (uint)*(ushort*)<#= GetItemRef64("address", i) #> + : 0; +<# } #> + return Vector256.Create(result0, result1, result2, result3).AsInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + private static unsafe <#= warpType32 #> Load32( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + int result<#= i #> = mask<#= i #> != 0 + ? *(int*)<#= GetItemRef64("address", i) #> + : 0; +<# } #> + return Vector256.Create( + result0, result1, result2, result3, + result4, result5, result6, result7); + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe <#= warpType64 #> Load64( + <#= warpType32 #> mask, + <#= warpType64 #> address) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + long result<#= i #> = mask<#= i #> != 0 + ? *(long*)<#= GetItemRef64("address", i) #> + : 0; +<# } #> + return ( + Vector256.Create(result0, result1, result2, result3), + Vector256.Create(result3, result5, result6, result7)); + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe void Store8( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + byte* addr<#= i #> = (byte*)<#= GetItemRef64("address", i) #>; +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var value<#= i #> = (byte)(value.GetElement(<#= i #>) & 0xff); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + if (mask<#= i #> != 0) + *addr<#= i #> = value<#= i #>; +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe void Store16( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + short* addr<#= i #> = (short*)<#= GetItemRef64("address", i) #>; +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var value<#= i #> = (short)(value.GetElement(<#= i #>) & 0xffff); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + if (mask<#= i #> != 0) + *addr<#= i #> = value<#= i #>; +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe void Store32( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType32 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + int* addr<#= i #> = (int*)<#= GetItemRef64("address", i) #>; +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var value<#= i #> = value.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + if (mask<#= i #> != 0) + *addr<#= i #> = value<#= i #>; +<# } #> + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization | + MethodImplOptions.<#= inliningAttribute #>)] + internal static unsafe void Store64( + <#= warpType32 #> mask, + <#= warpType64 #> address, + <#= warpType64 #> value) + { +<# for (int i = 0; i < warpSize; ++i) { #> + int mask<#= i #> = mask.GetElement(<#= i #>); +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + long* addr<#= i #> = (long*)<#= GetItemRef64("address", i) #>; +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + var value<#= i #> = <#= GetItemRef64("value", i) #>; +<# } #> +<# for (int i = 0; i < warpSize; ++i) { #> + if (mask<#= i #> != 0) + *addr<#= i #> = value<#= i #>; +<# } #> + } + + public static readonly MethodInfo Load8Method = + GetMethod(nameof(Load8)); + public static readonly MethodInfo Load16Method = + GetMethod(nameof(Load16)); + public static readonly MethodInfo Load32Method = + GetMethod(nameof(Load32)); + public static readonly MethodInfo Load64Method = + GetMethod(nameof(Load64)); + + public static readonly MethodInfo Store8Method = + GetMethod(nameof(Store8)); + public static readonly MethodInfo Store16Method = + GetMethod(nameof(Store16)); + public static readonly MethodInfo Store32Method = + GetMethod(nameof(Store32)); + public static readonly MethodInfo Store64Method = + GetMethod(nameof(Store64)); + + #endregion + + #region Misc + + [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)] + internal static void DebugAssertFailed( + <#= warpType32 #> mask, + <#= warpType32 #> value, + string message, + string fileName, + int line, + string method) + { + // Check if any lane failed the check + var failedAssertionMask = XorU32(LoadAllLanesMask32(), value); + if (BarrierPopCount32Scalar(mask, failedAssertionMask) != 0) + Trace.Assert(false, message, $"@ {fileName}:{line} in {method}"); + } + + public static readonly MethodInfo DebugAssertFailedMethod = + GetMethod(nameof(DebugAssertFailed)); + + [SuppressMessage( + "Globalization", + "CA1303:Do not pass literals as localized parameters", + Justification = "Basic invariant string")] + internal static void DumpWarp32(<#= warpType32 #> value, string label) + { + Console.Write(label); + Console.WriteLine(value.ToString()); + } + + public static readonly MethodInfo DumpWarp32Method = + GetMethod(nameof(DumpWarp32)); + + [SuppressMessage( + "Globalization", + "CA1303:Do not pass literals as localized parameters", + Justification = "Basic invariant string")] + internal static void DumpWarp64(<#= warpType64 #> value, string label) + { + Console.Write(label); + Console.Write(value.Item1.ToString()); + Console.Write(", "); + Console.WriteLine(value.Item2.ToString()); + } + + public static readonly MethodInfo DumpWarp64Method = + GetMethod(nameof(DumpWarp64)); + + #endregion + } +} + +#endif \ No newline at end of file diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs new file mode 100644 index 000000000..fe2285ee7 --- /dev/null +++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs @@ -0,0 +1,76 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Vec256TypeGenerator.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.IL; +using ILGPU.Runtime.Velocity; +using System; +using System.Numerics; + +#if NET7_0_OR_GREATER + +namespace ILGPU.Backends.Velocity.Vec256 +{ + /// + /// A vector type generator of 256bit vectors to be used with the Velocity backend. + /// + sealed class Vec256TypeGenerator : VelocityTypeGenerator + { + #region Static + + /// + /// Maps basic types to vectorized basic types. + /// + private static readonly Type[] VectorizedBasicTypeMapping = new Type[] + { + Vec256Operations.WarpType32, // None/Unknown + + Vec256Operations.WarpType32, // Int1 + Vec256Operations.WarpType32, // Int8 + Vec256Operations.WarpType32, // Int16 + Vec256Operations.WarpType32, // Int32 + Vec256Operations.WarpType64, // Int64 + + Vec256Operations.WarpType32, // Float16 + Vec256Operations.WarpType32, // Float32 + Vec256Operations.WarpType64, // Float64 + }; + + #endregion + + #region Instance + + /// + /// Constructs a new IL vector type generator. + /// + /// The parent capability context. + /// The parent runtime system. + public Vec256TypeGenerator( + VelocityCapabilityContext capabilityContext, + RuntimeSystem runtimeSystem) + : base(capabilityContext, runtimeSystem, Vector.Count) + { } + + #endregion + + #region Type System + + public override Type GetVectorizedBasicType(BasicValueType basicValueType) + { + if (basicValueType == BasicValueType.Float16 && !CapabilityContext.Float16) + throw VelocityCapabilityContext.GetNotSupportedFloat16Exception(); + return VectorizedBasicTypeMapping[(int)basicValueType]; + } + + #endregion + } +} + +#endif diff --git a/Src/ILGPU/ILGPU.csproj b/Src/ILGPU/ILGPU.csproj index 4228847f2..6b26c8c56 100644 --- a/Src/ILGPU/ILGPU.csproj +++ b/Src/ILGPU/ILGPU.csproj @@ -185,6 +185,14 @@ TextTemplatingFileGenerator CudaInstructionSet.Generated.cs + + TextTemplatingFilePreprocessor + Vec128Operations.cs + + + TextTemplatingFilePreprocessor + Vec256Operations.cs + @@ -353,6 +361,12 @@ True PrimitiveDataBlocks.tt + + Vec128Operations.tt + + + Vec256Operations.tt + diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs index d7b61813c..2d1c54938 100644 --- a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs +++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs @@ -13,6 +13,7 @@ using ILGPU.Backends.Velocity.Scalar; #if NET7_0_OR_GREATER using ILGPU.Backends.Velocity.Vec128; +using ILGPU.Backends.Velocity.Vec256; #endif using ILGPU.Util; using System; @@ -36,6 +37,11 @@ public enum VelocityDeviceType /// Vector128, + /// + /// 256bit vector operations to simulate four lanes per warp using hardware + /// acceleration via AVX. + /// + Vector256, #endif } @@ -53,6 +59,7 @@ public sealed class VelocityDevice : Device typeof(Scalar), #if NET7_0_OR_GREATER typeof(Vec128), + typeof(Vec256), #endif }; @@ -73,6 +80,7 @@ public VelocityDevice(VelocityDeviceType deviceType) break; #if NET7_0_OR_GREATER case VelocityDeviceType.Vector128: + case VelocityDeviceType.Vector256: // Vector always runs using software in the worst case break; #endif diff --git a/Src/ILGPU/Static/BinaryMathOperations.xml b/Src/ILGPU/Static/BinaryMathOperations.xml index 3a682f5cf..d954ec079 100644 --- a/Src/ILGPU/Static/BinaryMathOperations.xml +++ b/Src/ILGPU/Static/BinaryMathOperations.xml @@ -115,6 +115,9 @@ {Value0} - {Value0} / {Value1} * {Value1} + + {Value0} - {Value0} / {Value1} * {Value1} + @@ -128,6 +131,9 @@ Vector128.BitwiseAnd({Value0}, {Value1}) + + Vector256.BitwiseAnd({Value0}, {Value1}) + {Value0} == {Value1} @@ -153,6 +159,9 @@ Vector128.BitwiseOr({Value0}, {Value1}) + + Vector256.BitwiseOr({Value0}, {Value1}) + {Value0} == {Value1} @@ -178,6 +187,9 @@ Vector128.Xor({Value0}, {Value1}) + + Vector256.Xor({Value0}, {Value1}) + @@ -188,6 +200,9 @@ {Value0}[Field] << (int){Value1}[Field] + + {Value0}[Field] << (int){Value1}[Field] + {Value0}.IsZero @@ -206,6 +221,9 @@ {Value0}[Field] >> (int){Value1}[Field] + + {Value0}[Field] >> (int){Value1}[Field] + {Value0}.IsZero @@ -226,6 +244,9 @@ Vector128.Min({Value0}, {Value1}) + + Vector256.Min({Value0}, {Value1}) + @@ -241,6 +262,9 @@ Vector128.Max({Value0}, {Value1}) + + Vector256.Max({Value0}, {Value1}) + @@ -255,6 +279,7 @@ {MathType}.Atan2({Value0}, {Value1}) + @@ -264,6 +289,7 @@ {MathType}.Pow({Value0}, {Value1}) + @@ -273,6 +299,7 @@ {MathType}.Log({Value0}, {Value1}) + @@ -282,6 +309,7 @@ IntrinsicMath.CopySign + diff --git a/Src/ILGPU/Static/TernaryMathOperations.xml b/Src/ILGPU/Static/TernaryMathOperations.xml index 304e6a662..c3bb2faf8 100644 --- a/Src/ILGPU/Static/TernaryMathOperations.xml +++ b/Src/ILGPU/Static/TernaryMathOperations.xml @@ -9,6 +9,9 @@ Vector128.Add(Vector128.Multiply({Value0}, {Value1}), {Value2}) + + FMAImpl({Value0}, {Value1}, {Value2}) + diff --git a/Src/ILGPU/Static/TypeInformation.ttinclude b/Src/ILGPU/Static/TypeInformation.ttinclude index 47f270eb7..523f80625 100644 --- a/Src/ILGPU/Static/TypeInformation.ttinclude +++ b/Src/ILGPU/Static/TypeInformation.ttinclude @@ -322,9 +322,12 @@ public class VelocityMathConfig [XmlElement("Velocity128")] public Velocity128Config Velocity128 { get; set; } + + [XmlElement("Velocity256")] + public Velocity256Config Velocity256 { get; set; } } -public class Velocity128Config +public abstract class VelocityOperationConfig { [XmlAttribute] public bool SoftwareEmulation { get; set; } @@ -364,6 +367,9 @@ public class Velocity128Config } } +public class Velocity128Config : VelocityOperationConfig { } +public class Velocity256Config : VelocityOperationConfig { } + public class MathOp { #region Data diff --git a/Src/ILGPU/Static/UnaryMathOperations.xml b/Src/ILGPU/Static/UnaryMathOperations.xml index 85fa966d5..a6632dfa2 100644 --- a/Src/ILGPU/Static/UnaryMathOperations.xml +++ b/Src/ILGPU/Static/UnaryMathOperations.xml @@ -17,6 +17,9 @@ -{Value0} + + -{Value0} + @@ -45,6 +48,9 @@ ~{Value0} + + ~{Value0} + @@ -61,6 +67,9 @@ Vector128.Abs({Value0}) + + Vector256.Abs({Value0}) + @@ -69,6 +78,7 @@ IntrinsicMath.BitOperations.PopCount + @@ -77,6 +87,7 @@ IntrinsicMath.BitOperations.LeadingZeroCount + @@ -85,6 +96,7 @@ IntrinsicMath.BitOperations.TrailingZeroCount + @@ -96,6 +108,9 @@ RcpImpl({Value0}) + + RcpImpl({Value0}) + @@ -106,6 +121,7 @@ {TypeName}.IsNaN({Value0}) + @@ -115,6 +131,7 @@ {TypeName}.IsInfinity({Value0}) + @@ -124,6 +141,7 @@ !IsNaN({Value0}) && !IsInfinity({Value0}) + @@ -136,6 +154,9 @@ Vector128.Sqrt({Value0}) + + Vector256.Sqrt({Value0}) + @@ -147,6 +168,9 @@ RcpImpl(Vector128.Sqrt({Value0})) + + RsqrtImpl({Value0}) + @@ -157,6 +181,7 @@ {MathType}.Asin({Value0}) + @@ -166,6 +191,7 @@ {MathType}.Sin({Value0}) + @@ -175,6 +201,7 @@ {MathType}.Sinh({Value0}) + @@ -185,6 +212,7 @@ {MathType}.Acos({Value0}) + @@ -194,6 +222,7 @@ {MathType}.Cos({Value0}) + @@ -203,6 +232,7 @@ {MathType}.Cosh({Value0}) + @@ -213,6 +243,7 @@ {MathType}.Tan({Value0}) + @@ -222,6 +253,7 @@ {MathType}.Tanh({Value0}) + @@ -231,6 +263,7 @@ {MathType}.Atan({Value0}) + @@ -241,6 +274,7 @@ {MathType}.Exp({Value0}) + @@ -250,6 +284,7 @@ {MathType}.Pow({Const2}, {Value0}) + @@ -262,6 +297,9 @@ Vector128.Floor({Value0}) + + Vector256.Floor({Value0}) + @@ -273,6 +311,9 @@ Vector128.Ceiling({Value0}) + + Vector256.Ceiling({Value0}) + @@ -283,6 +324,7 @@ {MathType}.Log({Value0}) + @@ -292,6 +334,7 @@ {MathType}.Log({Value0}, {Const2}) + @@ -301,6 +344,7 @@ {MathType}.Log10({Value0}) +