diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a5206671c..e44c74d2c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -154,20 +154,26 @@ jobs:
         os: ${{ fromJson(needs.setup-os-matrix.outputs.os) }}
         library: [ILGPU, ILGPU.Algorithms, ILGPU.Analyzers]
         framework: [net6.0, net7.0, net8.0]
-        flavor: [CPU, Velocity, Velocity128]
+        flavor: [CPU, Velocity, Velocity128, Velocity256]
         exclude:
           - library: ILGPU.Algorithms
             flavor: Velocity
           - library: ILGPU.Algorithms
             flavor: Velocity128
+          - library: ILGPU.Algorithms
+            flavor: Velocity256
           - library: ILGPU.Analyzers
             flavor: Velocity
           - library: ILGPU.Analyzers
             flavor: Velocity128
+          - library: ILGPU.Analyzers
+            flavor: Velocity256
           - os: cuda
             flavor: Velocity
           - os: cuda
             flavor: Velocity128
+          - os: cuda
+            flavor: Velocity256
           - os: cuda
             library: ILGPU.Analyzers
       fail-fast: false
diff --git a/.gitignore b/.gitignore
index ed56d7bae..9b823d2dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -259,6 +259,7 @@ Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.cs
 Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.cs
 Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.cs
 Src/ILGPU/Backends/Velocity/Vec128/Vec128Operations.cs
+Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.cs
 Src/ILGPU/Backends/Velocity/VelocityIntrinsics.Generated.cs
 Src/ILGPU/Frontend/Intrinsic/RemappedIntrinsics.Generated.cs
 Src/ILGPU/HalfConversion.cs
@@ -345,6 +346,7 @@ Src/ILGPU.Tests.Cuda/Configurations.cs
 Src/ILGPU.Tests.OpenCL/Configurations.cs
 Src/ILGPU.Tests.Velocity/Configurations.cs
 Src/ILGPU.Tests.Velocity128/Configurations.cs
+Src/ILGPU.Tests.Velocity256/Configurations.cs
 
 # Generated test source files (Algorithms)
 Src/ILGPU.Algorithms.Tests/Generic/ConfigurationBase.cs
diff --git a/Src/ILGPU.Tests.Velocity256/.editorconfig b/Src/ILGPU.Tests.Velocity256/.editorconfig
new file mode 100644
index 000000000..09af4abce
--- /dev/null
+++ b/Src/ILGPU.Tests.Velocity256/.editorconfig
@@ -0,0 +1,7 @@
+[*.cs]
+
+# CA1707: Identifiers should not contain underscores
+dotnet_diagnostic.CA1707.severity = none
+
+# CA1014: Mark assemblies with CLSCompliant
+dotnet_diagnostic.CA1014.severity = none
diff --git a/Src/ILGPU.Tests.Velocity256/Configurations.tt b/Src/ILGPU.Tests.Velocity256/Configurations.tt
new file mode 100644
index 000000000..2d8e81f01
--- /dev/null
+++ b/Src/ILGPU.Tests.Velocity256/Configurations.tt
@@ -0,0 +1,60 @@
+﻿// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: Configurations.tt/Configurations.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+<#@ template debug="false" hostspecific="true" language="C#" #>
+<#@ include file="../ILGPU.Tests/Generic/ConfigurationBase.tt" #>
+<#@ assembly name="System.Core" #>
+<#@ import namespace="System.IO" #>
+using Xunit;
+using Xunit.Abstractions;
+
+<#
+var configurationFile = Host.ResolvePath("../ILGPU.Tests/Configurations.txt");
+var configurations = TestConfig.Parse(configurationFile);
+#>
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Tests.Velocity256
+{
+<# foreach (var (test, level, collection) in configurations) { #>
+<#      var name = $"Velocity256{test}_{level}"; #>
+    [Collection("Velocity256ContextCollection<#= collection #>")]
+    public sealed partial class <#= name #> : <#= test #>
+    {
+        public <#= name #>(
+            ITestOutputHelper output,
+            Velocity256TestContext<#= collection #> testContext)
+            : base(output, testContext)
+        { }
+    }
+
+<# } #>
+<# foreach (var (config, level) in TestConfig.AllConfigurations) { #>
+    public class Velocity256TestContext<#= config #> : Velocity256TestContext
+    {
+        public Velocity256TestContext<#= config #>()
+            : base(
+                OptimizationLevel.<#= level #>,
+                enableAssertions: true,
+                forceDebugConfig: true,
+                _ => { })
+        { }
+    }
+
+    [CollectionDefinition("Velocity256ContextCollection<#= config #>")]
+    public class Velocity256ContextCollection<#= config #> :
+        ICollectionFixture<Velocity256TestContext<#= config #>> { }
+
+<# } #>
+}
+
+#endif
\ No newline at end of file
diff --git a/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj b/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj
new file mode 100644
index 000000000..d7a492f70
--- /dev/null
+++ b/Src/ILGPU.Tests.Velocity256/ILGPU.Tests.Velocity256.csproj
@@ -0,0 +1,60 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>$(LibraryUnitTestTargetFrameworks)</TargetFrameworks>
+    <IsPackable>false</IsPackable>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <RunSettingsFilePath>$(MSBuildProjectDirectory)\..\ILGPU.Tests\.test.runsettings</RunSettingsFilePath>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <EnableNETAnalyzers>true</EnableNETAnalyzers>
+    <AnalysisMode>AllEnabledByDefault</AnalysisMode>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.9.0" />
+    <PackageReference Include="xunit" Version="2.6.6" />
+    <PackageReference Include="xunit.runner.visualstudio" Version="2.5.6">
+      <PrivateAssets>all</PrivateAssets>
+      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+    </PackageReference>
+    <DotNetCliToolReference Include="dotnet-xunit" Version="2.3.1" />
+    <PackageReference Include="T4.Build" Version="0.2.4" PrivateAssets="All" />
+    <PackageReference Include="Xunit.SkippableFact" Version="1.4.13" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\..\Src\ILGPU\ILGPU.csproj" />
+    <ProjectReference Include="..\ILGPU.Tests\ILGPU.Tests.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Include="Configurations.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>Configurations.tt</DependentUpon>
+    </None>
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="Configurations.tt">
+      <Generator>TextTemplatingFileGenerator</Generator>
+      <LastGenOutput>Configurations.cs</LastGenOutput>
+    </None>
+  </ItemGroup>
+
+  <ItemGroup>
+    <Service Include="{508349b6-6b84-4df5-91f0-309beebad82d}" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Update="Configurations.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>Configurations.tt</DependentUpon>
+    </Compile>
+  </ItemGroup>
+
+</Project>
diff --git a/Src/ILGPU.Tests.Velocity256/TestContext.cs b/Src/ILGPU.Tests.Velocity256/TestContext.cs
new file mode 100644
index 000000000..256baf226
--- /dev/null
+++ b/Src/ILGPU.Tests.Velocity256/TestContext.cs
@@ -0,0 +1,51 @@
+﻿// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: TestContext.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Runtime.Velocity;
+using System;
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Tests.Velocity256
+{
+    /// <summary>
+    /// An abstract test context for Velocity accelerators.
+    /// </summary>
+    public abstract class Velocity256TestContext : TestContext
+    {
+        /// <summary>
+        /// Creates a new test context instance.
+        /// </summary>
+        /// <param name="optimizationLevel">The optimization level to use.</param>
+        /// <param name="enableAssertions">
+        /// Enables use of assertions.
+        /// </param>
+        /// <param name="forceDebugConfig">
+        /// Forces use of debug configuration in O1 and O2 builds.
+        /// </param>
+        /// <param name="prepareContext">The context preparation handler.</param>
+        protected Velocity256TestContext(
+            OptimizationLevel optimizationLevel,
+            bool enableAssertions,
+            bool forceDebugConfig,
+            Action<Context.Builder> prepareContext)
+            : base(
+                  optimizationLevel,
+                  enableAssertions,
+                  forceDebugConfig,
+                  builder => prepareContext(
+                      builder.Velocity(VelocityDeviceType.Vector256)),
+                  context => context.CreateVelocityAccelerator())
+        { }
+    }
+}
+
+#endif
diff --git a/Src/ILGPU.sln b/Src/ILGPU.sln
index 89fc63ff4..7cd27280a 100644
--- a/Src/ILGPU.sln
+++ b/Src/ILGPU.sln
@@ -36,6 +36,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILGPU.Analyzers", "ILGPU.An
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Tests.Velocity128", "ILGPU.Tests.Velocity128\ILGPU.Tests.Velocity128.csproj", "{422BA1AE-858D-4AA4-815B-CF42A429D305}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Tests.Velocity256", "ILGPU.Tests.Velocity256\ILGPU.Tests.Velocity256.csproj", "{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -98,6 +100,10 @@ Global
 		{422BA1AE-858D-4AA4-815B-CF42A429D305}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{422BA1AE-858D-4AA4-815B-CF42A429D305}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{422BA1AE-858D-4AA4-815B-CF42A429D305}.Release|Any CPU.Build.0 = Release|Any CPU
+		{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{F24B884D-A64B-4511-85B6-FEEDA92CBBA1}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -114,6 +120,7 @@ Global
 		{4AFD2AAD-FA52-43EA-B9A8-10E948F9A139} = {7701FE3C-4187-401C-9612-44667203B0E5}
 		{B0101B27-F153-4041-8DEE-741B651453D5} = {7701FE3C-4187-401C-9612-44667203B0E5}
 		{422BA1AE-858D-4AA4-815B-CF42A429D305} = {7701FE3C-4187-401C-9612-44667203B0E5}
+		{F24B884D-A64B-4511-85B6-FEEDA92CBBA1} = {7701FE3C-4187-401C-9612-44667203B0E5}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {22270DEE-D42D-479D-A76F-B2E7A5F7C949}
diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs
new file mode 100644
index 000000000..fd5205282
--- /dev/null
+++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256.cs
@@ -0,0 +1,413 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: Vec256.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends.IL;
+using ILGPU.IR.Values;
+using ILGPU.Runtime.Velocity;
+using System;
+using System.Reflection.Emit;
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Backends.Velocity.Vec256
+{
+    sealed class Vec256 : VelocityTargetSpecializer
+    {
+        #region Instance & General Methods
+
+        public Vec256()
+            : base(
+                Vec256Operations.WarpSize,
+                Vec256Operations.WarpType32,
+                Vec256Operations.WarpType64)
+        { }
+
+        public override VelocityTypeGenerator CreateTypeGenerator(
+            VelocityCapabilityContext capabilityContext,
+            RuntimeSystem runtimeSystem) =>
+            new Vec256TypeGenerator(capabilityContext, runtimeSystem);
+
+        #endregion
+
+        #region General
+
+        public override void LoadLaneIndexVector32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadLaneIndexVector32Method);
+
+        public override void LoadLaneIndexVector64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadLaneIndexVector64Method);
+
+        public override void LoadWarpSizeVector32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadVectorLengthVector32Method);
+
+        public override void LoadWarpSizeVector64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadVectorLengthVector64Method);
+
+        #endregion
+
+        #region Masks
+
+        public override void PushAllLanesMask32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadAllLanesMask32Method);
+
+        public override void PushNoLanesMask32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.LoadNoLanesMask32Method);
+
+        public override void ConvertMask32To64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetConvert32To64Operation(
+                VelocityWarpOperationMode.I));
+
+        public override void ConvertMask64To32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetConvert64To32Operation(
+                VelocityWarpOperationMode.I));
+
+        public override void IntersectMask32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation32(
+                BinaryArithmeticKind.And,
+                VelocityWarpOperationMode.U));
+
+        public override void IntersectMask64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation64(
+                BinaryArithmeticKind.And,
+                VelocityWarpOperationMode.U));
+
+        public override void UnifyMask32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation32(
+                BinaryArithmeticKind.Or,
+                VelocityWarpOperationMode.U));
+
+        public override void UnifyMask64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation64(
+                BinaryArithmeticKind.Or,
+                VelocityWarpOperationMode.U));
+
+        public override void NegateMask32<TILEmitter>(TILEmitter emitter)
+        {
+            PushAllLanesMask32(emitter);
+            BinaryOperation32(
+                emitter,
+                BinaryArithmeticKind.Xor,
+                VelocityWarpOperationMode.U);
+        }
+
+        public override void NegateMask64<TILEmitter>(TILEmitter emitter)
+        {
+            PushAllLanesMask32(emitter);
+            ConvertMask32To64(emitter);
+            BinaryOperation64(
+                emitter,
+                BinaryArithmeticKind.Xor,
+                VelocityWarpOperationMode.U);
+        }
+
+        public override void CheckForAnyActiveLaneMask<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.CheckForAnyActiveLaneMethod);
+
+        public override void CheckForNoActiveLaneMask<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.CheckForNoActiveLaneMethod);
+
+        public override void CheckForEqualMasks<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.CheckForEqualMasksMethod);
+
+        public override void GetNumberOfActiveLanes<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.GetNumberOfActiveLanesMethod);
+
+        public override void ConditionalSelect32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Select32Method);
+
+        public override void ConditionalSelect64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Select64Method);
+
+        #endregion
+
+        #region Scalar Values
+
+        public override void LoadWarpSize32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitConstant(WarpSize);
+
+        public override void LoadWarpSize64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitConstant((long)WarpSize);
+
+        public override void ConvertBoolScalar<TILEmitter>(TILEmitter emitter, bool value)
+        {
+            emitter.Emit(value ? OpCodes.Ldc_I4_M1 : OpCodes.Ldc_I4_0);
+            ConvertScalarTo32(emitter, VelocityWarpOperationMode.I);
+        }
+
+        public override void ConvertScalarTo32<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode mode)
+        {
+            switch (mode)
+            {
+                case VelocityWarpOperationMode.I:
+                    emitter.EmitCall(Vec256Operations.FromScalarI32Method);
+                    break;
+                case VelocityWarpOperationMode.U:
+                    emitter.EmitCall(Vec256Operations.FromScalarU32Method);
+                    return;
+                case VelocityWarpOperationMode.F:
+                    emitter.EmitCall(Vec256Operations.FromScalarF32Method);
+                    break;
+                default:
+                    throw new NotSupportedException();
+            }
+        }
+
+        public override void ConvertScalarTo64<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode mode)
+        {
+            switch (mode)
+            {
+                case VelocityWarpOperationMode.I:
+                    emitter.EmitCall(Vec256Operations.FromScalarI64Method);
+                    break;
+                case VelocityWarpOperationMode.U:
+                    emitter.EmitCall(Vec256Operations.FromScalarU64Method);
+                    return;
+                case VelocityWarpOperationMode.F:
+                    emitter.EmitCall(Vec256Operations.FromScalarF64Method);
+                    break;
+                default:
+                    throw new NotSupportedException();
+            }
+        }
+
+        #endregion
+
+        #region Comparisons
+
+        public override void Compare32<TILEmitter>(
+            TILEmitter emitter,
+            CompareKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetCompareOperation32(kind, mode));
+
+        public override void Compare64<TILEmitter>(
+            TILEmitter emitter,
+            CompareKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetCompareOperation64(kind, mode));
+
+        #endregion
+
+        #region Conversions
+
+        public override void ConvertSoftware32<TILEmitter>(
+            TILEmitter emitter,
+            ArithmeticBasicValueType sourceType,
+            ArithmeticBasicValueType targetType) =>
+            emitter.EmitCall(Vec256Operations.GetConvertOperation32(
+                sourceType,
+                targetType));
+
+        public override void ConvertSoftware64<TILEmitter>(
+            TILEmitter emitter,
+            ArithmeticBasicValueType sourceType,
+            ArithmeticBasicValueType targetType) =>
+            emitter.EmitCall(Vec256Operations.GetConvertOperation64(
+                sourceType,
+                targetType));
+
+        public override void Convert32<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode source,
+            VelocityWarpOperationMode target) =>
+            emitter.EmitCall(Vec256Operations.GetConvertOperation32(
+                source.GetArithmeticBasicValueType(is64Bit: false),
+                target.GetArithmeticBasicValueType(is64Bit: false)));
+
+        public override void Convert64<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode source,
+            VelocityWarpOperationMode target) =>
+            emitter.EmitCall(Vec256Operations.GetConvertOperation64(
+                source.GetArithmeticBasicValueType(is64Bit: true),
+                target.GetArithmeticBasicValueType(is64Bit: true)));
+
+        public override void Convert32To64<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetConvert32To64Operation(mode));
+
+        public override void Convert64To32<TILEmitter>(
+            TILEmitter emitter,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetConvert64To32Operation(mode));
+
+        #endregion
+
+        #region Arithmetics
+
+        public override void UnaryOperation32<TILEmitter>(
+            TILEmitter emitter,
+            UnaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetUnaryOperation32(kind, mode));
+
+        public override void UnaryOperation64<TILEmitter>(
+            TILEmitter emitter,
+            UnaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetUnaryOperation64(kind, mode));
+
+        public override void BinaryOperation32<TILEmitter>(
+            TILEmitter emitter,
+            BinaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation32(kind, mode));
+
+        public override void BinaryOperation64<TILEmitter>(
+            TILEmitter emitter,
+            BinaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetBinaryOperation64(kind, mode));
+
+        public override void TernaryOperation32<TILEmitter>(
+            TILEmitter emitter,
+            TernaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetTernaryOperation32(kind, mode));
+
+        public override void TernaryOperation64<TILEmitter>(
+            TILEmitter emitter,
+            TernaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetTernaryOperation64(kind, mode));
+
+        #endregion
+
+        #region Atomics
+
+        public override void AtomicCompareExchange32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.AtomicCompareExchange32Method);
+
+        public override void AtomicCompareExchange64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.AtomicCompareExchange64Method);
+
+        public override void Atomic32<TILEmitter>(
+            TILEmitter emitter,
+            AtomicKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetAtomicOperation32(kind, mode));
+
+        public override void Atomic64<TILEmitter>(
+            TILEmitter emitter,
+            AtomicKind kind,
+            VelocityWarpOperationMode mode) =>
+            emitter.EmitCall(Vec256Operations.GetAtomicOperation64(kind, mode));
+
+        #endregion
+
+        #region Threads
+
+        public override void BarrierPopCount32<TILEmitter>(TILEmitter emitter)
+        {
+            emitter.Emit(OpCodes.Pop);
+            emitter.EmitCall(Vec256Operations.BarrierPopCount32Method);
+        }
+
+        public override void BarrierAnd32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.BarrierAnd32Method);
+
+        public override void BarrierOr32<TILEmitter>(TILEmitter emitter)
+        {
+            emitter.Emit(OpCodes.Pop);
+            emitter.EmitCall(Vec256Operations.BarrierOr32Method);
+        }
+
+        public override void Shuffle32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Shuffle32Method);
+
+        public override void ShuffleUp32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.ShuffleUp32Method);
+
+        public override void SubShuffleUp32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.SubShuffleUp32Method);
+
+        public override void ShuffleDown32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.ShuffleDown32Method);
+
+        public override void SubShuffleDown32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.SubShuffleDown32Method);
+
+        public override void ShuffleXor32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.ShuffleXor32Method);
+
+        public override void SubShuffleXor32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.SubShuffleXor32Method);
+
+        #endregion
+
+        #region IO
+
+        public override void Load8<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Load8Method);
+
+        public override void Load16<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Load16Method);
+
+        public override void Load32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Load32Method);
+
+        public override void Load64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Load64Method);
+
+        public override void Store8<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Store8Method);
+
+        public override void Store16<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Store16Method);
+
+        public override void Store32<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Store32Method);
+
+        public override void Store64<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.Store64Method);
+
+        #endregion
+
+        #region Misc
+
+        public override void DebugAssertFailed<TILEmitter>(TILEmitter emitter) =>
+            emitter.EmitCall(Vec256Operations.DebugAssertFailedMethod);
+
+        public override void WriteToOutput<TILEmitter>(TILEmitter emitter) =>
+            throw new NotSupportedException();
+
+        public override void DumpWarp32<TILEmitter>(
+            TILEmitter emitter,
+            string? label = null)
+        {
+            if (string.IsNullOrEmpty(label))
+                emitter.EmitConstant(string.Empty);
+            else
+                emitter.EmitConstant(label + ": ");
+            emitter.EmitCall(Vec256Operations.DumpWarp32Method);
+        }
+
+        public override void DumpWarp64<TILEmitter>(
+            TILEmitter emitter,
+            string? label = null)
+        {
+            if (string.IsNullOrEmpty(label))
+                emitter.EmitConstant(string.Empty);
+            else
+                emitter.EmitConstant(label + ": ");
+            emitter.EmitCall(Vec256Operations.DumpWarp64Method);
+        }
+
+        #endregion
+    }
+}
+
+#endif
diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs
new file mode 100644
index 000000000..60154e6ce
--- /dev/null
+++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Extensions.cs
@@ -0,0 +1,224 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: Vec256Extensions.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Backends.Velocity.Vec256
+{
+    partial class Vec256Operations
+    {
+        #region Rcp
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<float> RcpImpl(Vector256<float> value) =>
+            Avx.IsSupported
+                ? Avx.Reciprocal(value)
+                : Vector256.Create(1.0f) / value;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<double> RcpImpl(Vector256<double> value) =>
+            Vector256.Create(1.0) / value;
+
+        #endregion
+
+        #region Rscrt
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<float> RsqrtImpl(Vector256<float> value) =>
+            Avx.IsSupported
+                ? Avx.ReciprocalSqrt(value)
+                : Vector256.Create(1.0f) / value;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<double> RsqrtImpl(Vector256<double> value) =>
+            Vector256.Create(1.0) / value;
+
+        #endregion
+
+        #region FMA
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<int> FMAImpl(
+            Vector256<int> a,
+            Vector256<int> b,
+            Vector256<int> c) => a * b + c;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<uint> FMAImpl(
+            Vector256<uint> a,
+            Vector256<uint> b,
+            Vector256<uint> c) => a * b + c;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<long> FMAImpl(
+            Vector256<long> a,
+            Vector256<long> b,
+            Vector256<long> c) => a * b + c;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<ulong> FMAImpl(
+            Vector256<ulong> a,
+            Vector256<ulong> b,
+            Vector256<ulong> c) => a * b + c;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<float> FMAImpl(
+            Vector256<float> a,
+            Vector256<float> b,
+            Vector256<float> c) =>
+            Fma.IsSupported
+                ? Fma.MultiplyAdd(a, b, c)
+                : a * b + c;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<double> FMAImpl(
+            Vector256<double> a,
+            Vector256<double> b,
+            Vector256<double> c) =>
+            Fma.IsSupported
+                ? Fma.MultiplyAdd(a, b, c)
+                : a * b + c;
+
+        #endregion
+
+        #region Thread Operations
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int BarrierPopCount32Scalar(
+            Vector256<int> mask,
+            Vector256<int> warp) =>
+            -Vector.Sum(AndI32(mask, warp).AsVector());
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<int> BarrierPopCount32(
+            Vector256<int> mask,
+            Vector256<int> warp) =>
+            Vector256.Create(BarrierPopCount32Scalar(mask, warp));
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int BarrierPopCount64Scalar(
+            Vector256<int> mask,
+            (Vector256<long>, Vector256<long>) warp)
+        {
+            var parts = AndI64(Convert32To64I(mask), warp);
+            return -(int)(
+                Vector.Sum(parts.Item1.AsVector()) -
+                Vector.Sum(parts.Item2.AsVector()));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static (Vector256<long>, Vector256<long>) BarrierPopCount64(
+            Vector256<int> mask,
+            (Vector256<long>, Vector256<long>) warp) =>
+            FromScalarI64(BarrierPopCount64Scalar(mask, warp));
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        internal static Vector256<int> BarrierAnd32(
+            Vector256<int> mask,
+            Vector256<int> warp,
+            int groupSize) =>
+            BarrierPopCount32Scalar(mask, warp) == groupSize
+                ? Vector256<int>.AllBitsSet
+                : Vector256<int>.Zero;
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        internal static (Vector256<long>, Vector256<long>) BarrierAnd64(
+            Vector256<int> mask,
+            (Vector256<long>, Vector256<long>) warp,
+            int groupSize) =>
+            BarrierPopCount64Scalar(mask, warp) == groupSize
+                ? (Vector256<long>.AllBitsSet, Vector256<long>.AllBitsSet)
+                : (Vector256<long>.Zero, Vector256<long>.Zero);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<int> BarrierOr32(
+            Vector256<int> mask,
+            Vector256<int> warp) =>
+            BarrierPopCount32Scalar(mask, warp) != 0
+                ? Vector256<int>.AllBitsSet
+                : Vector256<int>.Zero;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static (Vector256<long>, Vector256<long>) BarrierOr64(
+            Vector256<int> mask,
+            (Vector256<long>, Vector256<long>) warp) =>
+            BarrierPopCount64Scalar(mask, warp) != 0
+                ? (Vector256<long>.AllBitsSet, Vector256<long>.AllBitsSet)
+                : (Vector256<long>.Zero, Vector256<long>.Zero);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<int> Broadcast32(
+            Vector256<int> _,
+            Vector256<int> value,
+            Vector256<int> sourceLane)
+        {
+            // Extract base source lane
+            int sourceLaneIndex = sourceLane.GetElement(0);
+
+            // Broadcast without referring to the current mask
+            return Broadcast32Internal(value, sourceLaneIndex);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<int> Broadcast32Internal(
+            Vector256<int> value,
+            int sourceLaneIndex) =>
+            Vector256.Create(value[sourceLaneIndex]);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static (Vector256<long>, Vector256<long>) Broadcast64(
+            Vector256<int> _,
+            (Vector256<long>, Vector256<long>) value,
+            (Vector256<long>, Vector256<long>) sourceLane) =>
+            throw new NotImplementedException();
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<int> Shuffle32(
+            Vector256<int> _,
+            Vector256<int> value,
+            Vector256<int> sourceLanes)
+        {
+            var lanes = MinI32(
+                MaxI32(sourceLanes, Vector256<int>.Zero),
+                WarpSizeM1Vector);
+
+            int value0 = value.GetElement(lanes.GetElement(0));
+            int value1 = value.GetElement(lanes.GetElement(1));
+            int value2 = value.GetElement(lanes.GetElement(2));
+            int value3 = value.GetElement(lanes.GetElement(3));
+            int value4 = value.GetElement(lanes.GetElement(4));
+            int value5 = value.GetElement(lanes.GetElement(5));
+            int value6 = value.GetElement(lanes.GetElement(6));
+            int value7 = value.GetElement(lanes.GetElement(7));
+
+            return Vector256.Create(
+                value0, value1, value2, value3,
+                value4, value5, value6, value7);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static (Vector256<long>, Vector256<long>) Shuffle64(
+            Vector256<int> _,
+            (Vector256<long>, Vector256<long>) value,
+            Vector256<int> sourceLanes) =>
+            throw new NotImplementedException();
+
+        #endregion
+    }
+}
+
+#endif
diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt
new file mode 100644
index 000000000..61147f3e2
--- /dev/null
+++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256Operations.tt
@@ -0,0 +1,1470 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: Vec256Operations.tt/Vec256Operations.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+<#@ template debug="false" hostspecific="true" language="C#" #>
+<#@ include file="../VelocityOperations.ttinclude" #>
+<#@ assembly name="System.Core" #>
+<#@ import namespace="System.Linq" #>
+<#@ import namespace="System.Text" #>
+<#@ import namespace="System.Collections.Generic" #>
+<#@ output extension=".cs" #>
+<#
+string rootPath = Host.ResolvePath("../../../Static");
+var unaryOps = GetUnaryMathOps(rootPath);
+var binaryOps = GetBinaryMathOps(rootPath);
+var ternaryOps = GetTernaryMathOps(rootPath);
+var compareOperations = new (string, string)[]
+{
+    ("Equal", "Vector256.Equals({0}, {1}).AsInt32()"),
+    ("NotEqual", "NotI32(Vector256.Equals({0}, {1}).AsInt32())"),
+    ("LessThan", "Vector256.LessThan({0}, {1}).AsInt32()"),
+    ("LessEqual", "Vector256.LessThanOrEqual({0}, {1}).AsInt32()"),
+    ("GreaterThan", "Vector256.GreaterThan({0}, {1}).AsInt32()"),
+    ("GreaterEqual", "Vector256.GreaterThanOrEqual({0}, {1}).AsInt32()")
+};
+var acceleratedConvTypes32 = new (TypeInformation Left, TypeInformation Right, string Op)[]
+{
+    (SignedIntTypes[2],   FloatTypes[1],       "Vector256.ConvertToSingle"),
+    (UnsignedIntTypes[2], FloatTypes[1],       "Vector256.ConvertToSingle"),
+    (FloatTypes[1],       SignedIntTypes[2],   "Vector256.ConvertToInt32"),
+    (FloatTypes[1],       UnsignedIntTypes[2], "Vector256.ConvertToUInt32"),
+};
+var acceleratedConvTypes64 = new (TypeInformation Left, TypeInformation Right, string Op)[]
+{
+    (SignedIntTypes[3],   FloatTypes[2],       "Vector256.ConvertToDouble"),
+    (UnsignedIntTypes[3], FloatTypes[2],       "Vector256.ConvertToDouble"),
+    (FloatTypes[2],       SignedIntTypes[3],   "Vector256.ConvertToInt64"),
+    (FloatTypes[2],       UnsignedIntTypes[3], "Vector256.ConvertToUInt64"),
+};
+
+int warpSize = 8;
+string GetWarpTypeName32(string elementTypeName) => $"Vector256<{elementTypeName}>";
+string GetWarpTypeName64(string elementTypeName) =>
+    $"({GetWarpTypeName32(elementTypeName)}, {GetWarpTypeName32(elementTypeName)})";
+string GetItemRef64(string name, int counter) => counter < 4
+    ? $"{name}.Item1.GetElement({counter})"
+    : $"{name}.Item2.GetElement({counter - 4})";
+string GetCastIToX32(string prefix, string variable) =>
+    prefix != "I" ? $"CastITo{prefix}32({variable})" : variable;
+string GetCastIToX64(string prefix, string variable) =>
+    prefix != "I" ? $"CastITo{prefix}64({variable})" : variable;
+string GetCastXToI32(string prefix, string variable) =>
+    prefix != "I" ? $"Cast{prefix}ToI32({variable})" : variable;
+string GetCastXToI64(string prefix, string variable) =>
+    prefix != "I" ? $"Cast{prefix}ToI64({variable})" : variable;
+
+var warpType32 = GetWarpTypeName32("int");
+var warpType64 = GetWarpTypeName64("long");
+string inliningAttribute = "AggressiveInlining";
+#>
+using ILGPU.IR.Values;
+using ILGPU.Util;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Numerics;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+
+// ReSharper disable ArrangeMethodOrOperatorBody
+// ReSharper disable RedundantCast
+// disable: max_line_length
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Backends.Velocity.Vec256
+{
+    // Operation implementations
+
+    static partial class Vec256Operations
+    {
+        #region Warp Types
+
+        public static int WarpSize => Vector256<int>.Count;
+        public static readonly Type WarpType32 = typeof(<#= warpType32 #>);
+        public static readonly Type WarpType64 = typeof(<#= warpType64 #>);
+
+        #endregion
+
+        #region Initialization
+
+        static Vec256Operations()
+        {
+            InitUnaryOperations();
+            InitBinaryOperations();
+            InitTernaryOperations();
+            InitializeCompareOperations();
+            InitializeConvertOperations();
+            InitializeVectorConvertOperations();
+            InitializeAtomicOperations();
+        }
+
+        private static readonly Vector256<int> WarpSizeM1Vector =
+            Vector256.Create(WarpSize - 1);
+
+        internal static MethodInfo GetMethod(string name) =>
+            typeof(Vec256Operations).GetMethod(
+                    name,
+                    BindingFlags.NonPublic | BindingFlags.Static)
+                .AsNotNull();
+
+        #endregion
+
+        #region Creation
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static Vector256<TTarget> CastWarp32<T, TTarget>(Vector256<T> source)
+            where T : struct
+            where TTarget : struct =>
+            source.As<T, TTarget>();
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static (Vector256<TTarget>, Vector256<TTarget>) CastWarp64<T, TTarget>(
+            (Vector256<T>, Vector256<T>) source)
+            where T : struct
+            where TTarget : struct =>
+            (source.Item1.As<T, TTarget>(), source.Item2.As<T, TTarget>());
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> MaskTo64(Vector256<int> mask) =>
+            Vector256.Widen(mask);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static bool CheckForAnyActiveLane(<#= warpType32 #> warp) =>
+            Vector256.EqualsAny(<#= warpType32 #>.AllBitsSet, warp);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static bool CheckForNoActiveLane(<#= warpType32 #> warp) =>
+            !CheckForAnyActiveLane(warp);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static bool CheckForEqualMasks(
+            <#= warpType32 #> firstMask,
+            <#= warpType32 #> secondMask) =>
+            Vector256.EqualsAll(firstMask, secondMask);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static int GetNumberOfActiveLanes(<#= warpType32 #> warp) =>
+            -Vector256.Sum(warp);
+
+        public static readonly MethodInfo CheckForAnyActiveLaneMethod =
+            GetMethod(nameof(CheckForAnyActiveLane));
+        public static readonly MethodInfo CheckForNoActiveLaneMethod =
+            GetMethod(nameof(CheckForNoActiveLane));
+        public static readonly MethodInfo CheckForEqualMasksMethod =
+            GetMethod(nameof(CheckForEqualMasks));
+        public static readonly MethodInfo GetNumberOfActiveLanesMethod =
+            GetMethod(nameof(GetNumberOfActiveLanes));
+
+        private static readonly <#= warpType32 #> LaneIndexVector32 =
+            Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> LoadLaneIndexVector32() => LaneIndexVector32;
+
+        private static readonly <#= warpType64 #> LaneIndexVector64 =
+            (Vector256.Create(0L, 1L, 2L, 3L), Vector256.Create(4L, 5L, 6L, 7L));
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> LoadLaneIndexVector64() => LaneIndexVector64;
+
+        public static readonly MethodInfo LoadLaneIndexVector32Method =
+            GetMethod(nameof(LoadLaneIndexVector32));
+        public static readonly MethodInfo LoadLaneIndexVector64Method =
+            GetMethod(nameof(LoadLaneIndexVector64));
+
+        private static readonly <#= warpType32 #> LaneLengthVector32 =
+            Vector256.Create(Vector<int>.Count);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> LoadVectorLengthVector32() => LaneLengthVector32;
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> LoadVectorLengthVector64()
+        {
+            long count = Vector<int>.Count;
+            return (Vector256.Create(count), Vector256.Create(count));
+        }
+
+        public static readonly MethodInfo LoadVectorLengthVector32Method =
+            GetMethod(nameof(LoadVectorLengthVector32));
+        public static readonly MethodInfo LoadVectorLengthVector64Method =
+            GetMethod(nameof(LoadVectorLengthVector64));
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> LoadAllLanesMask32() =>
+            <#= warpType32 #>.AllBitsSet;
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> LoadAllLanesMask64() =>
+            (Vector256<long>.AllBitsSet, Vector256<long>.AllBitsSet);
+
+        public static readonly MethodInfo LoadAllLanesMask32Method =
+            GetMethod(nameof(LoadAllLanesMask32));
+        public static readonly MethodInfo LoadAllLanesMask64Method =
+            GetMethod(nameof(LoadAllLanesMask64));
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> LoadNoLanesMask32() => default;
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> LoadNoLanesMask64() => default;
+
+        public static readonly MethodInfo LoadNoLanesMask32Method =
+            GetMethod(nameof(LoadNoLanesMask32));
+        public static readonly MethodInfo LoadNoLanesMask64Method =
+            GetMethod(nameof(LoadNoLanesMask64));
+
+        #endregion
+
+        #region Generic Casts
+
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Cast<#= prefix #>ToI32(
+            <#= GetWarpTypeName32(typeName) #> input) =>
+            CastWarp32<<#= typeName #>, int>(input);
+
+<#      if (typeName != "int") { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= GetWarpTypeName32(typeName) #> CastITo<#= prefix #>32(
+            <#= warpType32 #> input) =>
+            CastWarp32<int, <#= typeName #>>(input);
+
+        public static readonly MethodInfo Cast<#= prefix #>ToI32Method =
+            GetMethod(nameof(Cast<#= prefix #>ToI32));
+
+<#      } #>
+        public static readonly MethodInfo CastITo<#= prefix #>32Method =
+            GetMethod(nameof(CastITo<#= prefix #>32));
+
+<# } #>
+
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> Cast<#= prefix #>ToI64(
+            <#= GetWarpTypeName64(typeName) #> input) =>
+            CastWarp64<<#= typeName #>, long>(input);
+
+<#      if (typeName != "long") { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= GetWarpTypeName64(typeName) #> CastITo<#= prefix #>64(
+            <#= warpType64 #> input) =>
+            CastWarp64<long, <#= typeName #>>(input);
+
+        public static readonly MethodInfo Cast<#= prefix #>ToI64Method =
+            GetMethod(nameof(Cast<#= prefix #>ToI64));
+<#      } #>
+
+        public static readonly MethodInfo CastITo<#= prefix #>64Method =
+            GetMethod(nameof(CastITo<#= prefix #>64));
+
+<# } #>
+
+        #endregion
+
+        #region Scalar Operations
+
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> FromScalar<#= prefix #>32(<#= typeName #> scalar)
+        {
+            var result = Vector256.Create(scalar);
+            return <#= GetCastXToI32(prefix, "result") #>;
+        }
+
+        public static readonly MethodInfo FromScalar<#= prefix #>32Method =
+            GetMethod(nameof(FromScalar<#= prefix #>32));
+
+<# } #>
+
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> FromScalar<#= prefix #>64(<#= typeName #> scalar)
+        {
+            var result = Vector256.Create(scalar);
+            return <#= GetCastXToI64(prefix, "(result, result)") #>;
+        }
+
+        public static readonly MethodInfo FromScalar<#= prefix #>64Method =
+            GetMethod(nameof(FromScalar<#= prefix #>64));
+
+<# } #>
+
+        #endregion
+
+        #region Select Operations
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Select32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> left,
+            <#= warpType32 #> right) =>
+            Vector256.ConditionalSelect(mask, right, left);
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> Select64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> left,
+            <#= warpType64 #> right)
+        {
+            var mask64 = MaskTo64(mask);
+            return (
+                Vector256.ConditionalSelect(mask64.Item1, right.Item1, left.Item1),
+                Vector256.ConditionalSelect(mask64.Item2, right.Item2, left.Item2));
+        }
+
+        public static readonly MethodInfo Select32Method = GetMethod(nameof(Select32));
+        public static readonly MethodInfo Select64Method = GetMethod(nameof(Select64));
+
+        #endregion
+
+        #region Unary Operations
+
+<# foreach (var op in unaryOps) { #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+<#          var velocity = op.Velocity.Velocity256; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32(
+            <#= warpType32 #> warp)
+        {
+            var value = <#= GetCastIToX32(prefix, "warp") #>;
+<#              if (velocity.SoftwareEmulation) { #>
+            var result = Vector256.Create(
+                <#= velocity.GetImplementation(op, 0, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 1, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 2, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : ","#>
+                <#= velocity.GetImplementation(op, 3, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : ","#>
+                <#= velocity.GetImplementation(op, 4, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : ","#>
+                <#= velocity.GetImplementation(op, 5, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : ","#>
+                <#= velocity.GetImplementation(op, 6, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : ","#>
+                <#= velocity.GetImplementation(op, 7, "value") #>
+                <#= op.IsPredicate ? " ? -1 : 0" : ""#>);
+<#              } else { #>
+            var result = <#= velocity.GetImplementation(op, variables: "value") #>;
+<#              } #>
+<#              if (!op.IsPredicate && !op.Velocity.ReturnAsWarp32) { #>
+            return <#= GetCastXToI32(prefix, "result") #>;
+<#              } else { #>
+            return result;
+<#              } #>
+        }
+
+<#      } #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+<#          bool use32BitResult = op.Velocity.ReturnAsWarp32 | op.IsPredicate; #>
+<#          var returnType = use32BitResult ? warpType32 : warpType64; #>
+<#          var velocity = op.Velocity.Velocity256; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= returnType #> <#= op.Name #><#= prefix #>64(
+            <#= warpType64 #> warp)
+        {
+            var value = <#= GetCastIToX64(prefix, "warp") #>;
+<#              if (velocity.SoftwareEmulation && use32BitResult) { #>
+            var result = Vector256.Create(
+                <#= velocity.GetImplementation(op, 0, "value.Item1") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 1, "value.Item1") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 2, "value.Item1") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 3, "value.Item1") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 0, "value.Item2") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 1, "value.Item2") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 2, "value.Item2") #>
+                <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                <#= velocity.GetImplementation(op, 3, "value.Item2") #>
+                <#= op.IsPredicate ? " ? -1 : 0" : "" #>);
+<#              } else if (velocity.SoftwareEmulation) { #>
+            var result = (
+                Vector256.Create(
+                    <#= velocity.GetImplementation(op, 0, "value.Item1") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 1, "value.Item1") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 2, "value.Item1") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 3, "value.Item1") #>
+                    <#= op.IsPredicate ? " ? -1 : 0" : "" #>),
+                Vector256.Create(
+                    <#= velocity.GetImplementation(op, 0, "value.Item2") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 1, "value.Item2") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 2, "value.Item2") #>
+                    <#= op.IsPredicate ? " ? -1 : 0," : "," #>
+                    <#= velocity.GetImplementation(op, 3, "value.Item2") #>
+                    <#= op.IsPredicate ? " ? -1 : 0" : "" #>));
+<#              } else if (use32BitResult) { #>
+            var result = <#= velocity.GetImplementation(op, variables: "value") #>;
+<#              } else { #>
+            var result = (
+                    <#= velocity.GetImplementation(op, variables: "value.Item1") #>,
+                    <#= velocity.GetImplementation(op, variables: "value.Item2") #>);
+<#              } #>
+<#              if (!op.IsPredicate && !op.Velocity.ReturnAsWarp32) { #>
+            return <#= GetCastXToI64(prefix, "result") #>;
+<#              } else { #>
+            return result;
+<#              } #>
+        }
+
+<#      } #>
+
+<# } #>
+
+        private static readonly Dictionary<
+            (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            UnaryOperations32 = new();
+        private static readonly Dictionary<
+            (UnaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            UnaryOperations64 = new();
+
+        private static void InitUnaryOperations()
+        {
+<# foreach (var op in unaryOps) { #>
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            UnaryOperations32.Add(
+                (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>32)));
+<#      } #>
+
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            UnaryOperations64.Add(
+                (UnaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>64)));
+<#      } #>
+<# } #>
+        }
+
+        public static MethodInfo GetUnaryOperation32(
+            UnaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => UnaryOperations32[(kind, mode)];
+        public static MethodInfo GetUnaryOperation64(
+            UnaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => UnaryOperations64[(kind, mode)];
+
+        #endregion
+
+        #region Binary Operations
+
+<# foreach (var op in binaryOps) { #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+<#          var velocity = op.Velocity?.Velocity256; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32(
+            <#= warpType32 #> first,
+            <#= warpType32 #> second)
+        {
+            var left = <#= GetCastIToX32(prefix, "first") #>;
+            var right = <#= GetCastIToX32(prefix, "second") #>;
+
+<#          if (velocity == null) { #>
+            var result = <#= op.GetOpOrCall(isBool: false, "left", "right") #>;
+<#          } else if (velocity.SoftwareEmulation) { #>
+            var result = Vector256.Create(
+                <#= velocity.GetImplementation(op, 0, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 1, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 2, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 3, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 4, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 5, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 6, "left", "right") #>,
+                <#= velocity.GetImplementation(op, 7, "left", "right") #>);
+<#          } else { #>
+            var result = <#= velocity.GetImplementation(op, null, "left", "right") #>;
+<#          } #>
+            return <#= GetCastXToI32(prefix, "result") #>;
+        }
+
+<#      } #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+<#          var velocity = op.Velocity?.Velocity256; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64(
+            <#= warpType64 #> first,
+            <#= warpType64 #> second)
+        {
+            var left = <#= GetCastIToX64(prefix, "first") #>;
+            var right = <#= GetCastIToX64(prefix, "second") #>;
+
+<#          if (velocity == null) { #>
+            var result = (
+                <#= op.GetOpOrCall(isBool: false, "left.Item1", "right.Item1") #>,
+                <#= op.GetOpOrCall(isBool: false, "left.Item2", "right.Item2") #>);
+<#          } else if (velocity.SoftwareEmulation) { #>
+            var result = (
+                Vector256.Create(
+                    <#= velocity.GetImplementation(op, 0, "left.Item1", "right.Item1") #>,
+                    <#= velocity.GetImplementation(op, 1, "left.Item1", "right.Item1") #>,
+                    <#= velocity.GetImplementation(op, 2, "left.Item1", "right.Item1") #>,
+                    <#= velocity.GetImplementation(op, 3, "left.Item1", "right.Item1") #>),
+                Vector256.Create(
+                    <#= velocity.GetImplementation(op, 0, "left.Item2", "right.Item2") #>,
+                    <#= velocity.GetImplementation(op, 1, "left.Item2", "right.Item2") #>,
+                    <#= velocity.GetImplementation(op, 2, "left.Item2", "right.Item2") #>,
+                    <#= velocity.GetImplementation(op, 1, "left.Item2", "right.Item2") #>));
+<#          } else { #>
+            var result = (
+                <#= velocity.GetImplementation(op, null, "left.Item1", "right.Item1") #>,
+                <#= velocity.GetImplementation(op, null, "left.Item2", "right.Item2") #>);
+<#          } #>
+
+            return <#= GetCastXToI64(prefix, "result") #>;
+        }
+
+<#      } #>
+<# } #>
+
+        private static readonly Dictionary<
+            (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            BinaryOperations32 = new();
+        private static readonly Dictionary<
+            (BinaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            BinaryOperations64 = new();
+
+        private static void InitBinaryOperations()
+        {
+<# foreach (var op in binaryOps) { #>
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            BinaryOperations32.Add(
+                (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>32)));
+<#      } #>
+
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            BinaryOperations64.Add(
+                (BinaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>64)));
+<#      } #>
+<# } #>
+        }
+
+        public static MethodInfo GetBinaryOperation32(
+            BinaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => BinaryOperations32[(kind, mode)];
+        public static MethodInfo GetBinaryOperation64(
+            BinaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => BinaryOperations64[(kind, mode)];
+
+        #endregion
+
+        #region Ternary Operations
+
+<# foreach (var op in ternaryOps) { #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> <#= op.Name #><#= prefix #>32(
+            <#= warpType32 #> first,
+            <#= warpType32 #> second,
+            <#= warpType32 #> third)
+        {
+            var a = <#= GetCastIToX32(prefix, "first") #>;
+            var b = <#= GetCastIToX32(prefix, "second") #>;
+            var c = <#= GetCastIToX32(prefix, "third") #>;
+
+            var result = <#= op.Velocity.Velocity256.GetImplementation(
+                op,
+                null,
+                "a", "b", "c") #>;
+
+            return <#= GetCastXToI32(prefix, "result") #>;
+        }
+
+<#      } #>
+<#      foreach (var (_, prefix, typeName, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> <#= op.Name #><#= prefix #>64(
+            <#= warpType64 #> first,
+            <#= warpType64 #> second,
+            <#= warpType64 #> third)
+        {
+            var a = <#= GetCastIToX64(prefix, "first") #>;
+            var b = <#= GetCastIToX64(prefix, "second") #>;
+            var c = <#= GetCastIToX64(prefix, "third") #>;
+
+            var result1 = <#= op.Velocity.Velocity256.GetImplementation(
+                op,
+                null,
+                "a.Item1", "b.Item1", "c.Item1") #>;
+            var result2 = <#= op.Velocity.Velocity256.GetImplementation(
+                op,
+                null,
+                "a.Item2", "b.Item2", "c.Item2") #>;
+
+            return <#= GetCastXToI64(prefix, "(result1, result2)") #>;
+        }
+
+<#      } #>
+<# } #>
+
+        private static readonly Dictionary<
+            (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            TernaryOperations32 = new();
+        private static readonly Dictionary<
+            (TernaryArithmeticKind, VelocityWarpOperationMode), MethodInfo>
+            TernaryOperations64 = new();
+
+        private static void InitTernaryOperations()
+        {
+<# foreach (var op in ternaryOps) { #>
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes32.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            TernaryOperations32.Add(
+                (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>32)));
+<#      } #>
+
+<#      foreach (var (_, prefix, _, _) in
+            ImplementationTypes64.Where(t => (t.Flags & op.Flags) != 0)) { #>
+            TernaryOperations64.Add(
+                (TernaryArithmeticKind.<#= op.Name #>, VelocityWarpOperationMode.<#= prefix #>),
+                GetMethod(nameof(<#= op.Name #><#= prefix #>64)));
+<#      } #>
+<# } #>
+        }
+
+        public static MethodInfo GetTernaryOperation32(
+            TernaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => TernaryOperations32[(kind, mode)];
+        public static MethodInfo GetTernaryOperation64(
+            TernaryArithmeticKind kind,
+            VelocityWarpOperationMode mode) => TernaryOperations64[(kind, mode)];
+
+        #endregion
+
+        #region Compare Operations
+
+<# foreach (var (kind, op) in compareOperations) { #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes32) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>32(
+            <#= warpType32 #> first,
+            <#= warpType32 #> second)
+        {
+            var left = <#= GetCastIToX32(prefix, "first") #>;
+            var right = <#= GetCastIToX32(prefix, "second") #>;
+
+            return <#= string.Format(op, "left", "right") #>;
+        }
+
+<#      } #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes64) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Compare<#= kind #><#= prefix #>64(
+            <#= warpType64 #> first,
+            <#= warpType64 #> second)
+        {
+            var left = <#= GetCastIToX64(prefix, "first") #>;
+            var right = <#= GetCastIToX64(prefix, "second") #>;
+
+            var result1 =  <#= string.Format(op, "left.Item1", "right.Item1") #>;
+            var result2 =  <#= string.Format(op, "left.Item2", "right.Item2") #>;
+
+            return Vector256.Narrow(result1.AsInt64(), result2.AsInt64());
+        }
+
+<#      } #>
+<# } #>
+        private static readonly Dictionary<
+            (CompareKind, VelocityWarpOperationMode, bool),
+            MethodInfo> CompareOperations = new();
+
+        private static void InitializeCompareOperations()
+        {
+<# foreach (var (kind, _) in compareOperations) { #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes32) { #>
+            CompareOperations.Add(
+                (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, false),
+                GetMethod(nameof(Compare<#= kind #><#= prefix #>32)));
+<#      } #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes64) { #>
+            CompareOperations.Add(
+                (CompareKind.<#= kind #>, VelocityWarpOperationMode.<#= prefix #>, true),
+                GetMethod(nameof(Compare<#= kind #><#= prefix #>64)));
+<#      } #>
+<# } #>
+        }
+
+        public static MethodInfo GetCompareOperation32(
+            CompareKind kind,
+            VelocityWarpOperationMode mode) =>
+            CompareOperations[(kind, mode, false)];
+
+        public static MethodInfo GetCompareOperation64(
+            CompareKind kind,
+            VelocityWarpOperationMode mode) =>
+            CompareOperations[(kind, mode, true)];
+
+        #endregion
+
+        #region Convert Operations
+
+<# foreach (var sourceType in Warp32ConvTypes) { #>
+<# foreach (var targetType in Warp32ConvTypes) { #>
+<#      var sourceImplType32 = GetImplementationType32(sourceType.Kind); #>
+<#      var acceleratedOp = acceleratedConvTypes32.FirstOrDefault(
+                t => t.Left == sourceType && t.Right == targetType); #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_32(
+            <#= warpType32 #> warp)
+        {
+<#      if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #>
+            return warp;
+<#      } else if (acceleratedOp.Op != null) { #>
+            var value = <#= GetCastIToX32(sourceImplType32.Prefix, "warp") #>;
+            return <#= acceleratedOp.Op #>(value).AsInt32();
+<#      } else { #>
+            var value = <#= GetCastIToX32(sourceImplType32.Prefix, "warp") #>;
+            return Vector256.Create(
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(0),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(1),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(2),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(3),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(4),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(5),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(6),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.GetElement(7))
+                .AsInt32();
+<#      } #>
+        }
+
+<# } #>
+<# } #>
+
+<# foreach (var sourceType in Warp64ConvTypes) { #>
+<# foreach (var targetType in Warp64ConvTypes) { #>
+<#      var sourceImplType64 = GetImplementationType64(sourceType.Kind); #>
+<#      var acceleratedOp = acceleratedConvTypes64.FirstOrDefault(
+                t => t.Left == sourceType && t.Right == targetType); #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> Convert<#= sourceType.Name #>To<#= targetType.Name #>_64(
+            <#= warpType64 #> warp)
+        {
+<#      if (sourceType.GetBasicValueType() == targetType.GetBasicValueType()) { #>
+            return warp;
+<#      } else if (acceleratedOp.Op != null) { #>
+            var value = <#= GetCastIToX64(sourceImplType64.Prefix, "warp") #>;
+            return (
+                <#= acceleratedOp.Op #>(value.Item1).AsInt64(),
+                <#= acceleratedOp.Op #>(value.Item2).AsInt64());
+<#      } else { #>
+            var value = <#= GetCastIToX64(sourceImplType64.Prefix, "warp") #>;
+            var result1 = Vector256.Create(
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(0),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(1),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(2),
+                (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item1.GetElement(3));
+            var result2 = Vector256.Create(
+                    (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(0),
+                    (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(1),
+                    (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(2),
+                    (<#= targetType.Type #>)(<#= sourceType.Type #>)value.Item2.GetElement(3));
+            return (result1.AsInt64(), result2.AsInt64());
+<#      } #>
+        }
+
+<# } #>
+<# } #>
+
+        private static readonly Dictionary<
+            (ArithmeticBasicValueType, ArithmeticBasicValueType, bool),
+            MethodInfo> ConvertOperations = new();
+
+        private static void InitializeConvertOperations()
+        {
+<# foreach (var sourceType in Warp32ConvTypes) { #>
+<# foreach (var targetType in Warp32ConvTypes) { #>
+<#      var sourceName = sourceType.GetArithmeticBasicValueType(); #>
+<#      var targetName = targetType.GetArithmeticBasicValueType(); #>
+            ConvertOperations.Add(
+                (ArithmeticBasicValueType.<#= sourceName #>,
+                ArithmeticBasicValueType.<#= targetName #>,
+                false),
+                GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_32)));
+<# } #>
+<# } #>
+<# foreach (var sourceType in Warp64ConvTypes) { #>
+<# foreach (var targetType in Warp64ConvTypes) { #>
+<#      var sourceName = sourceType.GetArithmeticBasicValueType(); #>
+<#      var targetName = targetType.GetArithmeticBasicValueType(); #>
+            ConvertOperations.Add(
+                (ArithmeticBasicValueType.<#= sourceName #>,
+                ArithmeticBasicValueType.<#= targetName #>,
+                true),
+                GetMethod(nameof(Convert<#= sourceType.Name #>To<#= targetType.Name #>_64)));
+<# } #>
+<# } #>
+        }
+
+        public static MethodInfo GetConvertOperation32(
+            ArithmeticBasicValueType source,
+            ArithmeticBasicValueType target) =>
+            ConvertOperations[(source, target, false)];
+
+        public static MethodInfo GetConvertOperation64(
+            ArithmeticBasicValueType source,
+            ArithmeticBasicValueType target) =>
+            ConvertOperations[(source, target, true)];
+
+        #endregion
+
+        #region Vector Convert Operations
+
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> Convert64To32<#= prefix #>(<#= warpType64 #> warp)
+        {
+            var value = <#= GetCastIToX64(prefix, "warp") #>;
+            var result = Vector256.Narrow(value.Item1, value.Item2);
+            return <#= GetCastXToI32(prefix, "result") #>;
+        }
+
+<# } #>
+<# foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> Convert32To64<#= prefix #>(<#= warpType32 #> warp)
+        {
+            var value = <#= GetCastIToX32(prefix, "warp") #>;
+            var result = Vector256.Widen(value);
+            return <#= GetCastXToI64(prefix, "result") #>;
+        }
+
+<# } #>
+        internal static readonly Dictionary<
+            (VelocityWarpOperationMode, bool),
+            MethodInfo> VectorConvertOperations = new();
+
+        internal static void InitializeVectorConvertOperations()
+        {
+<# foreach (var (_, prefix, _, _) in ImplementationTypes32) { #>
+            VectorConvertOperations.Add(
+                (VelocityWarpOperationMode.<#= prefix #>, false),
+                GetMethod(nameof(Convert64To32<#= prefix #>)));
+<# } #>
+<# foreach (var (_, prefix, _, _) in ImplementationTypes64) { #>
+            VectorConvertOperations.Add(
+                (VelocityWarpOperationMode.<#= prefix #>, true),
+                GetMethod(nameof(Convert32To64<#= prefix #>)));
+<# } #>
+        }
+
+        public static MethodInfo GetConvert32To64Operation(
+            VelocityWarpOperationMode mode) =>
+            VectorConvertOperations[(mode, true)];
+
+        public static MethodInfo GetConvert64To32Operation(
+            VelocityWarpOperationMode mode) =>
+            VectorConvertOperations[(mode, false)];
+
+        #endregion
+
+        #region Atomic Operations
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType32 #> AtomicCompareExchange32(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> target,
+            <#= warpType32 #> compare,
+            <#= warpType32 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int result<#= i #> = default;
+            if (mask<#= i #> != 0)
+            {
+                result<#= i #> = Atomic.CompareExchange(
+                    ref Unsafe.AsRef<int>((void*)<#= GetItemRef64("target", i) #>),
+                    compare.GetElement(<#= i #>),
+                    value.GetElement(<#= i #>));
+            }
+<# } #>
+            return Vector256.Create(
+                result0, result1, result2, result3,
+                result4, result5, result6, result7);
+        }
+
+        public static readonly MethodInfo AtomicCompareExchange32Method =
+            GetMethod(nameof(AtomicCompareExchange32));
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType64 #> AtomicCompareExchange64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> target,
+            <#= warpType64 #> compare,
+            <#= warpType64 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            long result<#= i #> = default;
+            if (mask<#= i #> != 0)
+            {
+                result<#= i #> = Atomic.CompareExchange(
+                    ref Unsafe.AsRef<long>((void*)<#= GetItemRef64("target", i) #>),
+                    <#= GetItemRef64("compare", i) #>,
+                    <#= GetItemRef64("value", i) #>);
+            }
+<# } #>
+            return (
+                Vector256.Create(result0, result1, result2, result3),
+                Vector256.Create(result4, result5, result6, result7));
+        }
+
+        public static readonly MethodInfo AtomicCompareExchange64Method =
+            GetMethod(nameof(AtomicCompareExchange64));
+
+<# foreach (var (op, isBinary) in AtomicOperations) { #>
+<#      foreach (var (_, prefix, typeName, _) in ImplementationTypes32) { #>
+<#          var targetPrefix = isBinary ? "U" : prefix; #>
+<#          var targetTypeName = isBinary ? "uint" : typeName; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType32 #> Atomic<#= op #><#= prefix #>32(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> target,
+            <#= warpType32 #> value)
+        {
+            var sourceValue = <#= GetCastIToX32(targetPrefix, "value") #>;
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<#          for (int i = 0; i < warpSize; ++i) { #>
+            <#= targetTypeName #> result<#= i #> = default;
+            if (mask<#= i #> != 0)
+            {
+                result<#= i #> = Atomic.<#= op #>(
+                    ref Unsafe.AsRef<<#= targetTypeName #>>((void*)<#= GetItemRef64("target", i) #>),
+                    sourceValue.GetElement(<#= i #>));
+            }
+<#          } #>
+            return <#= GetCastXToI32(targetPrefix,
+                "Vector256.Create(" +
+                    "result0, result1, result2, result3," +
+                    "result4, result5, result6, result7)") #>;
+        }
+
+<#      } #>
+
+<#      foreach (var (_, prefix, typeName, _) in ImplementationTypes64) { #>
+<#          var targetPrefix = isBinary ? "U" : prefix; #>
+<#          var targetTypeName = isBinary ? "ulong" : typeName; #>
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType64 #> Atomic<#= op #><#= prefix #>64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> target,
+            <#= warpType64 #> value)
+        {
+            var sourceValue = <#= GetCastIToX64(targetPrefix, "value") #>;
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<#          for (int i = 0; i < warpSize; ++i) { #>
+            <#= targetTypeName #> result<#= i #> = default;
+            if (mask<#= i #> != 0)
+            {
+                result<#= i #> = Atomic.<#= op #>(
+                    ref Unsafe.AsRef<<#= targetTypeName #>>((void*)<#= GetItemRef64("target", i) #>),
+                    <#= GetItemRef64("sourceValue", i) #>);
+            }
+<#          } #>
+            return <#= GetCastXToI64(targetPrefix,
+                "(Vector256.Create(result0, result1, result2, result3)," +
+                    "Vector256.Create(result4, result5, result6, result7))") #>;
+        }
+
+<#      } #>
+<# } #>
+
+        internal static readonly Dictionary<
+            (AtomicKind, VelocityWarpOperationMode, bool),
+            MethodInfo> AtomicOperations = new();
+
+        internal static void InitializeAtomicOperations()
+        {
+<# foreach (var (op, _) in AtomicOperations) { #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes32) { #>
+            AtomicOperations.Add(
+                (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, false),
+                GetMethod(nameof(Atomic<#= op #><#= prefix #>32)));
+<#      } #>
+<#      foreach (var (_, prefix, _, _) in ImplementationTypes64) { #>
+            AtomicOperations.Add(
+                (AtomicKind.<#= op #>, VelocityWarpOperationMode.<#= prefix #>, true),
+                GetMethod(nameof(Atomic<#= op #><#= prefix #>64)));
+<#      } #>
+<# } #>
+        }
+
+        public static MethodInfo GetAtomicOperation32(
+            AtomicKind kind,
+            VelocityWarpOperationMode mode) =>
+            AtomicOperations[(kind, mode, false)];
+
+        public static MethodInfo GetAtomicOperation64(
+            AtomicKind kind,
+            VelocityWarpOperationMode mode) =>
+            AtomicOperations[(kind, mode, true)];
+
+        #endregion
+
+        #region Thread Operations
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static void ComputeShuffleConfig(
+            <#= warpType32 #> width,
+            out <#= warpType32 #> lane,
+            out <#= warpType32 #> offset)
+        {
+            lane = RemI32(LoadLaneIndexVector32(), width);
+            offset = MulI32(DivI32(lane, width), width);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> ShuffleUp32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> delta)
+        {
+            var lane = SubI32(LoadLaneIndexVector32(), delta);
+            return Shuffle32(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> SubShuffleUp32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> delta,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = SubI32(lane, delta);
+            return Shuffle32(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static <#= warpType64 #> ShuffleUp64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> delta,
+            <#= warpType32 #> width)
+        {
+            var lane = SubI32(LoadLaneIndexVector32(), delta);
+            return Shuffle64(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> SubShuffleUp64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> delta,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = SubI32(lane, delta);
+            return Shuffle64(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> ShuffleDown32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> delta)
+        {
+            var lane = AddI32(LoadLaneIndexVector32(), delta);
+            return Shuffle32(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> SubShuffleDown32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> delta,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = AddI32(lane, delta);
+            return Shuffle32(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> ShuffleDown64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> delta)
+        {
+            var lane = AddI32(LoadLaneIndexVector32(), delta);
+            return Shuffle64(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> SubShuffleDown64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> delta,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = AddI32(lane, delta);
+            return Shuffle64(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> ShuffleXor32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> laneMask)
+        {
+            var lane = XorU32(LoadLaneIndexVector32(), laneMask);
+            return Shuffle32(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType32 #> SubShuffleXor32(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> warp,
+            <#= warpType32 #> laneMask,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = XorU32(lane, laneMask);
+            return Shuffle32(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> ShuffleXor64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> laneMask)
+        {
+            var lane = XorU32(LoadLaneIndexVector32(), laneMask);
+            return Shuffle64(mask, warp, lane);
+        }
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static <#= warpType64 #> SubShuffleXor64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> warp,
+            <#= warpType32 #> laneMask,
+            <#= warpType32 #> width)
+        {
+            ComputeShuffleConfig(width, out var lane, out var offset);
+            var adjustedLane = XorU32(lane, laneMask);
+            return Shuffle64(mask, warp, AddI32(adjustedLane, offset));
+        }
+
+        public static readonly MethodInfo BarrierPopCount32Method =
+            GetMethod(nameof(BarrierPopCount32));
+        public static readonly MethodInfo BarrierPopCount64Method =
+            GetMethod(nameof(BarrierPopCount64));
+        public static readonly MethodInfo BarrierAnd32Method =
+            GetMethod(nameof(BarrierAnd32));
+        public static readonly MethodInfo BarrierAnd64Method =
+            GetMethod(nameof(BarrierAnd64));
+        public static readonly MethodInfo BarrierOr32Method =
+            GetMethod(nameof(BarrierOr32));
+        public static readonly MethodInfo BarrierOr64Method =
+            GetMethod(nameof(BarrierOr64));
+        public static readonly MethodInfo Broadcast32Method =
+            GetMethod(nameof(Broadcast32));
+        public static readonly MethodInfo Broadcast64Method =
+            GetMethod(nameof(Broadcast64));
+        public static readonly MethodInfo Shuffle32Method =
+            GetMethod(nameof(Shuffle32));
+        public static readonly MethodInfo Shuffle64Method =
+            GetMethod(nameof(Shuffle64));
+        public static readonly MethodInfo ShuffleUp32Method =
+            GetMethod(nameof(ShuffleUp32));
+        public static readonly MethodInfo SubShuffleUp32Method =
+            GetMethod(nameof(SubShuffleUp32));
+        public static readonly MethodInfo ShuffleUp64Method =
+            GetMethod(nameof(ShuffleUp64));
+        public static readonly MethodInfo SubShuffleUp64Method =
+            GetMethod(nameof(SubShuffleUp64));
+        public static readonly MethodInfo ShuffleDown32Method =
+            GetMethod(nameof(ShuffleDown32));
+        public static readonly MethodInfo SubShuffleDown32Method =
+            GetMethod(nameof(SubShuffleDown32));
+        public static readonly MethodInfo ShuffleDown64Method =
+            GetMethod(nameof(ShuffleDown64));
+        public static readonly MethodInfo SubShuffleDown64Method =
+            GetMethod(nameof(SubShuffleDown64));
+        public static readonly MethodInfo ShuffleXor32Method =
+            GetMethod(nameof(ShuffleXor32));
+        public static readonly MethodInfo SubShuffleXor32Method =
+            GetMethod(nameof(SubShuffleXor32));
+        public static readonly MethodInfo ShuffleXor64Method =
+            GetMethod(nameof(ShuffleXor64));
+        public static readonly MethodInfo SubShuffleXor64Method =
+            GetMethod(nameof(SubShuffleXor64));
+
+        #endregion
+
+        #region IO
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType32 #> Load8(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var result<#= i #> = mask<#= i #> != 0
+                ? (uint)*(byte*)<#= GetItemRef64("address", i) #>
+                : 0;
+<# } #>
+            return Vector256.Create(result0, result1, result2, result3).AsInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType32 #> Load16(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var result<#= i #> = mask<#= i #> != 0
+                ? (uint)*(ushort*)<#= GetItemRef64("address", i) #>
+                : 0;
+<# } #>
+            return Vector256.Create(result0, result1, result2, result3).AsInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        private static unsafe <#= warpType32 #> Load32(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int result<#= i #> = mask<#= i #> != 0
+                ? *(int*)<#= GetItemRef64("address", i) #>
+                : 0;
+<# } #>
+            return Vector256.Create(
+                result0, result1, result2, result3,
+                result4, result5, result6, result7);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe <#= warpType64 #> Load64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            long result<#= i #> = mask<#= i #> != 0
+                ? *(long*)<#= GetItemRef64("address", i) #>
+                : 0;
+<# } #>
+            return (
+                Vector256.Create(result0, result1, result2, result3),
+                Vector256.Create(result3, result5, result6, result7));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe void Store8(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address,
+            <#= warpType32 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            byte* addr<#= i #> = (byte*)<#= GetItemRef64("address", i) #>;
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var value<#= i #> = (byte)(value.GetElement(<#= i #>) & 0xff);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            if (mask<#= i #> != 0)
+                *addr<#= i #> = value<#= i #>;
+<# } #>
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe void Store16(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address,
+            <#= warpType32 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            short* addr<#= i #> = (short*)<#= GetItemRef64("address", i) #>;
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var value<#= i #> = (short)(value.GetElement(<#= i #>) & 0xffff);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            if (mask<#= i #> != 0)
+                *addr<#= i #> = value<#= i #>;
+<# } #>
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe void Store32(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address,
+            <#= warpType32 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int* addr<#= i #> = (int*)<#= GetItemRef64("address", i) #>;
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var value<#= i #> = value.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            if (mask<#= i #> != 0)
+                *addr<#= i #> = value<#= i #>;
+<# } #>
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveOptimization |
+                    MethodImplOptions.<#= inliningAttribute #>)]
+        internal static unsafe void Store64(
+            <#= warpType32 #> mask,
+            <#= warpType64 #> address,
+            <#= warpType64 #> value)
+        {
+<# for (int i = 0; i < warpSize; ++i) { #>
+            int mask<#= i #> = mask.GetElement(<#= i #>);
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            long* addr<#= i #> = (long*)<#= GetItemRef64("address", i) #>;
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            var value<#= i #> = <#= GetItemRef64("value", i) #>;
+<# } #>
+<# for (int i = 0; i < warpSize; ++i) { #>
+            if (mask<#= i #> != 0)
+                *addr<#= i #> = value<#= i #>;
+<# } #>
+        }
+
+        public static readonly MethodInfo Load8Method =
+            GetMethod(nameof(Load8));
+        public static readonly MethodInfo Load16Method =
+            GetMethod(nameof(Load16));
+        public static readonly MethodInfo Load32Method =
+            GetMethod(nameof(Load32));
+        public static readonly MethodInfo Load64Method =
+            GetMethod(nameof(Load64));
+
+        public static readonly MethodInfo Store8Method =
+            GetMethod(nameof(Store8));
+        public static readonly MethodInfo Store16Method =
+            GetMethod(nameof(Store16));
+        public static readonly MethodInfo Store32Method =
+            GetMethod(nameof(Store32));
+        public static readonly MethodInfo Store64Method =
+            GetMethod(nameof(Store64));
+
+        #endregion
+
+        #region Misc
+
+        [MethodImpl(MethodImplOptions.<#= inliningAttribute #>)]
+        internal static void DebugAssertFailed(
+            <#= warpType32 #> mask,
+            <#= warpType32 #> value,
+            string message,
+            string fileName,
+            int line,
+            string method)
+        {
+            // Check if any lane failed the check
+            var failedAssertionMask = XorU32(LoadAllLanesMask32(), value);
+            if (BarrierPopCount32Scalar(mask, failedAssertionMask) != 0)
+                Trace.Assert(false, message, $"@ {fileName}:{line} in {method}");
+        }
+
+        public static readonly MethodInfo DebugAssertFailedMethod =
+            GetMethod(nameof(DebugAssertFailed));
+
+        [SuppressMessage(
+            "Globalization",
+            "CA1303:Do not pass literals as localized parameters",
+            Justification = "Basic invariant string")]
+        internal static void DumpWarp32(<#= warpType32 #> value, string label)
+        {
+            Console.Write(label);
+            Console.WriteLine(value.ToString());
+        }
+
+        public static readonly MethodInfo DumpWarp32Method =
+            GetMethod(nameof(DumpWarp32));
+
+        [SuppressMessage(
+            "Globalization",
+            "CA1303:Do not pass literals as localized parameters",
+            Justification = "Basic invariant string")]
+        internal static void DumpWarp64(<#= warpType64 #> value, string label)
+        {
+            Console.Write(label);
+            Console.Write(value.Item1.ToString());
+            Console.Write(", ");
+            Console.WriteLine(value.Item2.ToString());
+        }
+
+        public static readonly MethodInfo DumpWarp64Method =
+            GetMethod(nameof(DumpWarp64));
+
+        #endregion
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs b/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs
new file mode 100644
index 000000000..fe2285ee7
--- /dev/null
+++ b/Src/ILGPU/Backends/Velocity/Vec256/Vec256TypeGenerator.cs
@@ -0,0 +1,76 @@
+// ---------------------------------------------------------------------------------------
+//                                        ILGPU
+//                           Copyright (c) 2024 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: Vec256TypeGenerator.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Backends.IL;
+using ILGPU.Runtime.Velocity;
+using System;
+using System.Numerics;
+
+#if NET7_0_OR_GREATER
+
+namespace ILGPU.Backends.Velocity.Vec256
+{
+    /// <summary>
+    /// A vector type generator of 256bit vectors to be used with the Velocity backend.
+    /// </summary>
+    sealed class Vec256TypeGenerator : VelocityTypeGenerator
+    {
+        #region Static
+
+        /// <summary>
+        /// Maps basic types to vectorized basic types.
+        /// </summary>
+        private static readonly Type[] VectorizedBasicTypeMapping = new Type[]
+        {
+            Vec256Operations.WarpType32, // None/Unknown
+
+            Vec256Operations.WarpType32, // Int1
+            Vec256Operations.WarpType32, // Int8
+            Vec256Operations.WarpType32, // Int16
+            Vec256Operations.WarpType32, // Int32
+            Vec256Operations.WarpType64, // Int64
+
+            Vec256Operations.WarpType32, // Float16
+            Vec256Operations.WarpType32, // Float32
+            Vec256Operations.WarpType64, // Float64
+        };
+
+        #endregion
+
+        #region Instance
+
+        /// <summary>
+        /// Constructs a new IL vector type generator.
+        /// </summary>
+        /// <param name="capabilityContext">The parent capability context.</param>
+        /// <param name="runtimeSystem">The parent runtime system.</param>
+        public Vec256TypeGenerator(
+            VelocityCapabilityContext capabilityContext,
+            RuntimeSystem runtimeSystem)
+            : base(capabilityContext, runtimeSystem, Vector<int>.Count)
+        { }
+
+        #endregion
+
+        #region Type System
+
+        public override Type GetVectorizedBasicType(BasicValueType basicValueType)
+        {
+            if (basicValueType == BasicValueType.Float16 && !CapabilityContext.Float16)
+                throw VelocityCapabilityContext.GetNotSupportedFloat16Exception();
+            return VectorizedBasicTypeMapping[(int)basicValueType];
+        }
+
+        #endregion
+    }
+}
+
+#endif
diff --git a/Src/ILGPU/ILGPU.csproj b/Src/ILGPU/ILGPU.csproj
index 4228847f2..6b26c8c56 100644
--- a/Src/ILGPU/ILGPU.csproj
+++ b/Src/ILGPU/ILGPU.csproj
@@ -185,6 +185,14 @@
       <Generator>TextTemplatingFileGenerator</Generator>
       <LastGenOutput>CudaInstructionSet.Generated.cs</LastGenOutput>
     </None>
+    <None Update="Backends\Velocity\Vec128\Vec128Operations.tt">
+      <Generator>TextTemplatingFilePreprocessor</Generator>
+      <LastGenOutput>Vec128Operations.cs</LastGenOutput>
+    </None>
+    <None Update="Backends\Velocity\Vec256\Vec256Operations.tt">
+      <Generator>TextTemplatingFilePreprocessor</Generator>
+      <LastGenOutput>Vec256Operations.cs</LastGenOutput>
+    </None>
   </ItemGroup>
 
   <ItemGroup>
@@ -353,6 +361,12 @@
       <AutoGen>True</AutoGen>
       <DependentUpon>PrimitiveDataBlocks.tt</DependentUpon>
     </Compile>
+    <Compile Update="Backends\Velocity\Vec128\Vec128Operations.cs">
+      <DependentUpon>Vec128Operations.tt</DependentUpon>
+    </Compile>
+    <Compile Update="Backends\Velocity\Vec256\Vec256Operations.cs">
+      <DependentUpon>Vec256Operations.tt</DependentUpon>
+    </Compile>
   </ItemGroup>
   <ItemGroup>
     <None Update="Backends\PTX\PTXLibDeviceNvvm.tt">
diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
index d7b61813c..2d1c54938 100644
--- a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
+++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs
@@ -13,6 +13,7 @@
 using ILGPU.Backends.Velocity.Scalar;
 #if NET7_0_OR_GREATER
 using ILGPU.Backends.Velocity.Vec128;
+using ILGPU.Backends.Velocity.Vec256;
 #endif
 using ILGPU.Util;
 using System;
@@ -36,6 +37,11 @@ public enum VelocityDeviceType
         /// </summary>
         Vector128,
 
+        /// <summary>
+        /// 256bit vector operations to simulate four lanes per warp using hardware
+        /// acceleration via AVX.
+        /// </summary>
+        Vector256,
 #endif
     }
 
@@ -53,6 +59,7 @@ public sealed class VelocityDevice : Device
             typeof(Scalar),
 #if NET7_0_OR_GREATER
             typeof(Vec128),
+            typeof(Vec256),
 #endif
         };
 
@@ -73,6 +80,7 @@ public VelocityDevice(VelocityDeviceType deviceType)
                     break;
 #if NET7_0_OR_GREATER
                 case VelocityDeviceType.Vector128:
+                case VelocityDeviceType.Vector256:
                     // Vector always runs using software in the worst case
                     break;
 #endif
diff --git a/Src/ILGPU/Static/BinaryMathOperations.xml b/Src/ILGPU/Static/BinaryMathOperations.xml
index 3a682f5cf..d954ec079 100644
--- a/Src/ILGPU/Static/BinaryMathOperations.xml
+++ b/Src/ILGPU/Static/BinaryMathOperations.xml
@@ -115,6 +115,9 @@
             <Velocity128>
                 <Implementation>{Value0} - {Value0} / {Value1} * {Value1}</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>{Value0} - {Value0} / {Value1} * {Value1}</Implementation>
+            </Velocity256>
         </Velocity>
         <!-- If we add some rules here we have to enable specific test cases
              in BinaryIntOperations.tt -->
@@ -128,6 +131,9 @@
             <Velocity128>
                 <Implementation>Vector128.BitwiseAnd({Value0}, {Value1})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.BitwiseAnd({Value0}, {Value1})</Implementation>
+            </Velocity256>
         </Velocity>
         <Rewriter>
             <Source>{Value0} == {Value1}</Source>
@@ -153,6 +159,9 @@
             <Velocity128>
                 <Implementation>Vector128.BitwiseOr({Value0}, {Value1})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.BitwiseOr({Value0}, {Value1})</Implementation>
+            </Velocity256>
         </Velocity>
         <Rewriter>
             <Source>{Value0} == {Value1}</Source>
@@ -178,6 +187,9 @@
             <Velocity128>
                 <Implementation>Vector128.Xor({Value0}, {Value1})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Xor({Value0}, {Value1})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="Shl">
@@ -188,6 +200,9 @@
             <Velocity128 SoftwareEmulation="true">
                 <Implementation>{Value0}[Field] &lt;&lt; (int){Value1}[Field]</Implementation>
             </Velocity128>
+            <Velocity256 SoftwareEmulation="true">
+                <Implementation>{Value0}[Field] &lt;&lt; (int){Value1}[Field]</Implementation>
+            </Velocity256>
         </Velocity>
         <Rewriter>
             <Source>{Value0}.IsZero</Source>
@@ -206,6 +221,9 @@
             <Velocity128 SoftwareEmulation="true">
                 <Implementation>{Value0}[Field] &gt;&gt; (int){Value1}[Field]</Implementation>
             </Velocity128>
+            <Velocity256 SoftwareEmulation="true">
+                <Implementation>{Value0}[Field] &gt;&gt; (int){Value1}[Field]</Implementation>
+            </Velocity256>
         </Velocity>
         <Rewriter>
             <Source>{Value0}.IsZero</Source>
@@ -226,6 +244,9 @@
             <Velocity128>
                 <Implementation>Vector128.Min({Value0}, {Value1})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Min({Value0}, {Value1})</Implementation>
+            </Velocity256>
         </Velocity>
         <!-- CombineNestedLeftConstant not required (commutative) -->
         <Rewriter Mode="CombineNestedRightConstant">
@@ -241,6 +262,9 @@
             <Velocity128>
                 <Implementation>Vector128.Max({Value0}, {Value1})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Max({Value0}, {Value1})</Implementation>
+            </Velocity256>
         </Velocity>
         <!-- CombineNestedLeftConstant not required (commutative) -->
         <Rewriter Mode="CombineNestedRightConstant">
@@ -255,6 +279,7 @@
         <Implementation>{MathType}.Atan2({Value0}, {Value1})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="PowF">
@@ -264,6 +289,7 @@
         <Implementation>{MathType}.Pow({Value0}, {Value1})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="BinaryLogF">
@@ -273,6 +299,7 @@
         <Implementation>{MathType}.Log({Value0}, {Value1})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -282,6 +309,7 @@
         <Call>IntrinsicMath.CopySign</Call>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 </Operations>
diff --git a/Src/ILGPU/Static/TernaryMathOperations.xml b/Src/ILGPU/Static/TernaryMathOperations.xml
index 304e6a662..c3bb2faf8 100644
--- a/Src/ILGPU/Static/TernaryMathOperations.xml
+++ b/Src/ILGPU/Static/TernaryMathOperations.xml
@@ -9,6 +9,9 @@
             <Velocity128>
                 <Implementation>Vector128.Add(Vector128.Multiply({Value0}, {Value1}), {Value2})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>FMAImpl({Value0}, {Value1}, {Value2})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
 </Operations>
diff --git a/Src/ILGPU/Static/TypeInformation.ttinclude b/Src/ILGPU/Static/TypeInformation.ttinclude
index 47f270eb7..523f80625 100644
--- a/Src/ILGPU/Static/TypeInformation.ttinclude
+++ b/Src/ILGPU/Static/TypeInformation.ttinclude
@@ -322,9 +322,12 @@ public class VelocityMathConfig
 
     [XmlElement("Velocity128")]
     public Velocity128Config Velocity128 { get; set; }
+
+    [XmlElement("Velocity256")]
+    public Velocity256Config Velocity256 { get; set; }
 }
 
-public class Velocity128Config
+public abstract class VelocityOperationConfig
 {
     [XmlAttribute]
     public bool SoftwareEmulation { get; set; }
@@ -364,6 +367,9 @@ public class Velocity128Config
     }
 }
 
+public class Velocity128Config : VelocityOperationConfig { }
+public class Velocity256Config : VelocityOperationConfig { }
+
 public class MathOp
 {
     #region Data
diff --git a/Src/ILGPU/Static/UnaryMathOperations.xml b/Src/ILGPU/Static/UnaryMathOperations.xml
index 85fa966d5..a6632dfa2 100644
--- a/Src/ILGPU/Static/UnaryMathOperations.xml
+++ b/Src/ILGPU/Static/UnaryMathOperations.xml
@@ -17,6 +17,9 @@
             <Velocity128>
                 <Implementation>-{Value0}</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>-{Value0}</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="Not">
@@ -45,6 +48,9 @@
             <Velocity128>
                 <Implementation>~{Value0}</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>~{Value0}</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="Abs">
@@ -61,6 +67,9 @@
             <Velocity128>
                 <Implementation>Vector128.Abs({Value0})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Abs({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="PopC">
@@ -69,6 +78,7 @@
         <Call>IntrinsicMath.BitOperations.PopCount</Call>
         <Velocity ReturnAsWarp32="true">
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="CLZ">
@@ -77,6 +87,7 @@
         <Call>IntrinsicMath.BitOperations.LeadingZeroCount</Call>
         <Velocity ReturnAsWarp32="true">
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="CTZ">
@@ -85,6 +96,7 @@
         <Call>IntrinsicMath.BitOperations.TrailingZeroCount</Call>
         <Velocity ReturnAsWarp32="true">
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="RcpF">
@@ -96,6 +108,9 @@
             <Velocity128>
                 <Implementation>RcpImpl({Value0})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>RcpImpl({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
 
@@ -106,6 +121,7 @@
         <Implementation>{TypeName}.IsNaN({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="IsInfF" IsPredicate="true">
@@ -115,6 +131,7 @@
         <Implementation>{TypeName}.IsInfinity({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="IsFinF" IsPredicate="true">
@@ -124,6 +141,7 @@
         <Implementation>!IsNaN({Value0}) &amp;&amp; !IsInfinity({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -136,6 +154,9 @@
             <Velocity128>
                 <Implementation>Vector128.Sqrt({Value0})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Sqrt({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="RsqrtF">
@@ -147,6 +168,9 @@
             <Velocity128>
                 <Implementation>RcpImpl(Vector128.Sqrt({Value0}))</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>RsqrtImpl({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
 
@@ -157,6 +181,7 @@
         <Implementation>{MathType}.Asin({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="SinF">
@@ -166,6 +191,7 @@
         <Implementation>{MathType}.Sin({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="SinhF">
@@ -175,6 +201,7 @@
         <Implementation>{MathType}.Sinh({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -185,6 +212,7 @@
         <Implementation>{MathType}.Acos({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="CosF">
@@ -194,6 +222,7 @@
         <Implementation>{MathType}.Cos({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="CoshF">
@@ -203,6 +232,7 @@
         <Implementation>{MathType}.Cosh({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -213,6 +243,7 @@
         <Implementation>{MathType}.Tan({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="TanhF">
@@ -222,6 +253,7 @@
         <Implementation>{MathType}.Tanh({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="AtanF">
@@ -231,6 +263,7 @@
         <Implementation>{MathType}.Atan({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -241,6 +274,7 @@
         <Implementation>{MathType}.Exp({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="Exp2F">
@@ -250,6 +284,7 @@
         <Implementation>{MathType}.Pow({Const2}, {Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 
@@ -262,6 +297,9 @@
             <Velocity128>
                 <Implementation>Vector128.Floor({Value0})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Floor({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
     <Operation Name="CeilingF">
@@ -273,6 +311,9 @@
             <Velocity128>
                 <Implementation>Vector128.Ceiling({Value0})</Implementation>
             </Velocity128>
+            <Velocity256>
+                <Implementation>Vector256.Ceiling({Value0})</Implementation>
+            </Velocity256>
         </Velocity>
     </Operation>
 
@@ -283,6 +324,7 @@
         <Implementation>{MathType}.Log({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="Log2F">
@@ -292,6 +334,7 @@
         <Implementation>{MathType}.Log({Value0}, {Const2})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
     <Operation Name="Log10F">
@@ -301,6 +344,7 @@
         <Implementation>{MathType}.Log10({Value0})</Implementation>
         <Velocity>
             <Velocity128 SoftwareEmulation="true" />
+            <Velocity256 SoftwareEmulation="true" />
         </Velocity>
     </Operation>
 </Operations>