diff --git a/.gitignore b/.gitignore index 0c6f225dc..d71268e87 100644 --- a/.gitignore +++ b/.gitignore @@ -257,6 +257,7 @@ Src/ILGPU/AtomicFunctions.cs Src/ILGPU/Backends/PTX/PTXIntrinsics.Generated.cs Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.cs Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.cs +Src/ILGPU/Backends/PTX/PTXLibDevicePtx.cs Src/ILGPU/Backends/Velocity/Scalar/ScalarOperations.cs Src/ILGPU/Backends/Velocity/VelocityIntrinsics.Generated.cs Src/ILGPU/Frontend/Intrinsic/RemappedIntrinsics.Generated.cs @@ -285,6 +286,7 @@ Src/ILGPU/Static/DllImports.cs Src/ILGPU/StrideTypes.cs Src/ILGPU/Util/DataBlocks.cs Src/ILGPU/Util/PrimitiveDataBlocks.cs +Tools/CudaGenerateLibDeviceTool/CudaDriverVersionUtils.cs # Ignore specific template outputs (Algorithms) Src/ILGPU.Algorithms/AlgorithmContextMappings.cs diff --git a/Samples/LibDeviceKernel/Program.cs b/Samples/LibDeviceKernel/Program.cs index 7f0d5d9d2..215f37fa9 100644 --- a/Samples/LibDeviceKernel/Program.cs +++ b/Samples/LibDeviceKernel/Program.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Samples -// Copyright (c) 2021 ILGPU Project +// Copyright (c) 2021-2024 ILGPU Project // www.ilgpu.net // // File: Program.cs @@ -28,8 +28,12 @@ public static void KernelWithLibDevice(Index1D index, ArrayView data) static void Main() { - // Create default context and enable LibDevice library - using var context = Context.Create(builder => builder.Cuda().LibDevice()); + // Create default context. + // + // ILGPU includes built-in support for LibDevice and should be compatible + // with most CUDA devices. If you have an older device, or wish to use + // a specific version of LibDevice, call LibDeviceOveride(). + using var context = Context.Create(builder => builder.Cuda()); // For each available device... foreach (var device in context) diff --git a/Src/ILGPU.Algorithms.Tests/Generic/AlgorithmsTestBase.cs b/Src/ILGPU.Algorithms.Tests/Generic/AlgorithmsTestBase.cs index c09342e85..5ba15cc55 100644 --- a/Src/ILGPU.Algorithms.Tests/Generic/AlgorithmsTestBase.cs +++ b/Src/ILGPU.Algorithms.Tests/Generic/AlgorithmsTestBase.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Algorithms -// Copyright (c) 2020-2023 ILGPU Project +// Copyright (c) 2020-2024 ILGPU Project // www.ilgpu.net // // File: AlgorithmsTestBase.cs @@ -27,7 +27,7 @@ protected AlgorithmsTestBase(ITestOutputHelper output, TestContext testContext) /// /// Compares two numbers for equality, within a defined tolerance. /// - private class HalfPrecisionComparer + internal class HalfPrecisionComparer : EqualityComparer { public readonly float Margin; @@ -59,7 +59,7 @@ public override int GetHashCode(Half obj) => /// /// Compares two numbers for equality, within a defined tolerance. /// - private class FloatPrecisionComparer + internal class FloatPrecisionComparer : EqualityComparer { public readonly float Margin; @@ -91,7 +91,7 @@ public override int GetHashCode(float obj) => /// /// Compares two numbers for equality, within a defined tolerance. /// - private class DoublePrecisionComparer + internal class DoublePrecisionComparer : EqualityComparer { public readonly double Margin; @@ -123,7 +123,7 @@ public override int GetHashCode(double obj) => /// /// Compares two numbers for equality, within a defined tolerance. /// - private class HalfRelativeErrorComparer + internal class HalfRelativeErrorComparer : EqualityComparer { public readonly float RelativeError; @@ -163,7 +163,7 @@ public override int GetHashCode(Half obj) => /// /// Compares two numbers for equality, within a defined tolerance. /// - private class FloatRelativeErrorComparer + internal class FloatRelativeErrorComparer : EqualityComparer { public readonly float RelativeError; @@ -203,7 +203,7 @@ public override int GetHashCode(float obj) => /// /// Compares two numbers for equality, within a defined tolerance. /// - private class DoubleRelativeErrorComparer + internal class DoubleRelativeErrorComparer : EqualityComparer { public readonly double RelativeError; @@ -245,20 +245,33 @@ public override int GetHashCode(double obj) => /// /// The target buffer. /// The expected values. - /// The acceptable error margin. - public void VerifyWithinPrecision( - ArrayView buffer, - Half[] expected, - uint decimalPlaces) + /// The comparer to use. + public void VerifyUsingComparer( + ArrayView buffer, + T[] expected, + IEqualityComparer comparer) + where T : unmanaged { var data = buffer.GetAsArray(Accelerator.DefaultStream); Assert.Equal(data.Length, expected.Length); - - var comparer = new HalfPrecisionComparer(decimalPlaces); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); + Assert.Equal(expected, data, comparer); } + /// + /// Verifies the contents of the given memory buffer. + /// + /// The target buffer. + /// The expected values. + /// The acceptable error margin. + public void VerifyWithinPrecision( + ArrayView buffer, + Half[] expected, + uint decimalPlaces) => + VerifyUsingComparer( + buffer, + expected, + new HalfPrecisionComparer(decimalPlaces)); + /// /// Verifies the contents of the given memory buffer. /// @@ -268,15 +281,11 @@ public void VerifyWithinPrecision( public void VerifyWithinPrecision( ArrayView buffer, float[] expected, - uint decimalPlaces) - { - var data = buffer.GetAsArray(Accelerator.DefaultStream); - Assert.Equal(data.Length, expected.Length); - - var comparer = new FloatPrecisionComparer(decimalPlaces); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); - } + uint decimalPlaces) => + VerifyUsingComparer( + buffer, + expected, + new FloatPrecisionComparer(decimalPlaces)); /// /// Verifies the contents of the given memory buffer. @@ -287,15 +296,11 @@ public void VerifyWithinPrecision( public void VerifyWithinPrecision( ArrayView buffer, double[] expected, - uint decimalPlaces) - { - var data = buffer.GetAsArray(Accelerator.DefaultStream); - Assert.Equal(data.Length, expected.Length); - - var comparer = new DoublePrecisionComparer(decimalPlaces); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); - } + uint decimalPlaces) => + VerifyUsingComparer( + buffer, + expected, + new DoublePrecisionComparer(decimalPlaces)); /// /// Verifies the contents of the given memory buffer. @@ -306,15 +311,11 @@ public void VerifyWithinPrecision( public void VerifyWithinRelativeError( ArrayView buffer, Half[] expected, - double relativeError) - { - var data = buffer.GetAsArray(Accelerator.DefaultStream); - Assert.Equal(data.Length, expected.Length); - - var comparer = new HalfRelativeErrorComparer((float)relativeError); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); - } + double relativeError) => + VerifyUsingComparer( + buffer, + expected, + new HalfRelativeErrorComparer((float)relativeError)); /// /// Verifies the contents of the given memory buffer. @@ -325,15 +326,11 @@ public void VerifyWithinRelativeError( public void VerifyWithinRelativeError( ArrayView buffer, float[] expected, - double relativeError) - { - var data = buffer.GetAsArray(Accelerator.DefaultStream); - Assert.Equal(data.Length, expected.Length); - - var comparer = new FloatRelativeErrorComparer((float)relativeError); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); - } + double relativeError) => + VerifyUsingComparer( + buffer, + expected, + new FloatRelativeErrorComparer((float)relativeError)); /// /// Verifies the contents of the given memory buffer. @@ -344,14 +341,10 @@ public void VerifyWithinRelativeError( public void VerifyWithinRelativeError( ArrayView buffer, double[] expected, - double relativeError) - { - var data = buffer.GetAsArray(Accelerator.DefaultStream); - Assert.Equal(data.Length, expected.Length); - - var comparer = new DoubleRelativeErrorComparer(relativeError); - for (int i = 0, e = data.Length; i < e; ++i) - Assert.Equal(expected[i], data[i], comparer); - } + double relativeError) => + VerifyUsingComparer( + buffer, + expected, + new DoubleRelativeErrorComparer(relativeError)); } } diff --git a/Src/ILGPU.Algorithms.Tests/XMathTests.Pow.tt b/Src/ILGPU.Algorithms.Tests/XMathTests.Pow.tt index 1b514b8d9..7785ed3a4 100644 --- a/Src/ILGPU.Algorithms.Tests/XMathTests.Pow.tt +++ b/Src/ILGPU.Algorithms.Tests/XMathTests.Pow.tt @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Algorithms -// Copyright (c) 2020-2023 ILGPU Project +// Copyright (c) 2020-2024 ILGPU Project // www.ilgpu.net // // File: XMathTests.Pow.tt/XMathTests.Pow.cs @@ -48,6 +48,32 @@ namespace ILGPU.Algorithms.Tests // and ensures a minimum error on each accelerator type. partial class XMathTests { + #region Nested Types + + /// + /// WORKAROUND: The output of LibDevice __nv_pow(double, double) and + /// .NET Math.Pow(double, double) on Cuda Test Runner are different. + /// + private class CudaPowDoubleRelativeErrorComparer : DoubleRelativeErrorComparer + { + public CudaPowDoubleRelativeErrorComparer(double relativeError) + : base(relativeError) + { } + + public override bool Equals(double x, double y) + { + if ((double.IsPositiveInfinity(x) && double.IsNegativeInfinity(y)) || + (double.IsNegativeInfinity(x) && double.IsPositiveInfinity(y))) + { + return true; + } + + return base.Equals(x, y); + } + } + + #endregion + <# foreach (var function in powFunctions) { #> internal static void <#= function.KernelName #>( Index1D index, @@ -120,10 +146,24 @@ namespace ILGPU.Algorithms.Tests v => Math<#= function.MathSuffix #>.<#= function.Name #>(v.X, v.Y)) .ToArray(); if (Accelerator.AcceleratorType == AcceleratorType.Cuda) +<# + if (function.DataType == "double") { +#> + VerifyUsingComparer( + output.View, + expected, + new CudaPowDoubleRelativeErrorComparer( + (<#= function.DataType #>)<#= function.RelativeError.Cuda #>)); +<# + } else { +#> VerifyWithinRelativeError( output.View, expected, <#= function.RelativeError.Cuda #>); +<# + } +#> else if (Accelerator.AcceleratorType == AcceleratorType.OpenCL) VerifyWithinRelativeError( output.View, diff --git a/Src/ILGPU.Algorithms.Tests/XMathTests.Sqrt.tt b/Src/ILGPU.Algorithms.Tests/XMathTests.Sqrt.tt index 4b0eb3a90..9d2ed98a5 100644 --- a/Src/ILGPU.Algorithms.Tests/XMathTests.Sqrt.tt +++ b/Src/ILGPU.Algorithms.Tests/XMathTests.Sqrt.tt @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU Algorithms -// Copyright (c) 2020-2023 ILGPU Project +// Copyright (c) 2020-2024 ILGPU Project // www.ilgpu.net // // File: XMathTests.Sqrt.tt/XMathTests.Sqrt.cs @@ -32,7 +32,7 @@ using Xunit; var rsqrtFunctions = new [] { - new XMathFunction("Rsqrt" , "float" , new Precision(15, 15, 7)), + new XMathFunction("Rsqrt" , "float" , new Precision(15, 6, 7)), new XMathFunction("Rsqrt" , "double", new Precision(15, 15, 15)), }; #> diff --git a/Src/ILGPU/Backends/PTX/PTXAssembly.cs b/Src/ILGPU/Backends/PTX/PTXAssembly.cs new file mode 100644 index 000000000..237addefe --- /dev/null +++ b/Src/ILGPU/Backends/PTX/PTXAssembly.cs @@ -0,0 +1,104 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: PTXAssembly.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System; +using System.Collections.Immutable; +using System.Text; + +namespace ILGPU.Backends.PTX +{ + /// + /// Collection of PTX modules that are used to build a Cuda kernel. + /// + public sealed class PTXAssembly + { + #region Nested Types + + /// + /// A builder for a collection of PTX modules. + /// + public class Builder + { + #region Instance + + /// + /// List of PTX modules. + /// + private readonly ImmutableArray.Builder modules; + + /// + /// Constructs a new builder. + /// + internal Builder() + { + KernelBuilder = new StringBuilder(); + modules = ImmutableArray.CreateBuilder(1); + + // Add placeholder for kernel module. + modules.Add(string.Empty); + } + + #endregion + + #region Properties + + /// + /// Contains the definition of the kernel module. + /// + public StringBuilder KernelBuilder { get; } + + #endregion + + #region Methods + + /// + /// Adds the PTX modules to the collection. + /// + public void AddModule(ReadOnlySpan ptxModules) => +#if NET7_0_OR_GREATER + modules.AddRange(ptxModules); +#else + modules.AddRange(ptxModules.ToArray()); +#endif + + /// + /// Constructs the completed collection of PTX modules. + /// + public PTXAssembly Seal() + { + // Replace placeholder string, so that the kernel is always at index 0. + modules[0] = KernelBuilder.ToString(); + return new PTXAssembly(modules.ToImmutable()); + } + + #endregion + } + + #endregion + + #region Instance + + /// + /// Collection of PTX modules. + /// + public ImmutableArray Modules { get; } + + /// + /// Constructs the list of PTX modules. + /// + internal PTXAssembly(ImmutableArray modules) + { + Modules = modules; + } + + #endregion + } +} diff --git a/Src/ILGPU/Backends/PTX/PTXBackend.cs b/Src/ILGPU/Backends/PTX/PTXBackend.cs index 186521b80..fe0a98701 100644 --- a/Src/ILGPU/Backends/PTX/PTXBackend.cs +++ b/Src/ILGPU/Backends/PTX/PTXBackend.cs @@ -14,13 +14,15 @@ using ILGPU.IR; using ILGPU.IR.Analyses; using ILGPU.IR.Transformations; +using ILGPU.Resources; using ILGPU.Runtime; using ILGPU.Runtime.Cuda; using ILGPU.Util; using System; +using System.Collections; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; -using System.Text; +using System.Linq; namespace ILGPU.Backends.PTX { @@ -49,7 +51,7 @@ public sealed class PTXBackend : PTXIntrinsic.Handler, PTXCodeGenerator.GeneratorArgs, PTXCodeGenerator, - StringBuilder> + PTXAssembly.Builder> { #region Constants @@ -74,6 +76,75 @@ public sealed class PTXBackend : #endregion + #region Nested Types + + /// + /// An enumerator for LibDevice backend methods. + /// + public struct LibDeviceEnumerator : IEnumerator + { + #region Instance + + private References.Enumerator enumerator; + + /// + /// Constructs a new enumerator. + /// + /// The current backend context. + internal LibDeviceEnumerator(in BackendContext context) + { + enumerator = context.Methods.GetEnumerator(); + } + + #endregion + + #region Properties + + /// + /// Returns the current value. + /// + public string Current + { + get + { + var method = enumerator.Current; + return method.Source.Name; + } + } + + /// + object IEnumerator.Current => Current; + + #endregion + + #region Methods + + /// + void IDisposable.Dispose() { } + + /// + public bool MoveNext() + { + while (enumerator.MoveNext()) + { + var current = enumerator.Current; + if (!current.HasSource) + continue; + if (current.Source.DeclaringType != typeof(PTXLibDeviceMethods)) + continue; + return true; + } + return false; + } + + /// + void IEnumerator.Reset() => throw new InvalidOperationException(); + + #endregion + } + + #endregion + #region Instance /// @@ -120,6 +191,10 @@ public PTXBackend( Architecture = architecture; InstructionSet = instructionSet; NvvmAPI = nvvmAPI; + IsLibDeviceAvailable = + NvvmAPI != null || + Architecture >= PTXLibDevicePtx.MinArchtecture && + InstructionSet >= PTXLibDevicePtx.MinInstructionSet; InitIntrinsicProvider(); InitializeKernelTransformers(builder => @@ -188,6 +263,11 @@ protected override void Dispose(bool disposing) /// public NvvmAPI? NvvmAPI { get; private set; } + /// + /// Returns true if LibDevice is available. + /// + internal bool IsLibDeviceAvailable { get; } + #endregion #region Methods @@ -197,7 +277,7 @@ protected override void Dispose(bool disposing) /// instance. /// [SuppressMessage("Globalization", "CA1308:Normalize strings to uppercase")] - protected override StringBuilder CreateKernelBuilder( + protected override PTXAssembly.Builder CreateKernelBuilder( EntryPoint entryPoint, in BackendContext backendContext, in KernelSpecialization specialization, @@ -217,7 +297,8 @@ protected override StringBuilder CreateKernelBuilder( : new PTXDebugLineInfoGenerator(); } - var builder = new StringBuilder(); + var assemblyBuilder = new PTXAssembly.Builder(); + var builder = assemblyBuilder.KernelBuilder; builder.AppendLine("//"); builder.Append("// Generated by ILGPU v"); @@ -237,7 +318,7 @@ protected override StringBuilder CreateKernelBuilder( builder.AppendLine((PointerSize * 8).ToString()); builder.AppendLine(); - GenerateLibDeviceCode(backendContext, builder); + GenerateLibDeviceCode(backendContext, assemblyBuilder); // Check whether we are running in the O1 or O2 pipeline bool o1Enabled = Context.Properties.OptimizationLevel >= OptimizationLevel.O1; @@ -263,7 +344,7 @@ protected override StringBuilder CreateKernelBuilder( alignments, uniforms); - return builder; + return assemblyBuilder; } /// @@ -292,113 +373,84 @@ protected override PTXCodeGenerator CreateKernelCodeGenerator( protected override CompiledKernel CreateKernel( EntryPoint entryPoint, CompiledKernel.KernelInfo? kernelInfo, - StringBuilder builder, + PTXAssembly.Builder builder, PTXCodeGenerator.GeneratorArgs data) { - data.DebugInfoGenerator.GenerateDebugSections(builder); + data.DebugInfoGenerator.GenerateDebugSections(builder.KernelBuilder); - var ptxAssembly = builder.ToString(); return new PTXCompiledKernel( Context, entryPoint, kernelInfo, - ptxAssembly); + builder.Seal()); } - private unsafe void GenerateLibDeviceCode( + /// + /// Generate the PTX code for LibDevice functions. + /// + /// The backend context. + /// The kernel builder. + private void GenerateLibDeviceCode( in BackendContext backendContext, - StringBuilder builder) + PTXAssembly.Builder builder) { - if (NvvmAPI == null || backendContext.Count == 0) - return; - - // Determine the NVVM IR Version to use. - var result = NvvmAPI.GetIRVersion(out int majorIR, out _, out _, out _); - if (result != NvvmResult.NVVM_SUCCESS) - return; - - // Convert the methods in the context into NVVM. - var methods = backendContext.GetEnumerator().AsEnumerable(); - var nvvmModule = PTXLibDeviceNvvm.GenerateNvvm(majorIR, methods); - - if (string.IsNullOrEmpty(nvvmModule)) + if (backendContext.Count == 0) return; - // Create a new NVVM program. - result = NvvmAPI.CreateProgram(out var program); - - try + using var enumerator = new LibDeviceEnumerator(backendContext); + if (IsLibDeviceAvailable) { - // Add custom NVVM module. - if (result == NvvmResult.NVVM_SUCCESS) + if (NvvmAPI != null) { - var nvvmModuleBytes = Encoding.ASCII.GetBytes(nvvmModule); - fixed (byte* nvvmPtr = nvvmModuleBytes) - { - result = NvvmAPI.AddModuleToProgram( - program, - new IntPtr(nvvmPtr), - new IntPtr(nvvmModuleBytes.Length), - null); - } + PTXLibDevice.GenerateLibDeviceCode( + NvvmAPI, + Architecture, + enumerator.AsEnumerable(), + out var ptx); + + var compiledString = + ptx.AsNotNull() + .Replace(".version", "//.version", StringComparison.Ordinal) + .Replace(".target", "//.target", StringComparison.Ordinal) + .Replace( + ".address_size", + "//.address_size", + StringComparison.Ordinal); + builder.KernelBuilder.Append(compiledString); } - - // Add the LibDevice bit code. - if (result == NvvmResult.NVVM_SUCCESS) + else { - fixed (byte* ptr = NvvmAPI.LibDeviceBytes) - { - result = NvvmAPI.LazyAddModuleToProgram( - program, - new IntPtr(ptr), - new IntPtr(NvvmAPI.LibDeviceBytes.Length), - null); - } + // Use the pre-generated LibDevice PTX code. + var ptxModules = InlineList.Create(backendContext.Count); + PTXLibDevicePtx.GetPtx( + enumerator.AsEnumerable(), + ref ptxModules, + out var ptxDeclarations); + + builder.AddModule(ptxModules.AsReadOnlySpan()); + builder.KernelBuilder.AppendLine(ptxDeclarations); } - - // Compile the NVVM into PTX for the backend architecture. - if (result == NvvmResult.NVVM_SUCCESS) + } + else if (enumerator.AsEnumerable().FirstOrDefault() != null) + { + // Handle any issues if a LibDevice function is used. + if (Architecture >= PTXLibDevicePtx.MinArchtecture) { - var major = Architecture.Major; - var minor = Architecture.Minor; - var archOption = $"-arch=compute_{major}{minor}"; - var archOptionAscii = Encoding.ASCII.GetBytes(archOption); - fixed (byte* archOptionPtr = archOptionAscii) - { - var numOptions = 1; - var optionValues = stackalloc byte[sizeof(void*) * numOptions]; - var values = (void**)optionValues; - values[0] = archOptionPtr; - - result = NvvmAPI.CompileProgram( - program, - numOptions, - new IntPtr(values)); - } + // The architecture is supported, but is using an older instruction + // set. Can be solved by a driver update. + throw new NotSupportedException(string.Format( + RuntimeErrorMessages.NotSupportedLibDevicePreGeneratedNewer, + PTXLibDevicePtx.MinDriverVersion.Major, + PTXLibDevicePtx.MinDriverVersion.Minor)); } - - // Extract the PTX result and comment out the initial declarations. - if (result == NvvmResult.NVVM_SUCCESS) + else { - result = NvvmAPI.GetCompiledResult(program, out var compiledPTX); - if (result == NvvmResult.NVVM_SUCCESS) - { - var compiledString = - compiledPTX.AsNotNull() - .Replace(".version", "//.version", StringComparison.Ordinal) - .Replace(".target", "//.target", StringComparison.Ordinal) - .Replace( - ".address_size", - "//.address_size", - StringComparison.Ordinal); - builder.Append(compiledString); - } + // The architecture is too old for the pre-generated LibDevice PTX. + // Inform the user to manually initialize LibDevice. + throw new NotSupportedException( + RuntimeErrorMessages.NotSupportedLibDeviceNotInitialized); } } - finally - { - NvvmAPI.DestroyProgram(ref program); - } } #endregion @@ -438,13 +490,13 @@ public static PTXBackendMode GetPTXBackendMode( : PTXBackendMode.Default); /// - /// Convenience method to get an IEnumerable of Method. + /// Convenience method to get an IEnumerable from an IEnumerator. /// - public static IEnumerable AsEnumerable( - this IEnumerator<(Method, Allocas)> enumerator) + public static IEnumerable AsEnumerable( + this IEnumerator enumerator) { while (enumerator.MoveNext()) - yield return enumerator.Current.Item1; + yield return enumerator.Current; } } } diff --git a/Src/ILGPU/Backends/PTX/PTXCodeGenerator.cs b/Src/ILGPU/Backends/PTX/PTXCodeGenerator.cs index 74f8b8ea7..3a1425e39 100644 --- a/Src/ILGPU/Backends/PTX/PTXCodeGenerator.cs +++ b/Src/ILGPU/Backends/PTX/PTXCodeGenerator.cs @@ -31,7 +31,7 @@ namespace ILGPU.Backends.PTX /// The code needs to be prepared for this code generator. public abstract partial class PTXCodeGenerator : PTXRegisterAllocator, - IBackendCodeGenerator + IBackendCodeGenerator { #region Constants @@ -404,7 +404,7 @@ public IntrinsicImplementationProvider /// /// Generates a function declaration in PTX code. /// - public abstract void GenerateHeader(StringBuilder builder); + public abstract void GenerateHeader(PTXAssembly.Builder builder); /// /// Generates PTX code. @@ -415,11 +415,12 @@ public IntrinsicImplementationProvider /// Generates PTX constant declarations. /// /// The target builder. - public void GenerateConstants(StringBuilder builder) => - builder.Append(GenerateConstantDeclarations()); + public void GenerateConstants(PTXAssembly.Builder builder) => + builder.KernelBuilder.Append(GenerateConstantDeclarations()); /// - public void Merge(StringBuilder builder) => builder.Append(Builder); + public void Merge(PTXAssembly.Builder builder) => + builder.KernelBuilder.Append(Builder); #endregion diff --git a/Src/ILGPU/Backends/PTX/PTXCompiledKernel.cs b/Src/ILGPU/Backends/PTX/PTXCompiledKernel.cs index 606f49026..33cabca12 100644 --- a/Src/ILGPU/Backends/PTX/PTXCompiledKernel.cs +++ b/Src/ILGPU/Backends/PTX/PTXCompiledKernel.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2023 ILGPU Project +// Copyright (c) 2018-2024 ILGPU Project // www.ilgpu.net // // File: PTXCompiledKernel.cs @@ -31,7 +31,7 @@ internal PTXCompiledKernel( Context context, EntryPoint entryPoint, KernelInfo? info, - string ptxAssembly) + PTXAssembly ptxAssembly) : base(context, entryPoint, info) { PTXAssembly = ptxAssembly; @@ -44,7 +44,7 @@ internal PTXCompiledKernel( /// /// Returns the PTX assembly code. /// - public string PTXAssembly { get; } + public PTXAssembly PTXAssembly { get; } #endregion } diff --git a/Src/ILGPU/Backends/PTX/PTXFunctionGenerator.cs b/Src/ILGPU/Backends/PTX/PTXFunctionGenerator.cs index 40b5d04c5..4d7c84827 100644 --- a/Src/ILGPU/Backends/PTX/PTXFunctionGenerator.cs +++ b/Src/ILGPU/Backends/PTX/PTXFunctionGenerator.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2021 ILGPU Project +// Copyright (c) 2018-2024 ILGPU Project // www.ilgpu.net // // File: PTXFunctionGenerator.cs @@ -78,13 +78,16 @@ private List GenerateHeaderDeclaration( /// /// Generates a function declaration in PTX code. /// - public override void GenerateHeader(StringBuilder builder) + public override void GenerateHeader(PTXAssembly.Builder builder) { - if (PTXLibDeviceMethods.IsLibDeviceMethod(Method)) + if (Method.HasSource && + Method.Source.DeclaringType == typeof(PTXLibDeviceMethods)) + { return; + } - GenerateHeaderDeclaration(builder); - builder.AppendLine(";"); + GenerateHeaderDeclaration(builder.KernelBuilder); + builder.KernelBuilder.AppendLine(";"); } /// diff --git a/Src/ILGPU/Backends/PTX/PTXIntrinsic.cs b/Src/ILGPU/Backends/PTX/PTXIntrinsic.cs index b1c03f2d0..13c61af80 100644 --- a/Src/ILGPU/Backends/PTX/PTXIntrinsic.cs +++ b/Src/ILGPU/Backends/PTX/PTXIntrinsic.cs @@ -267,7 +267,7 @@ backend is PTXBackend ptxBackend && (!MaxArchitecture.HasValue || ptxBackend.Architecture < MaxArchitecture.Value) && (!LibDeviceRequired.HasValue || - ptxBackend.NvvmAPI != null == LibDeviceRequired.Value); + ptxBackend.IsLibDeviceAvailable == LibDeviceRequired.Value); #endregion } diff --git a/Src/ILGPU/Backends/PTX/PTXKernelFunctionGenerator.cs b/Src/ILGPU/Backends/PTX/PTXKernelFunctionGenerator.cs index 413f694ba..8231293f3 100644 --- a/Src/ILGPU/Backends/PTX/PTXKernelFunctionGenerator.cs +++ b/Src/ILGPU/Backends/PTX/PTXKernelFunctionGenerator.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2018-2023 ILGPU Project +// Copyright (c) 2018-2024 ILGPU Project // www.ilgpu.net // // File: PTXKernelFunctionGenerator.cs @@ -115,7 +115,7 @@ public PTXKernelFunctionGenerator( /// /// Generates a function declaration in PTX code. /// - public override void GenerateHeader(StringBuilder builder) + public override void GenerateHeader(PTXAssembly.Builder builder) { // Generate global dynamic shared memory allocation information if (!EntryPoint.SharedMemory.HasDynamicMemory) @@ -134,11 +134,11 @@ public override void GenerateHeader(StringBuilder builder) PTXBackend.DefaultGlobalMemoryAlignment); // Use the proper alignment that is compatible with all types - builder.Append(".extern .shared .align "); - builder.Append(sharedAlignmentInBytes); - builder.Append(" .b8 "); - builder.Append(DynamicSharedMemoryAllocationName); - builder.AppendLine("[];"); + builder.KernelBuilder.Append(".extern .shared .align "); + builder.KernelBuilder.Append(sharedAlignmentInBytes); + builder.KernelBuilder.Append(" .b8 "); + builder.KernelBuilder.Append(DynamicSharedMemoryAllocationName); + builder.KernelBuilder.AppendLine("[];"); } /// diff --git a/Src/ILGPU/Backends/PTX/PTXLibDevice.cs b/Src/ILGPU/Backends/PTX/PTXLibDevice.cs new file mode 100644 index 000000000..cf900c1eb --- /dev/null +++ b/Src/ILGPU/Backends/PTX/PTXLibDevice.cs @@ -0,0 +1,170 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: PTXLibDevice.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Runtime.Cuda; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.InteropServices; +using System.Text; + +namespace ILGPU.Backends.PTX +{ + internal static class PTXLibDevice + { + /// + /// Detects the location of the Cuda SDK and NVVM/LibDevice files. + /// + /// Filled with the environment variable used. + /// Filled with the detected NVVM folder. + /// Filled with the detected NVVM file. + /// Filled with the detected LibDevice folder. + /// Filled with the detected LibDevice file. + public static void FindLibDevicePaths( + out string? cudaEnvName, + out string? nvvmBinDir, + out string? libNvvmPath, + out string? libDeviceDir, + out string? libDevicePath) + { + // Find the CUDA installation path. + cudaEnvName = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) + ? "CUDA_PATH" + : "CUDA_HOME"; + var cudaPath = Environment.GetEnvironmentVariable(cudaEnvName); + if (string.IsNullOrEmpty(cudaPath)) + { + nvvmBinDir = null; + libNvvmPath = null; + libDeviceDir = null; + libDevicePath = null; + return; + } + + var nvvmRoot = Path.Combine(cudaPath, "nvvm"); + + // Find the NVVM DLL. + var nvvmBinName = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) + ? "bin" + : "lib64"; + nvvmBinDir = Path.Combine(nvvmRoot, nvvmBinName); + var nvvmSearchPattern = + RuntimeInformation.IsOSPlatform(OSPlatform.Windows) + ? "nvvm64*.dll" + : "libnvvm*.so"; + var nvvmFiles = Directory.EnumerateFiles(nvvmBinDir, nvvmSearchPattern); + libNvvmPath = nvvmFiles.FirstOrDefault(); + + // Find the LibDevice Bitcode. + libDeviceDir = Path.Combine(nvvmRoot, "libdevice"); + var libDeviceFiles = Directory.EnumerateFiles( + libDeviceDir, + "libdevice.*.bc"); + libDevicePath = libDeviceFiles.FirstOrDefault(); + } + + /// + /// Generates the LibDevice PTX code using NVVM. + /// + /// The NVVM API instance. + /// The target Cuda architure to generate for. + /// The LibDevice method names to generate. + /// Filled in with the generated PTX code. + public static unsafe void GenerateLibDeviceCode( + NvvmAPI nvvmAPI, + in CudaArchitecture architecture, + IEnumerable methods, + out string? ptx) + { + ptx = null; + + // Determine the NVVM IR Version to use. + var result = nvvmAPI.GetIRVersion(out int majorIR, out _, out _, out _); + if (result != NvvmResult.NVVM_SUCCESS) + return; + + // Convert the methods in the context into NVVM. + var nvvmModule = PTXLibDeviceNvvm.GenerateNvvm(majorIR, methods); + + if (string.IsNullOrEmpty(nvvmModule)) + return; + + // Create a new NVVM program. + result = nvvmAPI.CreateProgram(out var program); + + try + { + // Add custom NVVM module. + if (result == NvvmResult.NVVM_SUCCESS) + { + var nvvmModuleBytes = Encoding.ASCII.GetBytes(nvvmModule); + fixed (byte* nvvmPtr = nvvmModuleBytes) + { + result = nvvmAPI.AddModuleToProgram( + program, + new IntPtr(nvvmPtr), + new IntPtr(nvvmModuleBytes.Length), + null); + } + } + + // Add the LibDevice bit code. + if (result == NvvmResult.NVVM_SUCCESS) + { + fixed (byte* ptr = nvvmAPI.LibDeviceBytes) + { + result = nvvmAPI.LazyAddModuleToProgram( + program, + new IntPtr(ptr), + new IntPtr(nvvmAPI.LibDeviceBytes.Length), + null); + } + } + + // Compile the NVVM into PTX for the backend architecture. + if (result == NvvmResult.NVVM_SUCCESS) + { + var major = architecture.Major; + var minor = architecture.Minor; + var archOption = $"-arch=compute_{major}{minor}"; + var archOptionAscii = Encoding.ASCII.GetBytes(archOption); + fixed (byte* archOptionPtr = archOptionAscii) + { + var numOptions = 1; + var optionValues = stackalloc byte[sizeof(void*) * numOptions]; + var values = (void**)optionValues; + values[0] = archOptionPtr; + + result = nvvmAPI.CompileProgram( + program, + numOptions, + new IntPtr(values)); + } + } + + // Extract the PTX result and comment out the initial declarations. + if (result == NvvmResult.NVVM_SUCCESS) + { + result = nvvmAPI.GetCompiledResult(program, out var compiledPTX); + if (result == NvvmResult.NVVM_SUCCESS) + { + ptx = compiledPTX; + } + } + } + finally + { + nvvmAPI.DestroyProgram(ref program); + } + } + } +} diff --git a/Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.tt b/Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.tt index b2af30d26..c678a53b5 100644 --- a/Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.tt +++ b/Src/ILGPU/Backends/PTX/PTXLibDeviceMethods.tt @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2021-2022 ILGPU Project +// Copyright (c) 2021-2024 ILGPU Project // www.ilgpu.net // // File: PTXLibDeviceMethods.tt/PTXLibDeviceMethods.cs @@ -24,7 +24,6 @@ var functions = LibDeviceFunctions.Load(rootPath, "../../Static/CudaLibDevice.xm #> using ILGPU.Frontend; -using ILGPU.IR; using System.Runtime.CompilerServices; #pragma warning disable IDE1006 // Naming Styles @@ -37,10 +36,6 @@ namespace ILGPU.Backends.PTX /// internal static class PTXLibDeviceMethods { - internal static bool IsLibDeviceMethod(Method method) => - method.HasSource && - method.Source.DeclaringType == typeof(PTXLibDeviceMethods); - <# foreach (var func in functions) { diff --git a/Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.tt b/Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.tt index 11c3e9137..4c8e3701e 100644 --- a/Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.tt +++ b/Src/ILGPU/Backends/PTX/PTXLibDeviceNvvm.tt @@ -23,7 +23,6 @@ string rootPath = Host.ResolvePath("."); var functions = LibDeviceFunctions.Load(rootPath, "../../Static/CudaLibDevice.xml"); #> -using ILGPU.IR; using System.Collections.Generic; using System.Text; @@ -70,15 +69,14 @@ namespace ILGPU.Backends.PTX /// The NVVM IR major version. /// The methods to check. /// The NVVM module, or an empty string. - public static string GenerateNvvm(int majorIR, IEnumerable methods) + public static string GenerateNvvm(int majorIR, IEnumerable methods) { var builder = new StringBuilder(); bool addPrefix = true; - foreach (var method in methods) + foreach (var methodName in methods) { - if (method.HasSource && - fragments.TryGetValue(method.Source.Name, out var methodNvvm)) + if (fragments.TryGetValue(methodName, out var methodNvvm)) { if (addPrefix) { diff --git a/Src/ILGPU/Backends/PTX/PTXLibDevicePtx.tt b/Src/ILGPU/Backends/PTX/PTXLibDevicePtx.tt new file mode 100644 index 000000000..a1ae88707 --- /dev/null +++ b/Src/ILGPU/Backends/PTX/PTXLibDevicePtx.tt @@ -0,0 +1,192 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: PTXLibDevicePtx.tt/PTXLibDevicePtx.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../../Static/TextTransformHelpers.ttinclude" #> +<#@ include file="../../Static/TypeInformation.ttinclude" #> +<#@ include file="../../Static/CudaLibDevice.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#@ output extension=".cs" #> +<# +string rootPath = Host.ResolvePath("."); +var lib = LibDevicePtxFunctions.Load(rootPath, "../../Static/CudaLibDevicePtx.xml"); +#> + +using ILGPU.Runtime.Cuda; +using ILGPU.Util; +using System.Collections.Generic; +using System.Text; + +// disable: max_line_length + +namespace ILGPU.Backends.PTX +{ + /// + /// Contains the pre-generated PTX for Cuda LibDevice functions. + /// + public static class PTXLibDevicePtx + { + /// + /// Minimum architecture required to use the pre-generated PTX. + /// + public static CudaArchitecture MinArchtecture { get; } = + new CudaArchitecture(<#= lib.MinArchitectureMajor #>, <#= lib.MinArchitectureMinor #>); + + /// + /// Minimum instruction set required to use the pre-generated PTX. + /// + public static CudaInstructionSet MinInstructionSet { get; } = + new CudaInstructionSet(<#= lib.MinInstructionSetMajor #>, <#= lib.MinInstructionSetMinor #>); + + /// + /// Minimum instruction set required to use the pre-generated PTX. + /// + public static CudaDriverVersion MinDriverVersion { get; } = + CudaDriverVersion.FromValue(<#= lib.MinDriverVersion #>); + + #region Generated PTX + +<# + foreach (var func in lib.Functions) + { + PushIndent(2); + WriteLibDeviceFunctionPtx(func); + PopIndent(); + } + +#> + #endregion + + private static readonly Dictionary fragments = + new Dictionary() + { +<# + PushIndent(4); + foreach (var func in lib.Functions) + WriteLine($"{{ \"{func.Name}\", {func.Name} }},"); + PopIndent(); +#> + }; + + private static readonly Dictionary headers = + new Dictionary() + { +<# + PushIndent(4); + foreach (var func in lib.Functions) + WriteLine($"{{ \"{func.Name}\", {func.Name}{DeclarationSuffix} }},"); + PopIndent(); +#> + }; + + /// + /// Returns the pre-generated PTX code for the Cuda LibDevice functions. + /// + /// The LibDevice method names. + /// Filled in with the PTX modules. + /// Filled in with the PTX declarations. + /// The PTX modules. + internal static void GetPtx( + IEnumerable methods, + ref InlineList ptxModules, + out string ptxDeclarations) + { + var declarationBuilder = new StringBuilder(); + + foreach (var methodName in methods) + { + if (fragments.TryGetValue(methodName, out var methodPtx)) + { + if (headers.TryGetValue(methodName, out var header)) + declarationBuilder.AppendLine(header); + ptxModules.Add(methodPtx); + } + } + + ptxDeclarations = declarationBuilder.ToString(); + } + } +} +<#+ + +public const string DeclarationSuffix = "_decl"; + +void WriteLibDeviceFunctionPtx(LibDevicePtxFunction func) +{ + // PTX declaration + WriteLine($"private const string {func.Name}{DeclarationSuffix} ="); + + PopIndent(); + PopIndent(); + WriteLine("\"\"\""); + + WriteLine(func.PtxDeclaration); + WriteLine("\"\"\";"); + WriteLine(); + + PushIndent(2); + + // PTX module + WriteLine($"private const string {func.Name} ="); + + PopIndent(); + PopIndent(); + WriteLine("\"\"\""); + + WriteLine(func.PtxModule); + WriteLine("\"\"\";"); + WriteLine(); + + PushIndent(); +} + +[XmlRoot("LibDevicePtx")] +public class LibDevicePtxFunctions +{ + internal static LibDevicePtxFunctions Load(string rootPath, string fileName) => + XmlHelper.Load(rootPath, fileName); + + [XmlAttribute] + public int MinArchitectureMajor { get; set; } + + [XmlAttribute] + public int MinArchitectureMinor { get; set; } + + [XmlAttribute] + public int MinInstructionSetMajor { get; set; } + + [XmlAttribute] + public int MinInstructionSetMinor { get; set; } + + [XmlAttribute] + public int MinDriverVersion { get; set; } + + [XmlElement("Function")] + public LibDevicePtxFunction[] Functions { get; set; } +} + +[XmlRoot("Function")] +public class LibDevicePtxFunction +{ + [XmlAttribute] + public string Name { get; set; } + + [XmlAttribute] + public string PtxModule { get; set; } + + [XmlAttribute] + public string PtxDeclaration { get; set; } +} + +#> \ No newline at end of file diff --git a/Src/ILGPU/Context.Builder.cs b/Src/ILGPU/Context.Builder.cs index a74dcb7a4..cb328e091 100644 --- a/Src/ILGPU/Context.Builder.cs +++ b/Src/ILGPU/Context.Builder.cs @@ -323,65 +323,41 @@ public Builder Profiling() /// Automatically detects the CUDA SDK location. /// /// The current builder instance. + [Obsolete("LibDevice is now embedded into ILGPU. Use LibDeviceOverride() if" + + " you want to load a specific version of LibDevice at runtime.")] public Builder LibDevice() => - LibDevice(throwIfNotFound: true); + this; /// - /// Turns on LibDevice support. - /// Automatically detects the CUDA SDK location. + /// Overrides the version of LibDevice embedded into ILGPU, and loads from the + /// CUDA SDK at runtime. Automatically detects the CUDA SDK location. /// - /// Determines error handling. /// The current builder instance. - internal Builder LibDevice(bool throwIfNotFound) + public Builder LibDeviceOverride() { - // Find the CUDA installation path. - var cudaEnvName = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) - ? "CUDA_PATH" - : "CUDA_HOME"; - var cudaPath = Environment.GetEnvironmentVariable(cudaEnvName); - if (string.IsNullOrEmpty(cudaPath)) + PTXLibDevice.FindLibDevicePaths( + out var cudaEnvName, + out var nvvmBinDir, + out var libNvvmPath, + out var libDeviceDir, + out var libDevicePath); + if (string.IsNullOrEmpty(cudaEnvName)) { - return throwIfNotFound - ? throw new NotSupportedException(string.Format( + throw new NotSupportedException(string.Format( RuntimeErrorMessages.NotSupportedLibDeviceEnvironmentVariable, - cudaEnvName)) - : this; + cudaEnvName)); } - var nvvmRoot = Path.Combine(cudaPath, "nvvm"); - - // Find the NVVM DLL. - var nvvmBinName = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) - ? "bin" - : "lib64"; - var nvvmBinDir = Path.Combine(nvvmRoot, nvvmBinName); - var nvvmSearchPattern = - RuntimeInformation.IsOSPlatform(OSPlatform.Windows) - ? "nvvm64*.dll" - : "libnvvm*.so"; - var nvvmFiles = Directory.EnumerateFiles(nvvmBinDir, nvvmSearchPattern); - var libNvvmPath = nvvmFiles.FirstOrDefault(); if (libNvvmPath is null) { - return throwIfNotFound - ? throw new NotSupportedException(string.Format( + throw new NotSupportedException(string.Format( RuntimeErrorMessages.NotSupportedLibDeviceNotFoundNvvmDll, - nvvmBinDir)) - : this; + nvvmBinDir)); } - - // Find the LibDevice Bitcode. - var libDeviceDir = Path.Combine(nvvmRoot, "libdevice"); - var libDeviceFiles = Directory.EnumerateFiles( - libDeviceDir, - "libdevice.*.bc"); - var libDevicePath = libDeviceFiles.FirstOrDefault(); if (libDevicePath is null) { - return throwIfNotFound - ? throw new NotSupportedException(string.Format( + throw new NotSupportedException(string.Format( RuntimeErrorMessages.NotSupportedLibDeviceNotFoundBitCode, - libDeviceDir)) - : this; + libDeviceDir)); } LibNvvmPath = libNvvmPath; @@ -396,7 +372,19 @@ internal Builder LibDevice(bool throwIfNotFound) /// Path to LibNvvm DLL. /// Path to LibDevice bitcode. /// The current builder instance. - public Builder LibDevice(string libNvvmPath, string libDevicePath) + [Obsolete("LibDevice is now embedded into ILGPU. Use LibDeviceOverride() if" + + " you want to load a specific version of LibDevice at runtime.")] + public Builder LibDevice(string libNvvmPath, string libDevicePath) => + this; + + /// + /// Overrides the version of LibDevice embedded into ILGPU, and loads from the + /// CUDA SDK at runtime. Explicitly specifies the LibDevice location. + /// + /// Path to LibNvvm DLL. + /// Path to LibDevice bitcode. + /// The current builder instance. + public Builder LibDeviceOverride(string libNvvmPath, string libDevicePath) { LibNvvmPath = libNvvmPath; LibDevicePath = libDevicePath; diff --git a/Src/ILGPU/ILGPU.csproj b/Src/ILGPU/ILGPU.csproj index 4228847f2..3bf9267d8 100644 --- a/Src/ILGPU/ILGPU.csproj +++ b/Src/ILGPU/ILGPU.csproj @@ -173,6 +173,10 @@ True PrimitiveDataBlocks.tt + + TextTemplatingFileGenerator + PTXLibDevicePtx.cs + TextTemplatingFileGenerator CudaArchitecture.Generated.cs @@ -198,6 +202,11 @@ True PTXLibDeviceNvvm.tt + + True + True + PTXLibDevicePtx.tt + True True diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs index 7c6931ffb..4f5129ace 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.Designer.cs @@ -327,6 +327,24 @@ internal static string NotSupportedLibDeviceNotFoundNvvmDll { } } + /// + /// Looks up a localized string similar to Cannot find LibDevice implementation. Ensure that LibDevice is enabled from the ContextBuilder.. + /// + internal static string NotSupportedLibDeviceNotInitialized { + get { + return ResourceManager.GetString("NotSupportedLibDeviceNotInitialized", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Cannot find LibDevice implementation. Upgrade to Cuda driver >= v{0}.{1} or override LibDevice from the ContextBuilder.. + /// + internal static string NotSupportedLibDevicePreGeneratedNewer { + get { + return ResourceManager.GetString("NotSupportedLibDevicePreGeneratedNewer", resourceCulture); + } + } + /// /// Looks up a localized string similar to Type '{0}' is not blittable. /// diff --git a/Src/ILGPU/Resources/RuntimeErrorMessages.resx b/Src/ILGPU/Resources/RuntimeErrorMessages.resx index d2b8d8819..698b04d9e 100644 --- a/Src/ILGPU/Resources/RuntimeErrorMessages.resx +++ b/Src/ILGPU/Resources/RuntimeErrorMessages.resx @@ -241,9 +241,15 @@ Unknown parent accelerator - Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false' + Velocity accelerator requires 64-bit application ({0} not supported). Ensure Prefer32Bit is set to 'false' - The Velocity accelerator supports little-endian machines only + The Velocity accelerator supports little-endian machines only + + + Cannot find LibDevice implementation. Ensure that LibDevice is enabled from the ContextBuilder. + + + Cannot find LibDevice implementation. Upgrade to Cuda driver >= v{0}.{1} or override LibDevice from the ContextBuilder. \ No newline at end of file diff --git a/Src/ILGPU/Runtime/Cuda/CudaAPI.cs b/Src/ILGPU/Runtime/Cuda/CudaAPI.cs index 98e6053d7..d25cb9a18 100644 --- a/Src/ILGPU/Runtime/Cuda/CudaAPI.cs +++ b/Src/ILGPU/Runtime/Cuda/CudaAPI.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2020-2023 ILGPU Project +// Copyright (c) 2020-2024 ILGPU Project // www.ilgpu.net // // File: CudaAPI.cs @@ -10,6 +10,7 @@ // --------------------------------------------------------------------------------------- using ILGPU.Resources; +using ILGPU.Util; using System; using System.Diagnostics; using System.Runtime.CompilerServices; @@ -573,6 +574,19 @@ public CudaError LoadModule( public unsafe CudaError LoadModule( out IntPtr kernelModule, string moduleData, + out string? errorLog) => + LoadModule(out kernelModule, new string[] { moduleData }, out errorLog); + + /// + /// Loads the given kernel module into driver memory. + /// + /// The loaded module. + /// The module data to load. + /// The error log. + /// The error status. + public unsafe CudaError LoadModule( + out IntPtr kernelModule, + ReadOnlySpan modules, out string? errorLog) { const int BufferSize = 1024; @@ -591,16 +605,62 @@ public unsafe CudaError LoadModule( values[0] = errorBuffer; values[1] = (void*)BufferSize; - var result = LoadModule( - out kernelModule, - moduleData, + var result = CurrentAPI.cuLinkCreate_v2( NumOptions, new IntPtr(options), - new IntPtr(optionValues)); - + new IntPtr(optionValues), + out IntPtr linkState); errorLog = result != CudaError.CUDA_SUCCESS ? Encoding.ASCII.GetString(errorBuffer, BufferSize) : null; + CudaException.ThrowIfFailed(result); + + using var stringCache = new StringCache(); + try + { + foreach (var module in modules) + { + var moduleStrEntry = stringCache.AddString(module); + + result = CurrentAPI.cuLinkAddData_v2( + linkState, + new IntPtr(1), // CU_JIT_INPUT_PTX + moduleStrEntry.NativePtr, + new IntPtr(moduleStrEntry.Length), + string.Empty, + 0, + IntPtr.Zero, + IntPtr.Zero); + errorLog = result != CudaError.CUDA_SUCCESS + ? Encoding.ASCII.GetString(errorBuffer, BufferSize) + : null; + CudaException.ThrowIfFailed(result); + } + + result = CurrentAPI.cuLinkComplete( + linkState, + out IntPtr cubin, + out IntPtr cubinSize); + errorLog = result != CudaError.CUDA_SUCCESS + ? Encoding.ASCII.GetString(errorBuffer, BufferSize) + : null; + CudaException.ThrowIfFailed(result); + + result = CurrentAPI.cuModuleLoadDataEx( + out kernelModule, + cubin, + NumOptions, + new IntPtr(options), + new IntPtr(optionValues)); + errorLog = result != CudaError.CUDA_SUCCESS + ? Encoding.ASCII.GetString(errorBuffer, BufferSize) + : null; + } + finally + { + CurrentAPI.cuLinkDestroy(linkState); + } + return result; } diff --git a/Src/ILGPU/Runtime/Cuda/CudaAPI.xml b/Src/ILGPU/Runtime/Cuda/CudaAPI.xml index ea0913437..21da259ba 100644 --- a/Src/ILGPU/Runtime/Cuda/CudaAPI.xml +++ b/Src/ILGPU/Runtime/Cuda/CudaAPI.xml @@ -150,6 +150,10 @@ + + + + @@ -158,6 +162,13 @@ + + + + + + + @@ -180,6 +191,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Src/ILGPU/Runtime/Cuda/CudaContextExtensions.cs b/Src/ILGPU/Runtime/Cuda/CudaContextExtensions.cs index 728e4b4b2..253f69d5f 100644 --- a/Src/ILGPU/Runtime/Cuda/CudaContextExtensions.cs +++ b/Src/ILGPU/Runtime/Cuda/CudaContextExtensions.cs @@ -90,10 +90,6 @@ public static Context.Builder CudaInternal( Backend.RuntimePlatform)); } - // Silently enable automatic LibDevice detection, if not already configured. - if (builder.LibDevicePath is null && builder.LibNvvmPath is null) - builder.LibDevice(throwIfNotFound: false); - if (IsRunningOnWSL()) { NativeLibrary.SetDllImportResolver( diff --git a/Src/ILGPU/Runtime/Cuda/CudaKernel.cs b/Src/ILGPU/Runtime/Cuda/CudaKernel.cs index 3f5c6c07c..6415f4575 100644 --- a/Src/ILGPU/Runtime/Cuda/CudaKernel.cs +++ b/Src/ILGPU/Runtime/Cuda/CudaKernel.cs @@ -1,6 +1,6 @@ // --------------------------------------------------------------------------------------- // ILGPU -// Copyright (c) 2017-2023 ILGPU Project +// Copyright (c) 2017-2024 ILGPU Project // www.ilgpu.net // // File: CudaKernel.cs @@ -50,7 +50,7 @@ internal CudaKernel( { var kernelLoaded = CurrentAPI.LoadModule( out modulePtr, - kernel.PTXAssembly, + kernel.PTXAssembly.Modules.AsSpan(), out string? errorLog); if (kernelLoaded != CudaError.CUDA_SUCCESS) { diff --git a/Src/ILGPU/Static/CudaLibDevicePtx.xml b/Src/ILGPU/Static/CudaLibDevicePtx.xml new file mode 100644 index 000000000..7e4fd1ff0 --- /dev/null +++ b/Src/ILGPU/Static/CudaLibDevicePtx.xml @@ -0,0 +1,322 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Src/ILGPU/Util/StringCache.cs b/Src/ILGPU/Util/StringCache.cs new file mode 100644 index 000000000..bdb7dc795 --- /dev/null +++ b/Src/ILGPU/Util/StringCache.cs @@ -0,0 +1,84 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: StringCache.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System; +using System.Runtime.InteropServices; +using System.Text; + +namespace ILGPU.Util +{ + /// + /// Maintains a cache of strings that have been marshalled to native strings + /// and need to be kept in memory. + /// + internal class StringCache : DisposeBase + { + #region Nested Types + + public readonly struct StringCacheEntry + { + public IntPtr NativePtr { get; } + public int Length { get; } + + public StringCacheEntry(IntPtr ptr, int length) + { + NativePtr = ptr; + Length = length; + } + } + + #endregion + + #region Instance + + private readonly InlineList _cache = + InlineList.Create(1); + + #endregion + + #region Methods + + /// + /// Adds a string to the cache, and returns the native pointer and length. + /// + /// The string to convert. + /// The native pointer and length. + public unsafe StringCacheEntry AddString(string value) + { + // Create null-terminated native string + var len = Encoding.ASCII.GetMaxByteCount(value.Length); + var ptr = Marshal.AllocHGlobal(len + 1); + var ptrSpan = new Span((void*)ptr, len); + + len = Encoding.ASCII.GetBytes(value, ptrSpan); + ptrSpan[len] = 0; + + // Add to cache, so that memory is valid until cache is disposed. + var entry = new StringCacheEntry(ptr, len); + _cache.Add(entry); + return entry; + } + + #endregion + + #region IDisposable + + /// + protected override void Dispose(bool disposing) + { + foreach (var entry in _cache) + Marshal.FreeHGlobal(entry.NativePtr); + base.Dispose(disposing); + } + + #endregion + } +} diff --git a/Tools/CudaGenerateLibDeviceTool/CudaDriverVersionUtils.tt b/Tools/CudaGenerateLibDeviceTool/CudaDriverVersionUtils.tt new file mode 100644 index 000000000..bd634a388 --- /dev/null +++ b/Tools/CudaGenerateLibDeviceTool/CudaDriverVersionUtils.tt @@ -0,0 +1,55 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: CudaDriverVersionUtils.tt/CudaDriverVersionUtils.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +<#@ template debug="false" hostspecific="true" language="C#" #> +<#@ include file="../../Src/ILGPU/Static/TypeInformation.ttinclude" #> +<#@ include file="../../Src/ILGPU/Static/CudaVersions.ttinclude" #> +<#@ assembly name="System.Core" #> +<#@ import namespace="System.Linq" #> +<#@ import namespace="System.Text" #> +<#@ import namespace="System.Collections.Generic" #> +<#@ output extension=".cs" #> +<# +string rootPath = Host.ResolvePath("../../Src/ILGPU/Static"); +var versions = CudaVersions.Load(rootPath, "CudaVersions.xml"); + +var instructionSets = + versions + .GroupBy(x => x.DriverVersion) + .OrderBy(x => x.Key) + .Select(g => (g.Key, g.Min(x => x.InstructionSetVersion))) + .ToArray(); + +#> + +namespace ILGPU.Runtime.Cuda +{ + internal class CudaDriverVersionUtils + { + /// + /// Maps Cuda driver version to the corresponding PTX ISA. + /// + internal static readonly Dictionary< + int, + CudaInstructionSet> DriverVersionLookup = + new Dictionary + { +<# foreach (var instructionSet in instructionSets) { #> +<# var drv = instructionSet.Item1; #> +<# var isa = instructionSet.Item2; #> + { + <#= drv.Major * 1000 + drv.Minor * 10 #>, + CudaInstructionSet.ISA_<#= isa.Major #><#= isa.Minor #> + }, +<# } #> + }; + } +} \ No newline at end of file diff --git a/Tools/CudaGenerateLibDeviceTool/CudaGenerateLibDeviceTool.csproj b/Tools/CudaGenerateLibDeviceTool/CudaGenerateLibDeviceTool.csproj new file mode 100644 index 000000000..6457a45fd --- /dev/null +++ b/Tools/CudaGenerateLibDeviceTool/CudaGenerateLibDeviceTool.csproj @@ -0,0 +1,80 @@ + + + + Exe + net8.0 + enable + enable + true + + + + + + True + True + PTXLibDeviceNvvm.tt + + + + + True + True + CudaArchitecture.Generated.tt + + + + True + True + CudaInstructionSet.Generated.tt + + + + + + + + + + TextTemplatingFileGenerator + PTXLibDeviceNvvm.cs + + + TextTemplatingFileGenerator + CudaArchitecture.Generated.cs + + + TextTemplatingFileGenerator + CudaInstructionSet.Generated.cs + + + True + True + CudaDriverVersionUtils.tt + + + + + + + + + + + + + + True + True + CudaDriverVersionUtils.tt + + + + + + TextTemplatingFileGenerator + CudaDriverVersionUtils.cs + + + + diff --git a/Tools/CudaGenerateLibDeviceTool/MinimalCudaAPI.cs b/Tools/CudaGenerateLibDeviceTool/MinimalCudaAPI.cs new file mode 100644 index 000000000..68f803519 --- /dev/null +++ b/Tools/CudaGenerateLibDeviceTool/MinimalCudaAPI.cs @@ -0,0 +1,50 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: MinimalCudaAPI.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using System.Runtime.InteropServices; + +namespace CudaGenerateLibDeviceTool +{ + /// + /// Minimal Cuda API binding to allow detecting the current Cuda driver version. + /// + internal static class MinimalCudaAPI + { + delegate int CudaInit(int flags); + delegate int CudaDriverGetVersion(out int driverVersion); + + public static int GetCudaDriverVersion() + { + var cudaLibName = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) + ? "nvcuda" + : "cuda"; + var cudaAPI = NativeLibrary.Load(cudaLibName); + try + { + var cuInit = + Marshal.GetDelegateForFunctionPointer( + NativeLibrary.GetExport(cudaAPI, "cuInit")); + var cuDriverGetVersion = + Marshal.GetDelegateForFunctionPointer( + NativeLibrary.GetExport(cudaAPI, "cuDriverGetVersion")); + + if (cuInit(0) == 0 && cuDriverGetVersion(out int driverVersion) == 0) + return driverVersion; + } + finally + { + NativeLibrary.Free(cudaAPI); + } + + throw new NotImplementedException(); + } + } +} diff --git a/Tools/CudaGenerateLibDeviceTool/Program.cs b/Tools/CudaGenerateLibDeviceTool/Program.cs new file mode 100644 index 000000000..cb9ba27c7 --- /dev/null +++ b/Tools/CudaGenerateLibDeviceTool/Program.cs @@ -0,0 +1,128 @@ +// --------------------------------------------------------------------------------------- +// ILGPU +// Copyright (c) 2024 ILGPU Project +// www.ilgpu.net +// +// File: Program.cs +// +// This file is part of ILGPU and is distributed under the University of Illinois Open +// Source License. See LICENSE.txt for details. +// --------------------------------------------------------------------------------------- + +using ILGPU.Backends.PTX; +using ILGPU.Runtime.Cuda; +using System.Reflection; +using System.Text; +using System.Xml; + +namespace CudaGenerateLibDeviceTool +{ + internal class Program + { + static void Main(string[] args) + { + // Configure NVVM and LibDevice. + PTXLibDevice.FindLibDevicePaths( + out _, + out _, + out var libNvvmPath, + out _, + out var libDevicePath); + + using var nvvmAPI = NvvmAPI.Create( + libNvvmPath!, + libDevicePath!); + + // Generate the PTX for each of the LibDevice methods. + var filePath = Path.Combine(GetDefaultFolder(), "CudaLibDevicePtx.xml"); + var minArchitecture = CudaArchitecture.SM_60; + var driverVersion = MinimalCudaAPI.GetCudaDriverVersion(); + var instructionSet = + CudaDriverVersionUtils.DriverVersionLookup[driverVersion]; + + using var doc = XmlWriter.Create( + filePath, + new XmlWriterSettings + { + Indent = true, + Encoding = Encoding.UTF8 + }); + doc.WriteStartElement("LibDevicePtx"); + doc.WriteAttributeString("MinArchitectureMajor", $"{minArchitecture.Major}"); + doc.WriteAttributeString("MinArchitectureMinor", $"{minArchitecture.Minor}"); + doc.WriteAttributeString("MinInstructionSetMajor", $"{instructionSet.Major}"); + doc.WriteAttributeString("MinInstructionSetMinor", $"{instructionSet.Minor}"); + doc.WriteAttributeString("MinDriverVersion", $"{driverVersion}"); + + var methods = LoadMethodNames(); + + foreach (var method in methods) + { + PTXLibDevice.GenerateLibDeviceCode( + nvvmAPI, + minArchitecture, + new[] { method }, + out var ptx); + + ptx = ptx!.ReplaceLineEndings().Trim(); + var decl = ParseDeclaration(ptx); + + doc.WriteStartElement("Function"); + doc.WriteAttributeString("Name", method); + doc.WriteAttributeString("PtxModule", ptx); + doc.WriteAttributeString("PtxDeclaration", decl); + doc.WriteEndElement(); + } + } + + private static string ParseDeclaration(string ptx) + { + // The PTX function starts with .visible, and ends when the function body + // opens with a { brace. + // + // This is a forward declaration, so use .extern. + const string VisibleKeyword = ".visible"; + var startIdx = ptx.IndexOf(VisibleKeyword) + VisibleKeyword.Length; + var endIdx = ptx.IndexOf('{', startIdx); + + return string.Concat(".extern", ptx.AsSpan(startIdx, endIdx - startIdx), ";"); + } + + private static IEnumerable LoadMethodNames() + { + var doc = new XmlDocument(); + var filePath = Path.Combine(GetDefaultFolder(), "CudaLibDevice.xml"); + doc.Load(filePath); + + var functionNodes = doc.SelectNodes("//Function"); + if (functionNodes != null) + foreach (var element in functionNodes.OfType()) + yield return element.GetAttribute("Name"); + } + + private static string GetDefaultFolder() + { + var rootFolder = GetRepositoryFromFile()!.FullName; + return Path.Combine(rootFolder, "Src", "ILGPU", "Static"); + } + + private static DirectoryInfo? GetRepositoryFromFile() + { + const string DotGit = ".git"; + var file = new FileInfo(Assembly.GetEntryAssembly()!.Location); + var next = file.Directory; + + while (next != null) + { + if (next.Name.Equals(DotGit, StringComparison.OrdinalIgnoreCase)) + return default; + else if (Directory.Exists(Path.Combine(next.FullName, DotGit))) + return next; + + next = next.Parent; + } + + return default; + } + } +} diff --git a/Tools/Tools.sln b/Tools/Tools.sln index 0adc38216..170d11462 100644 --- a/Tools/Tools.sln +++ b/Tools/Tools.sln @@ -17,6 +17,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GenerateCompatibilitySuppre GenerateCompatibilitySuppressionFiles\GenerateCompatibilitySuppressionFiles.ps1 = GenerateCompatibilitySuppressionFiles\GenerateCompatibilitySuppressionFiles.ps1 EndProjectSection EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CudaGenerateLibDeviceTool", "CudaGenerateLibDeviceTool\CudaGenerateLibDeviceTool.csproj", "{87DB64B0-3A32-429A-8652-3348591602F9}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -31,6 +33,10 @@ Global {2F92ECC4-CE02-4B1A-8B59-DC866185DA69}.Debug|Any CPU.Build.0 = Debug|Any CPU {2F92ECC4-CE02-4B1A-8B59-DC866185DA69}.Release|Any CPU.ActiveCfg = Release|Any CPU {2F92ECC4-CE02-4B1A-8B59-DC866185DA69}.Release|Any CPU.Build.0 = Release|Any CPU + {87DB64B0-3A32-429A-8652-3348591602F9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {87DB64B0-3A32-429A-8652-3348591602F9}.Debug|Any CPU.Build.0 = Debug|Any CPU + {87DB64B0-3A32-429A-8652-3348591602F9}.Release|Any CPU.ActiveCfg = Release|Any CPU + {87DB64B0-3A32-429A-8652-3348591602F9}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE