Skip to content

Commit

Permalink
Merge pull request #4882 from sfantao/amd-dev
Browse files Browse the repository at this point in the history
Add support for AMD GPUs using HIP/ROCm
  • Loading branch information
danpovey authored Nov 10, 2023
2 parents 49faa67 + 7efdeae commit e8ed610
Show file tree
Hide file tree
Showing 66 changed files with 1,106 additions and 169 deletions.
6 changes: 6 additions & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER)
endif
endif

ifeq ($(ROCM), true)
ifeq ($(WITH_CUDADECODER), true)
SUBDIRS += $(CUDADECODER)
endif
endif

SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
SUBDIRS_BIN = $(filter %bin, $(SUBDIRS))

Expand Down
9 changes: 8 additions & 1 deletion src/chain/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test
OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
language-model.o chain-denominator.o chain-training.o \
chain-generic-numerator.o
ifeq ($(CUDA), true)
ifeq ($(IS_GPU_BUILD), true)
OBJFILES += chain-kernels.o
endif

Expand All @@ -28,7 +28,14 @@ ifeq ($(CUDA), true)
endif

# Implicit rule for kernel compilation,
ifeq ($(CUDA), true)
%.o : %.cu
$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
endif
ifeq ($(ROCM), true)
%.o : %.cu
$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
endif


include ../makefiles/default_rules.mk
4 changes: 4 additions & 0 deletions src/chain/chain-kernels-ansi.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
#define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
#include "chain/chain-datastruct.h"

#ifdef __IS_HIP_COMPILE__
#include <hip/hip_runtime_api.h>
#endif

#if HAVE_CUDA == 1
extern "C" {

Expand Down
5 changes: 5 additions & 0 deletions src/chain/chain-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
#include <cfloat>
#include "chain/chain-kernels-ansi.h"

#ifdef __IS_HIP_COMPILE__
#define __CUDA_ARCH__ 800
#include <hip/hip_runtime.h>
#endif

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
#error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
configure with --use-cuda=no (this will disable the use of GPU).
Expand Down
93 changes: 89 additions & 4 deletions src/configure
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ Configuration options:
--cudatk-dir=DIR CUDA toolkit directory
--cuda-arch=FLAGS Override the default CUDA_ARCH flags. See:
https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
--use-rocm Build with ROCm
--rocm-dir=DIR ROCM directory
--rocm-targets=TGTS Comma separated list of GPU targets to target through ROCm
--debug-level=N Use assertion level 0 (disabled), 1, or 2 [default=1]
--double-precision Build with BaseFloat set to double if yes [default=no],
mostly useful for testing purposes.
Expand Down Expand Up @@ -248,6 +251,71 @@ function check_for_slow_expf {
fi
}

# ROCM is used only in selected directories including src/cudamatrix, src/nnet*
# and src/chain*. It is used to accelerate the neural network training.
# The rest of Kaldi runs on CPUs.

function configure_rocm {
# Check for ROCM in the system
if [ ! -d "$ROCMDIR" ]; then
for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then
ROCMDIR=$base
break
fi
done
fi

if [ -d "$ROCMDIR" ]; then
if [ ! -f $ROCMDIR/bin/hipcc ]; then
failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR"
fi
fi
echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
echo >> kaldi.mk
echo "# ROCm configuration" >> kaldi.mk
echo >> kaldi.mk
echo IS_GPU_BUILD = true >> kaldi.mk
echo ROCM = true >> kaldi.mk
echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk

echo "CUDA_ARCH = " >> kaldi.mk
echo "ROCM_ARCH_FLAGS = " >> kaldi.mk
for i in ${ROCM_TARGETS//,/ } ; do
echo "Targetting ROCm arch $i"
echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk
done

echo "HOST_ARCH = `uname -m`" >> kaldi.mk
echo >> kaldi.mk

ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1)
echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk
ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk

# Only ROCm 5.2+ is supported.
if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then
echo "\
WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required."
exit 1;
fi

# 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so,
# use direct calls to uname -m here
if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
cat makefiles/hip_64bit.mk >> kaldi.mk
else
echo "\
WARNING: ROCm will not be used!
ROCm is only supported with 64-bit Linux builds."
exit 1;
fi
}



# CUDA is used only in selected directories including src/cudamatrix, src/nnet*
# and src/chain*. It is used to accelerate the neural network training.
# The rest of Kaldi runs on CPUs.
Expand Down Expand Up @@ -379,6 +447,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
echo "# CUDA configuration" >> kaldi.mk
echo >> kaldi.mk

echo IS_GPU_BUILD = true >> kaldi.mk
echo CUDA = true >> kaldi.mk
echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
Expand Down Expand Up @@ -610,7 +679,8 @@ ENV_LDLIBS=$LDLIBS
debug_level=1
double_precision=false
dynamic_kaldi=false
use_cuda=true
use_cuda=false
use_rocm=false
with_cudadecoder=true
static_fst=false
static_math=false
Expand Down Expand Up @@ -659,8 +729,11 @@ do
--atlas-root=*)
GetSwitchExistingPathOrDie ATLASROOT "$1"
shift ;;
--use-cuda)
use_cuda=true;
--use-rocm)
use_rocm=true;
shift ;;
--use-rocm=no)
use_rocm=false;
shift ;;
--use-cuda=yes)
use_cuda=true;
Expand Down Expand Up @@ -737,6 +810,13 @@ do
--mathlib=*)
GetSwitchValueOrDie MATHLIB "$1"
shift ;;
--rocm-dir=*)
# ROCM is used in src/cudamatrix and src/nnet{,bin} only.
GetSwitchExistingPathOrDie ROCMDIR "$1"
shift ;;
--rocm-targets=*)
GetSwitchValueOrDie ROCM_TARGETS "$1"
shift ;;
--cudatk-dir=*)
# CUDA is used in src/cudamatrix and src/nnet{,bin} only.
GetSwitchExistingPathOrDie CUDATKDIR "$1"
Expand Down Expand Up @@ -976,7 +1056,11 @@ if $use_cuda; then
fi
echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
else
echo "WITH_CUDADECODER = false" >> kaldi.mk
if $use_rocm; then
echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
else
echo "WITH_CUDADECODER = false" >> kaldi.mk
fi
fi
echo >> kaldi.mk

Expand Down Expand Up @@ -1312,6 +1396,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
failure "Unsupported linear algebra library '$MATHLIB'"
fi
$use_cuda && configure_cuda
$use_rocm && configure_rocm
linux_configure_speex
else
failure "Could not detect the platform or we have not yet worked out the
Expand Down
10 changes: 9 additions & 1 deletion src/cudadecoder/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ all: ;
EXTRA_CXXFLAGS = -Wno-sign-compare
include ../kaldi.mk

ifeq ($(CUDA), true)
ifeq ($(IS_GPU_BUILD), true)
ifeq ($(WITH_CUDADECODER), true)

# Make sure we have CUDA_ARCH from kaldi.mk,
ifeq ($(CUDA), true)
ifndef CUDA_ARCH
$(error CUDA_ARCH is undefined, run 'src/configure')
endif
endif

TESTFILES =

Expand All @@ -34,8 +36,14 @@ LDLIBS += $(CUDA_LDLIBS)


# Implicit rule for kernel compilation
ifeq ($(CUDA), true)
%.o : %.cu
$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
endif
ifeq ($(ROCM), true)
%.o : %.cu
$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
endif

else
all:
Expand Down
5 changes: 5 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@

#include "cudadecoder/batched-static-nnet3-kernels.h"

#ifdef __IS_HIP_COMPILE__
#include "hip/hip_runtime.h"
#include "hipify.h"
#endif

#include <stdio.h>
namespace kaldi {
namespace cuda_decoder {
Expand Down
6 changes: 6 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@

#if HAVE_CUDA == 1

#ifdef __IS_HIP_COMPILE__
#include <hip/hip_runtime_api.h>

#include "hipify.h"
#else
#include <cuda_runtime_api.h>
#endif
#include "base/kaldi-types.h"

#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@

#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"

#ifdef __IS_HIP_COMPILE__
#include <roctracer/roctx.h>

#include "hipify.h"
#else
#include <nvToolsExt.h>
#endif

#include <mutex>
#include <numeric>
Expand Down
6 changes: 6 additions & 0 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@

#include <memory>

#ifdef __IS_HIP_COMPILE__
#include <roctracer/roctx.h>

#include "hipify.h"
#else
#include <nvToolsExt.h>
#endif

#include "base/kaldi-utils.h"
#include "cudadecoder/cuda-fst.h"
Expand Down
6 changes: 6 additions & 0 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@

#include <atomic>

#ifdef __IS_HIP_COMPILE__
#include <roctracer/roctx.h>

#include "hipify.h"
#else
#include <nvToolsExt.h>
#endif

namespace kaldi {
namespace cuda_decoder {
Expand Down
4 changes: 2 additions & 2 deletions src/cudadecoder/cuda-decoder-kernels-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
value.i2 = val;
if (old.i2.x <= val.x) return;
do {
assumed = old;
assumed.ull = old.ull;
old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
} while (old.ull != assumed.ull && old.i2.x > value.i2.x);
}
Expand All @@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) {
UInt64UnionInt2 old, assumed, value;
old.ull = *ptr64;
do {
assumed = old;
assumed.ull = old.ull;
value.i2.x = assumed.i2.x - sub.x;
value.i2.y = assumed.i2.y - sub.y;
old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
Expand Down
7 changes: 7 additions & 0 deletions src/cudadecoder/cuda-decoder-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef __IS_HIP_COMPILE__
#include <hipcub/hipcub.hpp>

#include "float.h"
#include "hipify.h"
#else
#include <cub/cub.cuh>
#endif
#include "cuda-decoder-kernels.h"
#include "cuda-decoder-kernels-utils.h"

Expand Down
Loading

0 comments on commit e8ed610

Please sign in to comment.