diff --git a/marlin/marlin_cuda_kernel.cu b/marlin/marlin_cuda_kernel.cu index 61b0658..ae4cef5 100644 --- a/marlin/marlin_cuda_kernel.cu +++ b/marlin/marlin_cuda_kernel.cu @@ -63,7 +63,7 @@ __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool ); } -// Asynchronous global->shared copy with a chache hint indicating that the values may be evicted immediately; used for +// Asynchronous global->shared copy with a cache hint indicating that the values may be evicted immediately; used for // quantized weights B, which are only accessed precisely once and should thus not pollute the L2 cache which we need // for inputs A and outputs C. __device__ inline void cp_async4_stream(void* smem_ptr, const void* glob_ptr) {