From 2f3a7d3b75b92bbd47ba4cbee9f741e602b9f954 Mon Sep 17 00:00:00 2001 From: Douceur Date: Mon, 30 Oct 2023 22:14:24 +0100 Subject: [PATCH] remove comment of CudaErrCheck --- cuda/CudaMatch.cu | 57 --------------------------------------- cuda/CudaPlotPhase2.cu | 30 +-------------------- cuda/GpuDownloadStream.cu | 5 +--- cuda/GpuQueue.cu | 1 - cuda/GpuStreams.cu | 1 - cuda/chacha8.cu | 1 - 6 files changed, 2 insertions(+), 93 deletions(-) diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu index e827547f..359df68d 100644 --- a/cuda/CudaMatch.cu +++ b/cuda/CudaMatch.cu @@ -564,17 +564,6 @@ cudaError CudaHarvestMatchK32( HarvestMatchK32Kernel<<>>( devOutPairs, devMatchCount, devYEntries, entryCount, matchOffset ); -// #if _DEBUG -// uint32 matchCount = 0; -// CudaErrCheck( cudaMemcpyAsync( &matchCount, devMatchCount, sizeof( uint32 ) , cudaMemcpyDeviceToHost, stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); - -// Pair* matches = new Pair[matchCount]; -// CudaErrCheck( cudaMemcpyAsync( matches, devOutPairs, sizeof( Pair ) * matchCount , cudaMemcpyDeviceToHost, stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// #endif return cudaSuccess; } @@ -621,49 +610,3 @@ void CudaMatchBucketizedK32( MatchCudaK32Bucket<<>>( bucketMask, entryCount, cx.devGroupCount, devY, cx.devGroupBoundaries, cx.devMatchCount, cx.devMatches ); } -//----------------------------------------------------------- -// cudaError CudaHarvestMatchK32WithGroupScan( -// Pair* devOutPairs, -// uint32* devMatchCount, -// const uint32 maxMatches, -// uint32* devGroupIndices, -// uint32* devGroupIndicesTemp, -// const uint32 maxGroups, -// void* sortBuffer, -// const size_t sortBufferSize, -// const uint64* devYEntries, -// const uint32 entryCount, -// const uint32 matchOffset, -// cudaStream_t stream ) -// { -// // Scan for BC groups -// { -// const uint32 kblocks = 0; -// const uint32 kthreads = 0; - - -// // constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS ); -// // Initialize the entries to the max value so that they are not included in the sort -// CudaInitGroups<<>>( devGroupIndicesTemp, entryCount ); -// // CudaInitGroupsBucket<<>>( tmpGroupCounts ); - -// // Add first group and last ghost group -// CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount ); -// } - -// CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); -// CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); -// ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask ); - -// byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT ); -// size_t sortTmpSize = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 ); - -// #if _DEBUG -// size_t sortSize = 0; -// cub::DeviceRadixSort::SortKeys( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 ); -// ASSERT( sortSize <= sortTmpSize ); -// #endif - -// cub::DeviceRadixSort::SortKeys( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream ); - -// } diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu index 8d2d5094..87c1b16c 100644 --- a/cuda/CudaPlotPhase2.cu +++ b/cuda/CudaPlotPhase2.cu @@ -245,22 +245,7 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) #if _DEBUG p2.outMarks.WaitForCompletion(); - // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[1]->GetStream() ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[2]->GetStream() ) ); - // byte* hByteField = bbcvirtalloc( GetMarkingTableByteSize() ); - // uint64* hBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() ); - // uint64* rBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() ); - // CudaErrCheck( cudaMemcpyAsync( hByteField, devLMarks, GetMarkingTableByteSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - // CudaErrCheck( cudaMemcpyAsync( hBitField, bitfield, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - - // if( rTable < TableId::Table7 ) - // CudaErrCheck( cudaMemcpyAsync( rBitField, p2.devRMarks, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - - // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - // // (void)p2.outMarks.GetDeviceBuffer(); uint64* hBitField = cx.hostMarkingTables[(int)lTable]; std::atomic bitfieldPrunedEntryCount = 0; @@ -276,23 +261,10 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) uint64 localPrunedEntryCount = 0; uint64 rPrunedEntryCount = 0; - // BitField rMarks( rBitField, rEntryCount ); - // const byte* bytefield = hByteField; uint64 count, offset, end; - // // Count r entries again to make sure it's still valid - // if( rt < TableId::Table7 ) - // { - // GetThreadOffsets( self, rEntryCount, count, offset, end ); - // for( uint64 i = offset; i < end; i++ ) - // { - // if( rMarks.Get( i ) ) - // rPrunedEntryCount ++; - // } - - // rTablePrunedEntryCount += rPrunedEntryCount; - // } + GetThreadOffsets( self, lEntryCount, count, offset, end ); // for( uint64 i = offset; i < end; i++ ) diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu index 3d06973c..4d35a06b 100644 --- a/cuda/GpuDownloadStream.cu +++ b/cuda/GpuDownloadStream.cu @@ -64,17 +64,14 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co // const void* devBuffer = self->deviceBuffer[index]; // // Signal from the work stream when it has finished doing kernel work with the device buffer - // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) ); // // Ensure the work stream has completed writing data to the device buffer // cudaStream_t stream = self->queue->_stream; - // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) ); // // Copy - // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) ); - + // // Signal that the device buffer is free to be re-used // CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu index 399a0fbf..f673f640 100644 --- a/cuda/GpuQueue.cu +++ b/cuda/GpuQueue.cu @@ -171,7 +171,6 @@ struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, b CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); - // CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) ); CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu index 63700c9c..2c129408 100644 --- a/cuda/GpuStreams.cu +++ b/cuda/GpuStreams.cu @@ -99,7 +99,6 @@ void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, con // cpy.copy.size = copySize; // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){ // const CopyInfo& c = *reinterpret_cast( userData ); // IGpuBuffer* self = c.self; diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu index 7fb7c5d0..3e17c15a 100644 --- a/cuda/chacha8.cu +++ b/cuda/chacha8.cu @@ -293,7 +293,6 @@ static void DbgValidateBucket( CudaK32PlotContext& cx, const uint32 bucket ) if( _dbgPool == nullptr ) _dbgPool = new ThreadPool( SysHost::GetLogicalCPUCount() ); - //CudaErrCheck( cudaStreamSynchronize( cx.downloadStream ) ); Log::Line( "Validating bucket %u", bucket ); AnonMTJob::Run( *_dbgPool, [&cx, bucket]( AnonMTJob* self ) {