|
| 1 | +#include <cuda.h> |
| 2 | +#include "cuda_runtime.h" |
| 3 | +#include "device_launch_parameters.h" |
| 4 | +#include "sm_30_intrinsics.h" |
| 5 | + |
| 6 | +#include <stdio.h> |
| 7 | +#include <memory.h> |
| 8 | +#include <stdint.h> |
| 9 | + |
| 10 | +// aus cpu-miner.c |
| 11 | +extern int device_map[8]; |
| 12 | + |
| 13 | +// diese Struktur wird in der Init Funktion angefordert |
| 14 | +static cudaDeviceProp props[8]; |
| 15 | + |
| 16 | +static uint32_t *d_tempBranch1Nonces[8]; |
| 17 | +static uint32_t *d_tempBranch2Nonces[8]; |
| 18 | +static size_t *d_numValid[8]; |
| 19 | +static size_t *h_numValid[8]; |
| 20 | + |
| 21 | +static uint32_t *d_partSum1[8], *d_partSum2[8]; // 2x partielle summen |
| 22 | +static uint32_t *d_validTemp1[8], *d_validTemp2[8]; |
| 23 | + |
| 24 | +// Zwischenspeicher |
| 25 | +static uint32_t *d_tempBranchAllNonces[8]; |
| 26 | + |
| 27 | +// aus heavy.cu |
| 28 | +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
| 29 | + |
| 30 | + |
| 31 | + |
| 32 | +// Setup-Funktionen |
| 33 | +__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) |
| 34 | +{ |
| 35 | + cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); |
| 36 | + |
| 37 | + // wir brauchen auch Speicherplatz auf dem Device |
| 38 | + cudaMalloc(&d_tempBranchAllNonces[thr_id], sizeof(uint32_t) * threads); |
| 39 | + cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads); |
| 40 | + cudaMalloc(&d_tempBranch2Nonces[thr_id], sizeof(uint32_t) * threads); |
| 41 | + cudaMalloc(&d_numValid[thr_id], 2*sizeof(size_t)); |
| 42 | + cudaMallocHost(&h_numValid[thr_id], 2*sizeof(size_t)); |
| 43 | + |
| 44 | + uint32_t s1; |
| 45 | + s1 = threads / 256; |
| 46 | + |
| 47 | + cudaMalloc(&d_partSum1[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
| 48 | + cudaMalloc(&d_partSum2[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
| 49 | + |
| 50 | + cudaMalloc(&d_validTemp1[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) |
| 51 | + cudaMalloc(&d_validTemp2[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) |
| 52 | +} |
| 53 | + |
| 54 | +// Die Testfunktion (zum Erstellen der TestMap) |
| 55 | +__global__ void jackpot_compactTest_gpu_TEST_64(int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesFull, |
| 56 | + uint32_t *d_nonces1, uint32_t *d_nonces2, |
| 57 | + uint32_t *d_validT1, uint32_t *d_validT2) |
| 58 | +{ |
| 59 | + int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
| 60 | + if (thread < threads) |
| 61 | + { |
| 62 | + // bestimme den aktuellen Zähler |
| 63 | + uint32_t nounce = startNounce + thread; |
| 64 | + uint32_t *inpHash = &inpHashes[16 * thread]; |
| 65 | + |
| 66 | + uint32_t tmp = inpHash[0] & 0x01; |
| 67 | + uint32_t val1 = (tmp == 1); |
| 68 | + uint32_t val2 = (tmp == 0); |
| 69 | + |
| 70 | + d_nonces1[thread] = val1; |
| 71 | + d_validT1[thread] = val1; |
| 72 | + d_nonces2[thread] = val2; |
| 73 | + d_validT2[thread] = val2; |
| 74 | + d_noncesFull[thread] = nounce; |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +// Die Summenfunktion (vom NVIDIA SDK) |
| 79 | +__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL) |
| 80 | +{ |
| 81 | + extern __shared__ uint32_t sums[]; |
| 82 | + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
| 83 | + //int lane_id = id % warpSize; |
| 84 | + int lane_id = id % width; |
| 85 | + // determine a warp_id within a block |
| 86 | + //int warp_id = threadIdx.x / warpSize; |
| 87 | + int warp_id = threadIdx.x / width; |
| 88 | + |
| 89 | + // Below is the basic structure of using a shfl instruction |
| 90 | + // for a scan. |
| 91 | + // Record "value" as a variable - we accumulate it along the way |
| 92 | + uint32_t value = data[id]; |
| 93 | + |
| 94 | + // Now accumulate in log steps up the chain |
| 95 | + // compute sums, with another thread's value who is |
| 96 | + // distance delta away (i). Note |
| 97 | + // those threads where the thread 'i' away would have |
| 98 | + // been out of bounds of the warp are unaffected. This |
| 99 | + // creates the scan sum. |
| 100 | +#pragma unroll |
| 101 | + |
| 102 | + for (int i=1; i<=width; i*=2) |
| 103 | + { |
| 104 | + uint32_t n = __shfl_up((int)value, i, width); |
| 105 | + |
| 106 | + if (lane_id >= i) value += n; |
| 107 | + } |
| 108 | + |
| 109 | + // value now holds the scan value for the individual thread |
| 110 | + // next sum the largest values for each warp |
| 111 | + |
| 112 | + // write the sum of the warp to smem |
| 113 | + //if (threadIdx.x % warpSize == warpSize-1) |
| 114 | + if (threadIdx.x % width == width-1) |
| 115 | + { |
| 116 | + sums[warp_id] = value; |
| 117 | + } |
| 118 | + |
| 119 | + __syncthreads(); |
| 120 | + |
| 121 | + // |
| 122 | + // scan sum the warp sums |
| 123 | + // the same shfl scan operation, but performed on warp sums |
| 124 | + // |
| 125 | + if (warp_id == 0) |
| 126 | + { |
| 127 | + uint32_t warp_sum = sums[lane_id]; |
| 128 | + |
| 129 | + for (int i=1; i<=width; i*=2) |
| 130 | + { |
| 131 | + uint32_t n = __shfl_up((int)warp_sum, i, width); |
| 132 | + |
| 133 | + if (lane_id >= i) warp_sum += n; |
| 134 | + } |
| 135 | + |
| 136 | + sums[lane_id] = warp_sum; |
| 137 | + } |
| 138 | + |
| 139 | + __syncthreads(); |
| 140 | + |
| 141 | + // perform a uniform add across warps in the block |
| 142 | + // read neighbouring warp's sum and add it to threads value |
| 143 | + uint32_t blockSum = 0; |
| 144 | + |
| 145 | + if (warp_id > 0) |
| 146 | + { |
| 147 | + blockSum = sums[warp_id-1]; |
| 148 | + } |
| 149 | + |
| 150 | + value += blockSum; |
| 151 | + |
| 152 | + // Now write out our result |
| 153 | + data[id] = value; |
| 154 | + |
| 155 | + // last thread has sum, write write out the block's sum |
| 156 | + if (partial_sums != NULL && threadIdx.x == blockDim.x-1) |
| 157 | + { |
| 158 | + partial_sums[blockIdx.x] = value; |
| 159 | + } |
| 160 | +} |
| 161 | + |
| 162 | +// Uniform add: add partial sums array |
| 163 | +__global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) |
| 164 | +{ |
| 165 | + __shared__ uint32_t buf; |
| 166 | + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
| 167 | + |
| 168 | + if (id > len) return; |
| 169 | + |
| 170 | + if (threadIdx.x == 0) |
| 171 | + { |
| 172 | + buf = partial_sums[blockIdx.x]; |
| 173 | + } |
| 174 | + |
| 175 | + __syncthreads(); |
| 176 | + data[id] += buf; |
| 177 | +} |
| 178 | + |
| 179 | +// Der Scatter |
| 180 | +__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *data, uint32_t *valid, uint32_t *sum, uint32_t *outp) |
| 181 | +{ |
| 182 | + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
| 183 | + if( valid[id] ) |
| 184 | + { |
| 185 | + int idx = sum[id]; |
| 186 | + if(idx > 0) |
| 187 | + outp[idx-1] = data[id]; |
| 188 | + } |
| 189 | +} |
| 190 | + |
| 191 | +////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
| 192 | +__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, size_t *nrm, |
| 193 | + uint32_t *d_nonces1, uint32_t *d_nonces2) |
| 194 | +{ |
| 195 | + // threadsPerBlock ausrechnen |
| 196 | + int blockSize = 256; |
| 197 | + int thr1 = threads / blockSize; |
| 198 | + int thr2 = threads / (blockSize*blockSize); |
| 199 | + |
| 200 | + // 1 |
| 201 | + jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id]); |
| 202 | + jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
| 203 | + jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
| 204 | + cudaStreamSynchronize(NULL); |
| 205 | + cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
| 206 | + jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
| 207 | + jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
| 208 | + |
| 209 | + // 2 |
| 210 | + jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id]); |
| 211 | + jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
| 212 | + jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
| 213 | + cudaStreamSynchronize(NULL); |
| 214 | + cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
| 215 | + jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
| 216 | + jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
| 217 | + |
| 218 | + // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten |
| 219 | + // Schritt 3: Scatter |
| 220 | + jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp1[thr_id], d_tempBranch1Nonces[thr_id], d_nonces1); |
| 221 | + jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp2[thr_id], d_tempBranch2Nonces[thr_id], d_nonces2); |
| 222 | + cudaStreamSynchronize(NULL); |
| 223 | +} |
| 224 | + |
| 225 | +__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, |
| 226 | + uint32_t *d_nonces1, size_t *nrm1, |
| 227 | + uint32_t *d_nonces2, size_t *nrm2, |
| 228 | + int order) |
| 229 | +{ |
| 230 | + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, |
| 231 | + // alle anderen mit 512 Threads. |
| 232 | + //int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512; |
| 233 | + int threadsperblock = 256; |
| 234 | + |
| 235 | + // berechne wie viele Thread Blocks wir brauchen |
| 236 | + dim3 grid((threads + threadsperblock-1)/threadsperblock); |
| 237 | + dim3 block(threadsperblock); |
| 238 | + |
| 239 | + size_t shared_size = 0; |
| 240 | + |
| 241 | +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); |
| 242 | + |
| 243 | + // Schritt 1: Prüfen der Bedingung und Speicherung in d_tempBranch1/2Nonces |
| 244 | + jackpot_compactTest_gpu_TEST_64<<<grid, block, shared_size>>>(threads, startNounce, inpHashes, d_tempBranchAllNonces[thr_id], |
| 245 | + d_tempBranch1Nonces[thr_id], d_tempBranch2Nonces[thr_id], |
| 246 | + d_validTemp1[thr_id], d_validTemp2[thr_id]); |
| 247 | + |
| 248 | + // Strategisches Sleep Kommando zur Senkung der CPU Last |
| 249 | + jackpot_compactTest_cpu_dualCompaction(thr_id, threads, |
| 250 | + h_numValid[thr_id], d_nonces1, d_nonces2); |
| 251 | + |
| 252 | + cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
| 253 | + *nrm1 = h_numValid[thr_id][0]; |
| 254 | + *nrm2 = h_numValid[thr_id][1]; |
| 255 | +} |
0 commit comments