Skip to content

Commit 433d653

Browse files
committed
bump to revision 0.7
1 parent 0659d69 commit 433d653

15 files changed

+2078
-88
lines changed

JHA/cuda_jha_compactionTest.cu

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
#include <cuda.h>
2+
#include "cuda_runtime.h"
3+
#include "device_launch_parameters.h"
4+
#include "sm_30_intrinsics.h"
5+
6+
#include <stdio.h>
7+
#include <memory.h>
8+
#include <stdint.h>
9+
10+
// aus cpu-miner.c
11+
extern int device_map[8];
12+
13+
// diese Struktur wird in der Init Funktion angefordert
14+
static cudaDeviceProp props[8];
15+
16+
static uint32_t *d_tempBranch1Nonces[8];
17+
static uint32_t *d_tempBranch2Nonces[8];
18+
static size_t *d_numValid[8];
19+
static size_t *h_numValid[8];
20+
21+
static uint32_t *d_partSum1[8], *d_partSum2[8]; // 2x partielle summen
22+
static uint32_t *d_validTemp1[8], *d_validTemp2[8];
23+
24+
// Zwischenspeicher
25+
static uint32_t *d_tempBranchAllNonces[8];
26+
27+
// aus heavy.cu
28+
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
29+
30+
31+
32+
// Setup-Funktionen
33+
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
34+
{
35+
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
36+
37+
// wir brauchen auch Speicherplatz auf dem Device
38+
cudaMalloc(&d_tempBranchAllNonces[thr_id], sizeof(uint32_t) * threads);
39+
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads);
40+
cudaMalloc(&d_tempBranch2Nonces[thr_id], sizeof(uint32_t) * threads);
41+
cudaMalloc(&d_numValid[thr_id], 2*sizeof(size_t));
42+
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(size_t));
43+
44+
uint32_t s1;
45+
s1 = threads / 256;
46+
47+
cudaMalloc(&d_partSum1[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
48+
cudaMalloc(&d_partSum2[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
49+
50+
cudaMalloc(&d_validTemp1[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block)
51+
cudaMalloc(&d_validTemp2[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block)
52+
}
53+
54+
// Die Testfunktion (zum Erstellen der TestMap)
55+
__global__ void jackpot_compactTest_gpu_TEST_64(int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesFull,
56+
uint32_t *d_nonces1, uint32_t *d_nonces2,
57+
uint32_t *d_validT1, uint32_t *d_validT2)
58+
{
59+
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
60+
if (thread < threads)
61+
{
62+
// bestimme den aktuellen Zähler
63+
uint32_t nounce = startNounce + thread;
64+
uint32_t *inpHash = &inpHashes[16 * thread];
65+
66+
uint32_t tmp = inpHash[0] & 0x01;
67+
uint32_t val1 = (tmp == 1);
68+
uint32_t val2 = (tmp == 0);
69+
70+
d_nonces1[thread] = val1;
71+
d_validT1[thread] = val1;
72+
d_nonces2[thread] = val2;
73+
d_validT2[thread] = val2;
74+
d_noncesFull[thread] = nounce;
75+
}
76+
}
77+
78+
// Die Summenfunktion (vom NVIDIA SDK)
79+
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL)
80+
{
81+
extern __shared__ uint32_t sums[];
82+
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
83+
//int lane_id = id % warpSize;
84+
int lane_id = id % width;
85+
// determine a warp_id within a block
86+
//int warp_id = threadIdx.x / warpSize;
87+
int warp_id = threadIdx.x / width;
88+
89+
// Below is the basic structure of using a shfl instruction
90+
// for a scan.
91+
// Record "value" as a variable - we accumulate it along the way
92+
uint32_t value = data[id];
93+
94+
// Now accumulate in log steps up the chain
95+
// compute sums, with another thread's value who is
96+
// distance delta away (i). Note
97+
// those threads where the thread 'i' away would have
98+
// been out of bounds of the warp are unaffected. This
99+
// creates the scan sum.
100+
#pragma unroll
101+
102+
for (int i=1; i<=width; i*=2)
103+
{
104+
uint32_t n = __shfl_up((int)value, i, width);
105+
106+
if (lane_id >= i) value += n;
107+
}
108+
109+
// value now holds the scan value for the individual thread
110+
// next sum the largest values for each warp
111+
112+
// write the sum of the warp to smem
113+
//if (threadIdx.x % warpSize == warpSize-1)
114+
if (threadIdx.x % width == width-1)
115+
{
116+
sums[warp_id] = value;
117+
}
118+
119+
__syncthreads();
120+
121+
//
122+
// scan sum the warp sums
123+
// the same shfl scan operation, but performed on warp sums
124+
//
125+
if (warp_id == 0)
126+
{
127+
uint32_t warp_sum = sums[lane_id];
128+
129+
for (int i=1; i<=width; i*=2)
130+
{
131+
uint32_t n = __shfl_up((int)warp_sum, i, width);
132+
133+
if (lane_id >= i) warp_sum += n;
134+
}
135+
136+
sums[lane_id] = warp_sum;
137+
}
138+
139+
__syncthreads();
140+
141+
// perform a uniform add across warps in the block
142+
// read neighbouring warp's sum and add it to threads value
143+
uint32_t blockSum = 0;
144+
145+
if (warp_id > 0)
146+
{
147+
blockSum = sums[warp_id-1];
148+
}
149+
150+
value += blockSum;
151+
152+
// Now write out our result
153+
data[id] = value;
154+
155+
// last thread has sum, write write out the block's sum
156+
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
157+
{
158+
partial_sums[blockIdx.x] = value;
159+
}
160+
}
161+
162+
// Uniform add: add partial sums array
163+
__global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
164+
{
165+
__shared__ uint32_t buf;
166+
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
167+
168+
if (id > len) return;
169+
170+
if (threadIdx.x == 0)
171+
{
172+
buf = partial_sums[blockIdx.x];
173+
}
174+
175+
__syncthreads();
176+
data[id] += buf;
177+
}
178+
179+
// Der Scatter
180+
__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *data, uint32_t *valid, uint32_t *sum, uint32_t *outp)
181+
{
182+
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
183+
if( valid[id] )
184+
{
185+
int idx = sum[id];
186+
if(idx > 0)
187+
outp[idx-1] = data[id];
188+
}
189+
}
190+
191+
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
192+
__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, size_t *nrm,
193+
uint32_t *d_nonces1, uint32_t *d_nonces2)
194+
{
195+
// threadsPerBlock ausrechnen
196+
int blockSize = 256;
197+
int thr1 = threads / blockSize;
198+
int thr2 = threads / (blockSize*blockSize);
199+
200+
// 1
201+
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id]);
202+
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
203+
jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
204+
cudaStreamSynchronize(NULL);
205+
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
206+
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
207+
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
208+
209+
// 2
210+
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id]);
211+
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
212+
jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
213+
cudaStreamSynchronize(NULL);
214+
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
215+
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
216+
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
217+
218+
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
219+
// Schritt 3: Scatter
220+
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp1[thr_id], d_tempBranch1Nonces[thr_id], d_nonces1);
221+
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp2[thr_id], d_tempBranch2Nonces[thr_id], d_nonces2);
222+
cudaStreamSynchronize(NULL);
223+
}
224+
225+
__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
226+
uint32_t *d_nonces1, size_t *nrm1,
227+
uint32_t *d_nonces2, size_t *nrm2,
228+
int order)
229+
{
230+
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
231+
// alle anderen mit 512 Threads.
232+
//int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
233+
int threadsperblock = 256;
234+
235+
// berechne wie viele Thread Blocks wir brauchen
236+
dim3 grid((threads + threadsperblock-1)/threadsperblock);
237+
dim3 block(threadsperblock);
238+
239+
size_t shared_size = 0;
240+
241+
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
242+
243+
// Schritt 1: Prüfen der Bedingung und Speicherung in d_tempBranch1/2Nonces
244+
jackpot_compactTest_gpu_TEST_64<<<grid, block, shared_size>>>(threads, startNounce, inpHashes, d_tempBranchAllNonces[thr_id],
245+
d_tempBranch1Nonces[thr_id], d_tempBranch2Nonces[thr_id],
246+
d_validTemp1[thr_id], d_validTemp2[thr_id]);
247+
248+
// Strategisches Sleep Kommando zur Senkung der CPU Last
249+
jackpot_compactTest_cpu_dualCompaction(thr_id, threads,
250+
h_numValid[thr_id], d_nonces1, d_nonces2);
251+
252+
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
253+
*nrm1 = h_numValid[thr_id][0];
254+
*nrm2 = h_numValid[thr_id][1];
255+
}

JHA/cuda_jha_keccak512.cu

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
2+
13
#include <cuda.h>
24
#include "cuda_runtime.h"
35
#include "device_launch_parameters.h"
@@ -132,7 +134,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
132134
}
133135
}
134136

135-
__global__ void jackpot_keccak512_gpu_hash_88(int threads, uint32_t startNounce, uint64_t *g_hash)
137+
__global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, uint64_t *g_hash)
136138
{
137139
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
138140
if (thread < threads)
@@ -518,9 +520,9 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
518520
}
519521
}
520522

521-
__host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata)
523+
// inlen kann 72...143 betragen
524+
__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
522525
{
523-
unsigned long long inlen = 88;
524526
const unsigned char *in = (const unsigned char*)pdata;
525527

526528
tKeccakLane state[5 * 5];
@@ -554,7 +556,7 @@ __host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata)
554556
0, cudaMemcpyHostToDevice);
555557
}
556558

557-
__host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
559+
__host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
558560
{
559561
const int threadsperblock = 256;
560562

@@ -567,6 +569,6 @@ __host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t st
567569

568570
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
569571

570-
jackpot_keccak512_gpu_hash_88<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
572+
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
571573
MyStreamSynchronize(NULL, order, thr_id);
572574
}

0 commit comments

Comments
 (0)