7
7
#include < stdio.h>
8
8
#include < memory.h>
9
9
10
+ // IMPORTANT: leave this enabled!
10
11
#define USE_SHARED 1
11
12
12
13
// aus cpu-miner.c
@@ -15,35 +16,41 @@ extern int device_map[8];
15
16
// aus heavy.cu
16
17
extern cudaError_t MyStreamSynchronize (cudaStream_t stream, int situation, int thr_id);
17
18
19
+ // aus driver.c
20
+ extern " C" void set_device (int device);
21
+
18
22
// Folgende Definitionen später durch header ersetzen
19
23
typedef unsigned char uint8_t ;
20
24
typedef unsigned int uint32_t ;
21
25
typedef unsigned long long uint64_t ;
22
26
27
+ // diese Struktur wird in der Init Funktion angefordert
28
+ static cudaDeviceProp props;
29
+
23
30
// globaler Speicher für alle HeftyHashes aller Threads
24
31
__constant__ uint32_t pTarget[8 ]; // Single GPU
25
32
extern uint32_t *d_resultNonce[8 ];
26
33
27
34
__constant__ uint32_t groestlcoin_gpu_msg[32 ];
28
35
36
+ #define SPH_C32 (x ) ((uint32_t )(x ## U))
29
37
#define SPH_T32 (x ) ((x) & SPH_C32 (0xFFFFFFFF ))
30
38
31
39
#define PC32up (j, r ) ((uint32_t )((j) + (r)))
32
40
#define PC32dn (j, r ) 0
33
41
#define QC32up (j, r ) 0xFFFFFFFF
34
42
#define QC32dn (j, r ) (((uint32_t )(r) << 24 ) ^ SPH_T32(~((uint32_t )(j) << 24 )))
35
43
36
- #define B32_0 (x ) ((x) & 0xFF )
37
- #define B32_1 (x ) (((x) >> 8 ) & 0xFF )
38
- #define B32_2 (x ) (((x) >> 16 ) & 0xFF )
39
- #define B32_3 (x ) ((x) >> 24 )
40
-
41
- #define SPH_C32 (x ) ((uint32_t )(x ## U))
42
- #define C32e (x ) ((SPH_C32(x) >> 24 ) \
43
- | ((SPH_C32(x) >> 8 ) & SPH_C32 (0x0000FF00 )) \
44
- | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000 )) \
45
- | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000 )))
44
+ #define B32_0 (x ) __byte_perm(x, 0 , 0x4440 )
45
+ // ((x) & 0xFF)
46
+ #define B32_1 (x ) __byte_perm(x, 0 , 0x4441 )
47
+ // (((x) >> 8) & 0xFF)
48
+ #define B32_2 (x ) __byte_perm(x, 0 , 0x4442 )
49
+ // (((x) >> 16) & 0xFF)
50
+ #define B32_3 (x ) __byte_perm(x, 0 , 0x4443 )
51
+ // ((x) >> 24)
46
52
53
+ #if 0
47
54
#if USE_SHARED
48
55
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
49
56
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
@@ -63,6 +70,18 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
63
70
#define T3up(x) tex1Dfetch(t3up1, x)
64
71
#define T3dn(x) tex1Dfetch(t3dn1, x)
65
72
#endif
73
+ #endif
74
+
75
+ // a healthy mix between shared and textured access provides the highest speed!
76
+ #define T0up (x ) (*((uint32_t *)mixtabs + ( (x))))
77
+ #define T0dn (x ) tex1Dfetch (t0dn1, x)
78
+ #define T1up (x ) tex1Dfetch (t1up1, x)
79
+ #define T1dn (x ) (*((uint32_t *)mixtabs + (768 +(x))))
80
+ #define T2up (x ) tex1Dfetch (t2up1, x)
81
+ #define T2dn (x ) (*((uint32_t *)mixtabs + (1280 +(x))))
82
+ #define T3up (x ) (*((uint32_t *)mixtabs + (1536 +(x))))
83
+ #define T3dn (x ) tex1Dfetch (t3dn1, x)
84
+
66
85
texture<unsigned int , 1 , cudaReadModeElementType> t0up1;
67
86
texture<unsigned int , 1 , cudaReadModeElementType> t0dn1;
68
87
texture<unsigned int , 1 , cudaReadModeElementType> t1up1;
@@ -81,21 +100,6 @@ extern uint32_t T2dn_cpu[];
81
100
extern uint32_t T3up_cpu[];
82
101
extern uint32_t T3dn_cpu[];
83
102
84
- #if __CUDA_ARCH__ < 350
85
- // Kepler (Compute 3.0)
86
- #define S (x, n ) (((x) >> (n)) | ((x) << (32 - (n))))
87
- #else
88
- // Kepler (Compute 3.5)
89
- #define S (x, n ) __funnelshift_r( x, x, n );
90
- #endif
91
- #define R (x, n ) ((x) >> (n))
92
- #define Ch (x, y, z ) ((x & (y ^ z)) ^ z)
93
- #define Maj (x, y, z ) ((x & (y | z)) | (y & z))
94
- #define S0 (x ) (S(x, 2 ) ^ S(x, 13 ) ^ S(x, 22 ))
95
- #define S1 (x ) (S(x, 6 ) ^ S(x, 11 ) ^ S(x, 25 ))
96
- #define s0 (x ) (S(x, 7 ) ^ S(x, 18 ) ^ R(x, 3 ))
97
- #define s1 (x ) (S(x, 17 ) ^ S(x, 19 ) ^ R(x, 10 ))
98
-
99
103
#define SWAB32 (x ) ( ((x & 0x000000FF ) << 24 ) | ((x & 0x0000FF00 ) << 8 ) | ((x & 0x00FF0000 ) >> 8 ) | ((x & 0xFF000000 ) >> 24 ) )
100
104
101
105
@@ -152,32 +156,25 @@ __device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
152
156
for (int k=0 ;k<16 ;k++) a[(k*2 )+0 ] ^= PC32up (k * 0x10 , 13 ); break ;
153
157
}
154
158
155
- // RBTT
159
+ // RBTT
156
160
#pragma unroll 16
157
- for (int k=0 ;k<32 ;k+=2 )
158
- {
159
- t[k + 0 ] = T0up ( B32_0 (a[k & 0x1f ]) ) ^
160
- T1up ( B32_1 (a[(k + 2 ) & 0x1f ]) ) ^
161
- T2up ( B32_2 (a[(k + 4 ) & 0x1f ]) ) ^
162
- T3up ( B32_3 (a[(k + 6 ) & 0x1f ]) ) ^
163
- T0dn ( B32_0 (a[(k + 9 ) & 0x1f ]) ) ^
164
- T1dn ( B32_1 (a[(k + 11 ) & 0x1f ]) ) ^
165
- T2dn ( B32_2 (a[(k + 13 ) & 0x1f ]) ) ^
166
- T3dn ( B32_3 (a[(k + 23 ) & 0x1f ]) );
167
-
168
- t[k + 1 ] = T0dn ( B32_0 (a[k & 0x1f ]) ) ^
169
- T1dn ( B32_1 (a[(k + 2 ) & 0x1f ]) ) ^
170
- T2dn ( B32_2 (a[(k + 4 ) & 0x1f ]) ) ^
171
- T3dn ( B32_3 (a[(k + 6 ) & 0x1f ]) ) ^
172
- T0up ( B32_0 (a[(k + 9 ) & 0x1f ]) ) ^
173
- T1up ( B32_1 (a[(k + 11 ) & 0x1f ]) ) ^
174
- T2up ( B32_2 (a[(k + 13 ) & 0x1f ]) ) ^
175
- T3up ( B32_3 (a[(k + 23 ) & 0x1f ]) );
176
- }
161
+ for (int k=0 ;k<32 ;k+=2 )
162
+ {
163
+ uint32_t t0_0 = B32_0 (a[(k ) & 0x1f ]), t9_0 = B32_0 (a[(k + 9 ) & 0x1f ]);
164
+ uint32_t t2_1 = B32_1 (a[(k + 2 ) & 0x1f ]), t11_1 = B32_1 (a[(k + 11 ) & 0x1f ]);
165
+ uint32_t t4_2 = B32_2 (a[(k + 4 ) & 0x1f ]), t13_2 = B32_2 (a[(k + 13 ) & 0x1f ]);
166
+ uint32_t t6_3 = B32_3 (a[(k + 6 ) & 0x1f ]), t23_3 = B32_3 (a[(k + 23 ) & 0x1f ]);
167
+
168
+ t[k + 0 ] = T0up ( t0_0 ) ^ T1up ( t2_1 ) ^ T2up ( t4_2 ) ^ T3up ( t6_3 ) ^
169
+ T0dn ( t9_0 ) ^ T1dn ( t11_1 ) ^ T2dn ( t13_2 ) ^ T3dn ( t23_3 );
170
+
171
+ t[k + 1 ] = T0dn ( t0_0 ) ^ T1dn ( t2_1 ) ^ T2dn ( t4_2 ) ^ T3dn ( t6_3 ) ^
172
+ T0up ( t9_0 ) ^ T1up ( t11_1 ) ^ T2up ( t13_2 ) ^ T3up ( t23_3 );
173
+ }
177
174
#pragma unroll 32
178
- for (int k=0 ;k<32 ;k++)
179
- a[k] = t[k];
180
- }
175
+ for (int k=0 ;k<32 ;k++)
176
+ a[k] = t[k];
177
+ }
181
178
}
182
179
183
180
__device__ __forceinline__ void groestlcoin_perm_Q (uint32_t *a, char *mixtabs)
@@ -233,32 +230,25 @@ __device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
233
230
for (int k=0 ;k<16 ;k++) { a[(k*2 )+0 ] ^= QC32up (k * 0x10 , 13 ); a[(k*2 )+1 ] ^= QC32dn (k * 0x10 , 13 );} break ;
234
231
}
235
232
236
- // RBTT
233
+ // RBTT
237
234
#pragma unroll 16
238
- for (int k=0 ;k<32 ;k+=2 )
239
- {
240
- t[k + 0 ] = T0up ( B32_0 (a[(k + 2 ) & 0x1f ]) ) ^
241
- T1up ( B32_1 (a[(k + 6 ) & 0x1f ]) ) ^
242
- T2up ( B32_2 (a[(k + 10 ) & 0x1f ]) ) ^
243
- T3up ( B32_3 (a[(k + 22 ) & 0x1f ]) ) ^
244
- T0dn ( B32_0 (a[(k + 1 ) & 0x1f ]) ) ^
245
- T1dn ( B32_1 (a[(k + 5 ) & 0x1f ]) ) ^
246
- T2dn ( B32_2 (a[(k + 9 ) & 0x1f ]) ) ^
247
- T3dn ( B32_3 (a[(k + 13 ) & 0x1f ]) );
248
-
249
- t[k + 1 ] = T0dn ( B32_0 (a[(k + 2 ) & 0x1f ]) ) ^
250
- T1dn ( B32_1 (a[(k + 6 ) & 0x1f ]) ) ^
251
- T2dn ( B32_2 (a[(k + 10 ) & 0x1f ]) ) ^
252
- T3dn ( B32_3 (a[(k + 22 ) & 0x1f ]) ) ^
253
- T0up ( B32_0 (a[(k + 1 ) & 0x1f ]) ) ^
254
- T1up ( B32_1 (a[(k + 5 ) & 0x1f ]) ) ^
255
- T2up ( B32_2 (a[(k + 9 ) & 0x1f ]) ) ^
256
- T3up ( B32_3 (a[(k + 13 ) & 0x1f ]) );
257
- }
235
+ for (int k=0 ;k<32 ;k+=2 )
236
+ {
237
+ uint32_t t2_0 = B32_0 (a[(k + 2 ) & 0x1f ]), t1_0 = B32_0 (a[(k + 1 ) & 0x1f ]);
238
+ uint32_t t6_1 = B32_1 (a[(k + 6 ) & 0x1f ]), t5_1 = B32_1 (a[(k + 5 ) & 0x1f ]);
239
+ uint32_t t10_2 = B32_2 (a[(k + 10 ) & 0x1f ]), t9_2 = B32_2 (a[(k + 9 ) & 0x1f ]);
240
+ uint32_t t22_3 = B32_3 (a[(k + 22 ) & 0x1f ]), t13_3 = B32_3 (a[(k + 13 ) & 0x1f ]);
241
+
242
+ t[k + 0 ] = T0up ( t2_0 ) ^ T1up ( t6_1 ) ^ T2up ( t10_2 ) ^ T3up ( t22_3 ) ^
243
+ T0dn ( t1_0 ) ^ T1dn ( t5_1 ) ^ T2dn ( t9_2 ) ^ T3dn ( t13_3 );
244
+
245
+ t[k + 1 ] = T0dn ( t2_0 ) ^ T1dn ( t6_1 ) ^ T2dn ( t10_2 ) ^ T3dn ( t22_3 ) ^
246
+ T0up ( t1_0 ) ^ T1up ( t5_1 ) ^ T2up ( t9_2 ) ^ T3up ( t13_3 );
247
+ }
258
248
#pragma unroll 32
259
- for (int k=0 ;k<32 ;k++)
260
- a[k] = t[k];
261
- }
249
+ for (int k=0 ;k<32 ;k++)
250
+ a[k] = t[k];
251
+ }
262
252
}
263
253
#if USE_SHARED
264
254
__global__ void /* __launch_bounds__(256) */
@@ -271,14 +261,17 @@ __global__ void
271
261
#if USE_SHARED
272
262
extern __shared__ char mixtabs[];
273
263
274
- *((uint32_t *)mixtabs + ( threadIdx .x )) = tex1Dfetch (t0up1, threadIdx .x );
275
- *((uint32_t *)mixtabs + (256 +threadIdx .x )) = tex1Dfetch (t0dn1, threadIdx .x );
276
- *((uint32_t *)mixtabs + (512 +threadIdx .x )) = tex1Dfetch (t1up1, threadIdx .x );
277
- *((uint32_t *)mixtabs + (768 +threadIdx .x )) = tex1Dfetch (t1dn1, threadIdx .x );
278
- *((uint32_t *)mixtabs + (1024 +threadIdx .x )) = tex1Dfetch (t2up1, threadIdx .x );
279
- *((uint32_t *)mixtabs + (1280 +threadIdx .x )) = tex1Dfetch (t2dn1, threadIdx .x );
280
- *((uint32_t *)mixtabs + (1536 +threadIdx .x )) = tex1Dfetch (t3up1, threadIdx .x );
281
- *((uint32_t *)mixtabs + (1792 +threadIdx .x )) = tex1Dfetch (t3dn1, threadIdx .x );
264
+ if (threadIdx .x < 256 )
265
+ {
266
+ *((uint32_t *)mixtabs + ( threadIdx .x )) = tex1Dfetch (t0up1, threadIdx .x );
267
+ *((uint32_t *)mixtabs + (256 +threadIdx .x )) = tex1Dfetch (t0dn1, threadIdx .x );
268
+ *((uint32_t *)mixtabs + (512 +threadIdx .x )) = tex1Dfetch (t1up1, threadIdx .x );
269
+ *((uint32_t *)mixtabs + (768 +threadIdx .x )) = tex1Dfetch (t1dn1, threadIdx .x );
270
+ *((uint32_t *)mixtabs + (1024 +threadIdx .x )) = tex1Dfetch (t2up1, threadIdx .x );
271
+ *((uint32_t *)mixtabs + (1280 +threadIdx .x )) = tex1Dfetch (t2dn1, threadIdx .x );
272
+ *((uint32_t *)mixtabs + (1536 +threadIdx .x )) = tex1Dfetch (t3up1, threadIdx .x );
273
+ *((uint32_t *)mixtabs + (1792 +threadIdx .x )) = tex1Dfetch (t3dn1, threadIdx .x );
274
+ }
282
275
283
276
__syncthreads ();
284
277
#endif
@@ -407,8 +400,11 @@ __global__ void
407
400
// Setup-Funktionen
408
401
__host__ void groestlcoin_cpu_init (int thr_id, int threads)
409
402
{
410
- cudaSetDevice (device_map[thr_id]);
411
- cudaDeviceSetCacheConfig ( cudaFuncCachePreferShared );
403
+ cudaSetDevice (device_map[thr_id]);
404
+
405
+ cudaGetDeviceProperties (&props, device_map[thr_id]);
406
+
407
+ cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 );
412
408
// Texturen mit obigem Makro initialisieren
413
409
texDef (t0up1, d_T0up, T0up_cpu, sizeof (uint32_t )*256 );
414
410
texDef (t0dn1, d_T0dn, T0dn_cpu, sizeof (uint32_t )*256 );
@@ -452,11 +448,9 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
452
448
453
449
__host__ void groestlcoin_cpu_hash (int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
454
450
{
455
- #if USE_SHARED
456
- const int threadsperblock = 256 ; // Alignment mit mixtab Grösse. NICHT ÄNDERN
457
- #else
458
- const int threadsperblock = 512 ; // so einstellen wie gewünscht ;-)
459
- #endif
451
+ // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
452
+ // alle anderen mit 512 Threads.
453
+ int threadsperblock = (props.major >= 3 ) ? 768 : 512 ;
460
454
461
455
// berechne wie viele Thread Blocks wir brauchen
462
456
dim3 grid ((threads + threadsperblock-1 )/threadsperblock);
0 commit comments