4
4
#include <assert.h>
5
5
6
6
#define CACHE_SIZE (10*1024*1024) // To flush cache between benchmarks
7
- #define NUM_MESH_CELLS (30*30) //(30*30)
7
+ #define NXC 20
8
+ #define NYC 15
9
+ #define NZC 3
10
+ #define NUM_MESH_CELLS (NXC*NYZ*NZC) //(20*15*3)
8
11
#define NUM_PCLS_PER_MESH_CELL 1024
9
12
#define NUM_THREADS_PER_MESH_CELL 1
10
13
#define ALIGNMENT 64
20
23
typedef struct mesh_cell
21
24
{
22
25
int num_pcls ;
26
+ __attribute__ ((align (64 ))) int cell_id_xyz [3 ];
23
27
/*
24
28
* Lengths need to be multiple of BLOCKSIZE,
25
29
* i.e., add padding if necessary
@@ -37,6 +41,8 @@ typedef struct mesh_cell
37
41
__attribute__ ((align (64 ))) double field_component_5 [6 ];
38
42
__attribute__ ((align (64 ))) double field_component_6 [6 ];
39
43
__attribute__ ((align (64 ))) double field_component_7 [6 ];
44
+ int * exit_cells ;
45
+ double * exit_pcls ;
40
46
} Mesh_cell ;
41
47
42
48
inline double time_sec ();
@@ -45,7 +51,7 @@ void move_bucket_old();
45
51
void move_bucket_new ();
46
52
void move_bucket_new_blocked ();
47
53
void move_bucket_new_blocked_sort ();
48
- inline unsigned int calc_cell_idx (double x , double y , double z );
54
+ inline int calc_cell_idx (double x , double y , double z );
49
55
50
56
/*
51
57
* Global variables
@@ -57,7 +63,8 @@ Mesh_cell *mesh_cells;
57
63
// Array to flush caches
58
64
char volatile * cache ;
59
65
60
- // Member variables (random values)
66
+ // Member variables (partly random values)
67
+ int nxc = NXC , nyc = NYC , nzc = NZC ;
61
68
double xstart = 0 , ystart = 0 , zstart = 0 ;
62
69
double inv_dx = .25 , inv_dy = .25 , inv_dz = .25 ;
63
70
double dto2 = .4 ;
@@ -74,9 +81,10 @@ double *field_components[8];
74
81
75
82
int main (void )
76
83
{
77
- double * q , * x , * y , * z , * u , * v , * w , * _xavg , * _yavg , * _zavg ;
84
+ double * q , * x , * y , * z , * u , * v , * w , * _xavg , * _yavg , * _zavg , * exit_pcls ;
78
85
long long * ParticleID ;
79
- int num_blocks ;
86
+ int * exit_cells ;
87
+ int size , num_blocks ;
80
88
int pidx , i , j ;
81
89
void * ptr ;
82
90
@@ -96,89 +104,106 @@ int main(void)
96
104
/*
97
105
* Fill mesh cells
98
106
*/
99
- for (i = 0 ; i < NUM_MESH_CELLS ; i ++ )
100
- {
101
- int size ;
107
+ for (cid_x = 0 ; cid_x < nxc ; cid_x ++ ) {
108
+ for (cid_y = 0 ; cid_y < nyc ; cid_y ++ ) {
109
+ for (cid_z = 0 ; cid_z < nzc ; cid_z ++ ) {
110
+
111
+ i = cell_xyz_to_idx (cidx_x , cidx_y , cidx_z );
112
+
113
+ num_pcls = NUM_PCLS_PER_MESH_CELL ;
114
+ /*
115
+ * Make sure that size of memory allocated
116
+ * is multiple of BLOCKSIZE particles
117
+ */
118
+ num_blocks = (num_pcls - 1 ) / BLOCKSIZE + 1 ;
119
+ size = num_blocks * BLOCKSIZE * sizeof (double );
120
+
121
+ /* Particles */
122
+ ParticleID = ptr = _mm_malloc (num_blocks * BLOCKSIZE * sizeof (long long ), ALIGNMENT );
123
+ assert (ptr );
124
+ q = ptr = _mm_malloc (size , ALIGNMENT );
125
+ assert (ptr );
126
+ x = ptr = _mm_malloc (size , ALIGNMENT );
127
+ assert (ptr );
128
+ y = ptr = _mm_malloc (size , ALIGNMENT );
129
+ assert (ptr );
130
+ z = ptr = _mm_malloc (size , ALIGNMENT );
131
+ assert (ptr );
132
+ u = ptr = _mm_malloc (size , ALIGNMENT );
133
+ assert (ptr );
134
+ v = ptr = _mm_malloc (size , ALIGNMENT );
135
+ assert (ptr );
136
+ w = ptr = _mm_malloc (size , ALIGNMENT );
137
+ assert (ptr );
138
+ _xavg = ptr = _mm_malloc (size , ALIGNMENT );
139
+ assert (ptr );
140
+ _yavg = ptr = _mm_malloc (size , ALIGNMENT );
141
+ assert (ptr );
142
+ _zavg = ptr = _mm_malloc (size , ALIGNMENT );
143
+ assert (ptr );
144
+ exit_cells = ptr = _mm_malloc (num_blocks * BLOCKSIZE * sizeof (int ), ALIGNMENT );
145
+ assert (ptr );
146
+ exit_pcls = ptr = _mm_malloc (size , ALIGNMENT );
147
+ assert (ptr );
148
+
149
+ /* Init cell's x, y, z coordinates (local to subdomain) */
150
+ mesh_cells [i ].cell_id_xyz [0 ] = cid_x ;
151
+ mesh_cells [i ].cell_id_xyz [1 ] = cid_y ;
152
+ mesh_cells [i ].cell_id_xyz [2 ] = cid_z ;
153
+
154
+ /* Init particles */
155
+ for (pidx = 0 ; pidx < num_pcls ; pidx ++ )
156
+ {
157
+ ParticleID [pidx ] = (long long ) random ();
158
+ q [pidx ] = (double ) random ();
159
+ x [pidx ] = (double ) random ();
160
+ y [pidx ] = (double ) random ();
161
+ z [pidx ] = (double ) random ();
162
+ u [pidx ] = (double ) random ();
163
+ v [pidx ] = (double ) random ();
164
+ w [pidx ] = (double ) random ();
165
+ _xavg [pidx ] = (double ) random ();
166
+ _yavg [pidx ] = (double ) random ();
167
+ _zavg [pidx ] = (double ) random ();
168
+ }
169
+ /* Init fields */
170
+ for (j = 0 ; j < 6 ; j ++ ) {
171
+ mesh_cells [i ].field_component_0 [j ] = (double ) random ();
172
+ mesh_cells [i ].field_component_1 [j ] = (double ) random ();
173
+ mesh_cells [i ].field_component_2 [j ] = (double ) random ();
174
+ mesh_cells [i ].field_component_3 [j ] = (double ) random ();
175
+ mesh_cells [i ].field_component_4 [j ] = (double ) random ();
176
+ mesh_cells [i ].field_component_5 [j ] = (double ) random ();
177
+ mesh_cells [i ].field_component_6 [j ] = (double ) random ();
178
+ mesh_cells [i ].field_component_7 [j ] = (double ) random ();
179
+ }
102
180
103
- num_pcls = NUM_PCLS_PER_MESH_CELL ;
104
- /*
105
- * Make sure that size of memory allocated
106
- * is multiple of BLOCKSIZE particles
107
- */
108
- num_blocks = (num_pcls - 1 ) / BLOCKSIZE + 1 ;
109
- size = num_blocks * BLOCKSIZE * sizeof (double );
110
-
111
- /* Particles */
112
- ParticleID = ptr = _mm_malloc (num_blocks * BLOCKSIZE * sizeof (long long ), ALIGNMENT );
113
- assert (ptr );
114
- q = ptr = _mm_malloc (size , ALIGNMENT );
115
- assert (ptr );
116
- x = ptr = _mm_malloc (size , ALIGNMENT );
117
- assert (ptr );
118
- y = ptr = _mm_malloc (size , ALIGNMENT );
119
- assert (ptr );
120
- z = ptr = _mm_malloc (size , ALIGNMENT );
121
- assert (ptr );
122
- u = ptr = _mm_malloc (size , ALIGNMENT );
123
- assert (ptr );
124
- v = ptr = _mm_malloc (size , ALIGNMENT );
125
- assert (ptr );
126
- w = ptr = _mm_malloc (size , ALIGNMENT );
127
- assert (ptr );
128
- _xavg = ptr = _mm_malloc (size , ALIGNMENT );
129
- assert (ptr );
130
- _yavg = ptr = _mm_malloc (size , ALIGNMENT );
131
- assert (ptr );
132
- _zavg = ptr = _mm_malloc (size , ALIGNMENT );
133
- assert (ptr );
134
-
135
- /* Init particles */
136
- for (pidx = 0 ; pidx < num_pcls ; pidx ++ )
137
- {
138
- ParticleID [pidx ] = (long long ) random ();
139
- q [pidx ] = (double ) random ();
140
- x [pidx ] = (double ) random ();
141
- y [pidx ] = (double ) random ();
142
- z [pidx ] = (double ) random ();
143
- u [pidx ] = (double ) random ();
144
- v [pidx ] = (double ) random ();
145
- w [pidx ] = (double ) random ();
146
- _xavg [pidx ] = (double ) random ();
147
- _yavg [pidx ] = (double ) random ();
148
- _zavg [pidx ] = (double ) random ();
149
- }
150
- /* Init fields */
151
- for (j = 0 ; j < 6 ; j ++ ) {
152
- mesh_cells [i ].field_component_0 [j ] = (double ) random ();
153
- mesh_cells [i ].field_component_1 [j ] = (double ) random ();
154
- mesh_cells [i ].field_component_2 [j ] = (double ) random ();
155
- mesh_cells [i ].field_component_3 [j ] = (double ) random ();
156
- mesh_cells [i ].field_component_4 [j ] = (double ) random ();
157
- mesh_cells [i ].field_component_5 [j ] = (double ) random ();
158
- mesh_cells [i ].field_component_6 [j ] = (double ) random ();
159
- mesh_cells [i ].field_component_7 [j ] = (double ) random ();
181
+ mesh_cells [i ].num_pcls = num_pcls ;
182
+ mesh_cells [i ].ParticleID = ParticleID ;
183
+ mesh_cells [i ].q = q ;
184
+ mesh_cells [i ].x = x ;
185
+ mesh_cells [i ].y = y ;
186
+ mesh_cells [i ].z = z ;
187
+ mesh_cells [i ].u = u ;
188
+ mesh_cells [i ].v = v ;
189
+ mesh_cells [i ].w = w ;
190
+ mesh_cells [i ]._xavg = _xavg ;
191
+ mesh_cells [i ]._yavg = _yavg ;
192
+ mesh_cells [i ]._zavg = _zavg ;
193
+ mesh_cells [i ].exit_cells = exit_cells ;
194
+ mesh_cells [i ].exit_pcls = exit_pcls ;
195
+
196
+ ALIGNED (mesh_cells [i ].ParticleID );
197
+ ALIGNED (mesh_cells [i ].q );
198
+ ALIGNED (mesh_cells [i ].x ); ALIGNED (mesh_cells [i ].y ); ALIGNED (mesh_cells [i ].z );
199
+ ALIGNED (mesh_cells [i ].u ); ALIGNED (mesh_cells [i ].v ); ALIGNED (mesh_cells [i ].w );
200
+ ALIGNED (mesh_cells [i ]._xavg );
201
+ ALIGNED (mesh_cells [i ]._yavg );
202
+ ALIGNED (mesh_cells [i ]._zavg );
203
+ ALIGNED (mesh_cells [i ].exit_cells );
204
+ ALIGNED (mesh_cells [i ].exit_pcls );
205
+ }
160
206
}
161
-
162
- mesh_cells [i ].num_pcls = num_pcls ;
163
- mesh_cells [i ].ParticleID = ParticleID ;
164
- mesh_cells [i ].q = q ;
165
- mesh_cells [i ].x = x ;
166
- mesh_cells [i ].y = y ;
167
- mesh_cells [i ].z = z ;
168
- mesh_cells [i ].u = u ;
169
- mesh_cells [i ].v = v ;
170
- mesh_cells [i ].w = w ;
171
- mesh_cells [i ]._xavg = _xavg ;
172
- mesh_cells [i ]._yavg = _yavg ;
173
- mesh_cells [i ]._zavg = _zavg ;
174
-
175
- ALIGNED (mesh_cells [i ].ParticleID );
176
- ALIGNED (mesh_cells [i ].q );
177
- ALIGNED (mesh_cells [i ].x ); ALIGNED (mesh_cells [i ].y ); ALIGNED (mesh_cells [i ].z );
178
- ALIGNED (mesh_cells [i ].u ); ALIGNED (mesh_cells [i ].v ); ALIGNED (mesh_cells [i ].w );
179
- ALIGNED (mesh_cells [i ]._xavg );
180
- ALIGNED (mesh_cells [i ]._yavg );
181
- ALIGNED (mesh_cells [i ]._zavg );
182
207
}
183
208
184
209
@@ -237,6 +262,9 @@ void move_bucket_new_blocked_sort()
237
262
double * restrict x , * restrict y , * restrict z ;
238
263
double * restrict u , * restrict v , * restrict w ;
239
264
double * restrict _xavg , * restrict _yavg , * restrict _zavg ;
265
+ int * restrict exit_cells ;
266
+ double * restrict exit_pcls ;
267
+ int * restrict cell_id_xyz ;
240
268
// For weights calculation
241
269
double abs_pos [3 ];
242
270
double rel_pos [3 ];
@@ -263,11 +291,15 @@ void move_bucket_new_blocked_sort()
263
291
__attribute__ ((align (64 ))) double Exl [BLOCKSIZE ];
264
292
__attribute__ ((align (64 ))) double Eyl [BLOCKSIZE ];
265
293
__attribute__ ((align (64 ))) double Ezl [BLOCKSIZE ];
266
- // For what is left
294
+ // For pushing
267
295
double qdto2mc ;
268
296
double omsq , denom ;
269
297
double t [3 ], Om [3 ], avg [3 ];
270
298
double udotOm ;
299
+ // For sorting
300
+ int new_pcidx [BLOCKSIZE ];
301
+ int exit_cells_idx , exit_pcls_idx ;
302
+ int gap_idx ;
271
303
// Timing
272
304
double time [6 ];
273
305
@@ -286,11 +318,14 @@ void move_bucket_new_blocked_sort()
286
318
BxyzExyz_l [5 ] = Ezl ;
287
319
288
320
/* 1 thread works on one cell */
289
- #pragma omp for schedule(static,1) nowait
321
+ #pragma omp for schedule(static,1)
290
322
for (i = 0 ; i < NUM_MESH_CELLS ; i ++ )
291
323
{
292
324
//printf("i: %d\n", i); fflush(stdout);
293
- cidx = i ; // Cell index
325
+ cidx = i ; // Cell index
326
+ exit_cells_idx = 1 ; // 1st idx where to start filling exit_cells[]
327
+ exit_pcls_idx = 0 ; // 1st idx where to start filling exit_pcls[]
328
+ gap_idx = 0 ; // 1st idx where gap in pcl arrays has to be filled
294
329
295
330
/* Set shortcuts to mesh cell's data */
296
331
ParticleID = mesh_cells [cidx ].ParticleID ; ALIGNED (ParticleID );
@@ -304,6 +339,9 @@ void move_bucket_new_blocked_sort()
304
339
_xavg = mesh_cells [cidx ]._xavg ; ALIGNED (_xavg );
305
340
_yavg = mesh_cells [cidx ]._yavg ; ALIGNED (_yavg );
306
341
_zavg = mesh_cells [cidx ]._zavg ; ALIGNED (_zavg );
342
+ exit_cells = mesh_cells [cidx ].exit_cells ; ALIGNED (exit_cells );
343
+ exit_pcls = mesh_cells [cidx ].exit_pcls ; ALIGNED (exit_pcls );
344
+ cell_id_xyz = mesh_cells [cidx ].cell_id_xyz ; ALIGNED (cell_id_xyz );
307
345
308
346
/* Field components for this cell */
309
347
field_component_0 = mesh_cells [cidx ].field_component_0 ; ALIGNED (field_component_0 );
@@ -359,9 +397,9 @@ void move_bucket_new_blocked_sort()
359
397
//const int iy = cy + 1;
360
398
//const int iz = cz + 1;
361
399
// fraction of the distance from the right of the cell
362
- w1 [0 ] = cx - cm1_pos [0 ];
363
- w1 [1 ] = cy - cm1_pos [1 ];
364
- w1 [2 ] = cz - cm1_pos [2 ];
400
+ w1 [0 ] = cell_id_xyz [ 0 ] - cm1_pos [0 ];
401
+ w1 [1 ] = cell_id_xyz [ 1 ] - cm1_pos [1 ];
402
+ w1 [2 ] = cell_id_xyz [ 2 ] - cm1_pos [2 ];
365
403
// fraction of distance from the left
366
404
w0 [0 ] = 1 - w1 [0 ];
367
405
w0 [1 ] = 1 - w1 [1 ];
@@ -411,7 +449,7 @@ void move_bucket_new_blocked_sort()
411
449
//time[3] = time_sec();
412
450
#if 1
413
451
414
- /* Do what is left */
452
+ /* Push particles */
415
453
//#pragma vector nontemporal (_xavg,_yavg,_zavg)
416
454
for (j = 0 , pidx = cpidx ; j < BLOCKSIZE ; j ++ , pidx ++ ) {
417
455
Om [0 ] = qdto2mc * Bxl [j ];
@@ -515,13 +553,14 @@ void move_bucket_new_blocked_sort()
515
553
exit_cells_idx ++ ; // Point to next empty slot for cell idx
516
554
}
517
555
518
-
519
- - Init gap idx to 0 (location of 1 st pcl ), increment with next pcl if required
520
- (- Pcls must be sorted before we enter pusher )
521
- - calc cell idx (integer value ) XXX
522
- - gather pcl data into leave array XXX
523
- - close gaps in array of pcls that stay XXX
524
- - Store exit counter in exit_cells [] XXX
556
+ /*
557
+ - Init gap idx to 0 (location of 1st pcl), increment with next pcl if required XXX
558
+ (- Pcls must be sorted before we enter pusher)
559
+ - calc cell idx (integer value) XXX
560
+ - gather pcl data into leave array XXX
561
+ - close gaps in array of pcls that stay XXX
562
+ - Store exit counter in exit_cells[] XXX
563
+ */
525
564
}
526
565
527
566
#endif
@@ -538,7 +577,10 @@ void move_bucket_new_blocked_sort()
538
577
#endif
539
578
}
540
579
/* Every cell collects its incoming particles */
541
- - Update num_pcls in mesh_cell
580
+ /*
581
+ * TODO
582
+ - Update num_pcls in mesh_cell
583
+ */
542
584
}
543
585
time [1 ] = time_sec ();
544
586
printf ("move_bucket_new_blocked_sort: total: %f\n" , time [1 ] - time [0 ]);
@@ -1240,7 +1282,7 @@ void flush_cache()
1240
1282
cache [i ] = (cache [i ] + i ) % 256 ;
1241
1283
}
1242
1284
1243
- inline unsigned int calc_cell_idx (double xpos , double ypos , double zpos )
1285
+ inline int calc_cell_idx (double xpos , double ypos , double zpos )
1244
1286
{
1245
1287
int cx , cy , cz ;
1246
1288
@@ -1263,7 +1305,7 @@ inline unsigned int calc_cell_idx(double xpos, double ypos, double zpos)
1263
1305
cz = (cz >= nzc ) ? nzc - 1 : cz ;
1264
1306
1265
1307
// cx * nyc * nzc + cy * nzc + cz
1266
- return (unsigned int ) (( cx * nyc + cy ) * nzc + cz ) ;
1308
+ return (cx * nyc + cy ) * nzc + cz ;
1267
1309
}
1268
1310
1269
1311
0 commit comments