Skip to content

Commit

Permalink
Fixed BLAS test failures of small matrix SYRK for single and double p…
Browse files Browse the repository at this point in the history
…recision.

Details:
- SYRK for small matrix was implemented by reusing small GEMM routine. This was
  resulting in output written to the full C matrix, and C being symmetric the
  lower and upper triangles of C matrix contained same results. BLAS SYRK API
  spec demands either lower or upper triangle of C matrix to be written with
  results. So, this was resulting in BLAS test failures, even though testsuite
  of BLIS was passing small SYRK operation.
- To fix BLAS test failures of small matrix SYRK, separate kernel routines are
  implemented for small SYRK for both single and double precision. The newly
  added small SYRK routines are in file kernels/zen/3/bli_syrk_small.c.
  Now the intermediate results of matrix C are written to a scratch buffer.
  Final results are written from scratch buffer to matrix C using SIMD
  copy to either lower or upper traingle part of matrix C.
- Source and header files frame/3/syrk/bli_syrk_front.c and
  frame/3/syrk/bli_syrk_front.h are changed to invoke new small SYRK routines.

Change-Id: I9cfb1116c93d150aefac673fca033952ecac97cb
  • Loading branch information
BiplabRaut committed Dec 19, 2018
1 parent 6d26737 commit 1f4eeee
Show file tree
Hide file tree
Showing 4 changed files with 4,235 additions and 36 deletions.
33 changes: 17 additions & 16 deletions config/zen/bli_cntx_init_zen.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -138,27 +139,27 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Zen optmized level 3 cache block sizes
/************************************************************************
Below block sizes of DGEMM, works better in a multi instance mode,
for clock frequency of 2.6Ghz and DDR4 clock frequency of 2400Mhz
for clock frequency of 2.2GHz and DDR4 clock frequency of 2400MHz
************************************************************************/
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );

/***********************************************************************************
Below block sizes of DGEMM, gives better performance in a multi instance mode,
for clock frequency of 2.2Ghz and DDR4 clock frequency of 2400Mhz
**************************************************************************************/
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 390, 144, 72 );
//bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );

/******************************************************************************
BLIS on single instance mode, gives better perfomance with
below mentioned default block size values
********************************************************************************/
// bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
//bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
/***********************************************************************************
Below block sizes of DGEMM, gives better performance in a multi instance mode,
for clock frequency of 2.6GHz and DDR4 clock frequency of 2400MHz
**************************************************************************************/
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 390, 144, 72 );
//bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );

/******************************************************************************
BLIS on single instance mode, gives better perfomance with
below mentioned default block size values
********************************************************************************/
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
//bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );

#else

Expand Down
41 changes: 21 additions & 20 deletions frame/3/syrk/bli_syrk_front.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -71,26 +72,26 @@ void bli_syrk_front

// For syrk, the right-hand "B" operand is simply A^T.
#ifdef BLIS_ENABLE_SMALL_MATRIX
bli_obj_alias_to( a, &at_local );
if (bli_obj_has_trans(a) != 0)
{//At*A operation
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &at_local );
//call small gemm to perform syrk.
//gemm small matrix threshold check is done inside bli_gemm_small() which is good enough for syrk small matrix also.
status = bli_gemm_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl );
}
else if ((a->dim[0] <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] < BLIS_SMALL_MATRIX_A_THRES_N_SYRK) ||
(a->dim[0] < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK))
{//A*At operation
bli_obj_set_conjtrans( BLIS_TRANSPOSE, &at_local );
//call small gemm to perform syrk.
//Explicit matrix dimension threshold check in this else if section before calling bli_gemm_small() for syrk small matrix also.
status = bli_gemm_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl );
}
if ( status == BLIS_SUCCESS )
{
return;
}
bli_obj_alias_to( a, &at_local );
if (bli_obj_has_trans(a) != 0)
{//At*A operation
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &at_local );
//call small syrk.
//syrk small matrix threshold check is done inside bli_syrk_small().
status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl );
}
else if ((a->dim[0] <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] < BLIS_SMALL_MATRIX_A_THRES_N_SYRK) ||
(a->dim[0] < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK))
{//A*At operation
bli_obj_set_conjtrans( BLIS_TRANSPOSE, &at_local );
//call small syrk.
//Explicit matrix dimension threshold check in this else if section before calling bli_syrk_small().
status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl );
}
if ( status == BLIS_SUCCESS )
{
return;
}
#endif

bli_obj_alias_to( a, &at_local );
Expand Down
11 changes: 11 additions & 0 deletions frame/3/syrk/bli_syrk_front.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -42,3 +43,13 @@ void bli_syrk_front
rntm_t* rntm,
cntl_t* cntl
);
err_t bli_syrk_small
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl
);
Loading

0 comments on commit 1f4eeee

Please sign in to comment.