Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
CXX_FLAGS=-O3 -std=c++11 -DNDEBUG
CXX_FLAGS=-O3 -std=c++11
INCLUDE_PATH=-I./include/

ifeq ($(CXX),icpc)
CXX_FLAGS += -qopenmp -xhost
else
ifeq ($(CXX),g++)
CXX_FLAGS += -fopenmp -march=native
CXX_FLAGS += -fopenmp -march=native -Wno-vla
else
ifeq ($(CXX),clang++)
CXX_FLAGS += -fopenmp -march=native
CXX_FLAGS += -fopenmp -march=native -fsanitize=address -Wno-vla-extension
endif
endif
endif
Expand Down
5 changes: 4 additions & 1 deletion benchmark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,11 @@ LIBS=-lhptt
all: ${OBJ}
${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o benchmark.exe

LIB_PATH+= ${OPENMP_LIB_PATH}
LIBS+= -lomp

%.o: %.cpp
${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@

clean:
rm -rf *.o benchmark.exe
2 changes: 1 addition & 1 deletion benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ int main(int argc, char *argv[])
restore(B, B_ref, total_size);
trashCache(trash1, trash2, largerThanL3);
auto begin_time = omp_get_wtime();
transpose_ref( size, perm, dim, A, alpha, B_ref, beta, false);
transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, 1, B_ref, beta, nullptr, nullptr, 1, false);
double elapsed_time = omp_get_wtime() - begin_time;
minTime = (elapsed_time < minTime) ? elapsed_time : minTime;
}
Expand Down
Empty file modified benchmark/benchmark.sh
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion benchmark/maxFromFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@
else:
f2.write(f2Content[i])
except:
print "ERROR:", i
print("ERROR:", i)
f2.close()
112 changes: 82 additions & 30 deletions benchmark/reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,68 +16,120 @@

template<typename floatType>
void transpose_ref( uint32_t *size, uint32_t *perm, int dim,
const floatType* __restrict__ A, floatType alpha,
floatType* __restrict__ B, floatType beta, const bool conjA)
const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA,
floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA)
{
// compute stride for all dimensions w.r.t. A
std::vector<int> tempOuterSizeA, tempOuterSizeB, tempOffsetA, tempOffsetB, tempPointerB;

// Stride One is location of 0 in perm. Perm[0] may not be B stride one unless perm[0] == 0
// perm provided yields positions in A data from a B order index
// perm calculated below relates positions in B data to an A order index
tempPointerB.resize(dim);
for (int i = 0; i < dim; ++i)
for (int j = 0; j < dim; ++j)
if (i == perm[j])
tempPointerB[i] = j;

// Use default values if any of the pointers are nullptr
if (outerSizeA == nullptr) {
tempOuterSizeA.resize(dim);
for (int i = 0; i < dim; i++) tempOuterSizeA[i] = size[i];
outerSizeA = tempOuterSizeA.data();
}

if (outerSizeB == nullptr) {
tempOuterSizeB.resize(dim);
for (int i = 0; i < dim; i++) tempOuterSizeB[i] = size[perm[i]];
outerSizeB = tempOuterSizeB.data();
}

if (offsetA == nullptr) {
tempOffsetA.resize(dim); // Default to zeros
for (int i = 0; i < dim; i++) tempOffsetA[i] = 0;
offsetA = tempOffsetA.data();
}

if (offsetB == nullptr) {
tempOffsetB.resize(dim); // Default to zeros
for (int i = 0; i < dim; i++) tempOffsetB[i] = 0;
offsetB = tempOffsetB.data();
}

// compute stride for all dimensions w.r.t. A (like lda)
uint32_t strideA[dim];
strideA[0] = 1;
strideA[0] = innerStrideA;
for(int i=1; i < dim; ++i)
strideA[i] = strideA[i-1] * outerSizeA[i-1];

// compute stride for all dimensions w.r.t. B (like ldb)
uint32_t strideB[dim];
strideB[0] = innerStrideB;
for(int i=1; i < dim; ++i)
strideA[i] = strideA[i-1] * size[i-1];
strideB[i] = strideB[i-1] * outerSizeB[i-1];

// combine all non-stride-one dimensions of B into a single dimension for
// maximum parallelism
uint32_t sizeOuter = 1;
for(int i=0; i < dim; ++i)
if( i != perm[0] )
sizeOuter *= size[i];
sizeOuter *= size[i];

uint32_t sizeInner = size[perm[0]];
uint32_t strideAinner = strideA[perm[0]];

// This implementation traverses the output tensor in a linear fashion

#pragma omp parallel for
for(uint32_t j=0; j < sizeOuter; ++j)
{
uint32_t offsetA = 0;
uint32_t offsetB = 0;
uint32_t j_tmp = j;
uint32_t indexOffsetA = 0;
uint32_t indexOffsetB = 0;
uint32_t j_tmp_A = j;
uint32_t j_tmp_B = j;
for(int i=1; i < dim; ++i)
{
int current_index = j_tmp % size[perm[i]];
j_tmp /= size[perm[i]];
offsetA += current_index * strideA[perm[i]];
int current_index_A = j_tmp_A % size[perm[i]];
j_tmp_A /= size[perm[i]];
j_tmp_B /= size[perm[i]];
indexOffsetA += (current_index_A + offsetA[perm[i]]) * strideA[perm[i]];
indexOffsetB += (j_tmp_B + 1) * offsetB[i] * strideB[i];
indexOffsetB += j_tmp_B * (outerSizeB[i] - offsetB[i] - size[perm[i]]) * strideB[i];
}

const floatType* __restrict__ A_ = A + offsetA;
floatType* __restrict__ B_ = B + j*sizeInner;

uint32_t strideAinner = strideA[perm[0]];
const floatType* __restrict__ A_ = A + indexOffsetA;
floatType* __restrict__ B_ = B + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB);

if( beta == (floatType) 0 )
for(int i=0; i < sizeInner; ++i)
for(int i=0; i < sizeInner; ++i) {
#ifdef DEBUG
//printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
#endif
if( conjA )
B_[i] = alpha * std::conj(A_[i * strideAinner]);
B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real();
else
B_[i] = alpha * A_[i * strideAinner];
B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];}
else
for(int i=0; i < sizeInner; ++i)
for(int i=0; i < sizeInner; ++i) {
#ifdef DEBUG
//printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
#endif
if( conjA )
B_[i] = alpha * std::conj(A_[i * strideAinner]) + beta * B_[i];
B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i * innerStrideB];
else
B_[i] = alpha * A_[i * strideAinner] + beta * B_[i];
B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i * innerStrideB];}
}
}

template void transpose_ref<float>( uint32_t *size, uint32_t *perm, int dim,
const float* __restrict__ A, float alpha,
float* __restrict__ B, float beta, const bool conjA);
const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, int innerStrideA,
float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
template void transpose_ref<FloatComplex>( uint32_t *size, uint32_t *perm, int dim,
const FloatComplex* __restrict__ A, FloatComplex alpha,
FloatComplex* __restrict__ B, FloatComplex beta, const bool conjA);
const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
template void transpose_ref<double>( uint32_t *size, uint32_t *perm, int dim,
const double* __restrict__ A, double alpha,
double* __restrict__ B, double beta, const bool conjA);
const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, int innerStrideA,
double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
template void transpose_ref<DoubleComplex>( uint32_t *size, uint32_t *perm, int dim,
const DoubleComplex* __restrict__ A, DoubleComplex alpha,
DoubleComplex* __restrict__ B, DoubleComplex beta, const bool conjA);
const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);

4 changes: 2 additions & 2 deletions benchmark/reference.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

template<typename floatType>
void transpose_ref( uint32_t *size, uint32_t *perm, int dim,
const floatType* __restrict__ A, floatType alpha,
floatType* __restrict__ B, floatType beta, const bool conjA);
const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA,
floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
7 changes: 5 additions & 2 deletions include/compute_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ namespace hptt {
/**
* \brief A ComputNode encodes a loop.
*/
class ComputeNode
class alignas(16) ComputeNode
{
public:
ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), next(nullptr) {}
ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(false), indexB(false), offDiffAB(std::numeric_limits<int>::min()), next(nullptr) {}

~ComputeNode() {
if ( next != nullptr )
Expand All @@ -20,6 +20,9 @@ class ComputeNode
size_t inc; //!< increment for at the current loop
size_t lda; //!< stride of A w.r.t. the loop index
size_t ldb; //!< stride of B w.r.t. the loop index
bool indexA; //!< true if index of A is innermost (0)
bool indexB; //!< true if index of B is innermost (0)
int offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop
ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
};

Expand Down
Loading