Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Apple AMX optimization #5649

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,10 @@ if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "
endif()
endif()

if(MACOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
option(NCNN_APPLE_AMX "optimize apple silicon platforms with apple amx" ON)
endif()

if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)")
Expand Down
15 changes: 15 additions & 0 deletions src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2378,6 +2378,21 @@ int cpu_support_arm_svef32mm()
#endif
}

int cpu_support_arm_amx()
{
try_initialize_global_cpu_info();
#if __aarch64__ && __APPLE__
return g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_IBIZA
|| g_hw_cpufamily == CPUFAMILY_ARM_LOBOS
|| g_hw_cpufamily == CPUFAMILY_ARM_PALMA;

#else
return 0;
#endif
}

int cpu_support_x86_avx()
{
try_initialize_global_cpu_info();
Expand Down
2 changes: 2 additions & 0 deletions src/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ NCNN_EXPORT int cpu_support_arm_svebf16();
NCNN_EXPORT int cpu_support_arm_svei8mm();
// svef32mm = aarch64 svef32mm
NCNN_EXPORT int cpu_support_arm_svef32mm();
// amx = aarch64 apple amx
NCNN_EXPORT int cpu_support_arm_amx();

// avx = x86 avx
NCNN_EXPORT int cpu_support_x86_avx();
Expand Down
156 changes: 156 additions & 0 deletions src/layer/arm/amx_usability.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef AMX_USABILITY_H
#define AMX_USABILITY_H

// From https://github.com/corsix/amx/blob/main/aarch64.h
#define AMX_NOP_OP_IMM5(op, imm5) \
__asm("nop\nnop\nnop\n.word (0x201000 + (%0 << 5) + %1)" \
: \
: "i"(op), "i"(imm5) \
: "memory")

#define AMX_OP_GPR(op, gpr) \
__asm(".word (0x201000 + (%0 << 5) + 0%1 - ((0%1 >> 4) * 6))" \
: \
: "i"(op), "r"((uint64_t)(gpr)) \
: "memory")

#define AMX_LDX(gpr) AMX_OP_GPR(0, gpr)
#define AMX_LDY(gpr) AMX_OP_GPR(1, gpr)
#define AMX_STX(gpr) AMX_OP_GPR(2, gpr)
#define AMX_STY(gpr) AMX_OP_GPR(3, gpr)
#define AMX_LDZ(gpr) AMX_OP_GPR(4, gpr)
#define AMX_STZ(gpr) AMX_OP_GPR(5, gpr)
#define AMX_LDZI(gpr) AMX_OP_GPR(6, gpr)
#define AMX_STZI(gpr) AMX_OP_GPR(7, gpr)
#define AMX_EXTRX(gpr) AMX_OP_GPR(8, gpr)
#define AMX_EXTRY(gpr) AMX_OP_GPR(9, gpr)
#define AMX_FMA64(gpr) AMX_OP_GPR(10, gpr)
#define AMX_FMS64(gpr) AMX_OP_GPR(11, gpr)
#define AMX_FMA32(gpr) AMX_OP_GPR(12, gpr)
#define AMX_FMS32(gpr) AMX_OP_GPR(13, gpr)
#define AMX_MAC16(gpr) AMX_OP_GPR(14, gpr)
#define AMX_FMA16(gpr) AMX_OP_GPR(15, gpr)
#define AMX_FMS16(gpr) AMX_OP_GPR(16, gpr)
#define AMX_VECINT(gpr) AMX_OP_GPR(18, gpr)
#define AMX_VECFP(gpr) AMX_OP_GPR(19, gpr)
#define AMX_MATINT(gpr) AMX_OP_GPR(20, gpr)
#define AMX_MATFP(gpr) AMX_OP_GPR(21, gpr)
#define AMX_GENLUT(gpr) AMX_OP_GPR(22, gpr)
#define PTR_ROW_FLAGS(ptr, row, flags) (((uint64_t) & *(ptr)) + (((uint64_t)((row) + (flags)*64)) << 56))
void amx_set()
{
AMX_NOP_OP_IMM5(17, 0);
}

void amx_clr()
{
AMX_NOP_OP_IMM5(17, 1);
}

void amx_ldx(bool pair, unsigned int x_row, const void* ptr)
{
if (x_row >= 8)
return;

uint64_t oprand = (uint64_t)ptr + ((uint64_t)x_row << 56);
if (pair)
oprand |= 1ULL << 62;

AMX_LDX(oprand);
}

void amx_ldy(bool pair, unsigned int y_row, const void* ptr)
{
if (y_row >= 8)
return;

uint64_t oprand = (uint64_t)ptr + ((uint64_t)y_row << 56);
if (pair)
oprand |= 1ULL << 62;

AMX_LDY(oprand);
}

void amx_ldz(bool pair, unsigned int z_row, const void* ptr)
{
if (z_row >= 64)
return;

uint64_t oprand = (uint64_t)ptr + ((uint64_t)z_row << 56);
if (pair)
oprand |= 1ULL << 62;

AMX_LDZ(oprand);
}

void amx_stz(bool pair, unsigned int z_row, const void* ptr)
{
if (z_row >= 64)
return;

uint64_t oprand = (uint64_t)ptr + ((uint64_t)z_row << 56);
if (pair)
oprand |= 1ULL << 62;

AMX_STZ(oprand);
}

void amx_fma16_masked(bool vector, unsigned int x_offset, unsigned int y_offset, int z_row, uint8_t x_mode, uint8_t x_mask, uint8_t y_mode, uint8_t y_mask)
{
uint64_t oprand = 0;
if (vector)
oprand |= 1ULL << 63;

oprand |= (uint64_t)y_offset & 0x1FF;
oprand |= ((uint64_t)x_offset & 0x1FF) << 10;
oprand |= ((uint64_t)z_row & 0x3F) << 20;
oprand |= ((uint64_t)y_mask & 0x1F) << 32;
oprand |= ((uint64_t)y_mode & 0x3) << 37;
oprand |= ((uint64_t)x_mask & 0x1F) << 41;
oprand |= ((uint64_t)x_mode & 0x3) << 46;

AMX_FMA16(oprand);
}

void amx_fma16(bool vector, unsigned int x_offset, unsigned int y_offset, int z_row)
{
amx_fma16_masked(vector, x_offset, y_offset, z_row, 0, 0, 0, 0);
}

void amx_fma32_masked(bool vector, unsigned int x_offset, unsigned int y_offset, int z_row, uint8_t x_mode, uint8_t x_mask, uint8_t y_mode, uint8_t y_mask)
{
uint64_t oprand = 0;
if (vector)
oprand |= 1ULL << 63;

oprand |= (uint64_t)y_offset & 0x1FF;
oprand |= ((uint64_t)x_offset & 0x1FF) << 10;
oprand |= ((uint64_t)z_row & 0x3F) << 20;
oprand |= ((uint64_t)y_mask & 0x1F) << 32;
oprand |= ((uint64_t)y_mode & 0x3) << 37;
oprand |= ((uint64_t)x_mask & 0x1F) << 41;
oprand |= ((uint64_t)x_mode & 0x3) << 46;

AMX_FMA32(oprand);
}

void amx_fma32(bool vector, unsigned int x_offset, unsigned int y_offset, int z_row)
{
amx_fma32_masked(vector, x_offset, y_offset, z_row, 0, 0, 0, 0);
}

#endif // AMX_USABILITY_H
Loading