Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fc521c2
Host buffer support in sim (#188)
quetric Mar 25, 2024
d9a0f64
Enabled building multiple CCLO configs
quetric Mar 29, 2024
bedd5cf
Fix builds
quetric Apr 2, 2024
4292ee7
Add script to run all builds
quetric Apr 10, 2024
90b8848
Fix extdma and kernel connections issues.
bo3z Apr 10, 2024
4d5c492
Merge pull request #191 from bo3z/further-kernel-build-fixes
quetric Apr 12, 2024
4548ec0
Merge pull request #190 from Xilinx/184-enable-multiple-kernel-builds…
quetric Apr 12, 2024
b9dccef
Unzip CCLO IP for Coyote build
quetric Apr 16, 2024
d9efb24
Copy and unzip plugins for Coyote builds
quetric Apr 16, 2024
e7aee05
No arbiter unless using vadd kernel
quetric Apr 17, 2024
f50b9b9
Rename vitis config file
quetric Apr 17, 2024
a0ba7ea
Merge branch 'dev' of github.com:Xilinx/ACCL into dev
quetric Apr 17, 2024
544bd9a
Fix incorrect TCP/IP network kernel instantiation
bo3z Apr 22, 2024
90e44d7
146 migrate to other zmq library for c++ (#195)
quetric Apr 28, 2024
3ad0ec0
Fixed control interface assignment
quetric Apr 29, 2024
932b520
Merge remote-tracking branch 'upstream/dev' into pl-pl-demo-fixes
quetric Apr 29, 2024
b656646
WIP: TCP Session Handler Fixes
bo3z Apr 30, 2024
acb3fa3
Merge branch 'pl-pl-demo-fixes' of https://github.com/bo3z/ACCL into …
bo3z Apr 30, 2024
5d0646c
hls code compatibility with Vitis 2023+
quetric Apr 30, 2024
7dffe7b
Fix TCP Session Handler
bo3z May 3, 2024
abf3f42
Working example of PL-PL
bo3z May 3, 2024
1f9d2f7
Documentation for vadd example
bo3z May 3, 2024
01f49d2
Merge pull request #194 from bo3z/pl-pl-demo-fixes
quetric May 13, 2024
83e1836
196 reduceallreduce issues on cyt rdma (#199)
quetric May 30, 2024
1c1c95a
Update action.yml
quetric May 30, 2024
218ba4e
No seqn increment on rendezvous transfer (#201)
quetric Jun 18, 2024
5cd25d0
Fix flags passed into copies for single-node collectives
quetric Jul 2, 2024
4b8aefd
Add coyote support in network utils and regression test
quetric Jul 2, 2024
7053483
Sane default for max eager count
quetric Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/setup-accl-build-env/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ runs:
run: |
sudo apt-get update
DEBIAN_FRONTEND=noninteractive sudo apt-get install -y --no-install-recommends wget
wget --no-check-certificate https://www.xilinx.com/bin/public/openDownload?filename=xrt_202220.2.14.354_20.04-amd64-xrt.deb -O xrt.deb
wget --no-check-certificate -U 'Mozilla' https://www.xilinx.com/bin/public/openDownload?filename=xrt_202220.2.14.354_20.04-amd64-xrt.deb -O xrt.deb
shell: bash

- name: Save XRT
Expand Down
4 changes: 2 additions & 2 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ git submodule update --init --recursive
The project has been tested with Xilinx Vitis 2022.1 on Ubuntu 20.04.
```sh
sudo apt update
sudo apt install python3 cmake libzmqpp-dev libjsoncpp-dev libtclap-dev libopenmpi-dev xvfb
sudo apt install python3 cmake libzmq3-dev libjsoncpp-dev libtclap-dev libopenmpi-dev xvfb
```
Install the Xilinx Run-Time libraries (XRT)
```
Expand Down Expand Up @@ -64,7 +64,7 @@ First start up either the emulator or simulator:
```sh
cd "kernels/cclo"
source <VIVADO_INSTALL>/settings64.sh
make STACK_TYPE=TCP EN_FANIN=1 simdll
make STACK_TYPE=TCP MODE=simdll
cd "../../test/model/simulator"
source <VITIS_INSTALL>/settings64.sh
/bin/cmake .
Expand Down
2 changes: 1 addition & 1 deletion driver/hls/accl_hls.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "ap_axi_sdata.h"

#ifdef ACCL_SYNTHESIS
#include "ap_utils.h"
#include "etc/autopilot_ssdm_op.h"
#else
#define ap_wait()
#endif
Expand Down
14 changes: 10 additions & 4 deletions driver/utils/accl_network_utils/include/accl_network_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ std::vector<ACCL::rank_t> generate_ranks(bool local, int local_rank,
// Initialize accl and required network kernels
// If segsize == 0, the bufsize will be used as segment size instead
std::unique_ptr<ACCL::ACCL>
initialize_accl(const std::vector<ACCL::rank_t> &ranks, int local_rank,
initialize_accl(std::vector<ACCL::rank_t> &ranks, int local_rank,
bool simulator, acclDesign design,
xrt::device device = xrt::device(),
std::filesystem::path xclbin = "", int nbufs = 16,
ACCL::addr_t bufsize = 1024, ACCL::addr_t segsize = 0,
std::filesystem::path xclbin = "", unsigned int nbufs = 16,
unsigned int bufsize = 1024, unsigned int egrsize = 0,
bool rsfec = false);

// Configure the VNX kernel, this function is called by initialize_accl
Expand All @@ -66,7 +66,13 @@ void configure_vnx(vnx::CMAC &cmac, vnx::Networklayer &network_layer,
// Configure the TCP kernel, this function is called by initialize_accl
void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_network,
xrt::kernel &network_krnl, xrt::kernel &session_krnl,
const std::vector<ACCL::rank_t> &ranks, int local_rank);
std::vector<ACCL::rank_t> &ranks, int local_rank);

// Configure TCP engine on Coyote
void configure_cyt_rdma(std::vector<ACCL::rank_t> &ranks, int local_rank, ACCL::CoyoteDevice* device);

// Configure RDMA engine on Coyote
void configure_cyt_tcp(std::vector<ACCL::rank_t> &ranks, int local_rank, ACCL::CoyoteDevice* device);

// Get IPs from config file, this function is called by generate_ranks
std::vector<std::string> get_ips(std::filesystem::path config_file);
Expand Down
215 changes: 188 additions & 27 deletions driver/utils/accl_network_utils/src/accl_network_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#
*******************************************************************************/
#include <fstream>
#include <algorithm>
#include <json/json.h>

#ifdef ACCL_NETWORK_UTILS_MPI
Expand Down Expand Up @@ -191,9 +192,9 @@ void configure_vnx(vnx::CMAC &cmac, vnx::Networklayer &network_layer,
}
}

void configure_tcp(FPGABuffer<int8_t> &tx_buf_network, FPGABuffer<int8_t> &rx_buf_network,
void configure_tcp(XRTBuffer<int8_t> &tx_buf_network, XRTBuffer<int8_t> &rx_buf_network,
xrt::kernel &network_krnl, xrt::kernel &session_krnl,
const std::vector<rank_t> &ranks, int local_rank) {
std::vector<rank_t> &ranks, int local_rank) {
tx_buf_network.sync_to_device();
rx_buf_network.sync_to_device();

Expand All @@ -211,27 +212,185 @@ void configure_tcp(FPGABuffer<int8_t> &tx_buf_network, FPGABuffer<int8_t> &rx_bu
<< std::dec << std::endl;
log_debug(ss.str());

//set up sessions for ranks
for(size_t i = 0; i < ranks.size(); ++i){
bool success;
if (i == static_cast<size_t>(local_rank)) {
continue;
MPI_Barrier(MPI_COMM_WORLD);

// Set up ports for each [other] rank on each rank
for (int i = 0; i < ranks.size(); i++) {
uint8_t tmp_ret_code = 0;
uint16_t tmp_session_id = static_cast<uint16_t>(ranks[i].session_id);
xrt::run run = session_krnl(
static_cast<uint32_t>(ip_encode(ranks[i].ip)),
static_cast<uint16_t>(ranks[i].port),
&tmp_session_id,
&tmp_ret_code,
tcpSessionHandlerOperation::OPEN_PORT
);
run.wait();
uint8_t ret_code = session_krnl.read_register(0x30);
if(!ret_code){
throw std::runtime_error(
"Failed to open port: " + std::to_string(ranks[i].port) +
" from local rank: " + std::to_string(local_rank)
);
} else {
std::cout << "Successfully opened port: " << std::to_string(ranks[i].port) <<
" from local rank: " << std::to_string(local_rank) << std::endl;
}
session_krnl(ranks[i].ip, ranks[i].port, false,
&(ranks[i].session_id), &success);
if(!success){
throw std::runtime_error("Failed to establish session for IP:"+
ranks[i].ip+
" port: "+
std::to_string(ranks[i].port));
}

MPI_Barrier(MPI_COMM_WORLD);

// Open TCP connections
for (int i = 0; i < ranks.size(); i++) {
if (i == local_rank) continue;
uint8_t tmp_ret_code = 0;
uint16_t tmp_session_id = static_cast<uint16_t>(ranks[i].session_id);
xrt::run run = session_krnl(
static_cast<uint32_t>(ip_encode(ranks[i].ip)),
static_cast<uint16_t>(ranks[i].port),
&tmp_session_id,
&tmp_ret_code,
tcpSessionHandlerOperation::OPEN_CONNECTION
);
run.wait();
uint8_t ret_code = session_krnl.read_register(0x30);
uint8_t updated_sesion = session_krnl.read_register(0x28);
if(!ret_code){
throw std::runtime_error(
"Failed to establish session for IP: " + ranks[i].ip +
" port: "+ std::to_string(ranks[i].port) +
" from local rank: " + std::to_string(local_rank)
);
} else {
std::cout << "Successfully opened session: " << updated_sesion <<
"with IP address: " << std::to_string(ranks[i].port) <<
" from local rank: " << std::to_string(local_rank) << std::endl;
}
std::ostringstream ss;
ss << "Established session ID: " << ranks[i].session_id << std::endl;
log_debug(ss.str());
}
}

void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector<fpga::ibvQpConn*> &ibvQpConn_vec, std::vector<ACCL::rank_t> &ranks){

if (local_rank == master_rank)
{
std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<slave_rank<<std::endl;
// Send the local queue pair information to the slave rank
MPI_Send(&(ibvQpConn_vec[slave_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD);
}
else if (local_rank == slave_rank)
{
std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<master_rank<<std::endl;
// Receive the queue pair information from the master rank
fpga::ibvQ received_q;
MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

// Copy the received data to the remote queue pair
ibvQpConn_vec[master_rank]->getQpairStruct()->remote = received_q;
}

// Synchronize after the first exchange to avoid race conditions
MPI_Barrier(MPI_COMM_WORLD);

if (local_rank == slave_rank)
{
std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<master_rank<<std::endl;
// Send the local queue pair information to the master rank
MPI_Send(&(ibvQpConn_vec[master_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD);
}
else if (local_rank == master_rank)
{
std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<slave_rank<<std::endl;
// Receive the queue pair information from the slave rank
fpga::ibvQ received_q;
MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

// Copy the received data to the remote queue pair
ibvQpConn_vec[slave_rank]->getQpairStruct()->remote = received_q;
}

MPI_Barrier(MPI_COMM_WORLD);

// write established connection to hardware and perform arp lookup
if (local_rank == master_rank)
{
int connection = (ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[slave_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
ibvQpConn_vec[slave_rank]->getQpairStruct()->print();
ibvQpConn_vec[slave_rank]->setConnection(connection);
ibvQpConn_vec[slave_rank]->writeContext(ranks[slave_rank].port);
ibvQpConn_vec[slave_rank]->doArpLookup();
ranks[slave_rank].session_id = ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn;
} else if (local_rank == slave_rank)
{
int connection = (ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[master_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
ibvQpConn_vec[master_rank]->getQpairStruct()->print();
ibvQpConn_vec[master_rank]->setConnection(connection);
ibvQpConn_vec[master_rank]->writeContext(ranks[master_rank].port);
ibvQpConn_vec[master_rank]->doArpLookup();
ranks[master_rank].session_id = ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn;
}

MPI_Barrier(MPI_COMM_WORLD);
}

void configure_cyt_rdma(std::vector<ACCL::rank_t> &ranks, int local_rank, ACCL::CoyoteDevice* device){

std::cout<<"Initializing QP connections..."<<std::endl;
// create queue pair connections
std::vector<fpga::ibvQpConn*> ibvQpConn_vec;
// create single page dummy memory space for each qp
uint32_t n_pages = 1;
for(int i=0; i<ranks.size(); i++)
{
fpga::ibvQpConn* qpConn = new fpga::ibvQpConn(device->coyote_qProc_vec[i], ranks[local_rank].ip, n_pages);
ibvQpConn_vec.push_back(qpConn);
// qpConn->getQpairStruct()->print();
}

std::cout<<"Exchanging QP..."<<std::endl;
for(int i=0; i<ranks.size(); i++)
{
for(int j=i+1; j<ranks.size();j++)
{
exchange_qp(i, j, local_rank, ibvQpConn_vec, ranks);
}
}
}

void configure_cyt_tcp(std::vector<ACCL::rank_t> &ranks, int local_rank, ACCL::CoyoteDevice* device){
std::cout<<"Configuring Coyote TCP..."<<std::endl;
// arp lookup
for(int i=0; i<ranks.size(); i++){
if(local_rank != i){
device->get_device()->doArpLookup(ip_encode(ranks[i].ip));
}
}

//open port
for (int i=0; i<ranks.size(); i++)
{
uint32_t dstPort = ranks[i].port;
bool open_port_status = device->get_device()->tcpOpenPort(dstPort);
}

std::this_thread::sleep_for(10ms);

//open con
for (int i=0; i<ranks.size(); i++)
{
uint32_t dstPort = ranks[i].port;
uint32_t dstIp = ip_encode(ranks[i].ip);
uint32_t dstRank = i;
uint32_t session = 0;
if (local_rank != dstRank)
{
bool success = device->get_device()->tcpOpenCon(dstIp, dstPort, &session);
ranks[i].session_id = session;
}
}

}


std::vector<std::string> get_ips(fs::path config_file) {
std::vector<std::string> ips{};
Json::Value config;
Expand Down Expand Up @@ -290,15 +449,17 @@ std::vector<rank_t> generate_ranks(bool local, int local_rank, int world_size,
}

std::unique_ptr<ACCL::ACCL>
initialize_accl(const std::vector<rank_t> &ranks, int local_rank,
initialize_accl(std::vector<rank_t> &ranks, int local_rank,
bool simulator, acclDesign design, xrt::device device,
fs::path xclbin, int nbufs, addr_t bufsize, addr_t segsize,
bool rsfec) {
fs::path xclbin, unsigned int nbufs, unsigned int bufsize,
unsigned int egrsize, bool rsfec) {
std::size_t world_size = ranks.size();
std::unique_ptr<ACCL::ACCL> accl;

if (segsize == 0) {
segsize = bufsize;
if (egrsize == 0) {
egrsize = bufsize;
} else if(egrsize > bufsize){
bufsize = egrsize;
}

if (simulator) {
Expand Down Expand Up @@ -342,13 +503,13 @@ initialize_accl(const std::vector<rank_t> &ranks, int local_rank,
// Tx and Rx buffers will not be cleaned up properly and leak memory.
// They need to live at least as long as ACCL so for now this is the best
// we can do without requiring the users to allocate the buffers manually.
auto tx_buf_network = new FPGABuffer<int8_t>(
auto tx_buf_network = new XRTBuffer<int8_t>(
64 * 1024 * 1024, dataType::int8, device, networkmem);
auto rx_buf_network = new FPGABuffer<int8_t>(
auto rx_buf_network = new XRTBuffer<int8_t>(
64 * 1024 * 1024, dataType::int8, device, networkmem);
auto network_krnl =
xrt::kernel(device, xclbin_uuid, "network_krnl:{network_krnl_0}",
xrt::kernel::cu_access_mode::exclusive);
xrt::kernel(device, xclbin_uuid, "network_krnl:{poe_0}",
xrt::kernel::cu_access_mode::exclusive);
auto session_krnl =
xrt::kernel(device, xclbin_uuid, "tcp_session_handler:{session_handler_0}",
xrt::kernel::cu_access_mode::exclusive);
Expand All @@ -358,7 +519,7 @@ initialize_accl(const std::vector<rank_t> &ranks, int local_rank,

accl = std::make_unique<ACCL::ACCL>(device, cclo_ip, hostctrl_ip, devicemem, rxbufmem);
}
accl.get()->initialize(ranks, local_rank, nbufs, bufsize, segsize);
accl.get()->initialize(ranks, local_rank, nbufs, bufsize, egrsize, std::min(nbufs*bufsize, (unsigned int)4*1024*1024));
return accl;
}
} // namespace accl_network_utils
6 changes: 3 additions & 3 deletions driver/xrt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ set(ACCL_HEADERS
${ACCL_HEADER_PATH}/constants.hpp
${ACCL_HEADER_PATH}/simdevice.hpp
${ACCL_HEADER_PATH}/simbuffer.hpp
${ACCL_HEADER_PATH}/fpgadevice.hpp
${ACCL_HEADER_PATH}/xrtdevice.hpp
${ACCL_HEADER_PATH}/acclrequest.hpp
)

Expand All @@ -58,7 +58,7 @@ set(ACCL_SOURCES
${ACCL_SOURCE_PATH}/constants.cpp
${ACCL_SOURCE_PATH}/simdevice.cpp
${ACCL_SOURCE_PATH}/simbuffer.cpp
${ACCL_SOURCE_PATH}/fpgadevice.cpp
${ACCL_SOURCE_PATH}/xrtdevice.cpp
${ZMQ_INTF_DIR}/zmq_client.cpp
${ZMQ_INTF_DIR}/zmq_common.cpp
)
Expand Down Expand Up @@ -120,7 +120,7 @@ target_link_libraries(accl PUBLIC xilinxopencl xrt_coreutil xrt_core)
target_include_directories(accl PUBLIC $ENV{XILINX_XRT}/include)

# ZMQ
target_link_libraries(accl PUBLIC zmqpp zmq pthread)
target_link_libraries(accl PUBLIC zmq pthread)

# Json
find_package(jsoncpp REQUIRED)
Expand Down
8 changes: 4 additions & 4 deletions driver/xrt/docs/cpp_reference/buffer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ ACCL::Buffer
Hardware buffers
**********************************

ACCL::FPGABuffer
ACCL::XRTBuffer
==================================
.. doxygenclass:: ACCL::FPGABuffer
.. doxygenclass:: ACCL::XRTBuffer
:project: ACCL
:members:

ACCL::FPGABufferP2P
ACCL::XRTBufferP2P
==================================
.. doxygenclass:: ACCL::FPGABufferP2P
.. doxygenclass:: ACCL::XRTBufferP2P
:project: ACCL
:members:

Expand Down
Loading