diff --git a/.github/actions/setup-accl-build-env/action.yml b/.github/actions/setup-accl-build-env/action.yml index 8d9cf21c..9116d73d 100644 --- a/.github/actions/setup-accl-build-env/action.yml +++ b/.github/actions/setup-accl-build-env/action.yml @@ -15,7 +15,7 @@ runs: run: | sudo apt-get update DEBIAN_FRONTEND=noninteractive sudo apt-get install -y --no-install-recommends wget - wget --no-check-certificate https://www.xilinx.com/bin/public/openDownload?filename=xrt_202220.2.14.354_20.04-amd64-xrt.deb -O xrt.deb + wget --no-check-certificate -U 'Mozilla' https://www.xilinx.com/bin/public/openDownload?filename=xrt_202220.2.14.354_20.04-amd64-xrt.deb -O xrt.deb shell: bash - name: Save XRT diff --git a/INSTALL.md b/INSTALL.md index 05ca4378..50da9065 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -10,7 +10,7 @@ git submodule update --init --recursive The project has been tested with Xilinx Vitis 2022.1 on Ubuntu 20.04. ```sh sudo apt update -sudo apt install python3 cmake libzmqpp-dev libjsoncpp-dev libtclap-dev libopenmpi-dev xvfb +sudo apt install python3 cmake libzmq3-dev libjsoncpp-dev libtclap-dev libopenmpi-dev xvfb ``` Install the Xilinx Run-Time libraries (XRT) ``` @@ -64,7 +64,7 @@ First start up either the emulator or simulator: ```sh cd "kernels/cclo" source /settings64.sh - make STACK_TYPE=TCP EN_FANIN=1 simdll + make STACK_TYPE=TCP MODE=simdll cd "../../test/model/simulator" source /settings64.sh /bin/cmake . diff --git a/driver/hls/accl_hls.h b/driver/hls/accl_hls.h index ee5243e2..beb3e2ef 100644 --- a/driver/hls/accl_hls.h +++ b/driver/hls/accl_hls.h @@ -21,7 +21,7 @@ #include "ap_axi_sdata.h" #ifdef ACCL_SYNTHESIS -#include "ap_utils.h" +#include "etc/autopilot_ssdm_op.h" #else #define ap_wait() #endif diff --git a/driver/utils/accl_network_utils/include/accl_network_utils.hpp b/driver/utils/accl_network_utils/include/accl_network_utils.hpp index 9e15bc8e..eb5655bd 100644 --- a/driver/utils/accl_network_utils/include/accl_network_utils.hpp +++ b/driver/utils/accl_network_utils/include/accl_network_utils.hpp @@ -51,11 +51,11 @@ std::vector generate_ranks(bool local, int local_rank, // Initialize accl and required network kernels // If segsize == 0, the bufsize will be used as segment size instead std::unique_ptr -initialize_accl(const std::vector &ranks, int local_rank, +initialize_accl(std::vector &ranks, int local_rank, bool simulator, acclDesign design, xrt::device device = xrt::device(), - std::filesystem::path xclbin = "", int nbufs = 16, - ACCL::addr_t bufsize = 1024, ACCL::addr_t segsize = 0, + std::filesystem::path xclbin = "", unsigned int nbufs = 16, + unsigned int bufsize = 1024, unsigned int egrsize = 0, bool rsfec = false); // Configure the VNX kernel, this function is called by initialize_accl @@ -66,7 +66,13 @@ void configure_vnx(vnx::CMAC &cmac, vnx::Networklayer &network_layer, // Configure the TCP kernel, this function is called by initialize_accl void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_network, xrt::kernel &network_krnl, xrt::kernel &session_krnl, - const std::vector &ranks, int local_rank); + std::vector &ranks, int local_rank); + +// Configure TCP engine on Coyote +void configure_cyt_rdma(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device); + +// Configure RDMA engine on Coyote +void configure_cyt_tcp(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device); // Get IPs from config file, this function is called by generate_ranks std::vector get_ips(std::filesystem::path config_file); diff --git a/driver/utils/accl_network_utils/src/accl_network_utils.cpp b/driver/utils/accl_network_utils/src/accl_network_utils.cpp index b01aa721..ad14c448 100644 --- a/driver/utils/accl_network_utils/src/accl_network_utils.cpp +++ b/driver/utils/accl_network_utils/src/accl_network_utils.cpp @@ -15,6 +15,7 @@ # *******************************************************************************/ #include +#include #include #ifdef ACCL_NETWORK_UTILS_MPI @@ -191,9 +192,9 @@ void configure_vnx(vnx::CMAC &cmac, vnx::Networklayer &network_layer, } } -void configure_tcp(FPGABuffer &tx_buf_network, FPGABuffer &rx_buf_network, +void configure_tcp(XRTBuffer &tx_buf_network, XRTBuffer &rx_buf_network, xrt::kernel &network_krnl, xrt::kernel &session_krnl, - const std::vector &ranks, int local_rank) { + std::vector &ranks, int local_rank) { tx_buf_network.sync_to_device(); rx_buf_network.sync_to_device(); @@ -211,27 +212,185 @@ void configure_tcp(FPGABuffer &tx_buf_network, FPGABuffer &rx_bu << std::dec << std::endl; log_debug(ss.str()); - //set up sessions for ranks - for(size_t i = 0; i < ranks.size(); ++i){ - bool success; - if (i == static_cast(local_rank)) { - continue; + MPI_Barrier(MPI_COMM_WORLD); + + // Set up ports for each [other] rank on each rank + for (int i = 0; i < ranks.size(); i++) { + uint8_t tmp_ret_code = 0; + uint16_t tmp_session_id = static_cast(ranks[i].session_id); + xrt::run run = session_krnl( + static_cast(ip_encode(ranks[i].ip)), + static_cast(ranks[i].port), + &tmp_session_id, + &tmp_ret_code, + tcpSessionHandlerOperation::OPEN_PORT + ); + run.wait(); + uint8_t ret_code = session_krnl.read_register(0x30); + if(!ret_code){ + throw std::runtime_error( + "Failed to open port: " + std::to_string(ranks[i].port) + + " from local rank: " + std::to_string(local_rank) + ); + } else { + std::cout << "Successfully opened port: " << std::to_string(ranks[i].port) << + " from local rank: " << std::to_string(local_rank) << std::endl; } - session_krnl(ranks[i].ip, ranks[i].port, false, - &(ranks[i].session_id), &success); - if(!success){ - throw std::runtime_error("Failed to establish session for IP:"+ - ranks[i].ip+ - " port: "+ - std::to_string(ranks[i].port)); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // Open TCP connections + for (int i = 0; i < ranks.size(); i++) { + if (i == local_rank) continue; + uint8_t tmp_ret_code = 0; + uint16_t tmp_session_id = static_cast(ranks[i].session_id); + xrt::run run = session_krnl( + static_cast(ip_encode(ranks[i].ip)), + static_cast(ranks[i].port), + &tmp_session_id, + &tmp_ret_code, + tcpSessionHandlerOperation::OPEN_CONNECTION + ); + run.wait(); + uint8_t ret_code = session_krnl.read_register(0x30); + uint8_t updated_sesion = session_krnl.read_register(0x28); + if(!ret_code){ + throw std::runtime_error( + "Failed to establish session for IP: " + ranks[i].ip + + " port: "+ std::to_string(ranks[i].port) + + " from local rank: " + std::to_string(local_rank) + ); + } else { + std::cout << "Successfully opened session: " << updated_sesion << + "with IP address: " << std::to_string(ranks[i].port) << + " from local rank: " << std::to_string(local_rank) << std::endl; } - std::ostringstream ss; - ss << "Established session ID: " << ranks[i].session_id << std::endl; - log_debug(ss.str()); } +} + +void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector &ibvQpConn_vec, std::vector &ranks){ + + if (local_rank == master_rank) + { + std::cout<<"Local rank "<getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD); + } + else if (local_rank == slave_rank) + { + std::cout<<"Local rank "<getQpairStruct()->remote = received_q; + } + + // Synchronize after the first exchange to avoid race conditions + MPI_Barrier(MPI_COMM_WORLD); + + if (local_rank == slave_rank) + { + std::cout<<"Local rank "<getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD); + } + else if (local_rank == master_rank) + { + std::cout<<"Local rank "<getQpairStruct()->remote = received_q; + } + + MPI_Barrier(MPI_COMM_WORLD); + + // write established connection to hardware and perform arp lookup + if (local_rank == master_rank) + { + int connection = (ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[slave_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16); + ibvQpConn_vec[slave_rank]->getQpairStruct()->print(); + ibvQpConn_vec[slave_rank]->setConnection(connection); + ibvQpConn_vec[slave_rank]->writeContext(ranks[slave_rank].port); + ibvQpConn_vec[slave_rank]->doArpLookup(); + ranks[slave_rank].session_id = ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn; + } else if (local_rank == slave_rank) + { + int connection = (ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[master_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16); + ibvQpConn_vec[master_rank]->getQpairStruct()->print(); + ibvQpConn_vec[master_rank]->setConnection(connection); + ibvQpConn_vec[master_rank]->writeContext(ranks[master_rank].port); + ibvQpConn_vec[master_rank]->doArpLookup(); + ranks[master_rank].session_id = ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn; + } + + MPI_Barrier(MPI_COMM_WORLD); +} +void configure_cyt_rdma(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device){ + + std::cout<<"Initializing QP connections..."< ibvQpConn_vec; + // create single page dummy memory space for each qp + uint32_t n_pages = 1; + for(int i=0; icoyote_qProc_vec[i], ranks[local_rank].ip, n_pages); + ibvQpConn_vec.push_back(qpConn); + // qpConn->getQpairStruct()->print(); + } + + std::cout<<"Exchanging QP..."< &ranks, int local_rank, ACCL::CoyoteDevice* device){ + std::cout<<"Configuring Coyote TCP..."<get_device()->doArpLookup(ip_encode(ranks[i].ip)); + } + } + + //open port + for (int i=0; iget_device()->tcpOpenPort(dstPort); + } + + std::this_thread::sleep_for(10ms); + + //open con + for (int i=0; iget_device()->tcpOpenCon(dstIp, dstPort, &session); + ranks[i].session_id = session; + } + } + +} + + std::vector get_ips(fs::path config_file) { std::vector ips{}; Json::Value config; @@ -290,15 +449,17 @@ std::vector generate_ranks(bool local, int local_rank, int world_size, } std::unique_ptr -initialize_accl(const std::vector &ranks, int local_rank, +initialize_accl(std::vector &ranks, int local_rank, bool simulator, acclDesign design, xrt::device device, - fs::path xclbin, int nbufs, addr_t bufsize, addr_t segsize, - bool rsfec) { + fs::path xclbin, unsigned int nbufs, unsigned int bufsize, + unsigned int egrsize, bool rsfec) { std::size_t world_size = ranks.size(); std::unique_ptr accl; - if (segsize == 0) { - segsize = bufsize; + if (egrsize == 0) { + egrsize = bufsize; + } else if(egrsize > bufsize){ + bufsize = egrsize; } if (simulator) { @@ -342,13 +503,13 @@ initialize_accl(const std::vector &ranks, int local_rank, // Tx and Rx buffers will not be cleaned up properly and leak memory. // They need to live at least as long as ACCL so for now this is the best // we can do without requiring the users to allocate the buffers manually. - auto tx_buf_network = new FPGABuffer( + auto tx_buf_network = new XRTBuffer( 64 * 1024 * 1024, dataType::int8, device, networkmem); - auto rx_buf_network = new FPGABuffer( + auto rx_buf_network = new XRTBuffer( 64 * 1024 * 1024, dataType::int8, device, networkmem); auto network_krnl = - xrt::kernel(device, xclbin_uuid, "network_krnl:{network_krnl_0}", - xrt::kernel::cu_access_mode::exclusive); + xrt::kernel(device, xclbin_uuid, "network_krnl:{poe_0}", + xrt::kernel::cu_access_mode::exclusive); auto session_krnl = xrt::kernel(device, xclbin_uuid, "tcp_session_handler:{session_handler_0}", xrt::kernel::cu_access_mode::exclusive); @@ -358,7 +519,7 @@ initialize_accl(const std::vector &ranks, int local_rank, accl = std::make_unique(device, cclo_ip, hostctrl_ip, devicemem, rxbufmem); } - accl.get()->initialize(ranks, local_rank, nbufs, bufsize, segsize); + accl.get()->initialize(ranks, local_rank, nbufs, bufsize, egrsize, std::min(nbufs*bufsize, (unsigned int)4*1024*1024)); return accl; } } // namespace accl_network_utils diff --git a/driver/xrt/CMakeLists.txt b/driver/xrt/CMakeLists.txt index 561f54e9..f6ed4236 100644 --- a/driver/xrt/CMakeLists.txt +++ b/driver/xrt/CMakeLists.txt @@ -47,7 +47,7 @@ set(ACCL_HEADERS ${ACCL_HEADER_PATH}/constants.hpp ${ACCL_HEADER_PATH}/simdevice.hpp ${ACCL_HEADER_PATH}/simbuffer.hpp - ${ACCL_HEADER_PATH}/fpgadevice.hpp + ${ACCL_HEADER_PATH}/xrtdevice.hpp ${ACCL_HEADER_PATH}/acclrequest.hpp ) @@ -58,7 +58,7 @@ set(ACCL_SOURCES ${ACCL_SOURCE_PATH}/constants.cpp ${ACCL_SOURCE_PATH}/simdevice.cpp ${ACCL_SOURCE_PATH}/simbuffer.cpp - ${ACCL_SOURCE_PATH}/fpgadevice.cpp + ${ACCL_SOURCE_PATH}/xrtdevice.cpp ${ZMQ_INTF_DIR}/zmq_client.cpp ${ZMQ_INTF_DIR}/zmq_common.cpp ) @@ -120,7 +120,7 @@ target_link_libraries(accl PUBLIC xilinxopencl xrt_coreutil xrt_core) target_include_directories(accl PUBLIC $ENV{XILINX_XRT}/include) # ZMQ -target_link_libraries(accl PUBLIC zmqpp zmq pthread) +target_link_libraries(accl PUBLIC zmq pthread) # Json find_package(jsoncpp REQUIRED) diff --git a/driver/xrt/docs/cpp_reference/buffer.rst b/driver/xrt/docs/cpp_reference/buffer.rst index b2174a5b..7de4cdfd 100644 --- a/driver/xrt/docs/cpp_reference/buffer.rst +++ b/driver/xrt/docs/cpp_reference/buffer.rst @@ -28,15 +28,15 @@ ACCL::Buffer Hardware buffers ********************************** -ACCL::FPGABuffer +ACCL::XRTBuffer ================================== -.. doxygenclass:: ACCL::FPGABuffer +.. doxygenclass:: ACCL::XRTBuffer :project: ACCL :members: -ACCL::FPGABufferP2P +ACCL::XRTBufferP2P ================================== -.. doxygenclass:: ACCL::FPGABufferP2P +.. doxygenclass:: ACCL::XRTBufferP2P :project: ACCL :members: diff --git a/driver/xrt/docs/cpp_reference/cclo.rst b/driver/xrt/docs/cpp_reference/cclo.rst index 45d08295..b0a03478 100644 --- a/driver/xrt/docs/cpp_reference/cclo.rst +++ b/driver/xrt/docs/cpp_reference/cclo.rst @@ -22,10 +22,18 @@ ACCL::ArithConfig :members: ********************************** -ACCL::FPGADevice +ACCL::XRTDevice ********************************** -.. doxygenclass:: ACCL::FPGADevice +.. doxygenclass:: ACCL::XRTDevice + :project: ACCL + :members: + +********************************** +ACCL::CoyoteDevice +********************************** + +.. doxygenclass:: ACCL::CoyoteDevice :project: ACCL :members: diff --git a/driver/xrt/include/accl.hpp b/driver/xrt/include/accl.hpp index 10f38edb..a98667fd 100644 --- a/driver/xrt/include/accl.hpp +++ b/driver/xrt/include/accl.hpp @@ -24,9 +24,8 @@ #include "accl/cclo.hpp" #include "accl/communicator.hpp" #include "accl/constants.hpp" -#include "accl/fpgabuffer.hpp" -#include "accl/fpgabufferp2p.hpp" -#include "accl/fpgadevice.hpp" +#include "accl/xrtbuffer.hpp" +#include "accl/xrtdevice.hpp" #include "accl/simbuffer.hpp" #include "accl/simdevice.hpp" #include "accl/coyotebuffer.hpp" @@ -138,19 +137,6 @@ class ACCL { ACCLRequest *set_timeout(unsigned int value, bool run_async = false, std::vector waitfor = {}); - /** - * Set the threshold for eager/rendezvous decision. - * - * @param value Threshold in bytes - * @param run_async Run the ACCL call asynchronously. - * @param waitfor ACCL call will wait for these operations before it will - * start. Currently not implemented. - * @return CCLO* CCLO object that can be waited on and passed to waitfor; - * nullptr if run_async is false. - */ - ACCLRequest *set_rendezvous_threshold(unsigned int value, bool run_async = false, - std::vector waitfor = {}); - /** * Performs the nop operation on the FPGA. * @@ -760,7 +746,7 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, int local_rank); /** - * Construct a new buffer object without an existing host buffer. + * Construct a new device buffer object without an existing host buffer. * * Note that when running in simulated mode, this constructor will not create * an underlying simulated BO buffer. If you need this functionality, use @@ -776,6 +762,31 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, return create_buffer(length, type, _devicemem); } + /** + * Construct a new host buffer object. + * + * Note that when running in simulated mode, this constructor will not create + * an underlying simulated BO buffer. If you need this functionality, use + * create_buffer(xrt::bo &, size_t, dataType). + * + * @tparam dtype Datatype of the buffer. + * @param length Amount of elements to allocate for. + * @param type ACCL datatype of the buffer. + * @return std::unique_ptr> The allocated buffer. + */ + template + std::unique_ptr> create_buffer_host(size_t length, dataType type) { + if (sim_mode) { + return std::unique_ptr>(new SimBuffer( + length, type, static_cast(cclo)->get_context(), true)); + } else if (cclo->get_device_type() == CCLO::xrt_device) { + return std::unique_ptr>(new XRTBuffer( + length, type, *(static_cast(cclo)->get_device()), xrt::bo::flags::host_only, (xrt::memory_group)0)); + } else { + return std::unique_ptr>(new CoyoteBuffer(length, type, cclo)); + } + } + /** * Construct a new buffer object without an existing host buffer on the * specified memory bank. @@ -797,10 +808,10 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, std::unique_ptr> create_buffer(size_t length, dataType type, unsigned mem_grp) { if (sim_mode) { return std::unique_ptr>(new SimBuffer( - length, type, static_cast(cclo)->get_context())); + length, type, static_cast(cclo)->get_context(), false, mem_grp)); } else if (cclo->get_device_type() == CCLO::xrt_device) { - return std::unique_ptr>(new FPGABuffer( - length, type, *(static_cast(cclo)->get_device()), (xrt::memory_group)mem_grp)); + return std::unique_ptr>(new XRTBuffer( + length, type, *(static_cast(cclo)->get_device()), (xrt::memory_group)mem_grp)); } else { return std::unique_ptr>(new CoyoteBuffer( length, type, cclo)); @@ -863,10 +874,10 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, if (sim_mode) { return std::unique_ptr>( new SimBuffer(host_buffer, length, type, - static_cast(cclo)->get_context())); + static_cast(cclo)->get_context(), false, mem_grp)); } else if(cclo->get_device_type() == CCLO::xrt_device ){ - return std::unique_ptr>(new FPGABuffer( - host_buffer, length, type, *(static_cast(cclo)->get_device()), (xrt::memory_group)mem_grp)); + return std::unique_ptr>(new XRTBuffer( + host_buffer, length, type, *(static_cast(cclo)->get_device()), (xrt::memory_group)mem_grp)); } return std::unique_ptr>(nullptr); } @@ -898,7 +909,7 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, static_cast(cclo)->get_context())); } else { return std::unique_ptr>( - new FPGABuffer(bo, length, type)); + new XRTBuffer(bo, length, type)); } } @@ -947,42 +958,14 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, return std::unique_ptr>(new SimBuffer( length, type, static_cast(cclo)->get_context())); } else if(cclo->get_device_type() == CCLO::xrt_device ){ - return std::unique_ptr>(new FPGABufferP2P( - length, type, *(static_cast(cclo)->get_device()), (xrt::memory_group)mem_grp)); + return std::unique_ptr>(new XRTBuffer( + length, type, *(static_cast(cclo)->get_device()), xrt::bo::flags::p2p, (xrt::memory_group)mem_grp)); } else { //for Coyote there's no concept of a p2p buffer throw std::runtime_error("p2p buffers not supported in Coyote"); } } - /** - * Construct a new p2p buffer object from an existing P2P BO buffer. - * - * If you do not pass a non-P2P BO buffer, data will not be copied correctly - * from and to the FPGA. - * - * Will create a normal buffer when running in simulated mode. See the notes - * of create_buffer(xrt::bo &, size_t, dataType) about using BO buffers in - * simulated mode. - * - * @tparam dtype Datatype of the buffer. - * @param length Amount of elements to allocate for. - * @param type ACCL datatype of the buffer. - * @return std::unique_ptr> The allocated P2P buffer. - */ - template - std::unique_ptr> create_buffer_p2p(xrt::bo &bo, size_t length, - dataType type) { - if (sim_mode) { - return std::unique_ptr>( - new SimBuffer(bo, *(static_cast(cclo)->get_device()), length, type, - static_cast(cclo)->get_context())); - } else { - return std::unique_ptr>( - new FPGABufferP2P(bo, length, type)); - } - } - /** * Construct a new coyote buffer object without an existing host buffer * diff --git a/driver/xrt/include/accl/constants.hpp b/driver/xrt/include/accl/constants.hpp index 5bf6490a..a8db21b2 100644 --- a/driver/xrt/include/accl/constants.hpp +++ b/driver/xrt/include/accl/constants.hpp @@ -383,6 +383,12 @@ enum class errorCode { DMA_TAG_MISMATCH_ERROR = 1 << 26 }; +enum class tcpSessionHandlerOperation { + OPEN_PORT = 1, + OPEN_CONNECTION = 2, + CLOSE_CONNECTION = 3 +}; + /** Amount of bits used for error codes. */ const size_t error_code_bits = 26; diff --git a/driver/xrt/include/accl/fpgabufferp2p.hpp b/driver/xrt/include/accl/fpgabufferp2p.hpp deleted file mode 100644 index 40821c9b..00000000 --- a/driver/xrt/include/accl/fpgabufferp2p.hpp +++ /dev/null @@ -1,143 +0,0 @@ -/******************************************************************************* -# Copyright (C) 2022 Xilinx, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -*******************************************************************************/ - -#pragma once -#include "buffer.hpp" -#include "common.hpp" -#include -#include -#include - -// Use posix_memalign if C++17 is not available -#if (__cplusplus >= 201703L) -#include -#else -#include -#endif - -#define ALIGNMENT 4096 - -/** @file fpgabufferp2p.hpp */ - -namespace ACCL { -/** - * A P2P buffer that is allocated on the FPGA and can be accessed from the host - * without explicit copying. - * - * @tparam dtype Datatype of the buffer. - */ -template class FPGABufferP2P : public Buffer { -public: - /** - * Construct a new FPGABufferP2P object from an existing P2P BO buffer. - * - * @param bo An existing P2P BO buffer. - * @param length Amount of elements in the P2P buffer. - * @param type ACCL datatype of the P2P buffer. - */ - FPGABufferP2P(xrt::bo &bo, addr_t length, dataType type) - : Buffer(bo.map(), length, type, bo.address()), _bo(bo) { - set_buffer(); - } - - /** - * Construct a new FPGABufferP2P object without any existing buffer. - * - * @param length Amount of elements to allocate for. - * @param type ACCL datatype of the P2P buffer. - * @param device Device to allocate the P2P buffer on. - * @param mem_grp Memory bank on device to allocate the P2P buffer on. - */ - FPGABufferP2P(addr_t length, dataType type, xrt::device &device, - xrt::memory_group mem_grp) - : Buffer(nullptr, length, type, 0x0), - _bo(device, length * sizeof(dtype), xrt::bo::flags::p2p, mem_grp) { - set_buffer(); - } - - /** - * Copy construct of a P2P buffer for internal use only. - * - */ - FPGABufferP2P(xrt::bo bo_, addr_t length, dataType type) - : Buffer(nullptr, length, type, 0x0), _bo(bo_) { - set_buffer(); - } - - /** - * Destroy the FPGABufferP2P object - * - */ - virtual ~FPGABufferP2P() {} - - /** - * Return the underlying P2P BO buffer. - * - * @return xrt::bo* The underlying P2P BO buffer. - */ - xrt::bo *bo() { return &_bo; } - - /** - * Check if the buffer is simulated, always false. - * - */ - bool is_simulated() const override { return false; } - - /** - * Check if the buffer is host-only, always false - * - */ - bool is_host_only() const override { return false; } - - /** - * Sync the data from the device back to the host, which is not required with - * a P2P buffer, so this function does nothing. - * - */ - void sync_from_device() override { - // Not applicable for p2p buffer - } - - /** - * Sync the data from the host to the device, which is not required with a P2P - * buffer, so this function does nothing. - * - */ - void sync_to_device() override { - // Not applicable for p2p buffer - } - - void free_buffer() override { return; } - - std::unique_ptr slice(size_t start, size_t end) override { - size_t start_bytes = start * sizeof(dtype); - size_t end_bytes = end * sizeof(dtype); - - return std::unique_ptr( - new FPGABufferP2P(xrt::bo(_bo, end_bytes - start_bytes, start_bytes), - end - start, this->_type)); - } - -private: - xrt::bo _bo; - - // Set the buffer after initialization since bo needs to be initialized first, - // but base constructor is called beforehand. - void set_buffer() { this->update_buffer(_bo.map(), _bo.address()); } -}; -} // namespace ACCL diff --git a/driver/xrt/include/accl/simbuffer.hpp b/driver/xrt/include/accl/simbuffer.hpp index adb06f95..582cf874 100644 --- a/driver/xrt/include/accl/simbuffer.hpp +++ b/driver/xrt/include/accl/simbuffer.hpp @@ -26,11 +26,21 @@ /** @file simbuffer.hpp */ -#define DEFAULT_SIMBUFFER_MEMGRP 0 +#define ACCL_SIM_DEFAULT_BANK 0 + +#ifndef ACCL_SIM_NUM_BANKS +#define ACCL_SIM_NUM_BANKS 1 +#endif + +#ifndef ACCL_SIM_MEM_SIZE_KB +#define ACCL_SIM_MEM_SIZE_KB (256*1024) +#endif namespace ACCL { -/** Stores the next free address on the simulated device. */ -extern addr_t next_free_address; +/** Stores the next free address on the simulated device. + Multiple card memory banks and one host memory bank */ +extern addr_t next_free_card_address[ACCL_SIM_NUM_BANKS]; +extern addr_t next_free_host_address; /** * A buffer that is allocated on a external CCLO emulator or simulator with an @@ -53,6 +63,8 @@ template class SimBuffer : public Buffer { xrt::bo _bo; // Only set if constructed using bo. xrt::bo internal_copy_bo; // Used to sync bo over zmq xrt::device _device{}; // Used to create copy buffers + bool host; // Flag identifying host-side buffer + unsigned int memgrp; //bank of buffer has device-side image bool bo_valid{}; /** @@ -61,12 +73,31 @@ template class SimBuffer : public Buffer { * @param size Size of the buffer to allocate. * @return addr_t Next free address on the CCLO. */ - addr_t get_next_free_address(size_t size) { - addr_t address = next_free_address; + addr_t get_next_free_card_address(size_t size, unsigned int memgrp = ACCL_SIM_DEFAULT_BANK) { + if(memgrp > ACCL_SIM_NUM_BANKS){ + throw std::invalid_argument("Requested address in invalid memory bank"); + } + addr_t address = next_free_card_address[memgrp]; // allocate on 4K boundaries // not sure how realistic this is, but it does help // work around some addressing limitations in RTLsim - next_free_address += ((addr_t)std::ceil(size / 4096.0)) * 4096; + next_free_card_address[memgrp] += ((addr_t)std::ceil(size / 4096.0)) * 4096; + + return address + memgrp*ACCL_SIM_MEM_SIZE_KB*1024; + } + + /** + * Get the next free host address available. + * + * @param size Size of the buffer to allocate. + * @return addr_t Next free host address. + */ + addr_t get_next_free_host_address(size_t size) { + addr_t address = next_free_host_address; + // allocate on 4K boundaries + // not sure how realistic this is, but it does help + // work around some addressing limitations in RTLsim + next_free_host_address += ((addr_t)std::ceil(size / 4096.0)) * 4096; return address; } @@ -88,7 +119,7 @@ template class SimBuffer : public Buffer { */ void allocate_buffer() { zmq_client_memalloc(this->zmq_ctx, (uint64_t)this->_address, - (unsigned int)this->_size); + (unsigned int)this->_size, this->host); } public: @@ -99,11 +130,18 @@ template class SimBuffer : public Buffer { * @param length Amount of elements in the existing host buffer. * @param type ACCL datatype of buffer. * @param context The zmq server of the CCLO to use. + * @param is_host The type of buffer to create. + * @param mmegrp The bank in which to allocate the buffer. */ SimBuffer(dtype *buffer, size_t length, dataType type, - zmq_intf_context *const context) + zmq_intf_context *const context, + bool is_host = false, + unsigned int memgrp = ACCL_SIM_DEFAULT_BANK) : SimBuffer(buffer, length, type, context, - this->get_next_free_address(length * sizeof(dtype))) {} + is_host ? + this->get_next_free_host_address(length * sizeof(dtype)) : + this->get_next_free_card_address(length * sizeof(dtype), memgrp), + is_host, memgrp) {} /** * Construct a new simulated buffer from a simulated BO buffer. @@ -119,8 +157,12 @@ template class SimBuffer : public Buffer { SimBuffer(xrt::bo &bo, xrt::device &device, size_t length, dataType type, zmq_intf_context *const context) : SimBuffer(bo.map(), length, type, context, - this->get_next_free_address(length * sizeof(dtype)), bo, - device, true) {} + bo.get_flags() == xrt::bo::flags::host_only ? + this->get_next_free_host_address(length * sizeof(dtype)) : + this->get_next_free_card_address(length * sizeof(dtype), ACCL_SIM_DEFAULT_BANK), + bo, device, + bo.get_flags() == xrt::bo::flags::host_only, ACCL_SIM_DEFAULT_BANK, + true) {} /** * Construct a new simulated buffer without an existing host pointer. @@ -128,25 +170,33 @@ template class SimBuffer : public Buffer { * @param length Amount of elements to allocate for. * @param type ACCL datatype of buffer. * @param context The zmq server of the CCLO to use. + * @param is_host The type of buffer to create. + * @param mmegrp The bank in which to allocate the buffer. */ - SimBuffer(size_t length, dataType type, zmq_intf_context *const context) - : SimBuffer(create_internal_buffer(length), length, type, context) {} + SimBuffer(size_t length, dataType type, zmq_intf_context *const context, + bool is_host = false, + unsigned int memgrp = ACCL_SIM_DEFAULT_BANK) + : SimBuffer(create_internal_buffer(length), length, type, context, is_host, memgrp) {} /** * Construct a new simulated buffer from an existing host pointer at a * specific physical address. You should generally let ACCL itself decide * which physical address to use. * - * @param buffer Host buffer to use. - * @param length Amount of elements in host pointer. - * @param type ACCL datatype of buffer. - * @param context The zmq server of the CCLO to use. + * @param buffer Host buffer to use. + * @param length Amount of elements in host pointer. + * @param type ACCL datatype of buffer. + * @param context The zmq server of the CCLO to use. * @param address The physical address of the device buffer. + * @param is_host The type of buffer to create. + * @param mmegrp The bank in which to allocate the buffer. */ SimBuffer(dtype *buffer, size_t length, dataType type, - zmq_intf_context *const context, const addr_t address) + zmq_intf_context *const context, const addr_t address, + bool is_host = false, + unsigned int memgrp = ACCL_SIM_DEFAULT_BANK) : Buffer(buffer, length, type, address), zmq_ctx(context), - _bo(xrt::bo()) { + _bo(xrt::bo()), host(is_host), memgrp(memgrp) { allocate_buffer(); } @@ -157,14 +207,14 @@ template class SimBuffer : public Buffer { SimBuffer(dtype *buffer, size_t length, dataType type, zmq_intf_context *const context, const addr_t address, xrt::bo &bo, xrt::device &device, bool bo_valid_, + bool is_host, unsigned int memgrp, bool is_slice = false) : Buffer(buffer, length, type, address), zmq_ctx(context), - _bo(bo), _device(device), bo_valid(bo_valid_) { + _bo(bo), _device(device), bo_valid(bo_valid_), host(is_host), memgrp(memgrp) { if (bo_valid) { - internal_copy_bo = xrt::bo(_device, this->_size, - (xrt::memory_group)DEFAULT_SIMBUFFER_MEMGRP); + internal_copy_bo = xrt::bo(_device, this->_size, bo.get_flags(), this->memgrp); } - + allocate_buffer(); } @@ -204,7 +254,7 @@ template class SimBuffer : public Buffer { * Check if the buffer is host-only, always false in sim. * */ - bool is_host_only() const override { return false; } + bool is_host_only() const override { return host; } /** * Sync the user BO buffer to the simulated buffer. @@ -218,7 +268,7 @@ template class SimBuffer : public Buffer { internal_copy_bo.sync(xclBOSyncDirection::XCL_BO_SYNC_BO_FROM_DEVICE); zmq_client_memwrite(this->zmq_ctx, (uint64_t)this->_address, (unsigned int)this->_size, - internal_copy_bo.map()); + internal_copy_bo.map(), this->host); } } @@ -250,7 +300,7 @@ template class SimBuffer : public Buffer { sync_bo_to_device(); zmq_client_memread(this->zmq_ctx, (uint64_t)this->_address, (unsigned int)this->_size, - static_cast(this->_byte_array)); + static_cast(this->_byte_array), this->host); } /** @@ -262,7 +312,7 @@ template class SimBuffer : public Buffer { void sync_to_device() override { zmq_client_memwrite(this->zmq_ctx, (uint64_t)this->_address, (unsigned int)this->_size, - static_cast(this->_byte_array)); + static_cast(this->_byte_array), this->host); if (bo_valid) { _bo.sync(XCL_BO_SYNC_BO_TO_DEVICE); } @@ -283,7 +333,7 @@ template class SimBuffer : public Buffer { return std::unique_ptr(new SimBuffer( &this->_buffer[start], end - start, this->_type, this->zmq_ctx, - this->_address + start, bo_slice, _device, bo_valid, true)); + this->_address + start, bo_slice, _device, bo_valid, this->host, this->memgrp, true)); } }; } // namespace ACCL diff --git a/driver/xrt/include/accl/fpgabuffer.hpp b/driver/xrt/include/accl/xrtbuffer.hpp similarity index 72% rename from driver/xrt/include/accl/fpgabuffer.hpp rename to driver/xrt/include/accl/xrtbuffer.hpp index 1b6dc637..55f3b9bf 100644 --- a/driver/xrt/include/accl/fpgabuffer.hpp +++ b/driver/xrt/include/accl/xrtbuffer.hpp @@ -24,7 +24,7 @@ #include #include -/** @file fpgabuffer.hpp */ +/** @file xrtbuffer.hpp */ namespace ACCL { /** @@ -37,10 +37,10 @@ namespace ACCL { * * @tparam dtype Datatype of the buffer. */ -template class FPGABuffer : public Buffer { +template class XRTBuffer : public Buffer { public: /** - * Construct a new FPGABuffer object from an existing host pointer. + * Construct a new XRTBuffer object from an existing host pointer. * * If a non-aligned host pointer is provided, ACCL will keep it's own aligned * host buffer, and copy between the unaligned and aligned host buffers when @@ -51,9 +51,27 @@ template class FPGABuffer : public Buffer { * @param length Amount of elements in the host buffer. * @param type ACCL datatype of buffer. * @param device Device to allocate the buffer on. + * @param flags BO flags * @param mem_grp Memory bank on the device to allocate the buffer on. */ - FPGABuffer(dtype *buffer, addr_t length, dataType type, xrt::device &device, + XRTBuffer(dtype *buffer, addr_t length, dataType type, xrt::device &device, + xrt::bo::flags flags, xrt::memory_group mem_grp) + : Buffer(nullptr, length, type, 0x0), + _bo(device, get_aligned_buffer(buffer, length * sizeof(dtype)), + length * sizeof(dtype), flags, mem_grp) { + set_buffer(); + } + + /** + * Construct a new device-side XRTBuffer object from an existing host pointer. + * + * @param buffer The host pointer containing the data. + * @param length Amount of elements in the host buffer. + * @param type ACCL datatype of buffer. + * @param device Device to allocate the buffer on. + * @param mem_grp Memory bank on the device to allocate the buffer on. + */ + XRTBuffer(dtype *buffer, addr_t length, dataType type, xrt::device &device, xrt::memory_group mem_grp) : Buffer(nullptr, length, type, 0x0), _bo(device, get_aligned_buffer(buffer, length * sizeof(dtype)), @@ -62,7 +80,7 @@ template class FPGABuffer : public Buffer { } /** - * Construct a new FPGABuffer object from an existing BO buffer. + * Construct a new XRTBuffer object from an existing BO buffer. * * No new buffer is allocated when using this constructor, instead ACCL will * use the existing BO buffer. @@ -71,22 +89,40 @@ template class FPGABuffer : public Buffer { * @param length Amount of elements to allocate for. * @param type ACCL datatype of buffer. */ - FPGABuffer(xrt::bo &bo, addr_t length, dataType type) + XRTBuffer(xrt::bo &bo, addr_t length, dataType type) : Buffer(bo.map(), length, type, bo.address()), _bo(bo) { set_buffer(); } /** - * Construct a new FPGABuffer object without an existing host pointer. + * Construct a new XRTBuffer object without an existing host pointer. + * + * This constructor will allocate on host and/or FPGA depending on flags. * - * This constructor will allocate a buffer on both the host and the FPGA. + * @param length Amount of elements to allocate the buffers for. + * @param type ACCL datatype of the buffer. + * @param device Device to allocate the FPGA buffer on. + * @param flags BO flags + * @param mem_grp Memory bank of the device to allocate the FPGA buffer on. + */ + XRTBuffer(addr_t length, dataType type, xrt::device &device, + xrt::bo::flags flags, xrt::memory_group mem_grp) + : Buffer(nullptr, length, type, 0x0), is_aligned(true), + _bo(device, length * sizeof(dtype), flags, mem_grp) { + set_buffer(); + // Initialize memory to zero + std::memset(this->_buffer, 0, this->_size); + } + + /** + * Construct a new XRTBuffer object on the FPGA without an existing host pointer. * * @param length Amount of elements to allocate the buffers for. * @param type ACCL datatype of the buffer. * @param device Device to allocate the FPGA buffer on. * @param mem_grp Memory bank of the device to allocate the FPGA buffer on. */ - FPGABuffer(addr_t length, dataType type, xrt::device &device, + XRTBuffer(addr_t length, dataType type, xrt::device &device, xrt::memory_group mem_grp) : Buffer(nullptr, length, type, 0x0), is_aligned(true), _bo(device, length * sizeof(dtype), mem_grp) { @@ -99,7 +135,7 @@ template class FPGABuffer : public Buffer { * Copy construct of an FPGA buffer for internal use only. * */ - FPGABuffer(xrt::bo bo_, addr_t length, dataType type, bool is_aligned_, + XRTBuffer(xrt::bo bo_, addr_t length, dataType type, bool is_aligned_, dtype *unaligned_buffer_) : Buffer(nullptr, length, type, 0x0), is_aligned(is_aligned_), _bo(bo_), aligned_buffer(_bo.map()), @@ -108,10 +144,10 @@ template class FPGABuffer : public Buffer { } /** - * Destroy the FPGABuffer object + * Destroy the XRTBuffer object * */ - virtual ~FPGABuffer() { + virtual ~XRTBuffer() { // Only free the aligned buffer if it exists and we own it (might not be // the case if this is a slice). if (!is_aligned && own_unaligned) { @@ -144,10 +180,12 @@ template class FPGABuffer : public Buffer { /** * Sync the data from the device back to the host. Will copy the data from * the aligned buffer to the unaligned buffer if an unaligned buffer was used - * during construction of the FPGABuffer. + * during construction of the XRTBuffer. * */ void sync_from_device() override { + auto flags = _bo.get_flags(); + if(flags == xrt::bo::flags::p2p) return; _bo.sync(xclBOSyncDirection::XCL_BO_SYNC_BO_FROM_DEVICE); if (!is_aligned) { std::memcpy(unaligned_buffer, aligned_buffer, this->size()); @@ -157,10 +195,12 @@ template class FPGABuffer : public Buffer { /** * Sync the data from the host to the device. Will copy the data from the * unaligned buffer to the aligned buffer first if an unaligned buffer was - * used during construction of the FPGABuffer. + * used during construction of the XRTBuffer. * */ void sync_to_device() override { + auto flags = _bo.get_flags(); + if(flags == xrt::bo::flags::p2p) return; if (!is_aligned) { std::memcpy(aligned_buffer, unaligned_buffer, this->size()); } @@ -178,7 +218,7 @@ template class FPGABuffer : public Buffer { offset_unaligned_buffer = &unaligned_buffer[start]; } - return std::unique_ptr(new FPGABuffer( + return std::unique_ptr(new XRTBuffer( xrt::bo(_bo, end_bytes - start_bytes, start_bytes), end - start, this->_type, this->is_aligned, offset_unaligned_buffer)); } diff --git a/driver/xrt/include/accl/fpgadevice.hpp b/driver/xrt/include/accl/xrtdevice.hpp similarity index 93% rename from driver/xrt/include/accl/fpgadevice.hpp rename to driver/xrt/include/accl/xrtdevice.hpp index 87bc3021..5b188b43 100644 --- a/driver/xrt/include/accl/fpgadevice.hpp +++ b/driver/xrt/include/accl/xrtdevice.hpp @@ -25,7 +25,7 @@ #include #include -/** @file fpgadevice.hpp */ +/** @file xrtdevice.hpp */ namespace ACCL { /** @@ -64,22 +64,22 @@ class FPGARequest : public BaseRequest { * Implementation of CCLO that uses a CCLO kernel on a FPGA. * */ -class FPGADevice : public CCLO { +class XRTDevice : public CCLO { public: /** - * Construct a new FPGADevice object + * Construct a new XRTDevice object * * @param cclo_ip The CCLO kernel to use. * @param hostctrl_ip The hostctrl kernel to use. * @param device Xrt device; */ - FPGADevice(xrt::ip &cclo_ip, xrt::kernel &hostctrl_ip, xrt::device &device); + XRTDevice(xrt::ip &cclo_ip, xrt::kernel &hostctrl_ip, xrt::device &device); /** - * Destroy the FPGADevice object + * Destroy the XRTDevice object * */ - virtual ~FPGADevice() {} + virtual ~XRTDevice() {} ACCLRequest *call(const Options &options) override; diff --git a/driver/xrt/src/accl.cpp b/driver/xrt/src/accl.cpp index 34052a26..9e98efb7 100644 --- a/driver/xrt/src/accl.cpp +++ b/driver/xrt/src/accl.cpp @@ -32,7 +32,7 @@ ACCL::ACCL(xrt::device &device, xrt::ip &cclo_ip, xrt::kernel &hostctrl_ip, const arithConfigMap &arith_config) : arith_config(arith_config), sim_mode(false), _devicemem(devicemem), rxbufmem(rxbufmem) { - cclo = new FPGADevice(cclo_ip, hostctrl_ip, device); + cclo = new XRTDevice(cclo_ip, hostctrl_ip, device); } // Simulation constructor @@ -104,22 +104,6 @@ ACCLRequest *ACCL::set_timeout(unsigned int value, bool run_async, return handle; } -ACCLRequest *ACCL::set_rendezvous_threshold(unsigned int value, bool run_async, - std::vector waitfor) { - CCLO::Options options{}; - options.scenario = operation::config; - options.count = value; - options.cfg_function = cfgFunc::set_max_eager_msg_size; - ACCLRequest *handle = call_async(options); - - if (!run_async) { - wait(handle); - check_return_value("set_max_eager_msg_size", handle); - } - - return handle; -} - ACCLRequest *ACCL::nop(bool run_async, std::vector waitfor) { CCLO::Options options{}; options.scenario = operation::nop; @@ -1156,7 +1140,7 @@ void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8, static_cast(cclo)->get_context()); } else if(cclo->get_device_type() == CCLO::xrt_device ){ - buf = new FPGABuffer(eager_rx_buffer_size, dataType::int8, *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); + buf = new XRTBuffer(eager_rx_buffer_size, dataType::int8, *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); } else if(cclo->get_device_type() == CCLO::coyote_device){ buf = new CoyoteBuffer(eager_rx_buffer_size, dataType::int8, static_cast(cclo)); } @@ -1195,8 +1179,8 @@ void ACCL::setup_rendezvous_spare_buffers(addr_t rndzv_spare_buf_size, const std buf = new SimBuffer(new int8_t[max_rndzv_msg_size](), max_rndzv_msg_size, dataType::int8, static_cast(cclo)->get_context()); } else if(cclo->get_device_type() == CCLO::xrt_device ){ - buf = new FPGABuffer(max_rndzv_msg_size, dataType::int8, - *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); + buf = new XRTBuffer(max_rndzv_msg_size, dataType::int8, + *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); } else if(cclo->get_device_type() == CCLO::coyote_device){ buf = new CoyoteBuffer(max_rndzv_msg_size, dataType::int8, static_cast(cclo)); } diff --git a/driver/xrt/src/simbuffer.cpp b/driver/xrt/src/simbuffer.cpp index 0381e09c..543460f7 100644 --- a/driver/xrt/src/simbuffer.cpp +++ b/driver/xrt/src/simbuffer.cpp @@ -20,5 +20,6 @@ namespace ACCL { /* Can't define variable in header */ - addr_t next_free_address = 0x0; + addr_t next_free_host_address = 0x0; + addr_t next_free_card_address[ACCL_SIM_NUM_BANKS] = {0x0}; } diff --git a/driver/xrt/src/simdevice.cpp b/driver/xrt/src/simdevice.cpp index d69f12ea..b0519f0d 100644 --- a/driver/xrt/src/simdevice.cpp +++ b/driver/xrt/src/simdevice.cpp @@ -50,12 +50,14 @@ void SimRequest::start() { function = static_cast(options.reduce_function); } + uint32_t flags = static_cast(options.host_flags) << 8 | static_cast(options.stream_flags); + zmq_client_startcall( reinterpret_cast(cclo_ptr)->get_context(), static_cast(options.scenario), options.tag, options.count, options.comm, options.root_src_dst, function, options.arithcfg_addr, static_cast(options.compression_flags), - static_cast(options.stream_flags), + static_cast(flags), options.addr_0->address(), options.addr_1->address(), options.addr_2->address()); diff --git a/driver/xrt/src/fpgadevice.cpp b/driver/xrt/src/xrtdevice.cpp similarity index 91% rename from driver/xrt/src/fpgadevice.cpp rename to driver/xrt/src/xrtdevice.cpp index a920e94f..8d93c464 100644 --- a/driver/xrt/src/fpgadevice.cpp +++ b/driver/xrt/src/xrtdevice.cpp @@ -16,14 +16,14 @@ # *******************************************************************************/ -#include "accl/fpgadevice.hpp" +#include "accl/xrtdevice.hpp" #include "accl/common.hpp" #include #include static void finish_fpga_request(ACCL::FPGARequest *req) { req->wait_kernel(); - ACCL::FPGADevice *cclo = reinterpret_cast(req->cclo()); + ACCL::XRTDevice *cclo = reinterpret_cast(req->cclo()); // get ret code before notifying waiting theads req->set_retcode(cclo->read(ACCL::CCLO_ADDR::RETCODE_OFFSET)); req->set_duration(cclo->read(ACCL::CCLO_ADDR::PERFCNT_OFFSET)); @@ -43,13 +43,14 @@ void FPGARequest::start() { } else { function = static_cast(options.reduce_function); } + uint32_t flags = static_cast(options.host_flags) << 8 | static_cast(options.stream_flags); switch(options.scenario) { case ACCL::operation::copy: run.set_arg(ACCL::XRT_ARG_ID::SCENARIO_ID, static_cast(options.scenario)); run.set_arg(ACCL::XRT_ARG_ID::COUNT_ID, static_cast(options.count)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -59,7 +60,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::FUNCTION_ID, static_cast(function)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_1_ID, static_cast(options.addr_1->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); @@ -72,7 +73,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::TAG_ID, static_cast(options.tag)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); break; case ACCL::operation::recv: @@ -83,7 +84,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::TAG_ID, static_cast(options.tag)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; case ACCL::operation::bcast: @@ -93,7 +94,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::ROOT_SRC_DST_ID, static_cast(options.root_src_dst)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); break; case ACCL::operation::scatter: @@ -103,7 +104,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::ROOT_SRC_DST_ID, static_cast(options.root_src_dst)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -114,7 +115,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::ROOT_SRC_DST_ID, static_cast(options.root_src_dst)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -126,7 +127,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::FUNCTION_ID, static_cast(function)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -136,7 +137,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::COMM_ID, static_cast(options.comm)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -147,7 +148,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::FUNCTION_ID, static_cast(function)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -158,7 +159,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::FUNCTION_ID, static_cast(function)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -174,7 +175,7 @@ void FPGARequest::start() { run.set_arg(ACCL::XRT_ARG_ID::COMM_ID, static_cast(options.comm)); run.set_arg(ACCL::XRT_ARG_ID::ARITHCFG_ADDR_ID, static_cast(options.arithcfg_addr)); run.set_arg(ACCL::XRT_ARG_ID::COMPRESSION_FLAGS_ID, static_cast(options.compression_flags)); - run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(options.stream_flags)); + run.set_arg(ACCL::XRT_ARG_ID::STREAM_FLAGS_ID, static_cast(flags)); run.set_arg(ACCL::XRT_ARG_ID::ADDR_0_ID, static_cast(options.addr_0->address())); run.set_arg(ACCL::XRT_ARG_ID::ADDR_2_ID, static_cast(options.addr_2->address())); break; @@ -190,11 +191,11 @@ void FPGARequest::start() { run.start(); } -ACCLRequest *FPGADevice::start(const Options &options) { +ACCLRequest *XRTDevice::start(const Options &options) { ACCLRequest *request = new ACCLRequest; if (options.waitfor.size() != 0) { - throw std::runtime_error("FPGADevice does not support chaining"); + throw std::runtime_error("XRTDevice does not support chaining"); } FPGARequest *fpga_handle = @@ -210,16 +211,16 @@ ACCLRequest *FPGADevice::start(const Options &options) { return request; } -FPGADevice::FPGADevice(xrt::ip &cclo_ip, xrt::kernel &hostctrl_ip, xrt::device &device) +XRTDevice::XRTDevice(xrt::ip &cclo_ip, xrt::kernel &hostctrl_ip, xrt::device &device) : cclo(cclo_ip), hostctrl(hostctrl_ip), device(device) {} -void FPGADevice::wait(ACCLRequest *request) { +void XRTDevice::wait(ACCLRequest *request) { auto fpga_handle = request_map.find(*request); if (fpga_handle != request_map.end()) fpga_handle->second->wait(); } -timeoutStatus FPGADevice::wait(ACCLRequest *request, +timeoutStatus XRTDevice::wait(ACCLRequest *request, std::chrono::milliseconds timeout) { auto fpga_handle = request_map.find(*request); @@ -229,7 +230,7 @@ timeoutStatus FPGADevice::wait(ACCLRequest *request, return timeoutStatus::timeout; } -bool FPGADevice::test(ACCLRequest *request) { +bool XRTDevice::test(ACCLRequest *request) { auto fpga_handle = request_map.find(*request); if (fpga_handle == request_map.end()) @@ -238,7 +239,7 @@ bool FPGADevice::test(ACCLRequest *request) { return fpga_handle->second->get_status() == operationStatus::COMPLETED; } -uint64_t FPGADevice::get_duration(ACCLRequest *request) { +uint64_t XRTDevice::get_duration(ACCLRequest *request) { auto handle = request_map.find(*request); if (handle == request_map.end()) @@ -247,7 +248,7 @@ uint64_t FPGADevice::get_duration(ACCLRequest *request) { return handle->second->get_duration() * 4; } -void FPGADevice::free_request(ACCLRequest *request) { +void XRTDevice::free_request(ACCLRequest *request) { auto fpga_handle = request_map.find(*request); if (fpga_handle != request_map.end()) { @@ -256,7 +257,7 @@ void FPGADevice::free_request(ACCLRequest *request) { } } -ACCLRequest *FPGADevice::call(const Options &options) { +ACCLRequest *XRTDevice::call(const Options &options) { ACCLRequest *req = start(options); wait(req); @@ -264,11 +265,11 @@ ACCLRequest *FPGADevice::call(const Options &options) { return req; } -CCLO::deviceType FPGADevice::get_device_type() { +CCLO::deviceType XRTDevice::get_device_type() { return CCLO::xrt_device; } -val_t FPGADevice::get_retcode(ACCLRequest *request) { +val_t XRTDevice::get_retcode(ACCLRequest *request) { auto fpga_handle = request_map.find(*request); if (fpga_handle != request_map.end()) @@ -277,13 +278,13 @@ val_t FPGADevice::get_retcode(ACCLRequest *request) { return fpga_handle->second->get_retcode(); } -val_t FPGADevice::read(addr_t offset) { return cclo.read_register(offset); } +val_t XRTDevice::read(addr_t offset) { return cclo.read_register(offset); } -void FPGADevice::write(addr_t offset, val_t val) { +void XRTDevice::write(addr_t offset, val_t val) { return cclo.write_register(offset, val); } -void FPGADevice::launch_request() { +void XRTDevice::launch_request() { // This guarantees permission to only one thread trying to start an operation if (queue.run()) { FPGARequest *req = queue.front(); @@ -293,7 +294,7 @@ void FPGADevice::launch_request() { } } -void FPGADevice::complete_request(FPGARequest *request) { +void XRTDevice::complete_request(FPGARequest *request) { if (request->get_status() == operationStatus::COMPLETED) { queue.pop(); launch_request(); diff --git a/kernels/cclo/Makefile b/kernels/cclo/Makefile index 4014d794..dc76095c 100644 --- a/kernels/cclo/Makefile +++ b/kernels/cclo/Makefile @@ -18,6 +18,7 @@ PLATFORM ?= xilinx_u280_xdma_201920_3 HW_DEBUG ?= none STACK_TYPE ?= UDP +MODE ?= xo EN_DMA ?= 1 EN_ARITH ?= 1 EN_COMPRESS ?= 1 @@ -25,24 +26,32 @@ EN_EXT_KRNL ?= 1 MB_DEBUG_LEVEL ?= 0 SIM_MEM_SIZE_LOG ?= 28 SIM_MEM_LATENCY ?= 50 -CCLO_ELF=vitis_ws/ccl_offload_control/Debug/ccl_offload_control.elf -CCLO_SIMDLL=ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/xsim.dir/ccl_offload_behav/xsimk.so -CCLO_XSA=ccl_offload_ex/ccl_offload.xsa -CCLO_DCP=ccl_offload_ex/ccl_offload_ex.runs/synth_1/packaged.dcp -CCLO_XO=ccl_offload.xo FW_SOURCES = $(shell find fw -name '*.c') $(shell find fw -name '*.h') $(shell find fw -name '*.tcl') +ifeq ($(MODE), simdll) + EN_DMA=1 + EN_ARITH=1 + EN_COMPRESS=1 + EN_EXT_KRNL=1 + MB_DEBUG_LEVEL=0 +endif + ifeq (u250,$(findstring u250, $(PLATFORM))) FPGAPART=xcu250-figd2104-2L-e + BOARD=u250 else ifeq (u280,$(findstring u280, $(PLATFORM))) FPGAPART=xcu280-fsvh2892-2L-e + BOARD=u280 else ifeq (u55c,$(findstring u55c, $(PLATFORM))) FPGAPART=xcu55c-fsvh2892-2L-e + BOARD=u55c else ifeq (u200,$(findstring u200, $(PLATFORM))) FPGAPART=xcu200-fsgd2104-2-e + BOARD=u200 else ifeq (u50,$(findstring u50, $(PLATFORM))) FPGAPART=xcu50-fsvh2104-2-e + BOARD=u50 else $(error Unsupported PLATFORM) endif @@ -51,14 +60,34 @@ COMMIT_HASH := 0x$(shell git rev-parse HEAD | cut -c 1-6) GEN_KERNEL_TCL := tcl/generate_kernel.tcl REBUILD_BD_TCL := tcl/rebuild_bd.tcl tcl/control_bd.tcl tcl/rx_bd.tcl tcl/tx_bd.tcl -all: $(CCLO_XO) +ifeq ($(MODE), simdll) + FPGAPART=xcu280-fsvh2892-2L-e + BOARD=u280 + BUILD_FOLDER = $(STACK_TYPE)_sim +else + BUILD_FOLDER = $(STACK_TYPE)_$(EN_DMA)$(EN_ARITH)$(EN_COMPRESS)$(EN_EXT_KRNL)$(MB_DEBUG_LEVEL)_$(FPGAPART) +endif + +CCLO_ELF=$(BUILD_FOLDER)/vitis_ws/ccl_offload_control/Debug/ccl_offload_control.elf +CCLO_SIMDLL=$(BUILD_FOLDER)/ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/xsim.dir/ccl_offload_behav/xsimk.so +CCLO_XSA=$(BUILD_FOLDER)/ccl_offload.xsa +CCLO_DCP=$(BUILD_FOLDER)/ccl_offload_ex/ccl_offload_ex.runs/synth_1/packaged.dcp +CCLO_XO=$(BUILD_FOLDER)/ccl_offload.xo + +OUTPUT_PRODUCT=$(CCLO_XO) +ifeq ($(MODE), simdll) + OUTPUT_PRODUCT=$(CCLO_SIMDLL) +endif + +all: $(OUTPUT_PRODUCT) .PHONY: xsa xsa: $(CCLO_XSA) $(CCLO_XSA): $(GEN_KERNEL_TCL) $(REBUILD_BD_TCL) $(MAKE) -C hls/ DEVICE=$(FPGAPART) - vivado -mode batch -source $< -tclargs $(FPGAPART) $(HW_DEBUG) $(CCLO_XSA) $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) $(COMMIT_HASH) + mkdir -p $(BUILD_FOLDER) + cd $(BUILD_FOLDER) && vivado -mode batch -source ../$< -tclargs $(FPGAPART) $(HW_DEBUG) ./ccl_offload.xsa $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) $(COMMIT_HASH) OPTIMIZATION=3 #valid values 0,1,2,3,s,g @@ -74,36 +103,34 @@ else ifeq ($(OPTIMIZATION),s) OPTIMIZATION_STRING = "Optimize for size (-Os)" endif -.PHONY: elf simdll xo clean dcp +.PHONY: elf xo clean dcp elf: $(CCLO_ELF) $(CCLO_ELF): tcl/generate_sw.tcl tcl/associate_elf.tcl $(FW_SOURCES) $(CCLO_XSA) - rm -rf vitis_ws && mkdir vitis_ws - xsct $< ccl_offload $(CCLO_XSA) ./fw $(OPTIMIZATION_STRING) - vivado -mode batch -source tcl/associate_elf.tcl -tclargs $(CCLO_ELF) - -simdll: $(CCLO_SIMDLL) + cd $(BUILD_FOLDER) && xsct ../tcl/generate_sw.tcl ccl_offload ./ccl_offload.xsa ../fw $(OPTIMIZATION_STRING) + cd $(BUILD_FOLDER) && vivado -mode batch -source ../tcl/associate_elf.tcl -tclargs vitis_ws/ccl_offload_control/Debug/ccl_offload_control.elf $(CCLO_SIMDLL): tcl/generate_sim.tcl elf - $(MAKE) -C ../plugins PLATFORM=$(PLATFORM) - vivado -mode batch -source $< -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(SIM_MEM_SIZE_LOG) $(SIM_MEM_LATENCY) - cd ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/ && ./compile.sh && ./elaborate.sh + $(MAKE) -C ../plugins DEVICE=$(FPGAPART) + cd $(BUILD_FOLDER) && vivado -mode batch -source ../tcl/generate_sim.tcl -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(SIM_MEM_SIZE_LOG) $(SIM_MEM_LATENCY) + cd $(BUILD_FOLDER)/ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/ && ./compile.sh && ./elaborate.sh dcp: $(CCLO_DCP) $(CCLO_DCP): tcl/generate_dcp.tcl $(CCLO_ELF) - vivado -mode batch -source $< -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) + cd $(BUILD_FOLDER) && vivado -mode batch -source ../tcl/generate_dcp.tcl -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) xo: $(CCLO_XO) -ccl_offload.xml: +$(BUILD_FOLDER)/ccl_offload.xml: python3 gen_xml.py $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) + mkdir -p $(BUILD_FOLDER) + mv ccl_offload.xml $@ -$(CCLO_XO): tcl/generate_xo.tcl ccl_offload.xml $(CCLO_DCP) - rm -rf ccl_offload_ex/ccl_offload - vivado -mode batch -source $< -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) +$(CCLO_XO): tcl/generate_xo.tcl $(BUILD_FOLDER)/ccl_offload.xml $(CCLO_DCP) + rm -rf $(BUILD_FOLDER)/ccl_offload_ex/ccl_offload + cd $(BUILD_FOLDER) && vivado -mode batch -source ../tcl/generate_xo.tcl -tclargs $(STACK_TYPE) $(EN_DMA) $(EN_ARITH) $(EN_COMPRESS) $(EN_EXT_KRNL) $(MB_DEBUG_LEVEL) clean: - $(MAKE) -C hls/ clean - rm -rf ccl_offload_ex ccl_offload.xml ccl_offload.xo vitis_ws .Xil *.jou *.log + git clean -xfd diff --git a/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.c b/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.c index c733ec43..eff26184 100755 --- a/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.c +++ b/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.c @@ -367,8 +367,6 @@ int rendezvous_get_any_completion(unsigned int *target_rank, uint64_t *target_ad putd(CMD_RNDZV_PENDING, addrh); putd(CMD_RNDZV_PENDING, host); putd(CMD_RNDZV_PENDING, count); - putd(CMD_RNDZV_PENDING, host); - putd(CMD_RNDZV_PENDING, count); } else { num_rndzv_pending--; *target_rank = rank; @@ -1588,6 +1586,8 @@ int reduce( unsigned int count, while(rendezvous_get_addr(root_rank, &buf_addr, &dst_host, count, TAG_ANY) == NOT_READY_ERROR); if(dst_host){ host |= RES_HOST; + } else { + host &= ~RES_HOST; } //do a RDMA write to the remote address return move( @@ -1770,13 +1770,13 @@ int reduce_scatter( //copy the OP0_HOST flag over RES_HOST //because we're broadcasting from the allreduce result buffer unsigned int r_host = (host & OP0_HOST) | ((host & OP0_HOST) << 2); - unsigned int r_buftype = (buftype & 0xFFFFFF00) | (r_host & 0xFF); + unsigned int r_buftype = (buftype & 0xFFFF00FF) | ((r_host & 0xFF) << 8); //reduce step - we reduce back into src_buf_addr while(reduce(count*world.size, func, 0, src_buf_addr, src_buf_addr, comm_offset, arcfg_offset, compression, r_buftype) == NOT_READY_ERROR); //copy the RES_HOST flag over OP0_HOST //because we're broadcasting from the allreduce result buffer host = (host & RES_HOST) | ((host & RES_HOST) >> 2); - buftype = (buftype & 0xFFFFFF00) | (host & 0xFF); + buftype = (buftype & 0xFFFF00FF) | ((host & 0xFF) << 8); //broadcast step while(scatter(count, 0, src_buf_addr, dst_buf_addr, comm_offset, arcfg_offset, compression, buftype) == NOT_READY_ERROR); } else { @@ -1874,7 +1874,7 @@ int allreduce( if(world.size == 1){ //corner-case copy for when running a single-node reduction - return copy(count, src_buf_addr, dst_buf_addr, arcfg_offset, compression, stream); + return copy(count, src_buf_addr, dst_buf_addr, arcfg_offset, compression, buftype); } else if((bytes_count > max_eager_size) && (compression == NO_COMPRESSION) && (stream == NO_STREAM)){ //allreduce via reduction+broadcast //reduce step @@ -1882,7 +1882,7 @@ int allreduce( //copy the RES_HOST flag over OP0_HOST //because we're broadcasting from the allreduce result buffer host = (host & RES_HOST) | ((host & RES_HOST) >> 2); - buftype = (buftype & 0xFFFFFF00) | (host & 0xFF); + buftype = (buftype & 0xFFFF00FF) | ((host & 0xFF) << 8); //broadcast step while(broadcast(count, 0, dst_buf_addr, comm_offset, arcfg_offset, compression, buftype) == NOT_READY_ERROR); } else { @@ -2136,7 +2136,7 @@ int all_to_all( if(world.size == 1){ //corner-case single-node alltoall - return copy(count, src_buf_addr, dst_buf_addr, arcfg_offset, compression, stream); + return copy(count, src_buf_addr, dst_buf_addr, arcfg_offset, compression, buftype); } else if(/*(bytes_count > max_eager_size) && */(compression == NO_COMPRESSION) && (stream == NO_STREAM)){ //alltoall via simultaneous broadcast //since in alltoall each endpoint must receive P-1 messages (where P is the world size) diff --git a/kernels/cclo/hdl/ccl_offload.v b/kernels/cclo/hdl/ccl_offload.v index 7efb5103..8fbea965 100644 --- a/kernels/cclo/hdl/ccl_offload.v +++ b/kernels/cclo/hdl/ccl_offload.v @@ -340,70 +340,6 @@ module ccl_offload `endif `ifdef DMA_ENABLE - .m_axi_0_araddr(m_axi_0_araddr), - .m_axi_0_arburst(m_axi_0_arburst), - .m_axi_0_arcache(m_axi_0_arcache), - .m_axi_0_arlen(m_axi_0_arlen), - .m_axi_0_arprot(m_axi_0_arprot), - .m_axi_0_arready(m_axi_0_arready), - .m_axi_0_arsize(m_axi_0_arsize), - .m_axi_0_aruser(m_axi_0_aruser), - .m_axi_0_arvalid(m_axi_0_arvalid), - .m_axi_0_awaddr(m_axi_0_awaddr), - .m_axi_0_awburst(m_axi_0_awburst), - .m_axi_0_awcache(m_axi_0_awcache), - .m_axi_0_awlen(m_axi_0_awlen), - .m_axi_0_awprot(m_axi_0_awprot), - .m_axi_0_awready(m_axi_0_awready), - .m_axi_0_awsize(m_axi_0_awsize), - .m_axi_0_awuser(m_axi_0_awuser), - .m_axi_0_awvalid(m_axi_0_awvalid), - .m_axi_0_bready(m_axi_0_bready), - .m_axi_0_bresp(m_axi_0_bresp), - .m_axi_0_bvalid(m_axi_0_bvalid), - .m_axi_0_rdata(m_axi_0_rdata), - .m_axi_0_rlast(m_axi_0_rlast), - .m_axi_0_rready(m_axi_0_rready), - .m_axi_0_rresp(m_axi_0_rresp), - .m_axi_0_rvalid(m_axi_0_rvalid), - .m_axi_0_wdata(m_axi_0_wdata), - .m_axi_0_wlast(m_axi_0_wlast), - .m_axi_0_wready(m_axi_0_wready), - .m_axi_0_wstrb(m_axi_0_wstrb), - .m_axi_0_wvalid(m_axi_0_wvalid), - - .m_axi_1_araddr(m_axi_1_araddr), - .m_axi_1_arburst(m_axi_1_arburst), - .m_axi_1_arcache(m_axi_1_arcache), - .m_axi_1_arlen(m_axi_1_arlen), - .m_axi_1_arprot(m_axi_1_arprot), - .m_axi_1_arready(m_axi_1_arready), - .m_axi_1_arsize(m_axi_1_arsize), - .m_axi_1_aruser(m_axi_1_aruser), - .m_axi_1_arvalid(m_axi_1_arvalid), - .m_axi_1_awaddr(m_axi_1_awaddr), - .m_axi_1_awburst(m_axi_1_awburst), - .m_axi_1_awcache(m_axi_1_awcache), - .m_axi_1_awlen(m_axi_1_awlen), - .m_axi_1_awprot(m_axi_1_awprot), - .m_axi_1_awready(m_axi_1_awready), - .m_axi_1_awsize(m_axi_1_awsize), - .m_axi_1_awuser(m_axi_1_awuser), - .m_axi_1_awvalid(m_axi_1_awvalid), - .m_axi_1_bready(m_axi_1_bready), - .m_axi_1_bresp(m_axi_1_bresp), - .m_axi_1_bvalid(m_axi_1_bvalid), - .m_axi_1_rdata(m_axi_1_rdata), - .m_axi_1_rlast(m_axi_1_rlast), - .m_axi_1_rready(m_axi_1_rready), - .m_axi_1_rresp(m_axi_1_rresp), - .m_axi_1_rvalid(m_axi_1_rvalid), - .m_axi_1_wdata(m_axi_1_wdata), - .m_axi_1_wlast(m_axi_1_wlast), - .m_axi_1_wready(m_axi_1_wready), - .m_axi_1_wstrb(m_axi_1_wstrb), - .m_axi_1_wvalid(m_axi_1_wvalid), -`elsif DMA_EXTERNAL .m_axis_dma0_s2mm_tdata(m_axis_dma0_s2mm_tdata), .m_axis_dma0_s2mm_tkeep(m_axis_dma0_s2mm_tkeep), .m_axis_dma0_s2mm_tdest(m_axis_dma0_s2mm_tdest), @@ -612,7 +548,6 @@ module ccl_offload .s_axi_data_arqos(s_axi_data_arqos), .s_axi_data_arready(s_axi_data_arready), .s_axi_data_arsize(s_axi_data_arsize), - .s_axi_data_aruser(s_axi_data_aruser), .s_axi_data_arvalid(s_axi_data_arvalid), .s_axi_data_awaddr(s_axi_data_awaddr), .s_axi_data_awburst(s_axi_data_awburst), diff --git a/kernels/cclo/hls/build.tcl b/kernels/cclo/hls/build.tcl index dde09e0a..ea2a921f 100644 --- a/kernels/cclo/hls/build.tcl +++ b/kernels/cclo/hls/build.tcl @@ -58,7 +58,7 @@ set seg_dir "[pwd]/../segmenter/" set rx_dir "[pwd]/../rxbuf_offload/" set drv_dir "[pwd]/../../../../driver/hls/" -open_project build_$ipname +open_project build_${ipname}.${device} add_files $ipname.cpp -cflags "-std=c++14 -I. -I../ -I$drv_dir -I$hlslib_dir -I$fw_dir -I$eth_dir -I$seg_dir -I$rx_dir -DACCL_SYNTHESIS" if {$do_sim || $do_cosim} { diff --git a/kernels/cclo/hls/dma_mover/Makefile b/kernels/cclo/hls/dma_mover/Makefile index 2d804673..fe7a524e 100644 --- a/kernels/cclo/hls/dma_mover/Makefile +++ b/kernels/cclo/hls/dma_mover/Makefile @@ -17,7 +17,7 @@ TARGET=ip DEVICE=xcu280-fsvh2892-2L-e -DMA_MOVER_IP=build_dma_mover/sol1/impl/ip/xilinx_com_hls_dma_mover_1_0.zip +DMA_MOVER_IP=build_dma_mover.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_dma_mover_1_0.zip all: $(DMA_MOVER_IP) diff --git a/kernels/cclo/hls/dma_mover/build_dma_mover.tcl b/kernels/cclo/hls/dma_mover/build_dma_mover.tcl index 4372816d..9c7307d3 100644 --- a/kernels/cclo/hls/dma_mover/build_dma_mover.tcl +++ b/kernels/cclo/hls/dma_mover/build_dma_mover.tcl @@ -50,7 +50,7 @@ switch $command { } } -open_project build_dma_mover +open_project build_dma_mover.${device} add_files dma_mover.cpp -cflags "-std=c++14 -I[pwd]/../../../../driver/hls -I[pwd]/../eth_intf/ -I[pwd]/../../../../hlslib/include/hlslib/xilinx -I[pwd]/../segmenter -I[pwd]/../../fw/sw_apps/ccl_offload_control/src -DHLSLIB_SYNTHESIS" add_files -tb tb_dma_mover.cpp -cflags "-std=c++14 -I[pwd]/../../../../driver/hls -I[pwd]/../eth_intf/ -I[pwd]/../../../../hlslib/include/hlslib/xilinx -I[pwd]/../segmenter -I[pwd]/../../fw/sw_apps/ccl_offload_control/src -DHLSLIB_SYNTHESIS" diff --git a/kernels/cclo/hls/dma_mover/dma_mover.cpp b/kernels/cclo/hls/dma_mover/dma_mover.cpp index a2e06839..bede2f43 100644 --- a/kernels/cclo/hls/dma_mover/dma_mover.cpp +++ b/kernels/cclo/hls/dma_mover/dma_mover.cpp @@ -648,7 +648,7 @@ void instruction_decode( STREAM_WRITE(eth_insn, pkt_wr); ack_insn.check_eth_tx = true; //if we're not sending to a remote stream, update sequence number - if(!pkt_wr.to_stream){ + if(!pkt_wr.to_stream && !pkt_wr.rendezvous){ if(pkt_wr.len <= pkt_wr.max_seg_len){ nsegments = 1; } else{ diff --git a/kernels/cclo/hls/eth_intf/Makefile b/kernels/cclo/hls/eth_intf/Makefile index d4537a85..f1de0a95 100644 --- a/kernels/cclo/hls/eth_intf/Makefile +++ b/kernels/cclo/hls/eth_intf/Makefile @@ -16,15 +16,15 @@ # *******************************************************************************/ DEVICE=xcu280-fsvh2892-2L-e -TCP_PACKETIZER_IP=build_tcp_packetizer/sol1/impl/ip/xilinx_com_hls_tcp_packetizer_1_0.zip -TCP_DEPACKETIZER_IP=build_tcp_depacketizer/sol1/impl/ip/xilinx_com_hls_tcp_depacketizer_1_0.zip -TCP_TXHANDLER_IP=build_tcp_txHandler//sol1/impl/ip/xilinx_com_hls_tcp_txHandler_1_0.zip -TCP_RXHANDLER_IP=build_tcp_rxHandler//sol1/impl/ip/xilinx_com_hls_tcp_rxHandler_1_0.zip -UDP_PACKETIZER_IP=build_udp_packetizer/sol1/impl/ip/xilinx_com_hls_udp_packetizer_1_0.zip -UDP_DEPACKETIZER_IP=build_udp_depacketizer/sol1/impl/ip/xilinx_com_hls_udp_depacketizer_1_0.zip -RDMA_SQ_HANDLER_IP=build_rdma_sq_handler/sol1/impl/ip/xilinx_com_hls_rdma_sq_handler_1_0.zip -RDMA_PACKETIZER_IP=build_rdma_packetizer/sol1/impl/ip/xilinx_com_hls_rdma_packetizer_1_0.zip -RDMA_DEPACKETIZER_IP=build_rdma_depacketizer/sol1/impl/ip/xilinx_com_hls_rdma_depacketizer_1_0.zip +TCP_PACKETIZER_IP=build_tcp_packetizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_tcp_packetizer_1_0.zip +TCP_DEPACKETIZER_IP=build_tcp_depacketizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_tcp_depacketizer_1_0.zip +TCP_TXHANDLER_IP=build_tcp_txHandler.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_tcp_txHandler_1_0.zip +TCP_RXHANDLER_IP=build_tcp_rxHandler.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_tcp_rxHandler_1_0.zip +UDP_PACKETIZER_IP=build_udp_packetizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_udp_packetizer_1_0.zip +UDP_DEPACKETIZER_IP=build_udp_depacketizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_udp_depacketizer_1_0.zip +RDMA_SQ_HANDLER_IP=build_rdma_sq_handler.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rdma_sq_handler_1_0.zip +RDMA_PACKETIZER_IP=build_rdma_packetizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rdma_packetizer_1_0.zip +RDMA_DEPACKETIZER_IP=build_rdma_depacketizer.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rdma_depacketizer_1_0.zip TARGET=ip diff --git a/kernels/cclo/hls/rxbuf_offload/Makefile b/kernels/cclo/hls/rxbuf_offload/Makefile index d8fdb3fe..9282cffa 100644 --- a/kernels/cclo/hls/rxbuf_offload/Makefile +++ b/kernels/cclo/hls/rxbuf_offload/Makefile @@ -17,10 +17,10 @@ TARGET=ip DEVICE=xcu250-figd2104-2L-e -RXBUF_DEQUEUE_IP=build_rxbuf_dequeue/sol1/impl/ip/xilinx_com_hls_rxbuf_dequeue_1_0.zip -RXBUF_ENQUEUE_IP=build_rxbuf_enqueue/sol1/impl/ip/xilinx_com_hls_rxbuf_enqueue_1_0.zip -RXBUF_SEEK_IP=build_rxbuf_seek/sol1/impl/ip/xilinx_com_hls_rxbuf_seek_1_0.zip -RXBUF_SESSION_IP=build_rxbuf_session/sol1/impl/ip/xilinx_com_hls_rxbuf_session_1_0.zip +RXBUF_DEQUEUE_IP=build_rxbuf_dequeue.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rxbuf_dequeue_1_0.zip +RXBUF_ENQUEUE_IP=build_rxbuf_enqueue.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rxbuf_enqueue_1_0.zip +RXBUF_SEEK_IP=build_rxbuf_seek.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rxbuf_seek_1_0.zip +RXBUF_SESSION_IP=build_rxbuf_session.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_rxbuf_session_1_0.zip all: $(RXBUF_DEQUEUE_IP) $(RXBUF_ENQUEUE_IP) $(RXBUF_SEEK_IP) $(RXBUF_SESSION_IP) diff --git a/kernels/cclo/hls/segmenter/Makefile b/kernels/cclo/hls/segmenter/Makefile index aa06d397..4055fe2a 100644 --- a/kernels/cclo/hls/segmenter/Makefile +++ b/kernels/cclo/hls/segmenter/Makefile @@ -16,8 +16,8 @@ # *******************************************************************************/ DEVICE=xcu250-figd2104-2L-e -SEGMENTER_IP=build_stream_segmenter/sol1/impl/ip/xilinx_com_hls_stream_segmenter_1_0.zip -DMA2SEGCMD_IP=build_dma2seg_cmd/sol1/impl/ip/xilinx_com_hls_dma2seg_cmd_1_0.zip +SEGMENTER_IP=build_stream_segmenter.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_stream_segmenter_1_0.zip +DMA2SEGCMD_IP=build_dma2seg_cmd.$(DEVICE)/sol1/impl/ip/xilinx_com_hls_dma2seg_cmd_1_0.zip TARGET=ip all: $(SEGMENTER_IP) $(DMA2SEGCMD_IP) diff --git a/kernels/cclo/tcl/generate_kernel.tcl b/kernels/cclo/tcl/generate_kernel.tcl index 07972299..0ccd5835 100644 --- a/kernels/cclo/tcl/generate_kernel.tcl +++ b/kernels/cclo/tcl/generate_kernel.tcl @@ -34,36 +34,36 @@ update_compile_order -fileset sources_1 create_bd_design ccl_offload_bd # add our own ip to the repo -set_property ip_repo_paths {./hls/} [current_project] +set_property ip_repo_paths {../hls/} [current_project] update_ip_catalog #rebuild bd -source -notrace tcl/rebuild_bd.tcl +source -notrace ../tcl/rebuild_bd.tcl create_root_design $stacktype $en_dma $en_arith $en_compress $en_extkrnl $mb_debug_level $commit_hash #add debug if requested if [string equal $hw_debug_level "dma"] { puts "Adding DMA debug to block design" - source -notrace tcl/debug_dma.tcl + source -notrace ../tcl/debug_dma.tcl } elseif [string equal $hw_debug_level "pkt"] { puts "Adding (de)packetizer debug to block design" - source -notrace tcl/debug_pkt.tcl + source -notrace ../tcl/debug_pkt.tcl } elseif [string equal $hw_debug_level "arith"] { puts "Adding arithmetic debug to block design" - source -notrace tcl/debug_arith.tcl + source -notrace ../tcl/debug_arith.tcl } elseif [string equal $hw_debug_level "control"] { puts "Adding control debug to block design" - source -notrace tcl/debug_control.tcl + source -notrace ../tcl/debug_control.tcl } elseif [string equal $hw_debug_level "all"] { puts "Adding all debug cores to block design" - source -notrace tcl/debug_dma.tcl - source -notrace tcl/debug_pkt.tcl - source -notrace tcl/debug_arith.tcl - source -notrace tcl/debug_control.tcl + source -notrace ../tcl/debug_dma.tcl + source -notrace ../tcl/debug_pkt.tcl + source -notrace ../tcl/debug_arith.tcl + source -notrace ../tcl/debug_control.tcl } # add wrapper -add_files -norecurse ./hdl/ccl_offload.v +add_files -norecurse ../hdl/ccl_offload.v update_compile_order -fileset sources_1 update_compile_order -fileset sim_1 generate_target all [get_files ./ccl_offload_ex/ccl_offload_ex.srcs/sources_1/bd/ccl_offload_bd/ccl_offload_bd.bd] diff --git a/kernels/cclo/tcl/generate_sim.tcl b/kernels/cclo/tcl/generate_sim.tcl index 812058e5..185cc990 100644 --- a/kernels/cclo/tcl/generate_sim.tcl +++ b/kernels/cclo/tcl/generate_sim.tcl @@ -38,11 +38,11 @@ open_project ./ccl_offload_ex/ccl_offload_ex.xpr update_compile_order -fileset sim_1 # add plugins to the catalog -set_property ip_repo_paths { ./hls ./../plugins } [current_project] +set_property ip_repo_paths { ../hls ../../plugins } [current_project] update_ip_catalog # add the simulation memory to the project -add_files -norecurse ./hdl/sim_mem.v +add_files -norecurse ../hdl/sim_mem.v update_compile_order -fileset sources_1 update_compile_order -fileset sim_1 @@ -120,22 +120,40 @@ if { $en_dma != 0 } { connect_bd_intf_net [get_bd_intf_pins axi_bram_ctrl_1/BRAM_PORTA] [get_bd_intf_pins sim_mem_1/MEM_PORT_A] connect_bd_intf_net [get_bd_intf_pins axi_bram_ctrl_1/BRAM_PORTB] [get_bd_intf_pins sim_mem_1/MEM_PORT_B] + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_bram_ctrl:4.1 axi_bram_ctrl_2 + set_property -dict [list CONFIG.SINGLE_PORT_BRAM {0} CONFIG.DATA_WIDTH {512} CONFIG.ECC_TYPE {0} CONFIG.READ_LATENCY $latency] [get_bd_cells axi_bram_ctrl_2] + create_bd_cell -type module -reference sim_mem sim_mem_2 + set_property -dict [list CONFIG.MEM_DEPTH_LOG $mem_addr_bits CONFIG.MEM_WIDTH {512} CONFIG.READ_LATENCY $latency] [get_bd_cells sim_mem_2] + connect_bd_intf_net [get_bd_intf_pins axi_bram_ctrl_2/BRAM_PORTA] [get_bd_intf_pins sim_mem_2/MEM_PORT_A] + connect_bd_intf_net [get_bd_intf_pins axi_bram_ctrl_2/BRAM_PORTB] [get_bd_intf_pins sim_mem_2/MEM_PORT_B] + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_crossbar:2.1 axi_crossbar_0 set_property -dict [list CONFIG.NUM_SI {3} CONFIG.NUM_MI {2}] [get_bd_cells axi_crossbar_0] connect_bd_intf_net [get_bd_intf_pins axi_crossbar_0/M00_AXI] [get_bd_intf_pins axi_bram_ctrl_0/S_AXI] connect_bd_intf_net [get_bd_intf_pins axi_crossbar_0/M01_AXI] [get_bd_intf_pins axi_bram_ctrl_1/S_AXI] - create_bd_cell -type ip -vlnv Xilinx:ACCL:external_dma:1.0 external_dma_0 + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_crossbar:2.1 axi_crossbar_1 + set_property -dict [list CONFIG.NUM_SI {3} CONFIG.NUM_MI {1}] [get_bd_cells axi_crossbar_1] + connect_bd_intf_net [get_bd_intf_pins axi_crossbar_1/M00_AXI] [get_bd_intf_pins axi_bram_ctrl_2/S_AXI] + + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_crossbar:2.1 axi_crossbar_2 + set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {2}] [get_bd_cells axi_crossbar_2] + connect_bd_intf_net [get_bd_intf_pins axi_crossbar_2/M00_AXI] [get_bd_intf_pins axi_crossbar_0/S02_AXI] + connect_bd_intf_net [get_bd_intf_pins axi_crossbar_2/M01_AXI] [get_bd_intf_pins axi_crossbar_1/S02_AXI] + + create_bd_cell -type ip -vlnv Xilinx:ACCL:external_dma_2port:1.0 external_dma_0 connect_bd_net [get_bd_ports ap_clk] [get_bd_pins external_dma_0/ap_clk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins external_dma_0/ap_rst_n] connect_bd_intf_net [get_bd_intf_pins external_dma_0/m_axi_0] [get_bd_intf_pins axi_crossbar_0/S00_AXI] + connect_bd_intf_net [get_bd_intf_pins external_dma_0/m_axi_1] [get_bd_intf_pins axi_crossbar_1/S00_AXI] connect_bd_intf_net [get_bd_intf_pins external_dma_0/s_axis_s2mm] [get_bd_intf_pins cclo/m_axis_dma0_s2mm] connect_bd_intf_net [get_bd_intf_pins external_dma_0/m_axis_mm2s] [get_bd_intf_pins cclo/s_axis_dma0_mm2s] - create_bd_cell -type ip -vlnv Xilinx:ACCL:external_dma:1.0 external_dma_1 + create_bd_cell -type ip -vlnv Xilinx:ACCL:external_dma_2port:1.0 external_dma_1 connect_bd_net [get_bd_ports ap_clk] [get_bd_pins external_dma_1/ap_clk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins external_dma_1/ap_rst_n] connect_bd_intf_net [get_bd_intf_pins external_dma_1/m_axi_0] [get_bd_intf_pins axi_crossbar_0/S01_AXI] + connect_bd_intf_net [get_bd_intf_pins external_dma_1/m_axi_1] [get_bd_intf_pins axi_crossbar_1/S01_AXI] connect_bd_intf_net [get_bd_intf_pins external_dma_1/s_axis_s2mm] [get_bd_intf_pins cclo/m_axis_dma1_s2mm] connect_bd_intf_net [get_bd_intf_pins external_dma_1/m_axis_mm2s] [get_bd_intf_pins cclo/s_axis_dma1_mm2s] @@ -185,33 +203,49 @@ if { $en_dma != 0 } { set s_axi [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:aximm_rtl:1.0 s_axi_data ] set_property -dict [ list CONFIG.ADDR_WIDTH {64} CONFIG.DATA_WIDTH {512} CONFIG.FREQ_HZ {250000000} CONFIG.HAS_BRESP {0} CONFIG.HAS_BURST {0} CONFIG.HAS_CACHE {0} CONFIG.HAS_LOCK {0} CONFIG.HAS_PROT {0} CONFIG.HAS_QOS {0} CONFIG.HAS_REGION {0} CONFIG.HAS_WSTRB {1} CONFIG.NUM_READ_OUTSTANDING {1} CONFIG.NUM_WRITE_OUTSTANDING {1} CONFIG.PROTOCOL {AXI4} CONFIG.READ_WRITE_MODE {READ_WRITE} ] $s_axi - connect_bd_intf_net [get_bd_intf_ports s_axi_data] [get_bd_intf_pins axi_crossbar_0/S02_AXI] + set_property -dict [list CONFIG.ID_WIDTH.VALUE_SRC USER CONFIG.AWUSER_WIDTH.VALUE_SRC USER CONFIG.ARUSER_WIDTH.VALUE_SRC USER] $s_axi + set_property -dict [list CONFIG.ARUSER_WIDTH {4} CONFIG.AWUSER_WIDTH {4} CONFIG.ID_WIDTH {4} ] $s_axi + connect_bd_intf_net [get_bd_intf_ports s_axi_data] [get_bd_intf_pins axi_crossbar_2/S00_AXI] connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_crossbar_0/aclk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_crossbar_0/aresetn] + connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_crossbar_1/aclk] + connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_crossbar_1/aresetn] + connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_crossbar_2/aclk] + connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_crossbar_2/aresetn] connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_bram_ctrl_0/s_axi_aclk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_bram_ctrl_0/s_axi_aresetn] connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_bram_ctrl_1/s_axi_aclk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_bram_ctrl_1/s_axi_aresetn] + connect_bd_net [get_bd_ports ap_clk] [get_bd_pins axi_bram_ctrl_2/s_axi_aclk] + connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins axi_bram_ctrl_2/s_axi_aresetn] # #assign addresses and set ranges save_bd_design assign_bd_address - set_property offset $memsize [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_1_Mem0}] - set_property offset $memsize [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] - set_property offset $memsize [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] - set_property offset 0x0000000000000000 [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_0_Mem0}] - set_property offset 0x0000000000000000 [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] - set_property offset 0x0000000000000000 [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] - set_property range $memsize [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] - set_property range $memsize [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] + set_property offset [expr { 0*$memsize }] [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_0_Mem0}] + set_property offset [expr { 1*$memsize }] [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_1_Mem0}] + set_property offset [expr { 2*$memsize }] [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_2_Mem0}] set_property range $memsize [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_0_Mem0}] + set_property range $memsize [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_1_Mem0}] + set_property range $memsize [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_2_Mem0}] + + set_property offset [expr { 0*$memsize }] [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] + set_property offset [expr { 1*$memsize }] [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] + set_property offset [expr { 2*$memsize }] [get_bd_addr_segs {external_dma_0/m_axi_1/SEG_axi_bram_ctrl_2_Mem0}] + set_property range $memsize [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] set_property range $memsize [get_bd_addr_segs {external_dma_0/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] + set_property range $memsize [get_bd_addr_segs {external_dma_0/m_axi_1/SEG_axi_bram_ctrl_2_Mem0}] + + set_property offset [expr { 0*$memsize }] [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] + set_property offset [expr { 1*$memsize }] [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] + set_property offset [expr { 2*$memsize }] [get_bd_addr_segs {external_dma_1/m_axi_1/SEG_axi_bram_ctrl_2_Mem0}] + set_property range $memsize [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_0_Mem0}] set_property range $memsize [get_bd_addr_segs {external_dma_1/m_axi_0/SEG_axi_bram_ctrl_1_Mem0}] - set_property range $memsize [get_bd_addr_segs {s_axi_data/SEG_axi_bram_ctrl_1_Mem0}] + set_property range $memsize [get_bd_addr_segs {external_dma_1/m_axi_1/SEG_axi_bram_ctrl_2_Mem0}] - group_bd_cells external_memory [get_bd_cells axi_bram_ctrl_*] [get_bd_cells sim_mem_*] [get_bd_cells axi_crossbar_0] + group_bd_cells external_memory [get_bd_cells axi_bram_ctrl_*] [get_bd_cells sim_mem_*] [get_bd_cells axi_crossbar_*] group_bd_cells dma [get_bd_cells external_dma_*] [get_bd_cells cyt_dma_0] [get_bd_cells cyt_dma_adapter_0] } @@ -263,7 +297,7 @@ if { $stacktype == "RDMA" } { connect_bd_intf_net [get_bd_intf_pins dummy_cyt_rdma_stack/recv_data] [get_bd_intf_pins cclo/s_axis_eth_rx_data] connect_bd_intf_net [get_bd_intf_pins cclo/m_axis_eth_tx_data] [get_bd_intf_pins dummy_cyt_rdma_stack/send_data] - set_property -dict [list CONFIG.NUM_SI {4}] [get_bd_cells external_memory/axi_crossbar_0] + set_property -dict [list CONFIG.NUM_SI {2}] [get_bd_cells external_memory/axi_crossbar_2] create_bd_cell -type ip -vlnv xilinx.com:ip:axi_datamover:5.1 cyt_wr_dma set_property -dict [list CONFIG.c_enable_mm2s {0} CONFIG.c_include_s2mm_dre {true} CONFIG.c_s2mm_support_indet_btt {true} ] [get_bd_cells cyt_wr_dma] @@ -272,7 +306,7 @@ if { $stacktype == "RDMA" } { connect_bd_intf_net [get_bd_intf_pins cyt_wr_dma/S_AXIS_S2MM] [get_bd_intf_pins dummy_cyt_rdma_stack/wr_data] connect_bd_intf_net [get_bd_intf_pins dummy_cyt_rdma_stack/wr_cmd] [get_bd_intf_pins cyt_wr_dma/S_AXIS_S2MM_CMD] connect_bd_intf_net [get_bd_intf_pins dummy_cyt_rdma_stack/wr_sts] [get_bd_intf_pins cyt_wr_dma/M_AXIS_S2MM_STS] - connect_bd_intf_net [get_bd_intf_pins cyt_wr_dma/M_AXI_S2MM] [get_bd_intf_pins external_memory/axi_crossbar_0/S03_AXI] + connect_bd_intf_net [get_bd_intf_pins cyt_wr_dma/M_AXI_S2MM] [get_bd_intf_pins external_memory/axi_crossbar_2/S01_AXI] connect_bd_net [get_bd_ports ap_clk] [get_bd_pins cyt_wr_dma/m_axi_s2mm_aclk] [get_bd_pins cyt_wr_dma/m_axis_s2mm_cmdsts_awclk] connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins cyt_wr_dma/m_axi_s2mm_aresetn] [get_bd_pins cyt_wr_dma/m_axis_s2mm_cmdsts_aresetn] diff --git a/kernels/cclo/tcl/rebuild_bd.tcl b/kernels/cclo/tcl/rebuild_bd.tcl index 559b28c5..259d0145 100644 --- a/kernels/cclo/tcl/rebuild_bd.tcl +++ b/kernels/cclo/tcl/rebuild_bd.tcl @@ -110,7 +110,7 @@ proc create_root_design { netStackType enableDMA enableArithmetic enableCompress set control_xbar [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 control_xbar ] set_property -dict [ list CONFIG.NUM_MI {2} ] $control_xbar - source -notrace ./tcl/control_bd.tcl + source -notrace ../tcl/control_bd.tcl set idcode [expr {$commitHash<<8 | $debugLevel<<6 | $enableExtKrnlStream<<5 | $enableCompression<<4 | $enableArithmetic<<3 | $enableDMA<<2 | ($netStackType == "RDMA" ? 2 : $netStackType == "TCP" ? 1 : 0) }] create_hier_cell_control [current_bd_instance .] control $debugLevel $idcode @@ -232,8 +232,8 @@ proc create_root_design { netStackType enableDMA enableArithmetic enableCompress save_bd_design # Create network (de)packetizer - source -notrace ./tcl/rx_bd.tcl - source -notrace ./tcl/tx_bd.tcl + source -notrace ../tcl/rx_bd.tcl + source -notrace ../tcl/tx_bd.tcl if { $netStackType == "TCP" } { # TCP interfaces diff --git a/kernels/plugins/Makefile b/kernels/plugins/Makefile index f4ad32ce..da39ea18 100644 --- a/kernels/plugins/Makefile +++ b/kernels/plugins/Makefile @@ -17,29 +17,12 @@ # *******************************************************************************/ PERIPHERAL_IPS = hostctrl loopback reduce_ops hp_compression dummy_tcp_stack client_arbiter vadd_put cyt_adapter external_dma dummy_cyt_rdma_stack dummy_cyt_dma tcp_session_handler +DEVICE=xcu280-fsvh2892-2L-e TARGET=ip -PLATFORM ?= xilinx_u280_xdma_201920_3 -DEBUG ?= none -STACK_TYPE ?= UDP - -ifeq (u250,$(findstring u250, $(PLATFORM))) - FPGAPART=xcu250-figd2104-2L-e -else ifeq (u280,$(findstring u280, $(PLATFORM))) - FPGAPART=xcu280-fsvh2892-2L-e -else ifeq (u55c,$(findstring u55c, $(PLATFORM))) - FPGAPART=xcu55c-fsvh2892-2L-e -else ifeq (u200,$(findstring u200, $(PLATFORM))) - FPGAPART=xcu200-fsgd2104-2-e -else ifeq (u50,$(findstring u50, $(PLATFORM))) - FPGAPART=xcu50-fsvh2104-2-e -else - $(error Unsupported PLATFORM) -endif all: $(PERIPHERAL_IPS) - .PHONY: hostctrl loopback reduce_ops hp_compression dummy_tcp_stack client_arbiter vadd_put cyt_adapter external_dma dummy_cyt_rdma_stack dummy_cyt_dma tcp_session_handler $(PERIPHERAL_IPS): - $(MAKE) -C $@ DEVICE=$(FPGAPART) TARGET=$(TARGET) STACK_TYPE=$(STACK_TYPE) + $(MAKE) -C $@ DEVICE=$(DEVICE) TARGET=$(TARGET) diff --git a/kernels/plugins/client_arbiter/Makefile b/kernels/plugins/client_arbiter/Makefile index 54c94cc2..da1fa056 100644 --- a/kernels/plugins/client_arbiter/Makefile +++ b/kernels/plugins/client_arbiter/Makefile @@ -16,9 +16,9 @@ # *******************************************************************************/ DEVICE=xcu250-figd2104-2L-e -ARBITER_IP=client_arbiter.xo +ARBITER_IP=client_arbiter_$(DEVICE).xo TARGET=ip -NCLIENTS=3 +NCLIENTS=2 all: $(ARBITER_IP) diff --git a/kernels/plugins/client_arbiter/build_client_arbiter.tcl b/kernels/plugins/client_arbiter/build_client_arbiter.tcl index 7b81765a..c23bb3f8 100644 --- a/kernels/plugins/client_arbiter/build_client_arbiter.tcl +++ b/kernels/plugins/client_arbiter/build_client_arbiter.tcl @@ -41,14 +41,14 @@ switch $command { } -open_project build_client_arbiter +open_project build_client_arbiter.$device add_files client_arbiter.cpp -cflags "-std=c++14 -I../../../driver/hls/ -DNUM_CTRL_STREAMS=$nclients -DACCL_SYNTHESIS" set_top client_arbiter open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/client_arbiter.xo +config_export -format xo -library ACCL -output [pwd]/client_arbiter_${device}.xo if {$do_syn} { set_part $device diff --git a/kernels/plugins/cyt_adapter/Makefile b/kernels/plugins/cyt_adapter/Makefile index d435f07c..be1a85c2 100644 --- a/kernels/plugins/cyt_adapter/Makefile +++ b/kernels/plugins/cyt_adapter/Makefile @@ -17,15 +17,14 @@ TARGET=ip DEVICE=xcu250-figd2104-2L-e -CYT_DMA_ADAPTER=cyt_dma_adapter.xo -CYT_RDMA_ARBITER=cyt_rdma_arbiter.xo -CYT_RDMA_MUX=cyt_rdma_mux.xo -STACK_TYPE ?= UDP +CYT_DMA_ADAPTER=cyt_dma_adapter_$(DEVICE).xo +CYT_RDMA_ARBITER=cyt_rdma_arbiter_$(DEVICE).xo +CYT_RDMA_MUX=cyt_rdma_mux_$(DEVICE).xo all: $(CYT_DMA_ADAPTER) $(CYT_RDMA_ARBITER) $(CYT_RDMA_MUX) $(CYT_DMA_ADAPTER): build_cyt_dma_adapter.tcl cyt_dma_adapter.cpp - vitis_hls $< -tclargs $(TARGET) $(DEVICE) $(STACK_TYPE) + vitis_hls $< -tclargs $(TARGET) $(DEVICE) $(CYT_RDMA_ARBITER): build_cyt_rdma_arbiter.tcl cyt_rdma_arbiter.cpp vitis_hls $< -tclargs $(TARGET) $(DEVICE) diff --git a/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl b/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl index 2de071ec..342b3400 100644 --- a/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl +++ b/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl @@ -52,7 +52,7 @@ switch $command { } -open_project build_cyt_dma_adapter +open_project build_cyt_dma_adapter.${device} if {$stack eq "RDMA"} { add_files cyt_dma_adapter.cpp -cflags "-std=c++14 -I. -I../../../driver/hls/ -DACCL_SYNTHESIS -DACCL_RDMA" @@ -64,7 +64,7 @@ if {$stack eq "RDMA"} { set_top cyt_dma_adapter open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/cyt_dma_adapter.xo +config_export -format xo -library ACCL -output [pwd]/cyt_dma_adapter_$device.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/cyt_adapter/build_cyt_rdma_arbiter.tcl b/kernels/plugins/cyt_adapter/build_cyt_rdma_arbiter.tcl index 9309459b..d8534621 100644 --- a/kernels/plugins/cyt_adapter/build_cyt_rdma_arbiter.tcl +++ b/kernels/plugins/cyt_adapter/build_cyt_rdma_arbiter.tcl @@ -51,7 +51,7 @@ switch $command { } -open_project build_cyt_rdma_arbiter +open_project build_cyt_rdma_arbiter.${device} add_files cyt_rdma_arbiter.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" @@ -60,7 +60,7 @@ add_files cyt_rdma_arbiter.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf set_top cyt_rdma_arbiter open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/cyt_rdma_arbiter.xo +config_export -format xo -library ACCL -output [pwd]/cyt_rdma_arbiter_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl b/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl index df908845..ac0e6dde 100644 --- a/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl +++ b/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl @@ -51,7 +51,7 @@ switch $command { } -open_project build_cyt_rdma_mux +open_project build_cyt_rdma_mux.${device} add_files cyt_rdma_mux.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" @@ -60,7 +60,7 @@ add_files cyt_rdma_mux.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I. set_top cyt_rdma_mux open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/cyt_rdma_mux.xo +config_export -format xo -library ACCL -output [pwd]/cyt_rdma_mux_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp b/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp index b0767dad..300e991a 100644 --- a/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp +++ b/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp @@ -71,27 +71,6 @@ void rdma_req_byp_cmd_converter( } -void multiplexor(hls::stream& in0, - hls::stream& in1, - hls::stream& out) -{ -#pragma HLS inline off -#pragma HLS pipeline II=1 - - cyt_req_t currWord; - - if (!STREAM_IS_EMPTY(in0)) - { - currWord = STREAM_READ(in0); - STREAM_WRITE(out, currWord); - } - else if(!STREAM_IS_EMPTY(in1)) - { - currWord = STREAM_READ(in1); - STREAM_WRITE(out, currWord); - } -} - void multiplexor(hls::stream& in0, hls::stream& in1, hls::stream& in2, @@ -184,11 +163,11 @@ void cyt_dma_adapter( hls::stream> &dma1_s2mm_sts, hls::stream> &dma0_mm2s_sts, hls::stream> &dma1_mm2s_sts, -#ifdef ACCL_RDMA + //RDMA rd_req and wr_req hls::stream & rdma_wr_req, hls::stream & rdma_rd_req, -#endif + //Coyote Bypass interface command and status hls::stream &cyt_byp_wr_cmd, hls::stream> &cyt_byp_wr_sts, @@ -213,13 +192,10 @@ void cyt_dma_adapter( #pragma HLS aggregate variable=cyt_byp_wr_cmd compact=bit #pragma HLS aggregate variable=cyt_byp_rd_cmd compact=bit -#ifdef ACCL_RDMA #pragma HLS INTERFACE axis port=rdma_wr_req #pragma HLS INTERFACE axis port=rdma_rd_req #pragma HLS aggregate variable=rdma_wr_req compact=bit #pragma HLS aggregate variable=rdma_rd_req compact=bit -#endif - static hls::stream byp_wr_cmd_0; #pragma HLS stream variable=byp_wr_cmd_0 depth=16 @@ -239,36 +215,21 @@ void cyt_dma_adapter( static hls::stream> dma1_s2mm_meta; #pragma HLS stream variable=dma1_s2mm_meta depth=16 -#ifdef ACCL_RDMA static hls::stream byp_wr_cmd_2; #pragma HLS stream variable=byp_wr_cmd_2 depth=16 static hls::stream byp_rd_cmd_2; #pragma HLS stream variable=byp_rd_cmd_2 depth=16 -#endif dm_byp_cmd_converter<0>(dma0_s2mm_cmd, byp_wr_cmd_0, dma0_s2mm_meta); dm_byp_cmd_converter<1>(dma1_s2mm_cmd, byp_wr_cmd_1, dma1_s2mm_meta); -#ifdef ACCL_RDMA rdma_req_byp_cmd_converter<2>(rdma_wr_req, byp_wr_cmd_2); -#endif - -#ifdef ACCL_RDMA multiplexor(byp_wr_cmd_0,byp_wr_cmd_1,byp_wr_cmd_2,cyt_byp_wr_cmd); -#else - multiplexor(byp_wr_cmd_0,byp_wr_cmd_1,cyt_byp_wr_cmd); -#endif + dm_byp_cmd_converter<0>(dma0_mm2s_cmd,byp_rd_cmd_0, dma0_mm2s_meta); dm_byp_cmd_converter<1>(dma1_mm2s_cmd,byp_rd_cmd_1, dma1_mm2s_meta); -#ifdef ACCL_RDMA rdma_req_byp_cmd_converter<2>(rdma_rd_req, byp_rd_cmd_2); -#endif - -#ifdef ACCL_RDMA multiplexor(byp_rd_cmd_0,byp_rd_cmd_1,byp_rd_cmd_2,cyt_byp_rd_cmd); -#else - multiplexor(byp_rd_cmd_0,byp_rd_cmd_1,cyt_byp_rd_cmd); -#endif byp_dm_sts_converter(cyt_byp_wr_sts, dma0_s2mm_sts, dma1_s2mm_sts, dma0_s2mm_meta, dma1_s2mm_meta); byp_dm_sts_converter(cyt_byp_rd_sts, dma0_mm2s_sts, dma1_mm2s_sts, dma0_mm2s_meta, dma1_mm2s_meta); diff --git a/kernels/plugins/dummy_cyt_dma/Makefile b/kernels/plugins/dummy_cyt_dma/Makefile index 7302142f..549ee6aa 100644 --- a/kernels/plugins/dummy_cyt_dma/Makefile +++ b/kernels/plugins/dummy_cyt_dma/Makefile @@ -15,7 +15,7 @@ # # *******************************************************************************/ DEVICE=xcu280-fsvh2892-2L-e -DUMMY_DMA_IP=dummy_cyt_dma.xo +DUMMY_DMA_IP=dummy_cyt_dma_$(DEVICE).xo TARGET=ip diff --git a/kernels/plugins/dummy_cyt_dma/build_dummy_cyt_dma.tcl b/kernels/plugins/dummy_cyt_dma/build_dummy_cyt_dma.tcl index 9abb37cc..b8d1d8e5 100644 --- a/kernels/plugins/dummy_cyt_dma/build_dummy_cyt_dma.tcl +++ b/kernels/plugins/dummy_cyt_dma/build_dummy_cyt_dma.tcl @@ -51,14 +51,14 @@ switch $command { } -open_project dummy_cyt_dma +open_project dummy_cyt_dma.${device} add_files dummy_cyt_dma.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I../cyt_adapter -I../../../hlslib/include/hlslib/xilinx -DACCL_SYNTHESIS" set_top cyt_dma open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/dummy_cyt_dma.xo +config_export -format xo -library ACCL -output [pwd]/dummy_cyt_dma_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/dummy_cyt_rdma_stack/Makefile b/kernels/plugins/dummy_cyt_rdma_stack/Makefile index 2f1b5ab8..237dddfb 100644 --- a/kernels/plugins/dummy_cyt_rdma_stack/Makefile +++ b/kernels/plugins/dummy_cyt_rdma_stack/Makefile @@ -15,7 +15,7 @@ # # *******************************************************************************/ DEVICE=xcu280-fsvh2892-2L-e -TCP_STACK_IP=dummy_cyt_rdma_stack.xo +TCP_STACK_IP=dummy_cyt_rdma_stack_$(DEVICE).xo TARGET=ip diff --git a/kernels/plugins/dummy_cyt_rdma_stack/build_cyt_rdma_stack.tcl b/kernels/plugins/dummy_cyt_rdma_stack/build_cyt_rdma_stack.tcl index 7f21e873..02f1d882 100644 --- a/kernels/plugins/dummy_cyt_rdma_stack/build_cyt_rdma_stack.tcl +++ b/kernels/plugins/dummy_cyt_rdma_stack/build_cyt_rdma_stack.tcl @@ -51,14 +51,14 @@ switch $command { } -open_project dummy_cyt_rdma_stack +open_project dummy_cyt_rdma_stack.${device} add_files dummy_cyt_rdma_stack.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I../../cclo/hls/eth_intf/ -I../../../hlslib/include/hlslib/xilinx -DACCL_SYNTHESIS" set_top cyt_rdma open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/dummy_cyt_rdma_stack.xo +config_export -format xo -library ACCL -output [pwd]/dummy_cyt_rdma_stack_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/dummy_tcp_stack/Makefile b/kernels/plugins/dummy_tcp_stack/Makefile index 933dcb08..b577ffe4 100644 --- a/kernels/plugins/dummy_tcp_stack/Makefile +++ b/kernels/plugins/dummy_tcp_stack/Makefile @@ -15,7 +15,7 @@ # # *******************************************************************************/ DEVICE=xcu280-fsvh2892-2L-e -TCP_STACK_IP=dummy_tcp_stack.xo +TCP_STACK_IP=dummy_tcp_stack_$(DEVICE).xo TARGET=ip diff --git a/kernels/plugins/dummy_tcp_stack/build_tcp_stack.tcl b/kernels/plugins/dummy_tcp_stack/build_tcp_stack.tcl index 1c04a7ee..6f3af94f 100644 --- a/kernels/plugins/dummy_tcp_stack/build_tcp_stack.tcl +++ b/kernels/plugins/dummy_tcp_stack/build_tcp_stack.tcl @@ -51,14 +51,14 @@ switch $command { } -open_project build_tcp_stack +open_project build_tcp_stack.${device} add_files dummy_tcp_stack.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I../../cclo/hls/eth_intf/ -I../../../hlslib/include/hlslib/xilinx -DACCL_SYNTHESIS" set_top network_krnl open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/dummy_tcp_stack.xo +config_export -format xo -library ACCL -output [pwd]/dummy_tcp_stack_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/external_dma/Makefile b/kernels/plugins/external_dma/Makefile index 704fc53c..174d5753 100755 --- a/kernels/plugins/external_dma/Makefile +++ b/kernels/plugins/external_dma/Makefile @@ -16,12 +16,9 @@ # *******************************************************************************/ DEVICE ?= xcu280-fsvh2892-2L-e -NUM_DMA ?= 1 -all: external_dma.xo +all: external_dma_1port.xo external_dma_2port.xo external_dma_3port.xo external_dma_4port.xo -external_dma.v kernel.xml &: gen_files.py - python3 $< -n $(NUM_DMA) - -external_dma.xo: bd.tcl kernel.xml external_dma.v - vivado -mode batch -source $< -notrace -tclargs $(DEVICE) +external_dma_%port.xo: bd.tcl + python3 gen_files.py -n $* + vivado -mode batch -source $< -notrace -tclargs $(DEVICE) $* diff --git a/kernels/plugins/external_dma/bd.tcl b/kernels/plugins/external_dma/bd.tcl index cc06c074..0d417b7c 100644 --- a/kernels/plugins/external_dma/bd.tcl +++ b/kernels/plugins/external_dma/bd.tcl @@ -15,10 +15,11 @@ # # *******************************************************************************/ set fpgapart [lindex $::argv 0] -set num_dma 2 +set num_dma [lindex $::argv 1] +set ipname external_dma_${num_dma}port # create project with correct target -create_project -force external_dma ./external_dma -part $fpgapart +create_project -force external_dma ./${ipname} -part $fpgapart set_property target_language verilog [current_project] set_property simulator_language MIXED [current_project] set_property coreContainer.enable false [current_project] @@ -314,23 +315,23 @@ set_property -dict [ list CONFIG.ASSOCIATED_BUSIF $interfaces ] [get_bd_ports ap validate_bd_design save_bd_design -add_files -norecurse ./external_dma.v +add_files -norecurse ./${ipname}.v update_compile_order -fileset sources_1 update_compile_order -fileset sim_1 -set bdfile [get_files ./external_dma/external_dma.srcs/sources_1/bd/external_dma_bd/external_dma_bd.bd] +set bdfile [get_files ./${ipname}/external_dma.srcs/sources_1/bd/external_dma_bd/external_dma_bd.bd] generate_target all $bdfile export_ip_user_files -of_objects $bdfile -no_script -sync -force -quiet create_ip_run $bdfile update_compile_order -fileset sources_1 -set_property top external_dma [current_fileset] +set_property top ${ipname} [current_fileset] # Package IP -ipx::package_project -root_dir ./packaged_kernel -vendor Xilinx -library ACCL -taxonomy /KernelIP -import_files -set_current false -ipx::unload_core ./packaged_kernel/component.xml +ipx::package_project -root_dir ./${ipname}/packaged_kernel -vendor Xilinx -library ACCL -taxonomy /KernelIP -import_files -set_current false +ipx::unload_core ./${ipname}/packaged_kernel/component.xml -ipx::edit_ip_in_project -upgrade true -name tmp_edit_project -directory ./package ./packaged_kernel/component.xml +ipx::edit_ip_in_project -upgrade true -name tmp_edit_project -directory ./${ipname}/package ./${ipname}/packaged_kernel/component.xml set_property core_revision 1 [ipx::current_core] foreach up [ipx::get_user_parameters] { @@ -365,10 +366,10 @@ ipx::update_checksums [ipx::current_core] ipx::save_core [ipx::current_core] ## Generate XO -if {[file exists "external_dma.xo"]} { - file delete -force "external_dma.xo" +if {[file exists "${ipname}.xo"]} { + file delete -force "${ipname}.xo" } -package_xo -xo_path external_dma.xo -kernel_name external_dma -ip_directory ./packaged_kernel -kernel_xml ./kernel.xml +package_xo -xo_path ${ipname}.xo -kernel_name ${ipname} -ip_directory ./${ipname}/packaged_kernel -kernel_xml ./${ipname}.xml close_project -delete diff --git a/kernels/plugins/external_dma/gen_files.py b/kernels/plugins/external_dma/gen_files.py index 260c5e59..999ac90c 100644 --- a/kernels/plugins/external_dma/gen_files.py +++ b/kernels/plugins/external_dma/gen_files.py @@ -25,7 +25,7 @@ `timescale 1 ns / 1 ps -module external_dma +module external_dma_{}port ( input ap_clk, input ap_rst_n, @@ -110,7 +110,7 @@ .s_axis_s2mm_tdata(s_axis_s2mm_tdata), .s_axis_s2mm_tkeep(s_axis_s2mm_tkeep), - .s_axis_s2mm_tdest(s_axis_s2mm_tdest), + {} .s_axis_s2mm_tlast(s_axis_s2mm_tlast), .s_axis_s2mm_tready(s_axis_s2mm_tready), .s_axis_s2mm_tvalid(s_axis_s2mm_tvalid), @@ -124,7 +124,7 @@ .s_axis_mm2s_cmd_tdata(s_axis_mm2s_cmd_tdata), .s_axis_mm2s_cmd_tready(s_axis_mm2s_cmd_tready), .s_axis_mm2s_cmd_tvalid(s_axis_mm2s_cmd_tvalid), - .s_axis_mm2s_cmd_tdest(s_axis_mm2s_cmd_tdest), + {} .m_axis_mm2s_sts_tdata(m_axis_mm2s_sts_tdata), .m_axis_mm2s_sts_tready(m_axis_mm2s_sts_tready), @@ -135,7 +135,7 @@ .s_axis_s2mm_cmd_tdata(s_axis_s2mm_cmd_tdata), .s_axis_s2mm_cmd_tready(s_axis_s2mm_cmd_tready), .s_axis_s2mm_cmd_tvalid(s_axis_s2mm_cmd_tvalid), - .s_axis_s2mm_cmd_tdest(s_axis_s2mm_cmd_tdest), + {} .m_axis_s2mm_sts_tdata(m_axis_s2mm_sts_tdata), .m_axis_s2mm_sts_tready(m_axis_s2mm_sts_tready), @@ -226,13 +226,16 @@ all_axi_declarations += axi_intf_declaration.format(i) all_axi_connections += axi_intf_connection.format(i) -with open("external_dma.v", "w") as f: - f.write(verilog_wrapper.format(all_axi_declarations, all_axi_connections)) +with open("external_dma_%dport.v" % args.numdma, "w") as f: + tdest_1 = '\t\t\t\t.s_axis_s2mm_tdest(s_axis_s2mm_tdest),\n' if args.numdma > 1 else '\n' + tdest_2 = '\t\t\t\t.s_axis_mm2s_cmd_tdest(s_axis_mm2s_cmd_tdest),\n' if args.numdma > 1 else '\n' + tdest_3 = '\t\t\t\t.s_axis_s2mm_cmd_tdest(s_axis_s2mm_cmd_tdest),\n' if args.numdma > 1 else '\n' + f.write(verilog_wrapper.format(args.numdma, all_axi_declarations, all_axi_connections, tdest_1, tdest_2, tdest_3)) kernel_xml = """ - + @@ -241,7 +244,7 @@ -{} +{1} @@ -249,7 +252,7 @@ -{} +{2} """ @@ -265,5 +268,5 @@ all_xml_ports += xml_axi_port.format(i) all_xml_args += xml_axi_arg.format(i,i+6) -with open("kernel.xml", "w") as f: - f.write(kernel_xml.format(all_xml_ports, all_xml_args)) +with open("external_dma_%dport.xml" % args.numdma, "w") as f: + f.write(kernel_xml.format(args.numdma, all_xml_ports, all_xml_args)) diff --git a/kernels/plugins/hostctrl/Makefile b/kernels/plugins/hostctrl/Makefile index febf9f68..3d0cb406 100644 --- a/kernels/plugins/hostctrl/Makefile +++ b/kernels/plugins/hostctrl/Makefile @@ -17,7 +17,7 @@ TARGET=ip DEVICE=xcu250-figd2104-2L-e -HOSTCTRL_IP=hostctrl.xo +HOSTCTRL_IP=hostctrl_$(DEVICE).xo all: $(HOSTCTRL_IP) diff --git a/kernels/plugins/hostctrl/build_hostctrl.tcl b/kernels/plugins/hostctrl/build_hostctrl.tcl index 53effa44..96b128ad 100644 --- a/kernels/plugins/hostctrl/build_hostctrl.tcl +++ b/kernels/plugins/hostctrl/build_hostctrl.tcl @@ -51,14 +51,14 @@ switch $command { } -open_project build_hostctrl +open_project build_hostctrl.${device} add_files hostctrl.cpp -cflags "-std=c++14 -I. -I../../../driver/hls/ -DACCL_SYNTHESIS" set_top hostctrl open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/hostctrl.xo +config_export -format xo -library ACCL -output [pwd]/hostctrl_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/hp_compression/Makefile b/kernels/plugins/hp_compression/Makefile index f5b26434..a57630ec 100644 --- a/kernels/plugins/hp_compression/Makefile +++ b/kernels/plugins/hp_compression/Makefile @@ -17,7 +17,7 @@ TARGET=ip DEVICE=xcu250-figd2104-2L-e -REDUCE_IP=hp_compression.xo +REDUCE_IP=hp_compression_$(DEVICE).xo all: $(REDUCE_IP) diff --git a/kernels/plugins/hp_compression/build.tcl b/kernels/plugins/hp_compression/build.tcl index 30338e9c..49af1fb4 100644 --- a/kernels/plugins/hp_compression/build.tcl +++ b/kernels/plugins/hp_compression/build.tcl @@ -50,14 +50,14 @@ switch $command { } } -open_project build_hp_compression +open_project build_hp_compression.${device} add_files hp_compression.cpp -cflags "-std=c++14 -I[pwd]/../../../driver/hls/ -DACCL_SYNTHESIS" set_top hp_compression open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/hp_compression.xo +config_export -format xo -library ACCL -output [pwd]/hp_compression_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/loopback/Makefile b/kernels/plugins/loopback/Makefile index 60a35b0a..c3ad9015 100644 --- a/kernels/plugins/loopback/Makefile +++ b/kernels/plugins/loopback/Makefile @@ -17,7 +17,7 @@ # *******************************************************************************/ DEVICE=xcu250-figd2104-2L-e -LOOPBACK_IP=loopback.xo +LOOPBACK_IP=loopback_$(DEVICE).xo TARGET=ip all: $(LOOPBACK_IP) diff --git a/kernels/plugins/loopback/build_loopback.tcl b/kernels/plugins/loopback/build_loopback.tcl index 3b9dbcfc..797b7c83 100644 --- a/kernels/plugins/loopback/build_loopback.tcl +++ b/kernels/plugins/loopback/build_loopback.tcl @@ -50,14 +50,14 @@ switch $command { } } -open_project build_loopback +open_project build_loopback.${device} add_files loopback.cpp -cflags "-std=c++14 -I[pwd]/../../../driver/hls/ -DACCL_SYNTHESIS" set_top loopback open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/loopback.xo +config_export -format xo -library ACCL -output [pwd]/loopback_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/reduce_ops/Makefile b/kernels/plugins/reduce_ops/Makefile index 7c941058..0817e372 100644 --- a/kernels/plugins/reduce_ops/Makefile +++ b/kernels/plugins/reduce_ops/Makefile @@ -19,11 +19,11 @@ TARGET=ip DEVICE=xcu250-figd2104-2L-e DTYPES=float half double int32_t int64_t DWIDTH=512 -REDUCE_IP = reduce_ops.xo +REDUCE_IP = reduce_ops_$(DEVICE).xo all: $(REDUCE_IP) -reduce_ops.xo: build.tcl reduce_ops.cpp +$(REDUCE_IP): build.tcl reduce_ops.cpp vitis_hls $< -tclargs $(TARGET) $(DEVICE) diff --git a/kernels/plugins/reduce_ops/build.tcl b/kernels/plugins/reduce_ops/build.tcl index 47e8eba0..5d8bbf17 100644 --- a/kernels/plugins/reduce_ops/build.tcl +++ b/kernels/plugins/reduce_ops/build.tcl @@ -53,14 +53,14 @@ switch $command { } -open_project build_${ipname} +open_project build_${ipname}.${device} add_files reduce_ops.cpp -cflags "-std=c++14 -I[pwd]/ -I[pwd]/../../../driver/hls/ -DACCL_SYNTHESIS" set_top ${ipname} open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/${ipname}.xo +config_export -format xo -library ACCL -output [pwd]/${ipname}_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/tcp_session_handler/Makefile b/kernels/plugins/tcp_session_handler/Makefile index 2ac5aa80..db730ad7 100644 --- a/kernels/plugins/tcp_session_handler/Makefile +++ b/kernels/plugins/tcp_session_handler/Makefile @@ -15,7 +15,7 @@ # # *******************************************************************************/ DEVICE=xcu280-fsvh2892-2L-e -TCP_STACK_IP=tcp_session_handler.xo +TCP_STACK_IP=tcp_session_handler_$(DEVICE).xo TARGET=ip diff --git a/kernels/plugins/tcp_session_handler/build_tcp_session_handler.tcl b/kernels/plugins/tcp_session_handler/build_tcp_session_handler.tcl index f59ee89c..903f7a5e 100644 --- a/kernels/plugins/tcp_session_handler/build_tcp_session_handler.tcl +++ b/kernels/plugins/tcp_session_handler/build_tcp_session_handler.tcl @@ -51,14 +51,14 @@ switch $command { } -open_project tcp_session_handler +open_project tcp_session_handler.${device} -add_files tcp_session_handler.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I../../../hlslib/include/hlslib/xilinx -DACCL_SYNTHESIS" +add_files tcp_session_handler.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I../../../driver/xrt/include/accl/ -I../../../hlslib/include/hlslib/xilinx -DACCL_SYNTHESIS" set_top tcp_session_handler open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/tcp_session_handler.xo +config_export -format xo -library ACCL -output [pwd]/tcp_session_handler_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/tcp_session_handler/tcp_session_handler.cpp b/kernels/plugins/tcp_session_handler/tcp_session_handler.cpp index 94d48deb..c5a057fa 100644 --- a/kernels/plugins/tcp_session_handler/tcp_session_handler.cpp +++ b/kernels/plugins/tcp_session_handler/tcp_session_handler.cpp @@ -16,55 +16,56 @@ # *******************************************************************************/ #include "tcp_session_handler.h" -using namespace std; - void tcp_session_handler( - unsigned int ip, - unsigned int port_nr, - bool close, - unsigned int *session_id, - bool *success, + uint32_t ip, + uint16_t port_nr, + volatile uint16_t *session_id, + volatile uint8_t *return_code, + ACCL::tcpSessionHandlerOperation operation, STREAM>& listen_port, STREAM>& port_status, STREAM>& open_connection, STREAM>& close_connection, STREAM>& open_status ){ -#pragma HLS INTERFACE s_axilite port=ip -#pragma HLS INTERFACE s_axilite port=port_nr -#pragma HLS INTERFACE s_axilite port=close -#pragma HLS INTERFACE s_axilite port=success -#pragma HLS INTERFACE s_axilite port=session_id -#pragma HLS INTERFACE axis register both port=listen_port -#pragma HLS INTERFACE axis register both port=port_status -#pragma HLS INTERFACE axis register both port=open_connection -#pragma HLS INTERFACE axis register both port=close_connection -#pragma HLS INTERFACE axis register both port=open_status + #pragma HLS INTERFACE s_axilite port=operation + #pragma HLS INTERFACE s_axilite port=ip + #pragma HLS INTERFACE s_axilite port=port_nr + + #pragma HLS INTERFACE s_axilite port=session_id + #pragma HLS INTERFACE ap_none port=session_id + #pragma HLS INTERFACE s_axilite port=return_code + #pragma HLS INTERFACE ap_none port=return_code + #pragma HLS INTERFACE s_axilite port=return - //first open port, unless the instruction is to close - if(!close){ + #pragma HLS INTERFACE axis register both port=listen_port + #pragma HLS INTERFACE axis register both port=port_status + #pragma HLS INTERFACE axis register both port=open_connection + #pragma HLS INTERFACE axis register both port=close_connection + #pragma HLS INTERFACE axis register both port=open_status + + if (operation == ACCL::tcpSessionHandlerOperation::OPEN_PORT) { ap_axiu<16, 0, 0, 0> listen_port_pkt; listen_port_pkt.data(15,0) = port_nr; STREAM_WRITE(listen_port, listen_port_pkt); + while(STREAM_IS_EMPTY(port_status)) {} ap_axiu<8, 0, 0, 0> port_status_pkt = STREAM_READ(port_status); - *success = port_status_pkt.data; - //if we weren't successful setting up port, stop here - if(!port_status_pkt.data) return; - } - //then open or close connection - if(!close){ + *return_code = port_status_pkt.data; + } else if (operation == ACCL::tcpSessionHandlerOperation::OPEN_CONNECTION) { ap_axiu<64, 0, 0, 0> openConnection_pkt; openConnection_pkt.data(31,0) = ip; openConnection_pkt.data(47,32) = port_nr; STREAM_WRITE(open_connection, openConnection_pkt); - ap_axiu<128, 0, 0, 0> open_status_pkt; - open_status_pkt = STREAM_READ(open_status); + while(STREAM_IS_EMPTY(open_status)) {} + ap_axiu<128, 0, 0, 0> open_status_pkt = STREAM_READ(open_status); *session_id = open_status_pkt.data(15,0); - *success = open_status_pkt.data(23,16); - } else { + *return_code = open_status_pkt.data(23, 16); + } else if (operation == ACCL::tcpSessionHandlerOperation::CLOSE_CONNECTION) { ap_axiu<16, 0, 0, 0> closeConnection_pkt; closeConnection_pkt.data = *session_id; STREAM_WRITE(close_connection, closeConnection_pkt); + *return_code = 1; + } else { + *return_code = 0; } - } diff --git a/kernels/plugins/tcp_session_handler/tcp_session_handler.h b/kernels/plugins/tcp_session_handler/tcp_session_handler.h index de69d487..638532bb 100644 --- a/kernels/plugins/tcp_session_handler/tcp_session_handler.h +++ b/kernels/plugins/tcp_session_handler/tcp_session_handler.h @@ -15,13 +15,15 @@ # # *******************************************************************************/ #include "accl_hls.h" +#include "stdint.h" +#include "constants.hpp" void tcp_session_handler( - unsigned int ip, - unsigned int port_nr, - bool close, - unsigned int *session_id, - bool *success, + uint32_t ip, + uint16_t port_nr, + volatile uint16_t *session_id, + volatile uint8_t *return_code, + ACCL::tcpSessionHandlerOperation operation, STREAM>& listen_port, STREAM>& port_status, STREAM>& open_connection, diff --git a/kernels/plugins/vadd_put/Makefile b/kernels/plugins/vadd_put/Makefile index 395ca0a8..61d9d6b5 100644 --- a/kernels/plugins/vadd_put/Makefile +++ b/kernels/plugins/vadd_put/Makefile @@ -16,7 +16,7 @@ # *******************************************************************************/ DEVICE=xcu250-figd2104-2L-e -VADD_IP=vadd_put.xo +VADD_IP=vadd_put_$(DEVICE).xo TARGET=ip all: $(VADD_IP) diff --git a/kernels/plugins/vadd_put/build_vadd_put.tcl b/kernels/plugins/vadd_put/build_vadd_put.tcl index 5ff3c6b4..829df0ef 100644 --- a/kernels/plugins/vadd_put/build_vadd_put.tcl +++ b/kernels/plugins/vadd_put/build_vadd_put.tcl @@ -40,14 +40,14 @@ switch $command { } -open_project build_vadd_put +open_project build_vadd_put.${device} add_files vadd_put.cpp -cflags "-std=c++14 -I../../../driver/hls/ -I. -DACCL_SYNTHESIS" set_top vadd_put open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/vadd_put.xo +config_export -format xo -library ACCL -output [pwd]/vadd_put_${device}.xo if {$do_syn} { set_part $device diff --git a/test/host/hls/.gitignore b/test/host/hls_simulator/.gitignore similarity index 100% rename from test/host/hls/.gitignore rename to test/host/hls_simulator/.gitignore diff --git a/test/host/hls/CMakeLists.txt b/test/host/hls_simulator/CMakeLists.txt similarity index 100% rename from test/host/hls/CMakeLists.txt rename to test/host/hls_simulator/CMakeLists.txt diff --git a/test/host/hls/test.cpp b/test/host/hls_simulator/test.cpp similarity index 100% rename from test/host/hls/test.cpp rename to test/host/hls_simulator/test.cpp diff --git a/test/host/hls_tcp_vadd/.gitignore b/test/host/hls_tcp_vadd/.gitignore new file mode 100644 index 00000000..ea1f8052 --- /dev/null +++ b/test/host/hls_tcp_vadd/.gitignore @@ -0,0 +1,11 @@ +CMakeCache.txt +CMakeDoxyfile.in +CMakeDoxygenDefaults.cmake +CMakeFiles/ +Makefile +cmake_install.cmake +doxygen.log +bin +docs +lib +build/* \ No newline at end of file diff --git a/test/host/hls_tcp_vadd/CMakeLists.txt b/test/host/hls_tcp_vadd/CMakeLists.txt new file mode 100644 index 00000000..e887a864 --- /dev/null +++ b/test/host/hls_tcp_vadd/CMakeLists.txt @@ -0,0 +1,53 @@ +# /******************************************************************************* +# Copyright (C) 2022 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +cmake_minimum_required(VERSION 3.9) +project(hls_test_vadd_tcp) + +set(CMAKE_CXX_STANDARD 17) + +# Set useful compile warnings +add_compile_options(-Wall -Wextra -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter) + +if (NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib") +endif() + +if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") +endif() + +set(ACCL_REPO_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../../) +set(HLSLIB_INCLUDE ${ACCL_REPO_ROOT}/hlslib/include/hlslib/xilinx/) +set(ACCL_HLSDRV_DIR ${ACCL_REPO_ROOT}/driver/hls/) +set(CCLO_BFM_DIR ${ACCL_REPO_ROOT}/test/model/bfm/) +set(VADD_DIR ${ACCL_REPO_ROOT}/kernels/plugins/vadd_put/) +set(EMU_INCLUDES ${HLSLIB_INCLUDE} $ENV{XILINX_HLS}/include/ ${ACCL_HLSDRV_DIR} ${CCLO_BFM_DIR}) + +set(ACCL_DEBUG 1) +set(ACCL_NETWORK_UTILS_MPI 1) +set(ACCL_NETWORK_UTILS_DEBUG 1) +set(CMAKE_BUILD_TYPE Debug) + +add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../../../driver/utils/accl_network_utils ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/accl_network_utils) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../model/bfm ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cclobfm) + +find_package(MPI REQUIRED) +add_executable(test test.cpp ${CCLO_BFM_DIR}/cclo_bfm.cpp ${VADD_DIR}/vadd_put.cpp) +target_include_directories(test PUBLIC ${MPI_CXX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH} ${EMU_INCLUDES} ${VADD_DIR}) +target_link_libraries(test PUBLIC MPI::MPI_CXX zmq zmqpp pthread cclobfm accl_network_utils) +target_compile_options(test PRIVATE -w -fdiagnostics-color=always -g) diff --git a/test/host/hls_tcp_vadd/README.md b/test/host/hls_tcp_vadd/README.md new file mode 100644 index 00000000..ffb3e210 --- /dev/null +++ b/test/host/hls_tcp_vadd/README.md @@ -0,0 +1,41 @@ +# ACCL-distributed Vector Addition over TCP/IP + +This directory contains a minimal working example of a FPGA-driven distributed application. Each FPGA (node) generates a unique vector, of variable size, based on it's ID. Then, one is added to each element of the vector and it is transmitted to to the next node. Therefore, each node: +1. Performs some floating-point computation +2. Sends a vector to a neighbouring node +3. Receives a vector from a neighbouring node + +## Running in simulation +The following steps describe how to run the example in emulation/simulation. For more information on ACCL simulation/emulation, please see [here]( https://ethz.ch/content/dam/ethz/special-interest/infk/inst-cp/inst-cp-dam/research/data-processing-on-modern-hardware/ACCL_Sim_FPGA23_Tutorial.pdf). First launch the emulator/simulator, as described in the [INSTALL.md](https://github.com/Xilinx/ACCL/blob/main/INSTALL.md): +```bash +cd "/test/model/emulator" +source /settings64.sh +/bin/cmake . +python3 run.py -n +``` + +Then, compile the program and run it using mpirun: +```bash +mkdir build && cd build +/bin/cmake .. && make +cd .. +mpirun -n 2 build/bin/test +``` + +## Running in hardware +The following example describe how to run the example on hardware. The design is deployed in [ETH HACC](https://systems.ethz.ch/research/data-processing-on-modern-hardware/hacc.html) on Alveo U55C boards; however, the scripts can be modified to run on any compatible FPGA cluster. First, create a bitstream (which will take some time...) + ```bash +source /settings64.sh +cd "/test/refdesigns" +make MODE=tcp USER_KERNEL=vadd PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 +``` +Once complete, compile the source host code: +```bash +mkdir build && cd build +/bin/cmake .. && make +cd .. +``` +Finally, launch the application using the run script. If needed, modify the script to the target cluster set-up +```bash +bash run_ethz_hacc_alveo_u55c.sh +``` diff --git a/test/host/hls_tcp_vadd/run_ethz_hacc_alveo_u55c.sh b/test/host/hls_tcp_vadd/run_ethz_hacc_alveo_u55c.sh new file mode 100644 index 00000000..8b93f528 --- /dev/null +++ b/test/host/hls_tcp_vadd/run_ethz_hacc_alveo_u55c.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +echo "Enter ETHZ HACC Alveo U55C machine IDs (space separated, e.g. 4 5):" +read -a SERVID + +echo "Enter path to ACCL .xclbin driver after bitstream generation:" +read -a XCLBIN_PATH + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +HOST_FILE=$SCRIPT_DIR/host.txt +FPGA_FILE=$SCRIPT_DIR/fpga.json +rm $HOST_FILE $FPGA_FILE + +# Obtain CPU (needed for launching MPI process) and FPGA (needed for ACCL-EasyNet) IPs +NP=0 +for ID in ${SERVID[@]}; do + echo "10.253.74.$((($ID - 1) * 4 + 66))" >> $HOST_FILE + fpgaip+="\"10.253.74.$((($ID - 1) * 4 + 68))\"," + hostlist+="alveo-u55c-$(printf "%02d" $servid) " + NP=$((NP+1)) +done +echo "{\"ips\": [${fpgaip::-1}]}" >> $FPGA_FILE + +# Run application +mpirun -np $NP -iface ens4f0 -f $HOST_FILE $SCRIPT_DIR/build/bin/test -f -c $FPGA_FILE -x $XCLBIN_PATH & +sleep 30 + +# Kill process, clean-up IP files and reset device +rm $HOST_FILE $FPGA_FILE +parallel-ssh -H "$hostlist" "kill -9 \$(ps -aux | grep test | awk '{print \$2}')" +parallel-ssh -H "$hostlist" "xbutil reset --force --device 0000:c4:00.1" + +# /home/bramhorst/accl_vadd/test/refdesigns/link_tcp_xilinx_u55c_gen3x16_xdma_3_202210_1_1/ccl_offload.xclbin +# /home/bramhorst/accl_vadd/test/refdesigns/link_tcp_xilinx_u55c_gen3x16_xdma_3_202210_1_2/ccl_offload.xclbin \ No newline at end of file diff --git a/test/host/hls_tcp_vadd/test.cpp b/test/host/hls_tcp_vadd/test.cpp new file mode 100644 index 00000000..4e762554 --- /dev/null +++ b/test/host/hls_tcp_vadd/test.cpp @@ -0,0 +1,236 @@ +/******************************************************************************* +# Copyright (C) 2022 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +*******************************************************************************/ + +// ACCL-specific includes +#include +#include + +// For initalizing the MPI environment +#include + +// For parsing CLI arguments +#include + +// For storing a list of the ACCL ranks/nodes (incl. IP address, port etc.) +#include + +// For communicating with the FPGA from the host CPU +#include + +// Standard I/O +#include + +// HLS implementation of vector add kernel. Found in ACCL/kernels/plugins/vadd +#include "vadd_put.h" + +// Used only during simulation; bus-functional model (BFM) of ACCL CCLO +#include "cclo_bfm.h" + +struct options_t { + unsigned int start_port; + unsigned int rxbuf_size; + unsigned int segment_size; + unsigned int count; + unsigned int device_index; + bool hardware; + bool rsfec; + std::string xclbin; + std::string config_file; +}; + +options_t parse_options(int argc, char *argv[]) { + TCLAP::CmdLine cmd("Test HLS ACCL C++ driver. Performs local vector addition and sends the result to a neighbouring node"); + TCLAP::ValueArg start_port_arg( + "p", "start-port", "Start of range of ports", false, 5500, "positive integer" + ); + cmd.add(start_port_arg); + + TCLAP::ValueArg count_arg( + "s", "count", "How many elements in the vector", false, 16, "positive integer" + ); + cmd.add(count_arg); + + TCLAP::ValueArg bufsize_arg( + "b", "rxbuf-size", "How many KB per RX buffer", false, 1, "positive integer" + ); + cmd.add(bufsize_arg); + + TCLAP::SwitchArg hardware_arg( + "f", "hardware", "Enable hardware mode", cmd, false + ); + + TCLAP::ValueArg xclbin_arg( + "x", "xclbin", "xclbin file of ACCL driver if hardware mode is used", false, "accl.xclbin", "file" + ); + cmd.add(xclbin_arg); + + TCLAP::ValueArg device_index_arg( + "i", "device-index", "device index of FPGA if hardware mode is used", false, 0, "positive integer" + ); + cmd.add(device_index_arg); + + TCLAP::ValueArg config_arg( + "c", "config", "Config file containing IP mapping", false, "", "JSON file" + ); + cmd.add(config_arg); + + TCLAP::SwitchArg rsfec_arg( + "", "rsfec", "Enables RS-FEC in CMAC.", cmd, false + ); + + try { + cmd.parse(argc, argv); + } catch (std::exception &e) { + std::cout << "Error: " << e.what() << std::endl; + MPI_Finalize(); + exit(1); + } + + options_t opts; + opts.start_port = start_port_arg.getValue(); + opts.count = count_arg.getValue(); + opts.rxbuf_size = bufsize_arg.getValue() * 1024; // convert to bytes + opts.segment_size = opts.rxbuf_size; + opts.hardware = hardware_arg.getValue(); + opts.xclbin = xclbin_arg.getValue(); + opts.device_index = device_index_arg.getValue(); + opts.config_file = config_arg.getValue(); + opts.rsfec = rsfec_arg.getValue(); + return opts; +} + +void test_vadd_put(ACCL::ACCL &accl, xrt::device &device, options_t options, int current_rank, int world_size) { + // Allocate float arrays for the HLS function to use + float src[options.count], dst[options.count]; + for(int i = 0; i < options.count; i++){ + src[i] = 1.0 * (options.count * current_rank + i); + } + + if (options.hardware) { + // Instantiate vector-addition kernel from hardware + xrt::kernel vadd_ip = xrt::kernel( + device, + device.get_xclbin_uuid(), + "vadd_put:{vadd_0_0}", + xrt::kernel::cu_access_mode::exclusive + ); + + // Allocated buffers for input and output data + // Need to use XRT API because vector-addition kernel might use different HBM banks than ACCL + auto src_bo = xrt::bo(device, sizeof(float) * options.count, vadd_ip.group_id(0)); + auto dst_bo = xrt::bo(device, sizeof(float) * options.count, vadd_ip.group_id(1)); + + // Sync data, run kernel and wait for output + src_bo.write(src); + src_bo.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + xrt::run run = vadd_ip( + src_bo, dst_bo, options.count, + (current_rank + 1) % world_size, + accl.get_communicator_addr(), + accl.get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}) + ); + run.wait(10000); + + dst_bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + dst_bo.read(dst); + + } else { + // Initialize a CCLO BFM (simulation-only) and streams as needed + hlslib::Stream callreq, callack; + hlslib::Stream data_cclo2krnl, data_krnl2cclo; + std::vector dest = {9}; + CCLO_BFM cclo( + options.start_port, current_rank, world_size, + dest, callreq, callack, data_cclo2krnl, data_krnl2cclo + ); + cclo.run(); + std::cout << "CCLO BFM started" << std::endl; + + // Wait for all nodes to initalize the BFM + MPI_Barrier(MPI_COMM_WORLD); + + // Run the HLS function, using the global communicator + vadd_put( + src, dst, options.count, + (current_rank + 1) % world_size, + accl.get_communicator_addr(), + accl.get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}), + callreq, callack, + data_krnl2cclo, data_cclo2krnl + ); + + // Stop the BFM + cclo.stop(); + } + + // Check HLS function outputs + unsigned int err_count = 0; + for(int i=0; i < options.count; i++){ + float expected = 1.0 * (options.count*((current_rank + world_size - 1) % world_size) + i) + 1; + if(dst[i] != expected){ + err_count++; + std::cout << "Mismatch at [" << i << "]: got " << dst[i] << " vs expected " << expected << std::endl; + } + } + + std::cout << "RANK: " << current_rank << " - TEST FINISHED WITH " << err_count << " ERRORS!" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); +} + +int main(int argc, char *argv[]) { + // Initialize the MPI world, identify the id (current_rank) of the node + MPI_Init(&argc, &argv); + int current_rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, ¤t_rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + std::cout << "current_rank: " << current_rank << " world_size: " << world_size << std::endl; + + // Parse CLI arguments + options_t options = parse_options(argc, argv); + + // Generate a list of ACCL ranks (incl. IP, port, RX buffer size) + std::vector ranks; + if (options.config_file == "") { + ranks = accl_network_utils::generate_ranks( + true, current_rank, world_size, options.start_port, options.rxbuf_size + ); + } else { + ranks = accl_network_utils::generate_ranks( + options.config_file, current_rank, options.start_port, options.rxbuf_size + ); + } + + // Initialize ACCL + xrt::device device{}; + if (options.hardware) device = xrt::device(options.device_index); + accl_network_utils::acclDesign design = accl_network_utils::acclDesign::TCP; + std::unique_ptr accl = accl_network_utils::initialize_accl( + ranks, current_rank, !options.hardware, design, device, options.xclbin, 16, + options.rxbuf_size, options.segment_size, options.rsfec + ); + accl->set_timeout(1e6); + + // Wait until all ranks have finished setting-up and run test + MPI_Barrier(MPI_COMM_WORLD); + test_vadd_put(*accl, device, options, current_rank, world_size); + + // Finalize + MPI_Finalize(); + return 0; +} diff --git a/test/host/xrt/CMakeLists.txt b/test/host/xrt/CMakeLists.txt index affbb8f6..c5dc5012 100644 --- a/test/host/xrt/CMakeLists.txt +++ b/test/host/xrt/CMakeLists.txt @@ -19,6 +19,8 @@ cmake_minimum_required(VERSION 3.9) project(xrt_test) set(CMAKE_CXX_STANDARD 17) +set(ACCL_SIM_NUM_BANKS 2 CACHE STRING "Number of ACCL simulator memory banks (must correspond to simdll)") +set(ACCL_SIM_MEM_SIZE_KB 262144 CACHE STRING "Size of ACCL simulator memory, in KB (must correspond to simdll)") #GTest config include(FetchContent) diff --git a/test/host/xrt/include/fixture.hpp b/test/host/xrt/include/fixture.hpp index 6250c4af..14e539e1 100644 --- a/test/host/xrt/include/fixture.hpp +++ b/test/host/xrt/include/fixture.hpp @@ -42,6 +42,7 @@ inline int size; inline pid_t emulator_pid; inline options_t options; inline xrt::device dev; +inline ACCL::CoyoteDevice* cyt_dev; inline std::unique_ptr accl; inline std::ofstream csvstream; @@ -66,9 +67,21 @@ class TestEnvironment : public ::testing::Environment { design = acclDesign::UDP; } else if (options.tcp) { design = acclDesign::TCP; + } else if (options.cyt_tcp) { + design = acclDesign::CYT_TCP; + } else if (options.cyt_rdma) { + design = acclDesign::CYT_RDMA; } - if (options.hardware || options.test_xrt_simulator) { + if(options.hardware){ + if(options.cyt_rdma) { + cyt_dev = new ACCL::CoyoteDevice(::size); + } else if (options.cyt_tcp){ + cyt_dev = new ACCL::CoyoteDevice(); + } else { + dev = xrt::device(options.device_index); + } + } else if (options.test_xrt_simulator) { dev = xrt::device(options.device_index); } @@ -84,12 +97,17 @@ class TestEnvironment : public ::testing::Environment { options.benchmark = false; } - accl = initialize_accl( - ranks, ::rank, !options.hardware, design, dev, options.xclbin, options.rxbuf_count, - options.rxbuf_size, options.segment_size, options.rsfec); - std::cout << "Setting up TestEnvironment" << std::endl; + + if(options.hardware && (options.cyt_rdma || options.cyt_tcp)){ + accl = std::make_unique(cyt_dev); + accl.get()->initialize(ranks, ::rank, options.rxbuf_count, options.rxbuf_size, options.max_eager_count); + } else { + accl = initialize_accl( + ranks, ::rank, !options.hardware, design, dev, options.xclbin, options.rxbuf_count, + options.rxbuf_size, options.max_eager_count, options.rsfec); + } + std::cout << "Done setting up TestEnvironment" << std::endl; accl->set_timeout(1e6); - accl->set_rendezvous_threshold(options.max_eager_count); } diff --git a/test/host/xrt/include/utility.hpp b/test/host/xrt/include/utility.hpp index e7a4e17f..5987d83e 100644 --- a/test/host/xrt/include/utility.hpp +++ b/test/host/xrt/include/utility.hpp @@ -31,7 +31,6 @@ struct options_t { int start_port; unsigned int rxbuf_size; unsigned int rxbuf_count; - unsigned int segment_size; unsigned int max_eager_count; unsigned int count; unsigned int device_index; diff --git a/test/host/xrt/src/bench.cpp b/test/host/xrt/src/bench.cpp index 11dd1d93..79017ff0 100644 --- a/test/host/xrt/src/bench.cpp +++ b/test/host/xrt/src/bench.cpp @@ -113,7 +113,7 @@ options_t parse_options(int argc, char *argv[]) { opts.count = count_arg.getValue(); opts.rxbuf_size = 4*opts.count; // 4 MB by default opts.rxbuf_count = 16; - opts.segment_size = opts.rxbuf_size; + opts.max_eager_count = opts.rxbuf_size; opts.axis3 = axis3_arg.getValue(); opts.benchmark = true; diff --git a/test/host/xrt/src/stress.cpp b/test/host/xrt/src/stress.cpp index 5fd395ac..09174e0b 100644 --- a/test/host/xrt/src/stress.cpp +++ b/test/host/xrt/src/stress.cpp @@ -92,7 +92,7 @@ options_t parse_options(int argc, char *argv[]) { opts.count = count_arg.getValue(); opts.rxbuf_count = bufcount_arg.getValue(); opts.rxbuf_size = bufsize_arg.getValue() * 1024; // convert to bytes - opts.segment_size = opts.rxbuf_size; + opts.max_eager_count = opts.rxbuf_size; opts.debug = debug_arg.getValue(); opts.hardware = hardware_arg.getValue(); opts.axis3 = axis3_arg.getValue(); diff --git a/test/host/xrt/src/test.cpp b/test/host/xrt/src/test.cpp index 3bd88827..adacef88 100644 --- a/test/host/xrt/src/test.cpp +++ b/test/host/xrt/src/test.cpp @@ -34,6 +34,8 @@ TEST_F(ACCLTest, test_copy){ unsigned int count = options.count; auto op_buf = accl->create_buffer(count, dataType::float32); auto res_buf = accl->create_buffer(count, dataType::float32); + EXPECT_FALSE(op_buf->is_host_only()); + EXPECT_FALSE(res_buf->is_host_only()); random_array(op_buf->buffer(), count); accl->copy(*op_buf, *res_buf, count); @@ -84,6 +86,84 @@ TEST_F(ACCLTest, test_copy_p2p) { } } +TEST_F(ACCLTest, test_copy_d2h) { + if(::size > 1){ + GTEST_SKIP() << "Skipping single-node test on multi-node setup"; + } + unsigned int count = options.count; + auto op_buf = accl->create_buffer(count, dataType::float32); + EXPECT_FALSE(op_buf->is_host_only()); + std::unique_ptr> res_buf; + try { + res_buf = accl->create_buffer_host(count, dataType::float32); + EXPECT_TRUE(res_buf->is_host_only()); + } catch (const std::bad_alloc &e) { + std::cout << "Can't allocate host buffer (" << e.what() << "). " + << "This probably means HOST mem is disabled.\n" + << "Skipping host buffer test..." << std::endl; + return; + } + random_array(op_buf->buffer(), count); + + accl->copy(*op_buf, *res_buf, count); + + for (unsigned int i = 0; i < count; ++i) { + EXPECT_FLOAT_EQ((*op_buf)[i], (*res_buf)[i]); + } +} + +TEST_F(ACCLTest, test_copy_h2d) { + if(::size > 1){ + GTEST_SKIP() << "Skipping single-node test on multi-node setup"; + } + unsigned int count = options.count; + auto res_buf = accl->create_buffer(count, dataType::float32); + EXPECT_FALSE(res_buf->is_host_only()); + std::unique_ptr> op_buf; + try { + op_buf = accl->create_buffer_host(count, dataType::float32); + EXPECT_TRUE(op_buf->is_host_only()); + } catch (const std::bad_alloc &e) { + std::cout << "Can't allocate host buffer (" << e.what() << "). " + << "This probably means HOST mem is disabled.\n" + << "Skipping host buffer test..." << std::endl; + return; + } + random_array(op_buf->buffer(), count); + + accl->copy(*op_buf, *res_buf, count); + + for (unsigned int i = 0; i < count; ++i) { + EXPECT_FLOAT_EQ((*op_buf)[i], (*res_buf)[i]); + } +} + +TEST_F(ACCLTest, test_copy_h2h) { + if(::size > 1){ + GTEST_SKIP() << "Skipping single-node test on multi-node setup"; + } + unsigned int count = options.count; + std::unique_ptr> op_buf, res_buf; + try { + op_buf = accl->create_buffer_host(count, dataType::float32); + EXPECT_TRUE(op_buf->is_host_only()); + res_buf = accl->create_buffer_host(count, dataType::float32); + EXPECT_TRUE(res_buf->is_host_only()); + } catch (const std::bad_alloc &e) { + std::cout << "Can't allocate host buffer (" << e.what() << "). " + << "This probably means HOST mem is disabled.\n" + << "Skipping host buffer test..." << std::endl; + return; + } + random_array(op_buf->buffer(), count); + + accl->copy(*op_buf, *res_buf, count); + + for (unsigned int i = 0; i < count; ++i) { + EXPECT_FLOAT_EQ((*op_buf)[i], (*res_buf)[i]); + } +} + TEST_P(ACCLFuncTest, test_combine) { if(::size > 1){ GTEST_SKIP() << "Skipping single-node test on multi-node setup"; @@ -266,7 +346,7 @@ TEST_P(ACCLSegmentationTest, test_sendrcv_segmentation){ if(::size == 1){ GTEST_SKIP() << "Skipping send/recv test on single-node setup"; } - unsigned int count_per_segment = options.segment_size / (dataTypeSize.at(dataType::float32) / 8); + unsigned int count_per_segment = options.rxbuf_size / (dataTypeSize.at(dataType::float32) / 8); unsigned int multiplier = std::get<0>(GetParam()); int offset = std::get<1>(GetParam()); unsigned int count; @@ -780,6 +860,34 @@ TEST_P(ACCLRootFuncTest, test_reduce) { } } +TEST_F(ACCLTest, test_reduce_h2h) { + int root = 0; + reduceFunction function = reduceFunction::SUM; + + unsigned int count = options.count; + unsigned int count_bytes = count * dataTypeSize.at(dataType::float32) / 8; + + auto op_buf = accl->create_buffer_host(count, dataType::float32); + auto tmp_buf = accl->create_buffer_host(count, dataType::float32); + auto res_buf = accl->create_buffer_host(count, dataType::float32); + random_array(op_buf->buffer(), count); + + test_debug("Reduce data to " + std::to_string(root) + "...", options); + accl->reduce(*op_buf, *tmp_buf, count, root, function); + accl->reduce((::rank == root) ? *tmp_buf : *op_buf, *res_buf, count, root, function); + + float res, ref; + if (::rank == root) { + for (unsigned int i = 0; i < count; ++i) { + res = (*res_buf)[i]; + ref = (*op_buf)[i] * (2*::size-1); + EXPECT_FLOAT_EQ(res, ref); + } + } else { + EXPECT_TRUE(true); + } +} + TEST_P(ACCLRootFuncTest, test_reduce_compressed) { int root = std::get<0>(GetParam()); reduceFunction function = std::get<1>(GetParam()); @@ -979,6 +1087,25 @@ TEST_P(ACCLFuncTest, test_allreduce) { } } +TEST_F(ACCLTest, test_allreduce_h2h) { + reduceFunction function = reduceFunction::SUM; + + unsigned int count = options.count; + auto op_buf = accl->create_buffer_host(count, dataType::float32); + auto res_buf = accl->create_buffer_host(count, dataType::float32); + random_array(op_buf->buffer(), count); + + test_debug("Reducing data...", options); + accl->allreduce(*op_buf, *res_buf, count, function); + + float res, ref; + for (unsigned int i = 0; i < count; ++i) { + res = (*res_buf)[i]; + ref = (*op_buf)[i] *::size; + EXPECT_FLOAT_EQ(res, ref); + } +} + TEST_P(ACCLFuncTest, test_allreduce_compressed) { reduceFunction function = GetParam(); if((function != reduceFunction::SUM) && (function != reduceFunction::MAX)){ @@ -1077,7 +1204,7 @@ options_t parse_options(int argc, char *argv[]) { false, "", "string"); TCLAP::ValueArg max_eager_arg("", "max-eager-count", "Maximum byte count for eager mode", false, - 16*1024, "positive integer"); + 3*1024, "positive integer"); cmd.add(max_eager_arg); try { cmd.parse(argc, argv); @@ -1100,7 +1227,6 @@ options_t parse_options(int argc, char *argv[]) { opts.count = count_arg.getValue(); opts.rxbuf_count = bufcount_arg.getValue(); opts.rxbuf_size = bufsize_arg.getValue() * 1024; // convert to bytes - opts.segment_size = std::min((unsigned)opts.rxbuf_size, (unsigned)4*1024*1024); //min of rxbuf_size and max_btt opts.debug = debug_arg.getValue(); opts.hardware = hardware_arg.getValue(); opts.axis3 = axis3_arg.getValue(); diff --git a/test/model/bfm/CMakeLists.txt b/test/model/bfm/CMakeLists.txt index 6b022a55..1077c455 100644 --- a/test/model/bfm/CMakeLists.txt +++ b/test/model/bfm/CMakeLists.txt @@ -84,7 +84,7 @@ get_target_property(JSON_INC_PATH jsoncpp_lib INTERFACE_INCLUDE_DIRECTORIES) add_library(cclobfm SHARED ${CCLO_BFM_SOURCES}) target_include_directories(cclobfm PUBLIC ${ACCL_XRTDRV_INCLUDE_PATH} ${ACCL_HLSDRV_INCLUDE_PATH} ${CCLO_BFM_INCLUDE_PATH} ${HLSLIB_INCLUDE} ${HLS_INCLUDES} ${JSON_INC_PATH} $ENV{XILINX_XRT}/include) -target_link_libraries(cclobfm PUBLIC jsoncpp_lib zmqpp zmq pthread) +target_link_libraries(cclobfm PUBLIC jsoncpp_lib zmq pthread) set_target_properties(cclobfm PROPERTIES VERSION ${PROJECT_VERSION} diff --git a/test/model/bfm/cclo_bfm.h b/test/model/bfm/cclo_bfm.h index 58017ecd..0edd5e17 100644 --- a/test/model/bfm/cclo_bfm.h +++ b/test/model/bfm/cclo_bfm.h @@ -20,6 +20,7 @@ #include "accl_hls.h" #include "accl/simbuffer.hpp" #include +#include /** * @brief Class providing a bus-functional model (at HLS Stream level) of the ACCL CCLO kernel. Connects to the emulator/simulator. diff --git a/test/model/emulator/CMakeLists.txt b/test/model/emulator/CMakeLists.txt index 5c3bcad2..3d4faaf0 100644 --- a/test/model/emulator/CMakeLists.txt +++ b/test/model/emulator/CMakeLists.txt @@ -75,7 +75,7 @@ add_executable(cclo_emu ${EMU_SOURCES}) get_target_property(JSON_INC_PATH jsoncpp_lib INTERFACE_INCLUDE_DIRECTORIES) target_include_directories(cclo_emu PUBLIC ${JSON_INC_PATH} ${EMU_INCLUDES}) -target_link_libraries(cclo_emu PUBLIC zmq zmqpp pthread jsoncpp_lib) +target_link_libraries(cclo_emu PUBLIC zmq pthread jsoncpp_lib) target_compile_definitions(cclo_emu PUBLIC MB_FW_EMULATION NUM_CTRL_STREAMS=3) if (NOT EXISTS $ENV{XILINX_HLS}) target_compile_definitions(cclo_emu PUBLIC OSS_HALF_PRECISION) diff --git a/test/model/emulator/cclo_emu.cpp b/test/model/emulator/cclo_emu.cpp index 54558063..578cc482 100644 --- a/test/model/emulator/cclo_emu.cpp +++ b/test/model/emulator/cclo_emu.cpp @@ -54,16 +54,18 @@ using namespace hlslib; Log logger; -void dma_read(vector &mem, Stream > &cmd, Stream > &sts, Stream &rdata){ - axi::Command<64, 23> command = axi::Command<64, 23>(cmd.Pop().data); +void dma_read(vector &dmem, vector &hmem, Stream > &cmd, Stream > &sts, Stream &rdata){ + ap_axiu<104,0,0,DEST_WIDTH> cmd_word = cmd.Pop(); + axi::Command<64, 23> command = axi::Command<64, 23>(cmd_word.data); + bool host = (cmd_word.dest == 1); axi::Status status; stream_word tmp; - logger << log_level::verbose << "DMA Read: Command popped. length: " << command.length << " offset: " << command.address << " EOF: " << command.eof << endl; + logger << log_level::verbose << "DMA " << (host ? "host" : "device") << " read: Command popped. length: " << command.length << " offset: " << command.address << " EOF: " << command.eof << endl; int byte_count = 0; while(byte_count < command.length){ tmp.keep = 0; for(int i=0; i<64 && byte_count < command.length; i++){ - tmp.data(8*(i+1)-1, 8*i) = mem.at(command.address+byte_count); + tmp.data(8*(i+1)-1, 8*i) = host ? hmem.at(command.address+byte_count) : dmem.at(command.address+byte_count); tmp.keep(i,i) = 1; byte_count++; } @@ -76,17 +78,23 @@ void dma_read(vector &mem, Stream > &cmd, Stre logger("DMA Read: Status pushed\n", log_level::verbose); } -void dma_write(vector &mem, Stream > &cmd, Stream > &sts, Stream &wdata){ - axi::Command<64, 23> command = axi::Command<64, 23>(cmd.Pop().data); +void dma_write(vector &dmem, vector &hmem, Stream > &cmd, Stream > &sts, Stream &wdata){ + ap_axiu<104,0,0,DEST_WIDTH> cmd_word = cmd.Pop(); + axi::Command<64, 23> command = axi::Command<64, 23>(cmd_word.data); + bool host = (cmd_word.dest == 1); axi::Status status; stream_word tmp; - logger << log_level::verbose << "DMA Write: Command popped. length: " << command.length << " offset: " << command.address << " EOF: " << command.eof << endl; + logger << log_level::verbose << "DMA " << (host ? "host" : "device") << " write: Command popped. length: " << command.length << " offset: " << command.address << " EOF: " << command.eof << endl; int byte_count = 0; while(byte_count devicemem; + vector devicemem, hostmem; Stream, 32> host_cmd("host_cmd"); Stream, 32> host_sts("host_sts"); @@ -343,14 +351,56 @@ void sim_bd(zmq_intf_context *ctx, string comm_backend, unsigned int local_rank, unsigned int max_words_per_pkt = (use_cyt_rdma ? 4096 : MAX_PACKETSIZE)/DATAPATH_WIDTH_BYTES; + SetName(cmd_fifos[CMD_CALL_RETRY], "cmd_call_retry"); + SetName(sts_fifos[STS_CALL_RETRY], "sts_call_retry"); + SetName(cmd_fifos[CMD_DMA_MOVE], "cmd_dma_move"); + SetName(sts_fifos[STS_DMA_MOVE], "sts_dma_move"); + SetName(cmd_fifos[CMD_RNDZV], "cmd_rndzv"); + SetName(sts_fifos[STS_RNDZV], "sts_rndzv"); + SetName(cmd_fifos[CMD_RNDZV_PENDING], "cmd_rndzv_pending"); + SetName(sts_fifos[STS_RNDZV_PENDING], "sts_rndzv_pending"); + SetName(cmd_fifos[CMD_CALL], "cmd_call"); + SetName(sts_fifos[STS_CALL], "sts_call"); + + SetName(dma_write_cmd_int[1], "dma_write_cmd_int_0"); + SetName(dma_write_cmd_int[0], "dma_write_cmd_int_1"); + SetName(dma_read_cmd_int[1], "dma_read_cmd_int_0"); + SetName(dma_read_cmd_int[0], "dma_read_cmd_int_1"); + SetName(dma_write_sts_int[1], "dma_write_sts_int_0"); + SetName(dma_write_sts_int[0], "dma_write_sts_int_1"); + SetName(dma_read_sts_int[1], "dma_read_sts_int_0"); + SetName(dma_read_sts_int[0], "dma_read_sts_int_1"); + SetName(dma_read_data[1], "dma_read_data_0"); + SetName(dma_read_data[0], "dma_read_data_1"); + + SetName(switch_s[7], "switch_s_7"); + SetName(switch_s[6], "switch_s_6"); + SetName(switch_s[5], "switch_s_5"); + SetName(switch_s[4], "switch_s_4"); + SetName(switch_s[3], "switch_s_3"); + SetName(switch_s[2], "switch_s_2"); + SetName(switch_s[1], "switch_s_1"); + SetName(switch_s[0], "switch_s_0"); + + SetName(switch_m[9], "switch_m_9"); + SetName(switch_m[8], "switch_m_8"); + SetName(switch_m[7], "switch_m_7"); + SetName(switch_m[6], "switch_m_6"); + SetName(switch_m[5], "switch_m_5"); + SetName(switch_m[4], "switch_m_4"); + SetName(switch_m[3], "switch_m_3"); + SetName(switch_m[2], "switch_m_2"); + SetName(switch_m[1], "switch_m_1"); + SetName(switch_m[0], "switch_m_0"); + // Dataflow functions running in parallel HLSLIB_DATAFLOW_INIT(); //DMA0 - HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, dma_write_cmd_int[0], dma_write_sts_int[0], switch_m[SWITCH_M_DMA0_WRITE]); - HLSLIB_FREERUNNING_FUNCTION(dma_read, devicemem, dma_read_cmd_int[0], dma_read_sts_int[0], dma_read_data[0]); + HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, hostmem, dma_write_cmd_int[0], dma_write_sts_int[0], switch_m[SWITCH_M_DMA0_WRITE]); + HLSLIB_FREERUNNING_FUNCTION(dma_read, devicemem, hostmem, dma_read_cmd_int[0], dma_read_sts_int[0], dma_read_data[0]); //DMA1 - HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, dma_write_cmd_int[1], dma_write_sts_int[1], switch_m[SWITCH_M_DMA1_WRITE]); - HLSLIB_FREERUNNING_FUNCTION(dma_read, devicemem, dma_read_cmd_int[1], dma_read_sts_int[1], dma_read_data[1]); + HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, hostmem, dma_write_cmd_int[1], dma_write_sts_int[1], switch_m[SWITCH_M_DMA1_WRITE]); + HLSLIB_FREERUNNING_FUNCTION(dma_read, devicemem, hostmem, dma_read_cmd_int[1], dma_read_sts_int[1], dma_read_data[1]); //RX buffer handling offload HLSLIB_FREERUNNING_FUNCTION(rxbuf_enqueue, enq2sess_dma_cmd, inflight_rxbuf, cfgmem); HLSLIB_FREERUNNING_FUNCTION(rxbuf_dequeue, sess2deq_dma_sts, eth_rx_sts_sess, inflight_rxbuf_sess, eth_rx_notif, cfgmem); @@ -423,7 +473,7 @@ void sim_bd(zmq_intf_context *ctx, string comm_backend, unsigned int local_rank, rdma_wr_data, rdma_wr_cmd, rdma_wr_sts, eth_rx_data, eth_tx_data ); - HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, rdma_wr_cmd, rdma_wr_sts, rdma_wr_data); + HLSLIB_FREERUNNING_FUNCTION(dma_write, devicemem, hostmem, rdma_wr_cmd, rdma_wr_sts, rdma_wr_data); } else{ HLSLIB_FREERUNNING_FUNCTION(udp_packetizer, switch_m[SWITCH_M_ETH_TX], eth_tx_data, eth_tx_cmd, eth_tx_sts, max_words_per_pkt); HLSLIB_FREERUNNING_FUNCTION(udp_depacketizer, eth_rx_data, switch_s[SWITCH_S_ETH_RX], eth_rx_sts, eth_notif_out_dpkt); @@ -440,7 +490,7 @@ void sim_bd(zmq_intf_context *ctx, string comm_backend, unsigned int local_rank, HLSLIB_FREERUNNING_FUNCTION(client_arbiter, callreq_arb_host, callack_arb_host, callreq_arb[2], callack_arb[2], sts_fifos[CMD_CALL], cmd_fifos[STS_CALL]); //ZMQ to host process - HLSLIB_FREERUNNING_FUNCTION(serve_zmq, ctx, cfgmem, devicemem, callreq_fifos, callack_fifos); + HLSLIB_FREERUNNING_FUNCTION(serve_zmq, ctx, cfgmem, devicemem, hostmem, callreq_fifos, callack_fifos); //ZMQ to other nodes process(es) HLSLIB_FREERUNNING_FUNCTION(eth_endpoint_egress_port, ctx, eth_tx_data, local_rank); HLSLIB_FREERUNNING_FUNCTION(eth_endpoint_ingress_port, ctx, eth_rx_data); diff --git a/test/model/simulator/CMakeLists.txt b/test/model/simulator/CMakeLists.txt index 9f02a71a..4175d9ab 100644 --- a/test/model/simulator/CMakeLists.txt +++ b/test/model/simulator/CMakeLists.txt @@ -19,6 +19,7 @@ cmake_minimum_required(VERSION 3.9) project(accl_simulation) set(CMAKE_CXX_STANDARD 17) +set(ACCL_SIM_NUM_BANKS 2 CACHE STRING "Number of ACCL simulator memory banks (must correspond to simdll)") set(ACCL_SIM_MEM_SIZE_KB 262144 CACHE STRING "Size of ACCL simulator memory, in KB (must correspond to simdll)") # Consider using cmake_path from CMake 3.20 (2021) for better error messages: @@ -51,10 +52,14 @@ target_link_libraries(cclo_sim PUBLIC zmq zmqpp pthread jsoncpp_lib dl rt) target_compile_definitions(cclo_sim PUBLIC ZMQ_CALL_VERBOSE NUM_CTRL_STREAMS=3 ACCL_SIM_MEM_SIZE_KB=${ACCL_SIM_MEM_SIZE_KB}) target_compile_options(cclo_sim PRIVATE -Wno-attributes -fdiagnostics-color=always -g -fmax-errors=3) -set(XSIM_COMPILE_FOLDER ${ACCL_REPO_ROOT}/kernels/cclo/ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/) +set(STACK_TYPE "TCP" CACHE STRING "Type of POE") +set_property(CACHE STACK_TYPE PROPERTY STRINGS "TCP" "UDP" "RDMA") + +set(BUILD_FOLDER ${STACK_TYPE}_sim) +set(XSIM_COMPILE_FOLDER ${ACCL_REPO_ROOT}/kernels/cclo/${BUILD_FOLDER}/ccl_offload_ex/ccl_offload_ex.sim/sim_1/behav/xsim/) if (NOT EXISTS ${XSIM_COMPILE_FOLDER}) - message(FATAL_ERROR "Simulation kernel not build. Run make simdll in ${ACCL_REPO_ROOT}/kernels/cclo") + message(FATAL_ERROR "Simulation kernel not built. Run make STACK_TYPE=${STACK_TYPE} MODE=simdll in ${ACCL_REPO_ROOT}/kernels/cclo") endif () set(SYMLINKS xsim.dir diff --git a/test/model/zmq/zmq_client.cpp b/test/model/zmq/zmq_client.cpp index de6df440..ec0a1c81 100644 --- a/test/model/zmq/zmq_client.cpp +++ b/test/model/zmq/zmq_client.cpp @@ -19,6 +19,7 @@ #include #include #include +#include using namespace std; @@ -28,7 +29,7 @@ zmq_intf_context zmq_client_intf(unsigned int starting_port, unsigned int local_ zmq_intf_context ctx; const string endpoint_base = "tcp://127.0.0.1:"; - ctx.cmd_socket = std::make_unique(ctx.context, zmqpp::socket_type::request); + ctx.cmd_socket = std::make_unique(ctx.context, zmq::socket_type::req); string cmd_endpoint = endpoint_base + to_string(starting_port + local_rank); cout << "Endpoint: " << cmd_endpoint << endl; @@ -42,8 +43,8 @@ zmq_intf_context zmq_client_intf(unsigned int starting_port, unsigned int local_ cout << "ZMQ Client Command Context established for rank " << local_rank << endl; if(krnl_dest.size() > 0){ - ctx.krnl_tx_socket = std::make_unique(ctx.context, zmqpp::socket_type::sub); - ctx.krnl_rx_socket = std::make_unique(ctx.context, zmqpp::socket_type::pub); + ctx.krnl_tx_socket = std::make_unique(ctx.context, zmq::socket_type::sub); + ctx.krnl_rx_socket = std::make_unique(ctx.context, zmq::socket_type::pub); //bind to tx socket string krnl_endpoint = endpoint_base + to_string(starting_port+2*world_size+local_rank); @@ -54,7 +55,8 @@ zmq_intf_context zmq_client_intf(unsigned int starting_port, unsigned int local_ for(int i=0; i<(int)krnl_dest.size(); i++){ string krnl_subscribe = to_string(krnl_dest.at(i)); cout << "Rank " << local_rank << " subscribing to " << krnl_subscribe << " (KRNL)" << endl; - ctx.krnl_tx_socket->subscribe(krnl_subscribe); + //TODO use non-deprecated set call: ctx.eth_rx_socket->set(zmq::sockopt::subscribe, krnl_subscribe); + ctx.krnl_tx_socket->setsockopt(ZMQ_SUBSCRIBE, krnl_subscribe.c_str(), krnl_subscribe.length()); this_thread::sleep_for(chrono::milliseconds(1000)); } //connect to rx socket @@ -91,13 +93,11 @@ void zmq_client_startcall(zmq_intf_context *ctx, unsigned int scenario, unsigned request_json["addr_2"] = (Json::Value::UInt64)addr_2; //send the message out to the CCLO simulator/emulator - zmqpp::message msg; - to_message(request_json, msg); cmd_socket_mutex.lock(); - ctx->cmd_socket->send(msg); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); //receive confirmation - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); cmd_socket_mutex.unlock(); Json::Value status = to_json(reply); if (status["status"].asInt() < 0) { @@ -111,13 +111,11 @@ void zmq_client_retcall(zmq_intf_context *ctx, unsigned int ctrl_id){ Json::Value request_json; request_json["type"] = 6; request_json["ctrl_id"] = ctrl_id; - zmqpp::message msg; - to_message(request_json, msg); cmd_socket_mutex.lock(); - ctx->cmd_socket->send(msg); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); //check the reply - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); cmd_socket_mutex.unlock(); Json::Value status = to_json(reply); if (status["status"].asInt() < 0) { @@ -133,12 +131,10 @@ unsigned int zmq_client_cfgread(zmq_intf_context *ctx, unsigned int offset){ request_json["type"] = 0; request_json["addr"] = (Json::Value::UInt)offset; - zmqpp::message request; - to_message(request_json, request); - ctx->cmd_socket->send(request); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); Json::Value reply_json = to_json(reply); if (reply_json["status"] != 0) { throw std::runtime_error("ZMQ config read error (" + std::to_string(reply_json["status"].asUInt()) + ")"); @@ -151,29 +147,26 @@ void zmq_client_cfgwrite(zmq_intf_context *ctx, unsigned int offset, unsigned in request_json["type"] = 1; request_json["addr"] = (Json::Value::UInt)offset; request_json["wdata"] = (Json::Value::UInt)val; - zmqpp::message request; - to_message(request_json, request); - ctx->cmd_socket->send(request); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); Json::Value reply_json = to_json(reply); if (reply_json["status"] != 0) { throw std::runtime_error("ZMQ config write error (" + std::to_string(reply_json["status"].asUInt()) + ")"); } } -void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data){ +void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data, bool host){ Json::Value request_json; request_json["type"] = 2; request_json["addr"] = (Json::Value::UInt64)adr; request_json["len"] = (Json::Value::UInt64)size; - zmqpp::message request; - to_message(request_json, request); - ctx->cmd_socket->send(request); + request_json["host"] = host; + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); Json::Value reply_json = to_json(reply); if (reply_json["status"] != 0) { throw std::runtime_error("ZMQ mem read error (" + std::to_string(reply_json["status"].asUInt()) + ")"); @@ -185,10 +178,11 @@ void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, } } -void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data){ +void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data, bool host){ Json::Value request_json; request_json["type"] = 3; request_json["addr"] = (Json::Value::UInt64)adr; + request_json["host"] = host; Json::Value array; for (size_t i = 0; i < size; ++i) { @@ -196,30 +190,27 @@ void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, } request_json["wdata"] = array; - zmqpp::message request; - to_message(request_json, request); - ctx->cmd_socket->send(request); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); Json::Value reply_json = to_json(reply); if (reply_json["status"] != 0) { throw std::runtime_error("ZMQ mem write error (" + std::to_string(reply_json["status"].asUInt()) + ")"); } } -void zmq_client_memalloc(zmq_intf_context *ctx, uint64_t adr, unsigned int size){ +void zmq_client_memalloc(zmq_intf_context *ctx, uint64_t adr, unsigned int size, bool host){ Json::Value request_json; request_json["type"] = 4; request_json["addr"] = (Json::Value::UInt64)adr; request_json["len"] = (Json::Value::UInt64)size; + request_json["host"] = host; - zmqpp::message request; - to_message(request_json, request); - ctx->cmd_socket->send(request); + ctx->cmd_socket->send(to_message(request_json), zmq::send_flags::none); - zmqpp::message reply; - ctx->cmd_socket->receive(reply); + zmq::message_t reply; + ctx->cmd_socket->recv(reply); Json::Value reply_json = to_json(reply); if (reply_json["status"] != 0) { throw std::runtime_error("ZMQ mem alloc error (" + std::to_string(reply_json["status"].asUInt()) + ")"); @@ -227,16 +218,17 @@ void zmq_client_memalloc(zmq_intf_context *ctx, uint64_t adr, unsigned int size) } std::vector zmq_client_strmread(zmq_intf_context *ctx, bool dont_block){ - zmqpp::message msg; + zmq::message_t msg; Json::Reader reader; Json::Value msg_json; std::string msg_text, dst_text; - if(!ctx->krnl_tx_socket->receive(msg, dont_block)) return std::vector(); + if(!(ctx->krnl_tx_socket->recv(msg, zmq::recv_flags::dontwait)).has_value()) return std::vector(); // decompose the message - msg >> dst_text; - msg >> msg_text; + dst_text = msg.to_string(); + ctx->krnl_tx_socket->recv(msg); + msg_text = msg.to_string(); reader.parse(msg_text, msg_json); std::vector ret; @@ -249,14 +241,13 @@ std::vector zmq_client_strmread(zmq_intf_context *ctx, bool dont_block) void zmq_client_strmwrite(zmq_intf_context *ctx, std::vector val, unsigned int dest){ Json::Value data_packet; - zmqpp::message msg; + zmq::message_t msg; Json::StreamWriterBuilder builder; for (int i = 0; i < static_cast(val.size()); ++i) { data_packet["data"][i] = val.at(i); } //first part of the message is the destination, used to filter at receiver - msg << to_string(dest); + ctx->krnl_rx_socket->send(zmq::message_t(to_string(dest)), zmq::send_flags::sndmore); //package the data - msg << Json::writeString(builder, data_packet); - ctx->krnl_rx_socket->send(msg); + ctx->krnl_rx_socket->send(zmq::message_t(Json::writeString(builder, data_packet)), zmq::send_flags::none); } diff --git a/test/model/zmq/zmq_client.h b/test/model/zmq/zmq_client.h index 7a4d4064..1ce5e304 100644 --- a/test/model/zmq/zmq_client.h +++ b/test/model/zmq/zmq_client.h @@ -74,8 +74,9 @@ void zmq_client_cfgwrite(zmq_intf_context *ctx, unsigned int offset, unsigned in * @param adr Address in emulated device memory * @param size Number of bytes to read * @param data Pointer to data + * @param host Flag indicating if targeting a "host" buffer */ -void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data); +void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data, bool host=false); /** * @brief Write to emulated device memory @@ -84,8 +85,9 @@ void zmq_client_memread(zmq_intf_context *ctx, uint64_t adr, unsigned int size, * @param adr Address in emulated device memory * @param size Number of bytes to read * @param data Pointer to data + * @param host Flag indicating if targeting a "host" buffer */ -void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data); +void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, uint8_t *data, bool host=false); /** * @brief Allocate memory on emulated device @@ -93,8 +95,9 @@ void zmq_client_memwrite(zmq_intf_context *ctx, uint64_t adr, unsigned int size, * @param ctx Pointer to existing ZMQ context * @param adr Address in emulated device memory * @param size Number of bytes to allocate + * @param host Flag indicating if targeting a "host" buffer */ -void zmq_client_memalloc(zmq_intf_context *ctx, uint64_t adr, unsigned int size); +void zmq_client_memalloc(zmq_intf_context *ctx, uint64_t adr, unsigned int size, bool host=false); /** * @brief Read from CCLO output data stream diff --git a/test/model/zmq/zmq_common.cpp b/test/model/zmq/zmq_common.cpp index 29f1704b..c3490e6b 100644 --- a/test/model/zmq/zmq_common.cpp +++ b/test/model/zmq/zmq_common.cpp @@ -18,18 +18,16 @@ #include "zmq_common.h" #include -Json::Value to_json(zmqpp::message &message) { - std::string message_txt; - message >> message_txt; +Json::Value to_json(zmq::message_t &message) { Json::Reader reader; Json::Value json; - reader.parse(message_txt, json); + reader.parse(message.to_string(), json); return json; } -void to_message(Json::Value &request_json, zmqpp::message &request){ +zmq::message_t to_message(Json::Value &request_json){ Json::StreamWriterBuilder builder; builder["indentation"] = ""; // minimize output const std::string message = Json::writeString(builder, request_json); - request << message; + return zmq::message_t(message); } diff --git a/test/model/zmq/zmq_common.h b/test/model/zmq/zmq_common.h index 83cd1949..12f473b0 100644 --- a/test/model/zmq/zmq_common.h +++ b/test/model/zmq/zmq_common.h @@ -16,7 +16,7 @@ # *******************************************************************************/ #pragma once #include -#include +#include #include /** @@ -24,12 +24,12 @@ * */ struct zmq_intf_context{ - zmqpp::context context; - std::unique_ptr cmd_socket; - std::unique_ptr eth_tx_socket; - std::unique_ptr eth_rx_socket; - std::unique_ptr krnl_tx_socket; - std::unique_ptr krnl_rx_socket; + zmq::context_t context; + std::unique_ptr cmd_socket; + std::unique_ptr eth_tx_socket; + std::unique_ptr eth_rx_socket; + std::unique_ptr krnl_tx_socket; + std::unique_ptr krnl_rx_socket; bool stop = false; zmq_intf_context() : context() {} }; @@ -40,12 +40,12 @@ struct zmq_intf_context{ * @param message Reference to the ZMQ message, as received from the socket * @return Json::Value The JSON equivalent */ -Json::Value to_json(zmqpp::message &message); +Json::Value to_json(zmq::message_t &message); /** * @brief Convert a JSON to a ZMQ message * * @param request_json The JSON input - * @param request Reference to the ZMQ message, ready for sending + * @return zmq::message_t The ZMQ message */ -void to_message(Json::Value &request_json, zmqpp::message &request); +zmq::message_t to_message(Json::Value &request_json); diff --git a/test/model/zmq/zmq_server.cpp b/test/model/zmq/zmq_server.cpp index 35677b60..b93abfd7 100644 --- a/test/model/zmq/zmq_server.cpp +++ b/test/model/zmq/zmq_server.cpp @@ -33,11 +33,11 @@ zmq_intf_context zmq_server_intf(unsigned int starting_port, unsigned int local_ zmq_intf_context ctx; logger = &log; - ctx.cmd_socket = std::make_unique(ctx.context, zmqpp::socket_type::reply); - ctx.eth_tx_socket = std::make_unique(ctx.context, zmqpp::socket_type::pub); - ctx.eth_rx_socket = std::make_unique(ctx.context, zmqpp::socket_type::sub); - ctx.krnl_tx_socket = std::make_unique(ctx.context, zmqpp::socket_type::pub); - ctx.krnl_rx_socket = std::make_unique(ctx.context, zmqpp::socket_type::sub); + ctx.cmd_socket = std::make_unique(ctx.context, zmq::socket_type::rep); + ctx.eth_tx_socket = std::make_unique(ctx.context, zmq::socket_type::pub); + ctx.eth_rx_socket = std::make_unique(ctx.context, zmq::socket_type::sub); + ctx.krnl_tx_socket = std::make_unique(ctx.context, zmq::socket_type::pub); + ctx.krnl_rx_socket = std::make_unique(ctx.context, zmq::socket_type::sub); const string endpoint_base = "tcp://127.0.0.1:"; @@ -57,6 +57,12 @@ zmq_intf_context zmq_server_intf(unsigned int starting_port, unsigned int local_ this_thread::sleep_for(chrono::milliseconds(1000)); + *logger << log_level::verbose << "Rank " << local_rank << " subscribing to " << local_rank << " (ETH)" << endl; + // Create a padded version of the rank to prevent subscription to + // ranks that have the same starting digits + std::stringstream rank_pad; + rank_pad << std::setw(DEST_PADDING) << std::setfill('0') << local_rank; + // connect to the sockets for(int i=0; isubscribe(rank_pad.str()); + //TODO use non-deprecated set call: ctx.eth_rx_socket->set(zmq::sockopt::subscribe, rank_pad.str()); + ctx.eth_rx_socket->setsockopt(ZMQ_SUBSCRIBE, rank_pad.str().c_str(), rank_pad.str().length()); this_thread::sleep_for(chrono::milliseconds(1000)); @@ -85,14 +87,18 @@ zmq_intf_context zmq_server_intf(unsigned int starting_port, unsigned int local_ if(!kernel_loopback){ krnl_endpoint = endpoint_base + to_string(starting_port+3*world_size+local_rank); } - *logger << log_level::verbose << "Rank " << local_rank << " connecting to " << krnl_endpoint << " (KRNL)" << endl; - ctx.krnl_rx_socket->connect(krnl_endpoint); - this_thread::sleep_for(chrono::milliseconds(1000)); + //subscribing to all (for now) *logger << log_level::verbose << "Rank " << local_rank << " subscribing to all (KRNL)" << endl; - ctx.krnl_rx_socket->subscribe(""); + + *logger << log_level::verbose << "Rank " << local_rank << " connecting to " << krnl_endpoint << " (KRNL)" << endl; + ctx.krnl_rx_socket->connect(krnl_endpoint); this_thread::sleep_for(chrono::milliseconds(1000)); + //TODO use non-deprecated call: ctx.krnl_rx_socket->set(zmq::sockopt::subscribe, ""); + ctx.krnl_rx_socket->setsockopt(ZMQ_SUBSCRIBE, "", 0); + this_thread::sleep_for(chrono::milliseconds(1000)); + *logger << log_level::info << "ZMQ Context established for rank " << local_rank << endl; return ctx; @@ -100,7 +106,6 @@ zmq_intf_context zmq_server_intf(unsigned int starting_port, unsigned int local_ void eth_endpoint_egress_port(zmq_intf_context *ctx, Stream &in, unsigned int local_rank){ - zmqpp::message message; Json::Value packet; Json::StreamWriterBuilder builder; @@ -125,15 +130,14 @@ void eth_endpoint_egress_port(zmq_intf_context *ctx, Stream &in, u std::stringstream dest_pad; dest_pad << std::setw(DEST_PADDING) << std::setfill('0') << dest; //first part of the message is the destination port ID - message << dest_pad.str(); + ctx->eth_tx_socket->send(zmq::message_t(dest_pad.str()), zmq::send_flags::sndmore); //second part of the message is the local rank of the sender - message << to_string(local_rank); + ctx->eth_tx_socket->send(zmq::message_t(to_string(local_rank)), zmq::send_flags::sndmore); //finally package the data string str = Json::writeString(builder, packet); - message << str; *logger << log_level::verbose << "ETH Send " << idx << " bytes to " << dest << endl; *logger << log_level::debug << str << endl; - ctx->eth_tx_socket->send(message); + ctx->eth_tx_socket->send(zmq::message_t(str), zmq::send_flags::none); //add some spacing to encourage realistic //interleaving between messsages in fabric this_thread::sleep_for(chrono::milliseconds(10)); @@ -144,16 +148,18 @@ void eth_endpoint_ingress_port(zmq_intf_context *ctx, Stream &out) Json::Reader reader; // receive the message - zmqpp::message message; - if(!ctx->eth_rx_socket->receive(message, true)) return; + zmq::message_t message; + if(!(ctx->eth_rx_socket->recv(message, zmq::recv_flags::dontwait)).has_value()) return; // decompose the message - string msg_text, dst_text, src_text, sender_rank_text; + string msg_text, dst_text, sender_rank_text; //get and check destination ID - message >> dst_text; - message >> sender_rank_text; - message >> msg_text; + dst_text = message.to_string(); + ctx->eth_rx_socket->recv(message); + sender_rank_text = message.to_string(); + ctx->eth_rx_socket->recv(message); + msg_text = message.to_string(); //parse msg_text as json Json::Value packet, data; @@ -185,7 +191,6 @@ void eth_endpoint_ingress_port(zmq_intf_context *ctx, Stream &out) void krnl_endpoint_egress_port(zmq_intf_context *ctx, Stream &in){ - zmqpp::message message; Json::Value packet; Json::StreamWriterBuilder builder; @@ -208,15 +213,14 @@ void krnl_endpoint_egress_port(zmq_intf_context *ctx, Stream &in){ //first part of the message is the destination port ID dest = tmp.dest; - message << to_string(dest); + ctx->krnl_tx_socket->send(zmq::message_t(to_string(dest)), zmq::send_flags::sndmore); //finally package the data string str = Json::writeString(builder, packet); - message << str; *logger << log_level::verbose << "CCLO to user kernel: push " << idx << " bytes to dest = " << dest << endl; *logger << log_level::debug << str << endl; if(!ctx->stop){ - ctx->krnl_tx_socket->send(message); + ctx->krnl_tx_socket->send(zmq::message_t(str), zmq::send_flags::none); } } @@ -226,15 +230,16 @@ void krnl_endpoint_ingress_port(zmq_intf_context *ctx, Stream &out Json::Reader reader; // receive the message - zmqpp::message message; - if(!ctx->krnl_rx_socket->receive(message, true)) return; + zmq::message_t message; + if(!(ctx->krnl_rx_socket->recv(message, zmq::recv_flags::dontwait)).has_value()) return; // decompose the message string msg_text, dst_text; //get and check destination ID - message >> dst_text; - message >> msg_text; + dst_text = message.to_string(); + ctx->krnl_rx_socket->recv(message); + msg_text = message.to_string(); //parse msg_text as json Json::Value packet, data; @@ -265,18 +270,17 @@ void krnl_endpoint_ingress_port(zmq_intf_context *ctx, Stream &out } -void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, vector &devicemem, Stream cmd[NUM_CTRL_STREAMS], Stream sts[NUM_CTRL_STREAMS]){ +void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, vector &devicemem, vector &hostmem, Stream cmd[NUM_CTRL_STREAMS], Stream sts[NUM_CTRL_STREAMS]){ Json::Reader reader; Json::StreamWriterBuilder builder; // receive the message - zmqpp::message message; - if(!ctx->cmd_socket->receive(message, true)) return; + zmq::message_t message; + if(!(ctx->cmd_socket->recv(message, zmq::recv_flags::dontwait)).has_value()) return; // decompose the message - string msg_text; - message >> msg_text; //message now is in a string + string msg_text = message.to_string(); //message now is in a string *logger << log_level::debug << "Received: " << msg_text << endl; @@ -288,6 +292,7 @@ void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, vector &devicemem, Json::Value response; response["status"] = 0; int adr, val, len; + bool host; uint64_t dma_addr; Json::Value dma_wdata; unsigned int ctrl_id; @@ -317,44 +322,76 @@ void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, vector &devicemem, cfgmem[adr/4] = request["wdata"].asUInt(); } break; - // Devicemem read request {"type": 2, "addr": , "len": } + // Devicemem read request {"type": 2, "addr": , "len": , "host": } // Devicemem read response {"status": OK|ERR, "rdata": } case 2: adr = request["addr"].asUInt(); len = request["len"].asUInt(); - *logger << log_level::debug << "Mem read " << adr << " len: " << len << endl; - if((adr+len) > devicemem.size()){ - response["status"] = 1; - response["rdata"][0] = 0; + host = request["host"].asUInt(); + *logger << log_level::debug << (host ? "Host " : "Device ") << " mem read " << adr << " len: " << len << endl; + if(host){ + if((adr+len) > hostmem.size()){ + response["status"] = 1; + response["rdata"][0] = 0; + *logger << log_level::error << "Host mem read outside allocated range ("<< hostmem.size()/1024 << "KB) at addr: " << adr << " len: " << len << endl; + } else { + for (int i=0; i devicemem.size()){ + response["status"] = 1; + response["rdata"][0] = 0; + *logger << log_level::error << "Device mem read outside allocated range ("<< devicemem.size()/1024 << "KB) at addr: " << adr << " len: " << len << endl; + } else { + for (int i=0; i, "wdata": } + // Devicemem write request {"type": 3, "addr": , "wdata": , "host": } // Devicemem write response {"status": OK|ERR} case 3: adr = request["addr"].asUInt(); dma_wdata = request["wdata"]; len = dma_wdata.size(); - *logger << log_level::debug << "Mem write " << adr << " len: " << len << endl; - if((adr+len) > devicemem.size()){ - devicemem.resize(adr+len); - } - for(int i=0; i hostmem.size()){ + hostmem.resize(adr+len); + } + for(int i=0; i devicemem.size()){ + devicemem.resize(adr+len); + } + for(int i=0; i, "len": } + // Devicemem allocate request {"type": 4, "addr": , "len": , "host": } // Devicemem allocate response {"status": OK|ERR} case 4: adr = request["addr"].asUInt(); len = request["len"].asUInt(); - *logger << log_level::debug << "Mem allocate " << adr << " len: " << len << endl; - if((adr+len) > devicemem.size()){ - devicemem.resize(adr+len); + host = request["host"].asUInt(); + *logger << log_level::debug << (host ? "Host " : "Device ") << " mem allocate " << adr << " len: " << len << endl; + if(host){ + if((adr+len) > hostmem.size()){ + hostmem.resize(adr+len); + } + } else { + if((adr+len) > devicemem.size()){ + devicemem.resize(adr+len); + } } break; // Call request {"type": 5, arg names and values} @@ -407,7 +444,8 @@ void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, vector &devicemem, } //return message to client string str = Json::writeString(builder, response); - ctx->cmd_socket->send(str); + zmq::message_t ret_msg(str); + ctx->cmd_socket->send(ret_msg, zmq::send_flags::none); } @@ -422,12 +460,11 @@ void serve_zmq(zmq_intf_context *ctx, Json::StreamWriterBuilder builder; // receive the message - zmqpp::message message; - if(!ctx->cmd_socket->receive(message, true)) return; + zmq::message_t message; + if(!(ctx->cmd_socket->recv(message, zmq::recv_flags::dontwait)).has_value()) return; // decompose the message - string msg_text; - message >> msg_text;//message now is in a string + string msg_text = message.to_string();//message now is in a string *logger << log_level::debug << "Received: " << msg_text << endl; @@ -439,6 +476,7 @@ void serve_zmq(zmq_intf_context *ctx, Json::Value response; response["status"] = 0; int adr, val, len; + bool host; uint64_t dma_addr; Json::Value dma_wdata; ap_uint<64> mem_addr; @@ -487,17 +525,19 @@ void serve_zmq(zmq_intf_context *ctx, } } break; - // Devicemem read request {"type": 2, "addr": , "len": } + // Devicemem read request {"type": 2, "addr": , "len": , "host": } // Devicemem read response {"status": OK|ERR, "rdata": } case 2: adr = request["addr"].asUInt(); len = request["len"].asUInt(); + host = request["host"].asUInt(); *logger << log_level::debug << "Mem read " << adr << " len: " << len << endl; - if((adr+len) > ACCL_SIM_MEM_SIZE_KB*1024){ + if((adr+len) > (host ? 1 : ACCL_SIM_NUM_BANKS)*ACCL_SIM_MEM_SIZE_KB*1024){ response["status"] = 1; response["rdata"][0] = 0; *logger << log_level::error << "Mem read outside available range ("<< ACCL_SIM_MEM_SIZE_KB << "KB) at addr: " << adr << " len: " << len << endl; } else { + adr += host ? ACCL_SIM_NUM_BANKS*ACCL_SIM_MEM_SIZE_KB*1024 : 0; aximm_rd_addr.Push(adr); aximm_rd_len.Push(len); unsigned int idx = 0; @@ -513,17 +553,19 @@ void serve_zmq(zmq_intf_context *ctx, } } break; - // Devicemem write request {"type": 3, "addr": , "wdata": } + // Devicemem write request {"type": 3, "addr": , "wdata": , "host": } // Devicemem write response {"status": OK|ERR} case 3: adr = request["addr"].asUInt(); dma_wdata = request["wdata"]; len = dma_wdata.size(); + host = request["host"].asUInt(); *logger << log_level::debug << "Mem write " << adr << " len: " << len << endl; - if((adr+len) > ACCL_SIM_MEM_SIZE_KB*1024){ + if((adr+len) > (host ? 1 : ACCL_SIM_NUM_BANKS)*ACCL_SIM_MEM_SIZE_KB*1024){ response["status"] = 1; *logger << log_level::error << "Mem write outside available range ("<< ACCL_SIM_MEM_SIZE_KB << "KB) at addr: " << adr << " len: " << len << endl; } else{ + adr += host ? ACCL_SIM_NUM_BANKS*ACCL_SIM_MEM_SIZE_KB*1024 : 0; aximm_wr_addr.Push(adr); aximm_wr_len.Push(len); for(int i=0; i, "len": } + // Devicemem allocate request {"type": 4, "addr": , "len": , "host": } // Devicemem allocate response {"status": OK|ERR} case 4: adr = request["addr"].asUInt(); len = request["len"].asUInt(); - *logger << log_level::debug << "Mem allocate " << adr << " len: " << len << endl; - if((adr+len) > ACCL_SIM_MEM_SIZE_KB*1024){ + host = request["host"].asUInt(); + *logger << log_level::debug << (host ? "Host " : "Device ") << " mem allocate " << adr << " len: " << len << endl; + if((adr+len) > (host ? 1 : ACCL_SIM_NUM_BANKS)*ACCL_SIM_MEM_SIZE_KB*1024){ response["status"] = 1; *logger << log_level::error << "Mem allocate outside available range ("<< ACCL_SIM_MEM_SIZE_KB << "KB) at addr: " << adr << " len: " << len << endl; } @@ -673,7 +716,8 @@ void serve_zmq(zmq_intf_context *ctx, } //return message to client string str = Json::writeString(builder, response); - ctx->cmd_socket->send(str); + zmq::message_t ret_msg(str); + ctx->cmd_socket->send(ret_msg, zmq::send_flags::none); } void zmq_cmd_server(zmq_intf_context *ctx, diff --git a/test/model/zmq/zmq_server.h b/test/model/zmq/zmq_server.h index 0cb02940..5abc292d 100644 --- a/test/model/zmq/zmq_server.h +++ b/test/model/zmq/zmq_server.h @@ -28,6 +28,10 @@ #define NUM_CTRL_STREAMS 1 #endif +#ifndef ACCL_SIM_NUM_BANKS +#define ACCL_SIM_NUM_BANKS 1 +#endif + #ifndef ACCL_SIM_MEM_SIZE_KB #define ACCL_SIM_MEM_SIZE_KB 256 #endif @@ -58,10 +62,11 @@ zmq_intf_context zmq_server_intf(unsigned int starting_port, unsigned int local_ * @param ctx Pointer to existing ZMQ context * @param cfgmem Pointer to emulated configuration memory * @param devicemem Pointer to emulated device memory + * @param hostmem Pointer to emulated host memory * @param cmd Command stream going to emulated CCLO * @param sts Status stream coming from emulated CCLO */ -void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, std::vector &devicemem, hlslib::Stream> cmd[NUM_CTRL_STREAMS], hlslib::Stream> sts[NUM_CTRL_STREAMS]); +void serve_zmq(zmq_intf_context *ctx, uint32_t *cfgmem, std::vector &devicemem, std::vector &hostmem, hlslib::Stream> cmd[NUM_CTRL_STREAMS], hlslib::Stream> sts[NUM_CTRL_STREAMS]); /** * @brief Serve an input Ethernet port diff --git a/test/refdesigns/Makefile b/test/refdesigns/Makefile index 0760089f..a5ff4ed6 100644 --- a/test/refdesigns/Makefile +++ b/test/refdesigns/Makefile @@ -23,6 +23,25 @@ PLATFORM ?= xilinx_u55c_gen3x16_xdma_3_202210_1 # U280: xilinx_u280_gen3x16_xdma_1_202211_1 # U250: xilinx_u250_gen3x16_xdma_4_1_202210_1 +ifeq (u250,$(findstring u250, $(PLATFORM))) + FPGAPART=xcu250-figd2104-2L-e + BOARD=u250 +else ifeq (u280,$(findstring u280, $(PLATFORM))) + FPGAPART=xcu280-fsvh2892-2L-e + BOARD=u280 +else ifeq (u55c,$(findstring u55c, $(PLATFORM))) + FPGAPART=xcu55c-fsvh2892-2L-e + BOARD=u55c +else ifeq (u200,$(findstring u200, $(PLATFORM))) + FPGAPART=xcu200-fsgd2104-2-e + BOARD=u200 +else ifeq (u50,$(findstring u50, $(PLATFORM))) + FPGAPART=xcu50-fsvh2104-2-e + BOARD=u50 +else + $(error Unsupported PLATFORM) +endif + XSA := $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) DEBUG ?= none PROFILE ?= none @@ -31,7 +50,6 @@ USE_HOSTMEM ?= 0 VPP_TARGET ?= hw USER_KERNEL ?= none FREQUENCY = 250 -CCLO_XO = ../../kernels/cclo/ccl_offload.xo CCLO_MB_DEBUG_LEVEL ?= 0 VNX=xup_vitis_network_example @@ -45,22 +63,24 @@ FW_SOURCES = $(shell find fw -name '*.c') $(shell find fw -name '*.h') $(shell f ETH_IF=0 CMAC_UDP_XO=$(VNX)/Ethernet/_x.$(PLATFORM)/cmac_$(ETH_IF).xo UDP_XO=$(VNX)/NetLayers/_x.$(PLATFORM)/networklayer.xo -TCP_DUMMY_XO=../../kernels/plugins/dummy_tcp_stack/dummy_tcp_stack.xo -TCP_XO=Vitis_with_100Gbps_TCP-IP/_x.hw.$(XSA)/network_krnl.xo +TCP_DUMMY_XO=../../kernels/plugins/dummy_tcp_stack/dummy_tcp_stack_$(FPGAPART).xo +TCP_XO=tcp_stack.$(XSA)/network_krnl.xo TCP_VIVADO_ROOTDIR ?= $$XILINX_VIVADO TCP_HLS_ROOTDIR ?= $$XILINX_HLS TCP_RX_BYPASS ?= 0 -CMAC_TCP_XO=Vitis_with_100Gbps_TCP-IP/_x.hw.$(XSA)/cmac_krnl.xo -EXTERNAL_DMA_XO=../../kernels/plugins/external_dma/external_dma.xo -CYT_BIT=Coyote/hw/build_$(CCLO_STACK_TYPE)/bitstreams/cyt_top.bit - -HOSTCTRL_XO=../../kernels/plugins/hostctrl/hostctrl.xo -CLIENT_ARB_XO=../../kernels/plugins/client_arbiter/client_arbiter.xo -SUM_XO=../../kernels/plugins/reduce_ops/reduce_ops.xo -COMPRESSION_XO=../../kernels/plugins/hp_compression/hp_compression.xo -LOOPBACK_XO=../../kernels/plugins/loopback/loopback.xo -TCP_SESS_XO=../../kernels/plugins/tcp_session_handler/tcp_session_handler.xo -VADD_XO=../../kernels/plugins/vadd_put/vadd_put.xo +CMAC_TCP_XO=tcp_stack.$(XSA)/cmac_krnl.xo + +HOSTCTRL_XO=../../kernels/plugins/hostctrl/hostctrl_$(FPGAPART).xo +CLIENT_ARB_XO=../../kernels/plugins/client_arbiter/client_arbiter_$(FPGAPART).xo +SUM_XO=../../kernels/plugins/reduce_ops/reduce_ops_$(FPGAPART).xo +COMPRESSION_XO=../../kernels/plugins/hp_compression/hp_compression_$(FPGAPART).xo +LOOPBACK_XO=../../kernels/plugins/loopback/loopback_$(FPGAPART).xo +TCP_SESS_XO=../../kernels/plugins/tcp_session_handler/tcp_session_handler_$(FPGAPART).xo +VADD_XO=../../kernels/plugins/vadd_put/vadd_put_$(FPGAPART).xo +CYT_DMA_ADAPTER_XO=../../kernels/plugins/cyt_adapter/cyt_dma_adapter_$(FPGAPART).xo +CYT_RDMA_ARBITER_XO=../../kernels/plugins/cyt_adapter/cyt_rdma_arbiter_$(FPGAPART).xo +CYT_RDMA_MUX_XO=../../kernels/plugins/cyt_adapter/cyt_rdma_mux_$(FPGAPART).xo +CYT_ADAPTER_XO = $(CYT_DMA_ADAPTER_XO) $(CYT_RDMA_ARBITER_XO) $(CYT_RDMA_MUX_XO) HWEMU_MST_XO=$$XILINX_VITIS/data/emulation/XO/sim_ipc_axis_master_512.xo HWEMU_SLV_XO=$$XILINX_VITIS/data/emulation/XO/sim_ipc_axis_slave_512.xo @@ -82,40 +102,25 @@ else $(error Unsupported PROFILE setting) endif -ifeq (u250,$(findstring u250, $(PLATFORM))) - FPGAPART=xcu250-figd2104-2L-e - BOARD=u250 -else ifeq (u280,$(findstring u280, $(PLATFORM))) - FPGAPART=xcu280-fsvh2892-2L-e - BOARD=u280 -else ifeq (u55c,$(findstring u55c, $(PLATFORM))) - FPGAPART=xcu55c-fsvh2892-2L-e - BOARD=u55c -else ifeq (u200,$(findstring u200, $(PLATFORM))) - FPGAPART=xcu200-fsgd2104-2-e - BOARD=u200 -else ifeq (u50,$(findstring u50, $(PLATFORM))) - FPGAPART=xcu50-fsvh2104-2-e - BOARD=u50 -else - $(error Unsupported PLATFORM) -endif - -BUILD_DIR := link_$(MODE)_eth_$(ETH_IF)_debug_$(DEBUG)_$(XSA) +BUILD_DIR := link_$(MODE)_$(XSA) XCLBIN=$(BUILD_DIR)/ccl_offload.xclbin + +CYT_BUILD_DIR=coyote_build_$(CCLO_STACK_TYPE)_$(BOARD) +CYT_BIT=$(CYT_BUILD_DIR)/bitstreams/cyt_top.bit + OUTPUT_PRODUCT := $(XCLBIN) -CONFIGFILE := link_config.ini +CONFIGFILE := $(BUILD_DIR).ini VPP_CONFIG := -t $(VPP_TARGET) --config $(CONFIGFILE) ADV_CMD := IPREPO_CMD := -OTHER_XO := $(CCLO_XO) $(HOSTCTRL_XO) $(CLIENT_ARB_XO) $(SUM_XO) $(COMPRESSION_XO) +OTHER_XO := $(HOSTCTRL_XO) $(SUM_XO) $(COMPRESSION_XO) GEN_CONFIG_FLAGS := ifeq (vadd, $(USER_KERNEL)) GEN_CONFIG_FLAGS += --vadd - OTHER_XO += $(VADD_XO) + OTHER_XO += $(VADD_XO) $(CLIENT_ARB_XO) else OTHER_XO += $(LOOPBACK_XO) endif @@ -128,15 +133,6 @@ ifeq (hw_emu, $(VPP_TARGET)) endif endif -CCLO_EN_DMA = 1 -CCLO_EN_EXT_DMA = 0 -ifeq (1, $(USE_HOSTMEM)) - GEN_CONFIG_FLAGS += --host - OTHER_XO += $(EXTERNAL_DMA_XO) - CCLO_EN_DMA = 0 - CCLO_EN_EXT_DMA = 1 -endif - ifeq (udp,$(MODE)) ifeq (u50,$(findstring u50, $(PLATFORM))) HLS_IP_FOLDER = $(shell readlink -f ./$(NETLAYERHLS)/synthesis_results_HBM) @@ -167,14 +163,13 @@ else ifeq (axis3x,$(MODE)) CCLO_STACK_TYPE = TCP CCLO_MB_DEBUG_LEVEL = 2 else ifeq (tcp, $(MODE)) - CMAC_TCP_XO=Vitis_with_100Gbps_TCP-IP/_x.hw.$(XSA)/cmac_krnl.xo NET_XO = $(TCP_XO) ifneq (hw_emu, $(VPP_TARGET)) NET_XO += $(CMAC_TCP_XO) - ADV_CMD += --advanced.param compiler.userPostSysLinkOverlayTcl=$(shell pwd)/Vitis_with_100Gbps_TCP-IP/scripts/post_sys_link.tcl + ADV_CMD += --advanced.param compiler.userPostSysLinkOverlayTcl=$(shell pwd)/tcp_stack.$(XSA)/scripts/post_sys_link.tcl endif OTHER_XO += $(LOOPBACK_XO) $(TCP_SESS_XO) - IPREPO_CMD += --user_ip_repo_paths Vitis_with_100Gbps_TCP-IP/build/fpga-network-stack/iprepo + IPREPO_CMD += --user_ip_repo_paths tcp_stack.$(XSA)/build/fpga-network-stack/iprepo CCLO_STACK_TYPE = TCP else ifeq (coyote_tcp, $(MODE)) N_DDR_CHAN = 0 @@ -184,10 +179,11 @@ else ifeq (coyote_tcp, $(MODE)) ifeq (u250,$(findstring u250, $(PLATFORM))) N_DDR_CHAN = 2 endif - OTHER_XO = coyote_shell + OTHER_XO += $(CYT_ADAPTER_XO) COYOTE_CONFIG = -DFDEV_NAME=$(BOARD) -DEN_MEM=1 -DEN_STRM=1 -DEN_BPSS=1 -DEN_TCP_0=1 -DN_STRM_AXI=2 -DN_CARD_AXI=2 -DEN_HLS=0 -DACLK_F=250 -DN_DDR_CHAN=$(N_DDR_CHAN) CCLO_STACK_TYPE = TCP OUTPUT_PRODUCT = $(CYT_BIT) + USE_HOSTMEM = 1 else ifeq (coyote_rdma, $(MODE)) N_DDR_CHAN = 0 ifeq (u200,$(findstring u200, $(PLATFORM))) @@ -196,14 +192,28 @@ else ifeq (coyote_rdma, $(MODE)) ifeq (u250,$(findstring u250, $(PLATFORM))) N_DDR_CHAN = 2 endif - OTHER_XO = coyote_shell + OTHER_XO += $(CYT_ADAPTER_XO) COYOTE_CONFIG = -DFDEV_NAME=$(BOARD) -DEN_MEM=1 -DEN_STRM=1 -DEN_BPSS=1 -DEN_RDMA_0=1 -DEN_RPC=1 -DN_STRM_AXI=3 -DN_CARD_AXI=3 -DEN_HLS=0 -DACLK_F=250 -DTLBL_A=12 -DN_DDR_CHAN=$(N_DDR_CHAN) CCLO_STACK_TYPE = RDMA OUTPUT_PRODUCT = $(CYT_BIT) + USE_HOSTMEM = 1 else $(error Unsupported MODE) endif +CCLO_XO = ../../kernels/cclo/$(CCLO_STACK_TYPE)_1111$(CCLO_MB_DEBUG_LEVEL)_$(FPGAPART)/ccl_offload.xo +OTHER_XO += $(CCLO_XO) + +ifeq (1, $(USE_HOSTMEM)) + GEN_CONFIG_FLAGS += --host + NUM_EXTDMA_AXI = 2 +else + NUM_EXTDMA_AXI = 1 +endif + +EXTERNAL_DMA_XO=../../kernels/plugins/external_dma/external_dma_$(NUM_EXTDMA_AXI)port.xo +OTHER_XO += $(EXTERNAL_DMA_XO) + ifneq (none, $(ETH_IF)) GEN_CONFIG_FLAGS += --ethif $(ETH_IF) endif @@ -213,53 +223,55 @@ all: $(OUTPUT_PRODUCT) .PHONY: vnx vnx: $(CMAC_UDP_XO) $(UDP_XO) -$(CMAC_UDP_XO) &: +$(VNX)/Ethernet/_x.%/cmac_$(ETH_IF).xo: git submodule update --init --recursive xup_vitis_network_example - $(MAKE) -C xup_vitis_network_example/Ethernet DEVICE=$(PLATFORM) INTERFACE=$(ETH_IF) all + $(MAKE) -C xup_vitis_network_example/Ethernet DEVICE=$* INTERFACE=$(ETH_IF) all -$(UDP_XO): +$(VNX)/NetLayers/_x.%/networklayer.xo: git submodule update --init --recursive xup_vitis_network_example - $(MAKE) -C xup_vitis_network_example/NetLayers DEVICE=$(PLATFORM) all - -$(CMAC_TCP_XO): - git submodule update --init --recursive Vitis_with_100Gbps_TCP-IP - $(MAKE) -C Vitis_with_100Gbps_TCP-IP/ cmac_krnl DEVICE=$(PLATFORM) TEMP_DIR=_x.hw.$(XSA)/ XSA=$(XSA) - -$(TCP_DUMMY_XO): - $(MAKE) -C ../../kernels/plugins/dummy_tcp_stack DEVICE=$(FPGAPART) all + $(MAKE) -C xup_vitis_network_example/NetLayers DEVICE=$* all .PHONY: coyote_shell -coyote_shell: Coyote/hw/build_$(CCLO_STACK_TYPE)/lynx/lynx.xpr +coyote_shell: $(CYT_BUILD_DIR)/lynx/lynx.xpr -Coyote/hw/build_$(CCLO_STACK_TYPE)/lynx/lynx.xpr: - cd Coyote/hw && mkdir build_$(CCLO_STACK_TYPE) && cd build_$(CCLO_STACK_TYPE) && cmake .. $(COYOTE_CONFIG) - $(MAKE) -C Coyote/hw/build_$(CCLO_STACK_TYPE)/ shell +$(CYT_BUILD_DIR)/lynx/lynx.xpr: + mkdir $(CYT_BUILD_DIR) && cd $(CYT_BUILD_DIR) && cmake ../Coyote/hw $(COYOTE_CONFIG) + $(MAKE) -C $(CYT_BUILD_DIR) shell $(CYT_BIT): coyote_shell - $(MAKE) -C ../../kernels/cclo PLATFORM=$(PLATFORM) STACK_TYPE=$(CCLO_STACK_TYPE) MB_DEBUG_LEVEL=$(CCLO_MB_DEBUG_LEVEL) EN_DMA=0 EN_EXT_DMA=1 - $(MAKE) -C ../../kernels/plugins PLATFORM=$(PLATFORM) DEBUG=$(DEBUG) STACK_TYPE=$(CCLO_STACK_TYPE) - vivado -mode tcl -source tcl/coyote.tcl -tclargs $(CCLO_STACK_TYPE) Coyote/hw/build_$(CCLO_STACK_TYPE) - cp hdl/$(MODE)_top.sv Coyote/hw/build_$(CCLO_STACK_TYPE)/lynx/hdl/config_0/user_logic_c0_0.sv - $(MAKE) -C Coyote/hw/build_$(CCLO_STACK_TYPE)/ compile + $(MAKE) -C ../../kernels/cclo PLATFORM=$(PLATFORM) STACK_TYPE=$(CCLO_STACK_TYPE) MB_DEBUG_LEVEL=$(CCLO_MB_DEBUG_LEVEL) + $(MAKE) -C ../../kernels/plugins DEVICE=$(FPGAPART) + cp -rf $(OTHER_XO) $(CYT_BUILD_DIR)/iprepo && cd $(CYT_BUILD_DIR)/iprepo && unzip -n '*.xo' + vivado -mode tcl -source tcl/coyote.tcl -tclargs $(CCLO_STACK_TYPE) $(CYT_BUILD_DIR) + cp hdl/$(MODE)_top.sv $(CYT_BUILD_DIR)/lynx/hdl/config_0/user_logic_c0_0.sv + $(MAKE) -C $(CYT_BUILD_DIR) compile .PHONY: tcp_stack tcp_stack: $(TCP_XO) $(CMAC_TCP_XO) -$(TCP_XO): +$(CMAC_TCP_XO): git submodule update --init --recursive Vitis_with_100Gbps_TCP-IP - mkdir -p Vitis_with_100Gbps_TCP-IP/build && \ - cd Vitis_with_100Gbps_TCP-IP/build && \ + mkdir -p tcp_stack.$(XSA) + cp -r Vitis_with_100Gbps_TCP-IP/* tcp_stack.$(XSA) + mkdir -p tcp_stack.$(XSA)/build && \ + cd tcp_stack.$(XSA)/build && \ cmake ../ -DFDEV_NAME=$(BOARD) -DVIVADO_HLS_ROOT_DIR=$(TCP_HLS_ROOTDIR) -DVIVADO_ROOT_DIR=$(TCP_VIVADO_ROOTDIR) -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=$(TCP_RX_BYPASS) -DDTCP_STACK_WINDOW_SCALING_EN=0 - $(MAKE) -C Vitis_with_100Gbps_TCP-IP/build installip - $(MAKE) -C Vitis_with_100Gbps_TCP-IP/ network_krnl DEVICE=$(PLATFORM) TEMP_DIR=_x.hw.$(XSA)/ XSA=$(XSA) + $(MAKE) -C tcp_stack.$(XSA)/build installip + $(MAKE) -C tcp_stack.$(XSA)/ cmac_krnl DEVICE=$(PLATFORM) TEMP_DIR=. XSA=$(XSA) + +$(TCP_XO): $(CMAC_TCP_XO) + $(MAKE) -C tcp_stack.$(XSA)/ network_krnl DEVICE=$(PLATFORM) TEMP_DIR=. XSA=$(XSA) + +$(TCP_DUMMY_XO): + $(MAKE) -C ../../kernels/plugins/dummy_tcp_stack DEVICE=$(FPGAPART) all $(CCLO_XO): - $(MAKE) -C ../../kernels/cclo PLATFORM=$(PLATFORM) STACK_TYPE=$(CCLO_STACK_TYPE) MB_DEBUG_LEVEL=$(CCLO_MB_DEBUG_LEVEL) EN_EXT_DMA=$(CCLO_EN_EXT_DMA) EN_DMA=$(CCLO_EN_DMA) + $(MAKE) -C ../../kernels/cclo PLATFORM=$(PLATFORM) STACK_TYPE=$(CCLO_STACK_TYPE) MB_DEBUG_LEVEL=$(CCLO_MB_DEBUG_LEVEL) $(XCLBIN): $(NET_XO) $(CCLO_XO) python3 gen_config.py --board $(BOARD) --poe $(MODE) $(GEN_CONFIG_FLAGS) -o $(CONFIGFILE) - $(MAKE) -C ../../kernels/plugins PLATFORM=$(PLATFORM) DEBUG=$(DEBUG) STACK_TYPE=$(CCLO_STACK_TYPE) - v++ --link --platform $(PLATFORM) --kernel_frequency $(FREQUENCY) --save-temps --temp_dir $(BUILD_DIR) $(VPP_CONFIG) $(ADV_CMD) $(IPREPO_CMD) -o $@ $(NET_XO) $(OTHER_XO) + $(MAKE) -C ../../kernels/plugins DEVICE=$(FPGAPART) + v++ --link --platform $(PLATFORM) $(PROFILE_OPTS) --kernel_frequency $(FREQUENCY) --save-temps --temp_dir $(BUILD_DIR) $(VPP_CONFIG) $(ADV_CMD) $(IPREPO_CMD) -o $@ $(NET_XO) $(OTHER_XO) .PHONY: distclean distclean: diff --git a/test/refdesigns/build_all.sh b/test/refdesigns/build_all.sh new file mode 100755 index 00000000..23354c4b --- /dev/null +++ b/test/refdesigns/build_all.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# /******************************************************************************* +# Copyright (C) 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +MODES=( + axis3x + udp + tcp + coyote_tcp + coyote_rdma +) + +PLATFORMS=( + xilinx_u55c_gen3x16_xdma_3_202210_1 + xilinx_u50_gen3x16_xdma_5_202210_1 + xilinx_u200_gen3x16_xdma_2_202110_1 + xilinx_u280_gen3x16_xdma_1_202211_1 + xilinx_u250_gen3x16_xdma_4_1_202210_1 +) + +# build for any combination of mode and platform +for mode in ${MODES[@]}; do + for platform in ${PLATFORMS[@]}; do + make -j$(nproc) MODE=$mode PLATFORM=$platform > build_${mode}_${platform}.log + done +done diff --git a/test/refdesigns/gen_config.py b/test/refdesigns/gen_config.py index e3dfb2b0..9d536fb2 100755 --- a/test/refdesigns/gen_config.py +++ b/test/refdesigns/gen_config.py @@ -31,7 +31,9 @@ if args.board == "u50" and args.ethif != 0: raise "U50 has a single Ethernet port" +num_extdma_ports = 1 if args.host: + num_extdma_ports = 2 if args.board == "u280" or args.board == "u50": raise "Host memory only supported on U55C/U200/U250" @@ -68,24 +70,19 @@ cclo_instantiation = "nk=ccl_offload:{num_inst}:".format(num_inst=num_cclo) arb_instantiation = "nk=client_arbiter:{num_inst}:".format(num_inst=num_cclo) -hc_instantiation = "nk=hostctrl:{num_inst}:".format(num_inst=2*num_cclo) +hc_instantiation = "nk=hostctrl:{num_inst}:".format(num_inst=num_cclo) reduce_instantiation = "nk=reduce_ops:{num_inst}:".format(num_inst=num_cclo) cast_instantiation = "nk=hp_compression:{num_inst}:".format(num_inst=3*num_cclo) - -if args.host: - extdma_instantiation = "nk=external_dma:{num_inst}:".format(num_inst=2*num_cclo) -else: - extdma_instantiation = "" +extdma_instantiation = "nk=external_dma_{num_ports}port:{num_inst}:".format(num_inst=2*num_cclo, num_ports=num_extdma_ports) for i in range(num_cclo): endch = "" if i == num_cclo-1 else "." cclo_instantiation += "ccl_offload_{inst_nr}".format(inst_nr=i) + endch arb_instantiation += "arb_{inst_nr}".format(inst_nr=i) + endch - hc_instantiation += "hostctrl_{inst_nr}_0.hostctrl_{inst_nr}_1".format(inst_nr=i) + endch + hc_instantiation += "hostctrl_{inst_nr}_0".format(inst_nr=i) + endch reduce_instantiation += "arith_{inst_nr}".format(inst_nr=i) + endch cast_instantiation += "compression_{inst_nr}_0.compression_{inst_nr}_1.compression_{inst_nr}_2".format(inst_nr=i) + endch - if args.host: - extdma_instantiation += "extdma_{num_inst}_0.extdma_{num_inst}_1".format(num_inst=i) + endch + extdma_instantiation += "extdma_{num_inst}_0.extdma_{num_inst}_1".format(num_inst=i) + endch if args.axis3x: if args.vadd: @@ -123,11 +120,12 @@ for i in range(num_cclo): target_slr = min(i,num_slr-1) if args.axis3x else cclo_slr - slr_constraints += "slr=arb_{inst_nr}:SLR{slr_nr}\nslr=arith_{inst_nr}:SLR{slr_nr}\nslr=ccl_offload_{inst_nr}:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) + slr_constraints += "slr=arith_{inst_nr}:SLR{slr_nr}\nslr=ccl_offload_{inst_nr}:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) + if args.vadd: + slr_constraints += "slr=arb_{inst_nr}:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) for j in range(3): slr_constraints += "slr=compression_{inst_nr}_{dp_nr}:SLR{slr_nr}\n".format(inst_nr=i, dp_nr=j, slr_nr=target_slr) - for j in range(2): - slr_constraints += "slr=hostctrl_{inst_nr}_{dp_nr}:SLR{slr_nr}\n".format(inst_nr=i, dp_nr=j, slr_nr=target_slr) + slr_constraints += "slr=hostctrl_{inst_nr}_0:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) if args.axis3x: slr_constraints += "slr=poe_{inst_nr}:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) else: @@ -136,9 +134,8 @@ slr_constraints += "slr=lb_user_krnl_{inst_nr}:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) else: slr_constraints += "slr=vadd_{inst_nr}_0:SLR{slr_nr}\n".format(inst_nr=i, slr_nr=target_slr) - if args.host: - for j in range(2): - slr_constraints += "slr=extdma_{inst_nr}_{dp_nr}:SLR{slr_nr}\n".format(inst_nr=i, dp_nr=j, slr_nr=target_slr) + for j in range(2): + slr_constraints += "slr=extdma_{inst_nr}_{dp_nr}:SLR{slr_nr}\n".format(inst_nr=i, dp_nr=j, slr_nr=target_slr) if args.poe == "tcp": slr_constraints += "slr=session_handler_0:SLR{slr_nr}\n".format(slr_nr=poe_slr) @@ -154,24 +151,17 @@ for i in range(num_cclo): if mem_type == "DDR": target_bank = i if args.axis3x else cclo_slr - if args.host: - mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_0:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) - mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_1:HOST[0]\n".format(inst_nr=i) - mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_0:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) - mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_1:HOST[0]\n".format(inst_nr=i) - else: - mem_constraints += "sp=ccl_offload_{inst_nr}.m_axi_0:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) - mem_constraints += "sp=ccl_offload_{inst_nr}.m_axi_1:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) + mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_0:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) + mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_0:DDR[{start_bank}]\n".format(inst_nr=i, start_bank=target_bank) else: - if args.host: - mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_0:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) - mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_1:HOST[0]\n".format(inst_nr=i) - mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_0:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) - mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_1:HOST[0]\n".format(inst_nr=i) - else: - mem_constraints += "sp=ccl_offload_{inst_nr}.m_axi_0:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) - mem_constraints += "sp=ccl_offload_{inst_nr}.m_axi_1:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) - bank_ctr += 6 + mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_0:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) + mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_0:HBM[{start_bank}:{end_bank}]\n".format(inst_nr=i, start_bank=bank_ctr, end_bank=bank_ctr+5) + bank_ctr += 6 + + if args.host: + mem_constraints += "sp=extdma_{inst_nr}_0.m_axi_1:HOST[0]\n".format(inst_nr=i) + mem_constraints += "sp=extdma_{inst_nr}_1.m_axi_1:HOST[0]\n".format(inst_nr=i) + if args.poe == "tcp": poe_bank = bank_ctr if mem_type == "HBM" else poe_ddr_bank mem_constraints += "sp=poe_{inst_nr}.m00_axi:{mtype}[{start_bank}]\n".format(inst_nr=i, mtype=mem_type, start_bank=poe_bank) @@ -213,12 +203,14 @@ # Connect host controllers to arbiter to CCL Offload, and connect plug-ins for i in range(num_cclo): # Command interfaces - stream_connections += "stream_connect=hostctrl_{inst_nr}_0.cmd:arb_{inst_nr}.cmd_clients_0\n".format(inst_nr=i) - stream_connections += "stream_connect=hostctrl_{inst_nr}_1.cmd:arb_{inst_nr}.cmd_clients_1\n".format(inst_nr=i) - stream_connections += "stream_connect=arb_{inst_nr}.cmd_cclo:ccl_offload_{inst_nr}.s_axis_call_req\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_call_ack:arb_{inst_nr}.ack_cclo\n".format(inst_nr=i) - stream_connections += "stream_connect=arb_{inst_nr}.ack_clients_0:hostctrl_{inst_nr}_0.sts\n".format(inst_nr=i) - stream_connections += "stream_connect=arb_{inst_nr}.ack_clients_1:hostctrl_{inst_nr}_1.sts\n".format(inst_nr=i) + if args.vadd: + stream_connections += "stream_connect=hostctrl_{inst_nr}_0.cmd:arb_{inst_nr}.cmd_clients_0\n".format(inst_nr=i) + stream_connections += "stream_connect=arb_{inst_nr}.cmd_cclo:ccl_offload_{inst_nr}.s_axis_call_req\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_call_ack:arb_{inst_nr}.ack_cclo\n".format(inst_nr=i) + stream_connections += "stream_connect=arb_{inst_nr}.ack_clients_0:hostctrl_{inst_nr}_0.sts\n".format(inst_nr=i) + else: + stream_connections += "stream_connect=hostctrl_{inst_nr}_0.cmd:ccl_offload_{inst_nr}.s_axis_call_req\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_call_ack:hostctrl_{inst_nr}_0.sts\n".format(inst_nr=i) # Plugin interfaces stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_arith_op0:arith_{inst_nr}.in0\n".format(inst_nr=i) stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_arith_op1:arith_{inst_nr}.in1\n".format(inst_nr=i) @@ -233,23 +225,24 @@ if args.vadd: stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_krnl:vadd_{inst_nr}_0.data_from_cclo\n".format(inst_nr=i) stream_connections += "stream_connect=vadd_{inst_nr}_0.data_to_cclo:ccl_offload_{inst_nr}.s_axis_krnl\n".format(inst_nr=i) + stream_connections += "stream_connect=arb_{inst_nr}.ack_clients_1:vadd_{inst_nr}_0.sts_from_cclo:512\n".format(inst_nr=i) + stream_connections += "stream_connect=vadd_{inst_nr}_0.cmd_to_cclo:arb_{inst_nr}.cmd_clients_1:512\n".format(inst_nr=i) else: stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_krnl:lb_user_krnl_{inst_nr}.in\n".format(inst_nr=i) stream_connections += "stream_connect=lb_user_krnl_{inst_nr}.out:ccl_offload_{inst_nr}.s_axis_krnl\n".format(inst_nr=i) # External DMA interface - if args.host: - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_s2mm:extdma_{inst_nr}_0.s_axis_s2mm\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_mm2s_cmd:extdma_{inst_nr}_0.s_axis_mm2s_cmd\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_s2mm_cmd:extdma_{inst_nr}_0.s_axis_s2mm_cmd\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_s2mm:extdma_{inst_nr}_1.s_axis_s2mm\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_mm2s_cmd:extdma_{inst_nr}_1.s_axis_mm2s_cmd\n".format(inst_nr=i) - stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_s2mm_cmd:extdma_{inst_nr}_1.s_axis_s2mm_cmd\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_mm2s:ccl_offload_{inst_nr}.s_axis_dma0_mm2s\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_mm2s_sts:ccl_offload_{inst_nr}.s_axis_dma0_mm2s_sts\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_s2mm_sts:ccl_offload_{inst_nr}.s_axis_dma0_s2mm_sts\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_mm2s:ccl_offload_{inst_nr}.s_axis_dma1_mm2s\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_mm2s_sts:ccl_offload_{inst_nr}.s_axis_dma1_mm2s_sts\n".format(inst_nr=i) - stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_s2mm_sts:ccl_offload_{inst_nr}.s_axis_dma1_s2mm_sts\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_s2mm:extdma_{inst_nr}_0.s_axis_s2mm\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_mm2s_cmd:extdma_{inst_nr}_0.s_axis_mm2s_cmd\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma0_s2mm_cmd:extdma_{inst_nr}_0.s_axis_s2mm_cmd\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_s2mm:extdma_{inst_nr}_1.s_axis_s2mm\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_mm2s_cmd:extdma_{inst_nr}_1.s_axis_mm2s_cmd\n".format(inst_nr=i) + stream_connections += "stream_connect=ccl_offload_{inst_nr}.m_axis_dma1_s2mm_cmd:extdma_{inst_nr}_1.s_axis_s2mm_cmd\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_mm2s:ccl_offload_{inst_nr}.s_axis_dma0_mm2s\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_mm2s_sts:ccl_offload_{inst_nr}.s_axis_dma0_mm2s_sts\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_0.m_axis_s2mm_sts:ccl_offload_{inst_nr}.s_axis_dma0_s2mm_sts\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_mm2s:ccl_offload_{inst_nr}.s_axis_dma1_mm2s\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_mm2s_sts:ccl_offload_{inst_nr}.s_axis_dma1_mm2s_sts\n".format(inst_nr=i) + stream_connections += "stream_connect=extdma_{inst_nr}_1.m_axis_s2mm_sts:ccl_offload_{inst_nr}.s_axis_dma1_s2mm_sts\n".format(inst_nr=i) # Connect CCLOs to POEs if args.poe == "tcp" or args.poe == "tcp_dummy": @@ -278,7 +271,8 @@ f.write("[connectivity]\n") f.write(cclo_instantiation+"\n") f.write(extdma_instantiation+"\n") - f.write(arb_instantiation+"\n") + if args.vadd: + f.write(arb_instantiation+"\n") f.write(hc_instantiation+"\n") f.write(reduce_instantiation+"\n") f.write(cast_instantiation+"\n") diff --git a/test/refdesigns/tcl/coyote.tcl b/test/refdesigns/tcl/coyote.tcl index 3381867d..5225362d 100644 --- a/test/refdesigns/tcl/coyote.tcl +++ b/test/refdesigns/tcl/coyote.tcl @@ -21,7 +21,6 @@ open_project "$build_dir/lynx/lynx.xpr" update_compile_order -fileset sources_1 create_bd_design "accl_bd" update_compile_order -fileset sources_1 -set_property ip_repo_paths "$build_dir ../../kernels" [current_project] update_ip_catalog create_bd_cell -type ip -vlnv Xilinx:ACCL:ccl_offload:1.0 ccl_offload_0