Skip to content

Commit

Permalink
Added support for Grace-Hopper platform with 4k and 64k Page support
Browse files Browse the repository at this point in the history
  • Loading branch information
KiranModukuri committed Aug 30, 2023
1 parent 3670962 commit 897f5be
Show file tree
Hide file tree
Showing 9 changed files with 383 additions and 233 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
nvidia-fs (2.17.4) RELEASE; urgency=low
* Added support for Grace-Hopper platform with 4k and 64k Page support

-- Aug 2023
nvidia-fs (2.17.0) RELEASE; urgency=low
* fixed compilations issues wth linux ver 6.x kernels
-- June 2023
Expand Down
2 changes: 1 addition & 1 deletion src/GDS_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.7.0.149
1.7.2.11
97 changes: 54 additions & 43 deletions src/nvfs-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ static void nvfs_get_pages_free_callback(void *data)
int bkt = 0;
struct pci_dev_mapping *pci_dev_mapping;
nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr;
void *kaddr, *orig_kaddr;

nvfs_stat(&nvfs_n_callbacks);

Expand Down Expand Up @@ -342,16 +343,12 @@ static void nvfs_get_pages_free_callback(void *data)
nvfs_err("Error when freeing page table\n");
}

#if 0
// terminated state is the terminal state
if (unlikely(atomic_cmpxchg(&gpu_info->io_state, IO_TERMINATED,
IO_CALLBACK_END) != IO_TERMINATED))
BUG();
#endif

nvfs_ioctl_mpage_ptr = kmap_atomic(gpu_info->end_fence_page);
kaddr = kmap_atomic(gpu_info->end_fence_page);
orig_kaddr = kaddr;
kaddr = (void*)((char*)kaddr + gpu_info->offset_in_page);
nvfs_ioctl_mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr;
nvfs_ioctl_mpage_ptr->state = NVFS_IO_META_DIED;
kunmap_atomic(nvfs_ioctl_mpage_ptr);
kunmap_atomic(orig_kaddr);
nvfs_dbg("marking end fence state dead\n");

// Reference taken during nvfs_map()
Expand Down Expand Up @@ -563,7 +560,7 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l
dma_addr_t dma_base_addr, dma_start_addr;
unsigned long gpu_page_index = ULONG_MAX;
struct nvfs_io* nvfsio;
pgoff_t pgoff;
pgoff_t pgoff = 0;
nvfs_mgroup_ptr_t nvfs_mgroup;
struct nvfs_gpu_args *gpu_info;
uint64_t pdevinfo;
Expand Down Expand Up @@ -619,7 +616,8 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l
dma_base_addr = dma_mapping->dma_addresses[gpu_page_index];
BUG_ON(dma_base_addr == 0);
// 4K page-level offset
BUG_ON(pgoff > (GPU_PAGE_SIZE -PAGE_SIZE));
// for 64K page we expect pgoff to be 0
BUG_ON(pgoff > (GPU_PAGE_SIZE - PAGE_SIZE));
dma_start_addr = dma_base_addr + pgoff;

#ifdef SIMULATE_BUG_DMA_DISCONTIG
Expand Down Expand Up @@ -711,9 +709,12 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l

nvfs_io_sparse_dptr_t nvfs_io_map_sparse_data(nvfs_mgroup_ptr_t nvfs_mgroup)
{
nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr =
kmap_atomic(nvfs_mgroup->gpu_info.end_fence_page);
nvfs_io_sparse_dptr_t sparse_ptr = &nvfs_ioctl_mpage_ptr->sparse_data;
nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr;
nvfs_io_sparse_dptr_t sparse_ptr;
void *kaddr = kmap_atomic(nvfs_mgroup->gpu_info.end_fence_page);
kaddr = (void*)((char*)kaddr + nvfs_mgroup->gpu_info.offset_in_page);
nvfs_ioctl_mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr;
sparse_ptr = &nvfs_ioctl_mpage_ptr->sparse_data;
sparse_ptr->nvfs_start_magic = NVFS_START_MAGIC;
sparse_ptr->nvfs_meta_version = 1;
sparse_ptr->nholes = 0;
Expand Down Expand Up @@ -783,14 +784,17 @@ void nvfs_io_free(nvfs_io_t* nvfsio, long res)
//For Async case, it's certain that mgroup wouldn't have been freed and hence
//we can mark the state Async state as Done after mgroup put as well
if (!sync) {
nvfs_ioctl_metapage_ptr_t mpage_ptr;
void *kaddr = kmap_atomic(gpu_info->end_fence_page);
nvfs_ioctl_metapage_ptr_t mpage_ptr =
(nvfs_ioctl_metapage_ptr_t) kaddr;
void *orig_kaddr = kaddr;
kaddr = (void*)((char*)kaddr + gpu_info->offset_in_page);
mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr;
//User space library is polling on these values
mpage_ptr->result = res;
wmb();
nvfs_dbg("freeing nvfs io end_fence_page: %llx and offset in page : %u in kernel\n", (u64)gpu_info->end_fence_page, gpu_info->offset_in_page);
mpage_ptr->end_fence_val = nvfsio->end_fence_value;
kunmap_atomic(kaddr);
kunmap_atomic(orig_kaddr);
nvfs_dbg("Async - nvfs_io complete. res %ld\n",
res);
}
Expand Down Expand Up @@ -1056,13 +1060,14 @@ static int nvfs_open(struct inode *inode, struct file *file)

mutex_lock(&nvfs_module_mutex);
nvfs_get_ops();

if(nvfs_nvidia_p2p_init()) {
nvfs_err("Could not load nvidia_p2p* symbols\n");
nvfs_put_ops();
ret = -EOPNOTSUPP;
goto out;
}

ret = nvfs_blk_register_dma_ops();
if (ret < 0) {
nvfs_err("nvfs modules probe failed with error :%d\n", ret);
Expand Down Expand Up @@ -1127,7 +1132,7 @@ static int nvfs_get_endfence_page(nvfs_ioctl_map_t *input_param,
goto out;
}

if ((unsigned long) end_fence & (PAGE_SIZE -1)) {
if ((unsigned long) end_fence & (NVFS_BLOCK_SIZE -1)) {
nvfs_err("%s:%d end_fence address not aligned\n",
__func__, __LINE__);
goto out;
Expand Down Expand Up @@ -1155,6 +1160,8 @@ static int nvfs_get_endfence_page(nvfs_ioctl_map_t *input_param,
goto out;
}

gpu_info->offset_in_page = (u32)((u64)end_fence % PAGE_SIZE);
nvfs_dbg("successfully pinned end fence address : %llx, end_fence_page : %llx offset in page : %ux in kernel\n", (u64)end_fence, (u64)gpu_info->end_fence_page, gpu_info->offset_in_page);
return 0;
out:
return ret;
Expand Down Expand Up @@ -1256,20 +1263,21 @@ static int nvfs_pin_gpu_pages(nvfs_ioctl_map_t *input_param,
}

if(gpu_buf_len < GPU_PAGE_SIZE &&
(input_param->sbuf_block * PAGE_SIZE) <
(input_param->sbuf_block * NVFS_BLOCK_SIZE) <
(gpuvaddr - gpu_virt_start + gpu_buf_len))
{
nvfs_err("invalid shadow buf size provided %ld, gpu_buf_len: %lld, gpuvaddr: %llx \n",
input_param->sbuf_block * PAGE_SIZE, gpu_buf_len, gpuvaddr);
nvfs_err("invalid shadow buf size provided %u, gpu_buf_len: %lld, gpuvaddr: %llx \n",
input_param->sbuf_block * NVFS_BLOCK_SIZE, gpu_buf_len, gpuvaddr);
goto error;
}

rounded_size = round_up((gpu_virt_end - gpu_virt_start + 1),
GPU_PAGE_SIZE);

nvfs_dbg("gpu_addr 0x%llx cpu_addr 0x%llx\n",
nvfs_dbg("gpu_addr 0x%llx cpu_addr 0x%llx gpu_buf_len %llu\n",
input_param->gpuvaddr,
input_param->cpuvaddr);
input_param->cpuvaddr,
gpu_buf_len);

gpu_info->gpu_buf_len = gpu_buf_len;
gpu_info->gpuvaddr = gpuvaddr;
Expand Down Expand Up @@ -1453,7 +1461,7 @@ static int nvfs_map(nvfs_ioctl_map_t *input_param)
nvfs_get_ops();

nvfs_mgroup = nvfs_mgroup_pin_shadow_pages(input_param->cpuvaddr,
input_param->sbuf_block * PAGE_SIZE);
input_param->sbuf_block * NVFS_BLOCK_SIZE);
if (!nvfs_mgroup) {
nvfs_err("%s:%d Error nvfs_setup_shadow_buffer\n",
__func__, __LINE__);
Expand Down Expand Up @@ -1546,8 +1554,8 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs)
return ERR_PTR(ret);
}

if (offset_in_page(ioargs->offset) ||
offset_in_page(ioargs->size)) {
if (ioargs->offset % NVFS_BLOCK_SIZE ||
ioargs->size % NVFS_BLOCK_SIZE) {
nvfs_err("%s:%d offset = %lld size = %llu not sector aligned\n",
__func__, __LINE__,
ioargs->offset,
Expand Down Expand Up @@ -1705,8 +1713,11 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs)

gpu_virt_start = (gpu_info->gpuvaddr & GPU_PAGE_MASK);
va_offset = ((u64)gpu_info->gpuvaddr - gpu_virt_start) +
file_args->devptroff;
if (offset_in_page(va_offset)) {
file_args->devptroff;
nvfs_dbg("gpuvaddr : %llu, gpu_virt_start : %llu, devptroff : %llu, va_offset : %llu\n",
(u64)gpu_info->gpuvaddr, (u64)gpu_virt_start, (u64) file_args->devptroff, va_offset);

if (va_offset % NVFS_BLOCK_SIZE) {
nvfs_err("gpu_va_offset not aligned va_offset %ld "
"devptroff %ld\n",
(unsigned long)va_offset,
Expand Down Expand Up @@ -1751,7 +1762,7 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs)
#ifdef NVFS_ENABLE_KERN_RDMA_SUPPORT
//If use_rkey is set, then set the appropriate segments for this IO
if(nvfsio->use_rkeys) {
shadow_buf_size = nvfs_mgroup->nvfs_pages_count * PAGE_SIZE;
shadow_buf_size = nvfs_mgroup->nvfs_blocks_count * NVFS_BLOCK_SIZE;
rdma_seg_offset = va_offset % shadow_buf_size;
nvfsio->rdma_seg_offset = rdma_seg_offset;
nvfs_dbg("%s: set curr rdma segment offset = %lu\n",
Expand Down Expand Up @@ -1912,8 +1923,8 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio)
loff_t fd_offset = nvfsio->fd_offset;
u64 va_offset = 0;
int op = nvfsio->op;
unsigned long shadow_buf_size = (nvfs_mgroup->nvfs_pages_count) *
PAGE_SIZE;
unsigned long shadow_buf_size = (nvfs_mgroup->nvfs_blocks_count) *
NVFS_BLOCK_SIZE;
ssize_t rdma_seg_offset = 0;

nvfs_dbg("Ring %s: m_pDBuffer=%lx BufferSize=%lu TotalRWSize:%ld "
Expand Down Expand Up @@ -1973,7 +1984,7 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio)
nvfs_dbg("%s rdma offset = %lu\n", __func__, rdma_seg_offset);

while (bytes_left) {
int nr_pages;
int nr_blocks;
size_t bytes_issued;

// Check if there are any callbacks or munmaps
Expand All @@ -1988,17 +1999,17 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio)
}

bytes_issued = min((long) bytes_left, (long)shadow_buf_size - (long)rdma_seg_offset);
BUG_ON(offset_in_page(bytes_issued));

nr_pages = DIV_ROUND_UP(bytes_issued, PAGE_SIZE);

nvfs_dbg("Num 4k Pages in process address "
"nr_pages=%d bytes_left=%lu "
"%s bytes_issued=%lu nvfsio 0x%p rdma_seg_offset %lu use_rkey:%d \n",
nr_pages, bytes_left, opstr(op), bytes_issued, nvfsio,
//BUG_ON(offset_in_page(bytes_issued));
BUG_ON(bytes_issued % NVFS_BLOCK_SIZE);

nr_blocks = DIV_ROUND_UP(bytes_issued, NVFS_BLOCK_SIZE);
nvfs_dbg("Num blocks in process address "
"nr_blocks=%d bytes_left=%lu "
"%s bytes_issued=%lu nvfsio 0x%p rdma_seg_offset %lu use_rkey:%d\n",
nr_blocks, bytes_left, opstr(op), bytes_issued, nvfsio,
rdma_seg_offset, nvfsio->use_rkeys);

ret = nvfs_mgroup_fill_mpages(nvfs_mgroup, nr_pages);
ret = nvfs_mgroup_fill_mpages(nvfs_mgroup, nr_blocks);
// Check if there are any callbacks or munmaps
if (ret < 0) {
nvfs_err("%s:%d shadow buffer misaligned for gpu page_offset: 0x%llx bytes_issued: %ld bytes"
Expand Down Expand Up @@ -2100,7 +2111,7 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio)
}

#ifdef SIMULATE_LESS_BYTES
if (bytes_done > 4096) {
if (bytes_done > NVFS_BLOCK_SIZE) {
bytes_done -= 4091;
nvfs_info("truncate request size :%lu\n", bytes_done);
}
Expand Down
2 changes: 1 addition & 1 deletion src/nvfs-core.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ void nvfs_io_process_exiting(nvfs_mgroup_ptr_t nvfs_mgroup);
#define NVFS_IOCTL_BATCH_IO _IOW(NVFS_MAGIC, 8, int)
#endif

#define PAGE_PER_GPU_PAGE_SHIFT 4
#define PAGE_PER_GPU_PAGE_SHIFT ilog2(GPU_PAGE_SIZE / PAGE_SIZE)
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE ((u64)1 << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
Expand Down
37 changes: 27 additions & 10 deletions src/nvfs-dma.c
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,12 @@ static int nvfs_blk_rq_map_sg_internal(struct request_queue *q,
}

curr_page_gpu = (nvfs_mgroup != NULL);
if (nvfs_mgroup != NULL) {
if (nvfs_mgroup_metadata_set_dma_state(bvec.bv_page, nvfs_mgroup, bvec.bv_len, bvec.bv_offset) != 0) {
nvfs_err("%s:%d mgroup_set_dma error\n", __func__, __LINE__);
return NVFS_IO_ERR;
}
}
#endif

/*
Expand Down Expand Up @@ -494,6 +500,7 @@ static int nvfs_dma_map_sg_attrs_internal(struct device *device,
void *gpu_base_dma = NULL;
struct scatterlist *sg = NULL;
struct blk_plug *plug = NULL;
nvfs_mgroup_ptr_t nvfs_mgroup = NULL;

if (unlikely(nents == 0)) {
nvfs_err("%s:%d cannot map empty sglist\n", __func__, __LINE__);
Expand Down Expand Up @@ -526,9 +533,19 @@ static int nvfs_dma_map_sg_attrs_internal(struct device *device,
current->plug = NULL;
ret = nvfs_get_dma(to_pci_dev(device), sg_page(sg), &gpu_base_dma, -1);
current->plug = plug;
}
else
} else {
ret = nvfs_get_dma(to_pci_dev(device), sg_page(sg), &gpu_base_dma, sg->length);
if (ret == 0) {
nvfs_mgroup = nvfs_mgroup_from_page(sg_page(sg));
if(nvfs_mgroup == NULL) {
nvfs_err("%s:%d empty mgroup\n", __func__, __LINE__);
return NVFS_IO_ERR;
}
// We have dma mapping set up
nvfs_mgroup_metadata_set_dma_state(sg_page(sg), nvfs_mgroup, sg->length, sg->offset);
nvfs_mgroup_put(nvfs_mgroup);
}
}

#ifdef SIMULATE_NVFS_IOERR
ret = NVFS_IO_ERR;
Expand Down Expand Up @@ -643,7 +660,7 @@ static int nvfs_dma_unmap_sg(struct device *device,
page = sg_page(sg);
if (unlikely(page == NULL))
continue;
ret = nvfs_check_gpu_page_and_error(page);
ret = nvfs_check_gpu_page_and_error(page, sg->offset, sg->length);
if (!ret) {
cpu_segs++;
} else if (unlikely(ret == -1)) {
Expand Down Expand Up @@ -714,7 +731,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist,
struct scatterlist *sg = NULL;
struct page *page;
nvfs_mgroup_ptr_t nvfs_mgroup = NULL, prev_mgroup = NULL;
int i = 0, npages = 0;
int i = 0, nblocks = 0;
uint64_t shadow_buf_size, total_size = 0;
struct nvfs_io* nvfsio = NULL;

Expand Down Expand Up @@ -759,16 +776,16 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist,
nvfs_mgroup_put(prev_mgroup);
return NVFS_IO_ERR;
}
shadow_buf_size = (prev_mgroup->nvfs_pages_count) * PAGE_SIZE;
shadow_buf_size = (prev_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE;
nvfsio = &prev_mgroup->nvfsio;
memcpy(rdma_infop, &prev_mgroup->rdma_info, sizeof(*rdma_infop));
// get to the base 64K page of the starting address
rdma_infop->rem_vaddr -= (rdma_infop->rem_vaddr & (GPU_PAGE_SIZE -1));
// set to the current address by calulating the number of 64K pages + offset
rdma_infop->rem_vaddr += (nvfsio->cur_gpu_base_index << GPU_PAGE_SHIFT);
rdma_infop->rem_vaddr += (nvfsio->gpu_page_offset);
rdma_infop->size = (nvfsio->nvfs_active_pages_end -
nvfsio->nvfs_active_pages_start + 1) * PAGE_SIZE;
rdma_infop->size = (nvfsio->nvfs_active_blocks_end -
nvfsio->nvfs_active_blocks_start + 1) * NVFS_BLOCK_SIZE;
if ((int32_t) rdma_infop->size > (shadow_buf_size - nvfsio->rdma_seg_offset) ||
(int32_t) rdma_infop->size < 0) {
nvfs_err("%s: wrong rdma_infop->size %d shadow buffer size %llu addr = 0x%llx\n \
Expand All @@ -785,7 +802,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist,
nvfsio->rdma_seg_offset, rdma_infop->rkey);
}

shadow_buf_size = (prev_mgroup->nvfs_pages_count) * PAGE_SIZE;
shadow_buf_size = (prev_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE;
nvfs_mgroup_put(prev_mgroup);

for_each_sg(sglist, sg, nents, i) {
Expand All @@ -795,7 +812,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist,
return NVFS_BAD_REQ;
}
page = sg_page(sg);
npages = DIV_ROUND_UP(sg->length, PAGE_SIZE);
nblocks = DIV_ROUND_UP(sg->length, NVFS_BLOCK_SIZE);

if(page == NULL) {
nvfs_dbg("%s: NULL page passed, page number: %d", __func__, i);
Expand All @@ -807,7 +824,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist,
#ifdef NVFS_TEST_GPFS_CALLBACK
nvfs_mgroup = nvfs_mgroup_get((page->index >> NVFS_MAX_SHADOW_PAGES_ORDER));
#else
nvfs_mgroup = nvfs_mgroup_from_page_range(page, npages);
nvfs_mgroup = nvfs_mgroup_from_page_range(page, nblocks, sg->offset);
#endif
if(nvfs_mgroup == NULL) {
nvfs_dbg("%s: mgroup NULL for page %d for addr 0x%p", __func__, i, page);
Expand Down
Loading

0 comments on commit 897f5be

Please sign in to comment.