From 897f5be31373cd20dae3015aa168637822ef81c9 Mon Sep 17 00:00:00 2001 From: Kiran Modukuri Date: Wed, 30 Aug 2023 15:13:07 -0700 Subject: [PATCH] Added support for Grace-Hopper platform with 4k and 64k Page support --- ChangeLog | 4 + src/GDS_VERSION | 2 +- src/nvfs-core.c | 97 ++++++----- src/nvfs-core.h | 2 +- src/nvfs-dma.c | 37 +++-- src/nvfs-mmap.c | 428 ++++++++++++++++++++++++++++++------------------ src/nvfs-mmap.h | 36 ++-- src/nvfs-rdma.c | 8 +- src/nvfs-vers.h | 2 +- 9 files changed, 383 insertions(+), 233 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4b7a8b4..01910f6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +nvidia-fs (2.17.4) RELEASE; urgency=low + * Added support for Grace-Hopper platform with 4k and 64k Page support + +-- Aug 2023 nvidia-fs (2.17.0) RELEASE; urgency=low * fixed compilations issues wth linux ver 6.x kernels -- June 2023 diff --git a/src/GDS_VERSION b/src/GDS_VERSION index a4bd231..7344df8 100644 --- a/src/GDS_VERSION +++ b/src/GDS_VERSION @@ -1 +1 @@ -1.7.0.149 +1.7.2.11 diff --git a/src/nvfs-core.c b/src/nvfs-core.c index a2abbf3..6c8a533 100644 --- a/src/nvfs-core.c +++ b/src/nvfs-core.c @@ -293,6 +293,7 @@ static void nvfs_get_pages_free_callback(void *data) int bkt = 0; struct pci_dev_mapping *pci_dev_mapping; nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr; + void *kaddr, *orig_kaddr; nvfs_stat(&nvfs_n_callbacks); @@ -342,16 +343,12 @@ static void nvfs_get_pages_free_callback(void *data) nvfs_err("Error when freeing page table\n"); } -#if 0 - // terminated state is the terminal state - if (unlikely(atomic_cmpxchg(&gpu_info->io_state, IO_TERMINATED, - IO_CALLBACK_END) != IO_TERMINATED)) - BUG(); -#endif - - nvfs_ioctl_mpage_ptr = kmap_atomic(gpu_info->end_fence_page); + kaddr = kmap_atomic(gpu_info->end_fence_page); + orig_kaddr = kaddr; + kaddr = (void*)((char*)kaddr + gpu_info->offset_in_page); + nvfs_ioctl_mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr; nvfs_ioctl_mpage_ptr->state = NVFS_IO_META_DIED; - kunmap_atomic(nvfs_ioctl_mpage_ptr); + kunmap_atomic(orig_kaddr); nvfs_dbg("marking end fence state dead\n"); // Reference taken during nvfs_map() @@ -563,7 +560,7 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l dma_addr_t dma_base_addr, dma_start_addr; unsigned long gpu_page_index = ULONG_MAX; struct nvfs_io* nvfsio; - pgoff_t pgoff; + pgoff_t pgoff = 0; nvfs_mgroup_ptr_t nvfs_mgroup; struct nvfs_gpu_args *gpu_info; uint64_t pdevinfo; @@ -619,7 +616,8 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l dma_base_addr = dma_mapping->dma_addresses[gpu_page_index]; BUG_ON(dma_base_addr == 0); // 4K page-level offset - BUG_ON(pgoff > (GPU_PAGE_SIZE -PAGE_SIZE)); + // for 64K page we expect pgoff to be 0 + BUG_ON(pgoff > (GPU_PAGE_SIZE - PAGE_SIZE)); dma_start_addr = dma_base_addr + pgoff; #ifdef SIMULATE_BUG_DMA_DISCONTIG @@ -711,9 +709,12 @@ int nvfs_get_dma(void *device, struct page *page, void **gpu_base_dma, int dma_l nvfs_io_sparse_dptr_t nvfs_io_map_sparse_data(nvfs_mgroup_ptr_t nvfs_mgroup) { - nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr = - kmap_atomic(nvfs_mgroup->gpu_info.end_fence_page); - nvfs_io_sparse_dptr_t sparse_ptr = &nvfs_ioctl_mpage_ptr->sparse_data; + nvfs_ioctl_metapage_ptr_t nvfs_ioctl_mpage_ptr; + nvfs_io_sparse_dptr_t sparse_ptr; + void *kaddr = kmap_atomic(nvfs_mgroup->gpu_info.end_fence_page); + kaddr = (void*)((char*)kaddr + nvfs_mgroup->gpu_info.offset_in_page); + nvfs_ioctl_mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr; + sparse_ptr = &nvfs_ioctl_mpage_ptr->sparse_data; sparse_ptr->nvfs_start_magic = NVFS_START_MAGIC; sparse_ptr->nvfs_meta_version = 1; sparse_ptr->nholes = 0; @@ -783,14 +784,17 @@ void nvfs_io_free(nvfs_io_t* nvfsio, long res) //For Async case, it's certain that mgroup wouldn't have been freed and hence //we can mark the state Async state as Done after mgroup put as well if (!sync) { + nvfs_ioctl_metapage_ptr_t mpage_ptr; void *kaddr = kmap_atomic(gpu_info->end_fence_page); - nvfs_ioctl_metapage_ptr_t mpage_ptr = - (nvfs_ioctl_metapage_ptr_t) kaddr; + void *orig_kaddr = kaddr; + kaddr = (void*)((char*)kaddr + gpu_info->offset_in_page); + mpage_ptr = (nvfs_ioctl_metapage_ptr_t) kaddr; //User space library is polling on these values mpage_ptr->result = res; wmb(); + nvfs_dbg("freeing nvfs io end_fence_page: %llx and offset in page : %u in kernel\n", (u64)gpu_info->end_fence_page, gpu_info->offset_in_page); mpage_ptr->end_fence_val = nvfsio->end_fence_value; - kunmap_atomic(kaddr); + kunmap_atomic(orig_kaddr); nvfs_dbg("Async - nvfs_io complete. res %ld\n", res); } @@ -1056,13 +1060,14 @@ static int nvfs_open(struct inode *inode, struct file *file) mutex_lock(&nvfs_module_mutex); nvfs_get_ops(); - + if(nvfs_nvidia_p2p_init()) { nvfs_err("Could not load nvidia_p2p* symbols\n"); nvfs_put_ops(); ret = -EOPNOTSUPP; goto out; } + ret = nvfs_blk_register_dma_ops(); if (ret < 0) { nvfs_err("nvfs modules probe failed with error :%d\n", ret); @@ -1127,7 +1132,7 @@ static int nvfs_get_endfence_page(nvfs_ioctl_map_t *input_param, goto out; } - if ((unsigned long) end_fence & (PAGE_SIZE -1)) { + if ((unsigned long) end_fence & (NVFS_BLOCK_SIZE -1)) { nvfs_err("%s:%d end_fence address not aligned\n", __func__, __LINE__); goto out; @@ -1155,6 +1160,8 @@ static int nvfs_get_endfence_page(nvfs_ioctl_map_t *input_param, goto out; } + gpu_info->offset_in_page = (u32)((u64)end_fence % PAGE_SIZE); + nvfs_dbg("successfully pinned end fence address : %llx, end_fence_page : %llx offset in page : %ux in kernel\n", (u64)end_fence, (u64)gpu_info->end_fence_page, gpu_info->offset_in_page); return 0; out: return ret; @@ -1256,20 +1263,21 @@ static int nvfs_pin_gpu_pages(nvfs_ioctl_map_t *input_param, } if(gpu_buf_len < GPU_PAGE_SIZE && - (input_param->sbuf_block * PAGE_SIZE) < + (input_param->sbuf_block * NVFS_BLOCK_SIZE) < (gpuvaddr - gpu_virt_start + gpu_buf_len)) { - nvfs_err("invalid shadow buf size provided %ld, gpu_buf_len: %lld, gpuvaddr: %llx \n", - input_param->sbuf_block * PAGE_SIZE, gpu_buf_len, gpuvaddr); + nvfs_err("invalid shadow buf size provided %u, gpu_buf_len: %lld, gpuvaddr: %llx \n", + input_param->sbuf_block * NVFS_BLOCK_SIZE, gpu_buf_len, gpuvaddr); goto error; } rounded_size = round_up((gpu_virt_end - gpu_virt_start + 1), GPU_PAGE_SIZE); - nvfs_dbg("gpu_addr 0x%llx cpu_addr 0x%llx\n", + nvfs_dbg("gpu_addr 0x%llx cpu_addr 0x%llx gpu_buf_len %llu\n", input_param->gpuvaddr, - input_param->cpuvaddr); + input_param->cpuvaddr, + gpu_buf_len); gpu_info->gpu_buf_len = gpu_buf_len; gpu_info->gpuvaddr = gpuvaddr; @@ -1453,7 +1461,7 @@ static int nvfs_map(nvfs_ioctl_map_t *input_param) nvfs_get_ops(); nvfs_mgroup = nvfs_mgroup_pin_shadow_pages(input_param->cpuvaddr, - input_param->sbuf_block * PAGE_SIZE); + input_param->sbuf_block * NVFS_BLOCK_SIZE); if (!nvfs_mgroup) { nvfs_err("%s:%d Error nvfs_setup_shadow_buffer\n", __func__, __LINE__); @@ -1546,8 +1554,8 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs) return ERR_PTR(ret); } - if (offset_in_page(ioargs->offset) || - offset_in_page(ioargs->size)) { + if (ioargs->offset % NVFS_BLOCK_SIZE || + ioargs->size % NVFS_BLOCK_SIZE) { nvfs_err("%s:%d offset = %lld size = %llu not sector aligned\n", __func__, __LINE__, ioargs->offset, @@ -1705,8 +1713,11 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs) gpu_virt_start = (gpu_info->gpuvaddr & GPU_PAGE_MASK); va_offset = ((u64)gpu_info->gpuvaddr - gpu_virt_start) + - file_args->devptroff; - if (offset_in_page(va_offset)) { + file_args->devptroff; + nvfs_dbg("gpuvaddr : %llu, gpu_virt_start : %llu, devptroff : %llu, va_offset : %llu\n", + (u64)gpu_info->gpuvaddr, (u64)gpu_virt_start, (u64) file_args->devptroff, va_offset); + + if (va_offset % NVFS_BLOCK_SIZE) { nvfs_err("gpu_va_offset not aligned va_offset %ld " "devptroff %ld\n", (unsigned long)va_offset, @@ -1751,7 +1762,7 @@ struct nvfs_io* nvfs_io_init(int op, nvfs_ioctl_ioargs_t *ioargs) #ifdef NVFS_ENABLE_KERN_RDMA_SUPPORT //If use_rkey is set, then set the appropriate segments for this IO if(nvfsio->use_rkeys) { - shadow_buf_size = nvfs_mgroup->nvfs_pages_count * PAGE_SIZE; + shadow_buf_size = nvfs_mgroup->nvfs_blocks_count * NVFS_BLOCK_SIZE; rdma_seg_offset = va_offset % shadow_buf_size; nvfsio->rdma_seg_offset = rdma_seg_offset; nvfs_dbg("%s: set curr rdma segment offset = %lu\n", @@ -1912,8 +1923,8 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio) loff_t fd_offset = nvfsio->fd_offset; u64 va_offset = 0; int op = nvfsio->op; - unsigned long shadow_buf_size = (nvfs_mgroup->nvfs_pages_count) * - PAGE_SIZE; + unsigned long shadow_buf_size = (nvfs_mgroup->nvfs_blocks_count) * + NVFS_BLOCK_SIZE; ssize_t rdma_seg_offset = 0; nvfs_dbg("Ring %s: m_pDBuffer=%lx BufferSize=%lu TotalRWSize:%ld " @@ -1973,7 +1984,7 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio) nvfs_dbg("%s rdma offset = %lu\n", __func__, rdma_seg_offset); while (bytes_left) { - int nr_pages; + int nr_blocks; size_t bytes_issued; // Check if there are any callbacks or munmaps @@ -1988,17 +1999,17 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio) } bytes_issued = min((long) bytes_left, (long)shadow_buf_size - (long)rdma_seg_offset); - BUG_ON(offset_in_page(bytes_issued)); - - nr_pages = DIV_ROUND_UP(bytes_issued, PAGE_SIZE); - - nvfs_dbg("Num 4k Pages in process address " - "nr_pages=%d bytes_left=%lu " - "%s bytes_issued=%lu nvfsio 0x%p rdma_seg_offset %lu use_rkey:%d \n", - nr_pages, bytes_left, opstr(op), bytes_issued, nvfsio, + //BUG_ON(offset_in_page(bytes_issued)); + BUG_ON(bytes_issued % NVFS_BLOCK_SIZE); + + nr_blocks = DIV_ROUND_UP(bytes_issued, NVFS_BLOCK_SIZE); + nvfs_dbg("Num blocks in process address " + "nr_blocks=%d bytes_left=%lu " + "%s bytes_issued=%lu nvfsio 0x%p rdma_seg_offset %lu use_rkey:%d\n", + nr_blocks, bytes_left, opstr(op), bytes_issued, nvfsio, rdma_seg_offset, nvfsio->use_rkeys); - ret = nvfs_mgroup_fill_mpages(nvfs_mgroup, nr_pages); + ret = nvfs_mgroup_fill_mpages(nvfs_mgroup, nr_blocks); // Check if there are any callbacks or munmaps if (ret < 0) { nvfs_err("%s:%d shadow buffer misaligned for gpu page_offset: 0x%llx bytes_issued: %ld bytes" @@ -2100,7 +2111,7 @@ long nvfs_io_start_op(nvfs_io_t* nvfsio) } #ifdef SIMULATE_LESS_BYTES - if (bytes_done > 4096) { + if (bytes_done > NVFS_BLOCK_SIZE) { bytes_done -= 4091; nvfs_info("truncate request size :%lu\n", bytes_done); } diff --git a/src/nvfs-core.h b/src/nvfs-core.h index 9081c51..0d47a74 100644 --- a/src/nvfs-core.h +++ b/src/nvfs-core.h @@ -229,7 +229,7 @@ void nvfs_io_process_exiting(nvfs_mgroup_ptr_t nvfs_mgroup); #define NVFS_IOCTL_BATCH_IO _IOW(NVFS_MAGIC, 8, int) #endif -#define PAGE_PER_GPU_PAGE_SHIFT 4 +#define PAGE_PER_GPU_PAGE_SHIFT ilog2(GPU_PAGE_SIZE / PAGE_SIZE) #define GPU_PAGE_SHIFT 16 #define GPU_PAGE_SIZE ((u64)1 << GPU_PAGE_SHIFT) #define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) diff --git a/src/nvfs-dma.c b/src/nvfs-dma.c index 5725822..ad2c35e 100644 --- a/src/nvfs-dma.c +++ b/src/nvfs-dma.c @@ -319,6 +319,12 @@ static int nvfs_blk_rq_map_sg_internal(struct request_queue *q, } curr_page_gpu = (nvfs_mgroup != NULL); + if (nvfs_mgroup != NULL) { + if (nvfs_mgroup_metadata_set_dma_state(bvec.bv_page, nvfs_mgroup, bvec.bv_len, bvec.bv_offset) != 0) { + nvfs_err("%s:%d mgroup_set_dma error\n", __func__, __LINE__); + return NVFS_IO_ERR; + } + } #endif /* @@ -494,6 +500,7 @@ static int nvfs_dma_map_sg_attrs_internal(struct device *device, void *gpu_base_dma = NULL; struct scatterlist *sg = NULL; struct blk_plug *plug = NULL; + nvfs_mgroup_ptr_t nvfs_mgroup = NULL; if (unlikely(nents == 0)) { nvfs_err("%s:%d cannot map empty sglist\n", __func__, __LINE__); @@ -526,9 +533,19 @@ static int nvfs_dma_map_sg_attrs_internal(struct device *device, current->plug = NULL; ret = nvfs_get_dma(to_pci_dev(device), sg_page(sg), &gpu_base_dma, -1); current->plug = plug; - } - else + } else { ret = nvfs_get_dma(to_pci_dev(device), sg_page(sg), &gpu_base_dma, sg->length); + if (ret == 0) { + nvfs_mgroup = nvfs_mgroup_from_page(sg_page(sg)); + if(nvfs_mgroup == NULL) { + nvfs_err("%s:%d empty mgroup\n", __func__, __LINE__); + return NVFS_IO_ERR; + } + // We have dma mapping set up + nvfs_mgroup_metadata_set_dma_state(sg_page(sg), nvfs_mgroup, sg->length, sg->offset); + nvfs_mgroup_put(nvfs_mgroup); + } + } #ifdef SIMULATE_NVFS_IOERR ret = NVFS_IO_ERR; @@ -643,7 +660,7 @@ static int nvfs_dma_unmap_sg(struct device *device, page = sg_page(sg); if (unlikely(page == NULL)) continue; - ret = nvfs_check_gpu_page_and_error(page); + ret = nvfs_check_gpu_page_and_error(page, sg->offset, sg->length); if (!ret) { cpu_segs++; } else if (unlikely(ret == -1)) { @@ -714,7 +731,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, struct scatterlist *sg = NULL; struct page *page; nvfs_mgroup_ptr_t nvfs_mgroup = NULL, prev_mgroup = NULL; - int i = 0, npages = 0; + int i = 0, nblocks = 0; uint64_t shadow_buf_size, total_size = 0; struct nvfs_io* nvfsio = NULL; @@ -759,7 +776,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, nvfs_mgroup_put(prev_mgroup); return NVFS_IO_ERR; } - shadow_buf_size = (prev_mgroup->nvfs_pages_count) * PAGE_SIZE; + shadow_buf_size = (prev_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE; nvfsio = &prev_mgroup->nvfsio; memcpy(rdma_infop, &prev_mgroup->rdma_info, sizeof(*rdma_infop)); // get to the base 64K page of the starting address @@ -767,8 +784,8 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, // set to the current address by calulating the number of 64K pages + offset rdma_infop->rem_vaddr += (nvfsio->cur_gpu_base_index << GPU_PAGE_SHIFT); rdma_infop->rem_vaddr += (nvfsio->gpu_page_offset); - rdma_infop->size = (nvfsio->nvfs_active_pages_end - - nvfsio->nvfs_active_pages_start + 1) * PAGE_SIZE; + rdma_infop->size = (nvfsio->nvfs_active_blocks_end - + nvfsio->nvfs_active_blocks_start + 1) * NVFS_BLOCK_SIZE; if ((int32_t) rdma_infop->size > (shadow_buf_size - nvfsio->rdma_seg_offset) || (int32_t) rdma_infop->size < 0) { nvfs_err("%s: wrong rdma_infop->size %d shadow buffer size %llu addr = 0x%llx\n \ @@ -785,7 +802,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, nvfsio->rdma_seg_offset, rdma_infop->rkey); } - shadow_buf_size = (prev_mgroup->nvfs_pages_count) * PAGE_SIZE; + shadow_buf_size = (prev_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE; nvfs_mgroup_put(prev_mgroup); for_each_sg(sglist, sg, nents, i) { @@ -795,7 +812,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, return NVFS_BAD_REQ; } page = sg_page(sg); - npages = DIV_ROUND_UP(sg->length, PAGE_SIZE); + nblocks = DIV_ROUND_UP(sg->length, NVFS_BLOCK_SIZE); if(page == NULL) { nvfs_dbg("%s: NULL page passed, page number: %d", __func__, i); @@ -807,7 +824,7 @@ int nvfs_get_gpu_sglist_rdma_info(struct scatterlist *sglist, #ifdef NVFS_TEST_GPFS_CALLBACK nvfs_mgroup = nvfs_mgroup_get((page->index >> NVFS_MAX_SHADOW_PAGES_ORDER)); #else - nvfs_mgroup = nvfs_mgroup_from_page_range(page, npages); + nvfs_mgroup = nvfs_mgroup_from_page_range(page, nblocks, sg->offset); #endif if(nvfs_mgroup == NULL) { nvfs_dbg("%s: mgroup NULL for page %d for addr 0x%p", __func__, i, page); diff --git a/src/nvfs-mmap.c b/src/nvfs-mmap.c index 544846a..e3e7221 100644 --- a/src/nvfs-mmap.c +++ b/src/nvfs-mmap.c @@ -126,9 +126,9 @@ static void nvfs_mgroup_free(nvfs_mgroup_ptr_t nvfs_mgroup, bool from_dma) { int i; struct nvfs_gpu_args *gpu_info = NULL; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; gpu_info = &nvfs_mgroup->gpu_info; - if (atomic_read(&gpu_info->io_state) > IO_INIT) { if(nvfs_free_gpu_info(gpu_info, from_dma) != 0) { nvfs_info("nvfs_free_gpu_info failed. for mgroup %p, ref cnt %d\n", @@ -161,12 +161,12 @@ static void nvfs_mgroup_free(nvfs_mgroup_ptr_t nvfs_mgroup, bool from_dma) if(nvfs_mgroup->nvfs_metadata) kfree(nvfs_mgroup->nvfs_metadata); if(nvfs_mgroup->nvfs_ppages) { - for(i=0; i< nvfs_mgroup->nvfs_pages_count; i++) { - if(nvfs_mgroup->nvfs_ppages[i] != NULL) - put_page(nvfs_mgroup->nvfs_ppages[i]); + for(i = 0; i < nvfs_mgroup->nvfs_blocks_count; i = i + nvfs_block_count_per_page) { + if(nvfs_mgroup->nvfs_ppages[i/nvfs_block_count_per_page] != NULL) + put_page(nvfs_mgroup->nvfs_ppages[i/nvfs_block_count_per_page]); } kfree(nvfs_mgroup->nvfs_ppages); - nvfs_mgroup->nvfs_pages_count = 0; + nvfs_mgroup->nvfs_blocks_count = 0; nvfs_mgroup->nvfs_ppages = NULL; } nvfs_mgroup->base_index = 0; @@ -230,6 +230,7 @@ static nvfs_mgroup_ptr_t nvfs_get_mgroup_from_vaddr_internal(u64 cpuvaddr) unsigned long cur_base_index = 0; nvfs_mgroup_ptr_t nvfs_mgroup = NULL; nvfs_mgroup_page_ptr_t nvfs_mpage; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; if (!cpuvaddr) { nvfs_err("%s:%d Invalid shadow buffer address\n", @@ -237,7 +238,8 @@ static nvfs_mgroup_ptr_t nvfs_get_mgroup_from_vaddr_internal(u64 cpuvaddr) goto out; } - if (offset_in_page(cpuvaddr)) { +// if (offset_in_page(cpuvaddr)) { + if (cpuvaddr % NVFS_BLOCK_SIZE) { nvfs_err("%s:%d Shadow buffer allocation not aligned\n", __func__, __LINE__); goto out; @@ -272,7 +274,7 @@ static nvfs_mgroup_ptr_t nvfs_get_mgroup_from_vaddr_internal(u64 cpuvaddr) } - nvfs_mpage = &nvfs_mgroup->nvfs_metadata[page->index % NVFS_MAX_SHADOW_PAGES]; + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[(page->index % NVFS_MAX_SHADOW_PAGES) * nvfs_block_count_per_page]; if (nvfs_mpage == NULL || nvfs_mpage->nvfs_start_magic != NVFS_START_MAGIC || nvfs_mpage->page != page) { nvfs_err("%s:%d found invalid page %p\n", @@ -320,7 +322,7 @@ nvfs_mgroup_ptr_t nvfs_get_mgroup_from_vaddr(u64 cpuvaddr) nvfs_mgroup_put(nvfs_mgroup_s); // Check the last page - page_count = nvfs_mgroup_s->nvfs_pages_count; + page_count = nvfs_mgroup_s->nvfs_blocks_count; addr = (((char *)cpuvaddr) + ((page_count - 1) * PAGE_SIZE)); @@ -347,7 +349,7 @@ nvfs_mgroup_ptr_t nvfs_mgroup_pin_shadow_pages(u64 cpuvaddr, unsigned long lengt { int ret = 0; struct page** pages = NULL; - unsigned long count, j, cur_base_index = 0; + unsigned long count, block_count, j, cur_base_index = 0; nvfs_mgroup_ptr_t nvfs_mgroup = NULL; if (!cpuvaddr) { @@ -356,7 +358,8 @@ nvfs_mgroup_ptr_t nvfs_mgroup_pin_shadow_pages(u64 cpuvaddr, unsigned long lengt goto out; } - if (!(cpuvaddr) && offset_in_page(cpuvaddr)) { + //if (!(cpuvaddr) && offset_in_page(cpuvaddr)) { + if (cpuvaddr % NVFS_BLOCK_SIZE) { nvfs_err("%s:%d Shadow buffer allocation not aligned\n", __func__, __LINE__); goto out; @@ -366,6 +369,7 @@ nvfs_mgroup_ptr_t nvfs_mgroup_pin_shadow_pages(u64 cpuvaddr, unsigned long lengt cpuvaddr, length); count = DIV_ROUND_UP(length, PAGE_SIZE); + block_count = DIV_ROUND_UP(length, NVFS_BLOCK_SIZE); pages = (struct page **) kmalloc(count * sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -410,7 +414,12 @@ nvfs_mgroup_ptr_t nvfs_mgroup_pin_shadow_pages(u64 cpuvaddr, unsigned long lengt nvfs_mgroup = nvfs_mgroup_get(cur_base_index); if(nvfs_mgroup == NULL || unlikely(IS_ERR(nvfs_mgroup))) goto out; - BUG_ON((nvfs_mgroup->nvfs_pages_count != count)); + + if ((nvfs_mgroup->nvfs_blocks_count != block_count)) { + nvfs_dbg("Mgroup Block count: %lu, block count:%lu\n", nvfs_mgroup->nvfs_blocks_count, block_count); + nvfs_dbg("Mgroup page: %p, page:%p\n", nvfs_mgroup->nvfs_ppages[j], pages[j]); + BUG_ON(nvfs_mgroup->nvfs_blocks_count < block_count); + } } BUG_ON((nvfs_mgroup->base_index != cur_base_index)); BUG_ON(j != (pages[j]->index % NVFS_MAX_SHADOW_PAGES)); @@ -608,23 +617,27 @@ static const struct vm_operations_struct nvfs_mmap_ops = { static int nvfs_mgroup_mmap_internal(struct file *filp, struct vm_area_struct *vma) { - int ret = -EINVAL, i, tries = 10; + int ret = -EINVAL, i,j, tries = 10; unsigned long length = vma->vm_end - vma->vm_start; unsigned long base_index; - unsigned long nvfs_pages_count; + unsigned long nvfs_blocks_count; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; nvfs_mgroup_ptr_t nvfs_mgroup, nvfs_new_mgroup; struct nvfs_gpu_args *gpu_info; + int os_pages_count; nvfs_stat64(&nvfs_n_mmap); /* check length - do not allow larger mappings than the number of pages allocated */ if (length > NVFS_MAX_SHADOW_PAGES * PAGE_SIZE) goto error; + /* if the length is less than 64K, check for 4K alignment */ - if (length < GPU_PAGE_SIZE && (length % PAGE_SIZE)) { + if ((length < GPU_PAGE_SIZE) && (length % NVFS_BLOCK_SIZE)) { nvfs_err("mmap size not a multiple of 4K for size < 64K : 0x%lx \n", length); goto error; } + /* if the length is greater than 64K, check for 64K alignment */ if (length > GPU_PAGE_SIZE && (length % GPU_PAGE_SIZE)) { nvfs_err("mmap size not a multiple of 64K: 0x%lx for size >64k \n", length); @@ -695,8 +708,11 @@ static int nvfs_mgroup_mmap_internal(struct file *filp, struct vm_area_struct *v goto error; } - nvfs_pages_count = DIV_ROUND_UP(length, PAGE_SIZE); - nvfs_mgroup->nvfs_ppages = (struct page**)kzalloc(nvfs_pages_count * + nvfs_blocks_count = DIV_ROUND_UP(length, NVFS_BLOCK_SIZE); + + // Draw from nvfs_block_count and get the correct page index for every for e.g. 16 blocks + os_pages_count = DIV_ROUND_UP(length, PAGE_SIZE); + nvfs_mgroup->nvfs_ppages = (struct page**)kzalloc(os_pages_count * sizeof(struct page*), GFP_KERNEL); if (!nvfs_mgroup->nvfs_ppages) { nvfs_mgroup_put(nvfs_mgroup); @@ -704,7 +720,7 @@ static int nvfs_mgroup_mmap_internal(struct file *filp, struct vm_area_struct *v goto error; } - nvfs_mgroup->nvfs_metadata = (struct nvfs_io_metadata*)kzalloc(nvfs_pages_count * + nvfs_mgroup->nvfs_metadata = (struct nvfs_io_metadata*)kzalloc(nvfs_blocks_count * sizeof(struct nvfs_io_metadata), GFP_KERNEL); if (!nvfs_mgroup->nvfs_metadata) { nvfs_mgroup_put(nvfs_mgroup); @@ -720,48 +736,52 @@ static int nvfs_mgroup_mmap_internal(struct file *filp, struct vm_area_struct *v BUG_ON(vma->vm_private_data != NULL); } - for (i = 0; i < nvfs_pages_count; i++) { - nvfs_mgroup->nvfs_ppages[i] = alloc_page(GFP_USER|__GFP_ZERO); - if (nvfs_mgroup->nvfs_ppages[i]) { - nvfs_mgroup->nvfs_ppages[i]->index = (base_index * NVFS_MAX_SHADOW_PAGES) + i; + j = 0; + for (i = 0; i < nvfs_blocks_count; i++) { + j = i / nvfs_block_count_per_page; + if (nvfs_mgroup->nvfs_ppages[j] == NULL) { + nvfs_mgroup->nvfs_ppages[j] = alloc_page(GFP_USER|__GFP_ZERO); + if (nvfs_mgroup->nvfs_ppages[j]) { + nvfs_mgroup->nvfs_ppages[j]->index = (base_index * NVFS_MAX_SHADOW_PAGES) + j; #ifdef CONFIG_FAULT_INJECTION - if (nvfs_fault_trigger(&nvfs_vm_insert_page_error)) { - ret = -EFAULT; - } - else + if (nvfs_fault_trigger(&nvfs_vm_insert_page_error)) { + ret = -EFAULT; + } + else #endif - { - // This will take a page reference which is released in mgroup_put - ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, - nvfs_mgroup->nvfs_ppages[i]); - } + { + // This will take a page reference which is released in mgroup_put + ret = vm_insert_page(vma, vma->vm_start + j * PAGE_SIZE, + nvfs_mgroup->nvfs_ppages[j]); + } - nvfs_dbg("vm_insert_page : %d pages: %lx mapping: %p, " - "index: %lx (%lx - %lx) ret: %d \n", - i, (unsigned long)nvfs_mgroup->nvfs_ppages[i], - nvfs_mgroup->nvfs_ppages[i]->mapping, - nvfs_mgroup->nvfs_ppages[i]->index, - vma->vm_start + (i * PAGE_SIZE) , - vma->vm_start + (i + 1) * PAGE_SIZE, - ret); - if (ret) { - nvfs_mgroup->nvfs_pages_count = i+1; - nvfs_mgroup_put(nvfs_mgroup); + nvfs_dbg("vm_insert_page : %d pages: %lx mapping: %p, " + "index: %lx (%lx - %lx) ret: %d \n", + j, (unsigned long)nvfs_mgroup->nvfs_ppages[j], + nvfs_mgroup->nvfs_ppages[j]->mapping, + nvfs_mgroup->nvfs_ppages[j]->index, + vma->vm_start + (j * PAGE_SIZE) , + vma->vm_start + (j + 1) * PAGE_SIZE, + ret); + if (ret) { + nvfs_mgroup->nvfs_blocks_count = (j+1) * nvfs_block_count_per_page; + nvfs_mgroup_put(nvfs_mgroup); + ret = -ENOMEM; + goto error; + } + } else { + nvfs_mgroup->nvfs_blocks_count = j * nvfs_block_count_per_page; + nvfs_mgroup_put(nvfs_mgroup); ret = -ENOMEM; goto error; - } - } else { - nvfs_mgroup->nvfs_pages_count = i; - nvfs_mgroup_put(nvfs_mgroup); - ret = -ENOMEM; - goto error; - } + } + } //fill the nvfs metadata header nvfs_mgroup->nvfs_metadata[i].nvfs_start_magic = NVFS_START_MAGIC; nvfs_mgroup->nvfs_metadata[i].nvfs_state = NVFS_IO_ALLOC; - nvfs_mgroup->nvfs_metadata[i].page = nvfs_mgroup->nvfs_ppages[i]; + nvfs_mgroup->nvfs_metadata[i].page = nvfs_mgroup->nvfs_ppages[j]; } - nvfs_mgroup->nvfs_pages_count = nvfs_pages_count; + nvfs_mgroup->nvfs_blocks_count = nvfs_blocks_count; gpu_info = &nvfs_mgroup->gpu_info; atomic_set(&gpu_info->io_state, IO_FREE); nvfs_stat64_add(length, &nvfs_n_active_shadow_buf_sz); @@ -801,45 +821,51 @@ void nvfs_mgroup_init() hash_init(nvfs_io_mgroup_hash); } -void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_state state, bool validate, +void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_block_state state, bool validate, bool update_nvfsio) { struct nvfs_io_metadata *nvfs_mpages = nvfs_mgroup->nvfs_metadata; nvfs_io_sparse_dptr_t sparse_ptr = NULL; int last_sparse_index = -1; struct nvfs_io* nvfsio = &nvfs_mgroup->nvfsio; - unsigned done_pages = DIV_ROUND_UP(nvfsio->ret, PAGE_SIZE); // roundup to next 4K page - unsigned issued_pages = (nvfsio->nvfs_active_pages_end - nvfsio->nvfs_active_pages_start +1); + unsigned done_blocks = DIV_ROUND_UP(nvfsio->ret, NVFS_BLOCK_SIZE); + unsigned issued_blocks = (nvfsio->nvfs_active_blocks_end - nvfsio->nvfs_active_blocks_start + 1); int i, nholes = -1; - int last_done_page = 0; // needs to be int to handle 0 bytes done. + int last_done_block = 0; // needs to be int to handle 0 bytes done. int sparse_read_bytes_limit = 0; // set only if we reach max hole regions int ret = 0; + int cur_block_num = nvfsio->nvfs_active_blocks_start; + int last_block_num = nvfsio->nvfs_active_blocks_end; - if (validate && (state == NVFS_IO_DONE)) { - BUG_ON(nvfsio->ret < 0); - BUG_ON(nvfsio->ret > nvfsio->length); + if (validate && (state == NVFS_IO_DONE)) { + BUG_ON(nvfsio->ret < 0); + BUG_ON(nvfsio->ret > nvfsio->length); - /* setup sparse metadata structure */ - if(nvfsio->op == READ && nvfsio->check_sparse == true) { - sparse_ptr = nvfs_io_map_sparse_data(nvfs_mgroup); - } + /* setup sparse metadata structure */ + if(nvfsio->op == READ && nvfsio->check_sparse == true) { + sparse_ptr = nvfs_io_map_sparse_data(nvfs_mgroup); + } - /*setup the last page IO was seen based on the ret value */ - if(done_pages < issued_pages) { - last_done_page = nvfsio->nvfs_active_pages_start + done_pages - 1; - nvfs_dbg("EOF detected, sparse: %p, done_pages:%d issued_pages:%d start:%ld last_done:%d end:%ld \n", - sparse_ptr, - done_pages, issued_pages, - nvfsio->nvfs_active_pages_start, - last_done_page, - nvfsio->nvfs_active_pages_end); - } else { - last_done_page = nvfsio->nvfs_active_pages_end; - } - } + /*setup the last block IO was seen based on the ret value */ + if(done_blocks < issued_blocks) { + last_done_block = nvfsio->nvfs_active_blocks_start + done_blocks - 1; + nvfs_dbg("EOF detected, sparse: %p, done_blocks:%d issued_blocks:%d start:%ld last_done:%d end:%ld \n", + sparse_ptr, + done_blocks, issued_blocks, + nvfsio->nvfs_active_blocks_start, + last_done_block, + nvfsio->nvfs_active_blocks_end); + } else { + last_done_block = nvfsio->nvfs_active_blocks_end; + } + } - // check that every page has seen the dma mapping call on success - for(i=0; i < nvfs_mgroup->nvfs_pages_count ; i++) { + if (state == NVFS_IO_INIT) { + cur_block_num = 0; + last_block_num = nvfs_mgroup->nvfs_blocks_count - 1; + } + // check that every block has seen the dma mapping call on success + for(i = cur_block_num; i <= last_block_num; i++) { if(state == NVFS_IO_FREE) { WARN_ON_ONCE(validate && nvfs_mpages[i].nvfs_state != NVFS_IO_INIT && nvfs_mpages[i].nvfs_state != NVFS_IO_ALLOC @@ -855,17 +881,17 @@ void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_sta WARN_ON_ONCE(validate && nvfs_mpages[i].nvfs_state != NVFS_IO_QUEUED && nvfs_mpages[i].nvfs_state != NVFS_IO_DMA_START); } else if(state == NVFS_IO_DONE - && i>= nvfsio->nvfs_active_pages_start && i <= nvfsio->nvfs_active_pages_end) { + && i >= nvfsio->nvfs_active_blocks_start && i <= nvfsio->nvfs_active_blocks_end) { if(validate && nvfs_mpages[i].nvfs_state != NVFS_IO_DMA_START) { - // This page was not issued to block layer as the file ended - if(i > last_done_page) { + // This block was not issued to block layer as the file ended + if (i > last_done_block) { if (validate && nvfs_mpages[i].nvfs_state != NVFS_IO_QUEUED) { ret = -EIO; WARN_ON_ONCE(1); } - // This page was not issued to block layer and the file is not sparse, BUG - }else { - if(nvfsio->op == READ) { + // This block was not issued to block layer and the file is not sparse, BUG + } else { + if (nvfsio->op == READ) { // handle fallocate case with unwritten extents if (sparse_ptr == false) { BUG_ON(nvfsio->check_sparse == true); @@ -876,19 +902,19 @@ void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_sta if(last_sparse_index < 0 || (last_sparse_index + 1) != i) { if (sparse_read_bytes_limit) { last_sparse_index = i; - // we stop further hole processing, and record the current page index for + // we stop further hole processing, and record the current block index for // mimicking a partial read to nvfs_io_complete } else if (nholes + 1 >= NVFS_MAX_HOLE_REGIONS) { - sparse_read_bytes_limit = (i - nvfsio->nvfs_active_pages_start) * PAGE_SIZE; + sparse_read_bytes_limit = (i - nvfsio->nvfs_active_blocks_start) * NVFS_BLOCK_SIZE; last_sparse_index = i; nvfs_info("detected max hole region count: %u", nholes); - nvfs_info("sparse read current page index: %u, read_bytes: %d", i, + nvfs_info("sparse read current BLOCK index: %u, read_bytes: %d", i, sparse_read_bytes_limit); } else { // start a new sparse region nholes++; BUG_ON(nholes >= NVFS_MAX_HOLE_REGIONS); - sparse_ptr->hole[nholes].start = i - nvfsio->nvfs_active_pages_start; + sparse_ptr->hole[nholes].start = i - nvfsio->nvfs_active_blocks_start; sparse_ptr->hole[nholes].npages = 1; last_sparse_index = i; } @@ -898,32 +924,33 @@ void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_sta } } else { //WARN_ON(validate && nvfs_mpages[i].nvfs_state != NVFS_IO_DMA_START); - nvfs_dbg("WRITE: page index: %d, expected NVFS_IO_DMA_START," + nvfs_dbg("WRITE: block index: %d, expected NVFS_IO_DMA_START," "current state: %x\n", i, nvfs_mpages[i].nvfs_state); ret = -EIO; } } } } else if(state == NVFS_IO_DONE && - (i > nvfsio->nvfs_active_pages_end || i < nvfsio->nvfs_active_pages_start)) { - // We shouldn't be seeing a page which are out of bounds - if (validate && nvfs_mpages[i].nvfs_state != NVFS_IO_INIT) - BUG_ON(1); - // don't update the state to DONE. - continue; - } else { + (i > nvfsio->nvfs_active_blocks_end || i < nvfsio->nvfs_active_blocks_start)) { + if (validate && nvfs_mpages[i].nvfs_state != NVFS_IO_INIT) { + // We shouldn't be seeing a page which are out of bounds + BUG_ON(1); + } + // don't update the state to DONE. + continue; + } else { WARN_ON_ONCE(1); ret = -EIO; } - // Donot transition an active page to IO_DONE state, + // Do not transition an active block to IO_DONE state, // if process is exiting or the thread is interrupted if (state == NVFS_IO_DONE && - (i>= nvfsio->nvfs_active_pages_start && i <= nvfsio->nvfs_active_pages_end) && + (i >= nvfsio->nvfs_active_blocks_start && i <= nvfsio->nvfs_active_blocks_end) && ((!in_interrupt() && current->flags & PF_EXITING) || nvfsio->ret == -ERESTARTSYS )) { if(nvfs_mpages[i].nvfs_state < NVFS_IO_QUEUED || nvfs_mpages[i].nvfs_state > NVFS_IO_DMA_START) { - nvfs_err("page %d in unexpected state: %d \n", i, nvfs_mpages[i].nvfs_state); + nvfs_err("block %d in unexpected state: %d \n", i, nvfs_mpages[i].nvfs_state); } } else { nvfs_mpages[i].nvfs_state = state; @@ -933,9 +960,9 @@ void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_sta if(state == NVFS_IO_DONE) { // skip cleaning the page metadata if exiting if ((nvfsio->ret != -ERESTARTSYS) && - !(current->flags & PF_EXITING)) { - nvfsio->nvfs_active_pages_start = 0; - nvfsio->nvfs_active_pages_end = 0; + !(current->flags & PF_EXITING)) { + nvfsio->nvfs_active_blocks_start = 0; + nvfsio->nvfs_active_blocks_end = 0; } } @@ -980,53 +1007,54 @@ static void nvfs_mgroup_fill_mpage(struct page* page, nvfs_mgroup_page_ptr_t nvf } -int nvfs_mgroup_fill_mpages(nvfs_mgroup_ptr_t nvfs_mgroup, unsigned nr_pages) +int nvfs_mgroup_fill_mpages(nvfs_mgroup_ptr_t nvfs_mgroup, unsigned nr_blocks) { struct nvfs_io* nvfsio = &nvfs_mgroup->nvfsio; int j; - unsigned long pgoff = 0; + unsigned long blockoff = 0; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; - if (unlikely(nr_pages > nvfs_mgroup->nvfs_pages_count)) { - nvfs_err("nr_pages :%u nvfs_pages_count :%lu\n", nr_pages, nvfs_mgroup->nvfs_pages_count); + if (unlikely(nr_blocks > nvfs_mgroup->nvfs_blocks_count)) { + nvfs_err("nr_blocks :%u nvfs_blocks_count :%lu\n", nr_blocks, nvfs_mgroup->nvfs_blocks_count); return -EIO; } if (nvfsio->gpu_page_offset) { // check page offset is less than or equal to 60K - if (nvfsio->gpu_page_offset > (GPU_PAGE_SIZE - PAGE_SIZE)) + if (nvfsio->gpu_page_offset > (GPU_PAGE_SIZE - KiB4)) return -EIO; // check page offset is 4K aligned - if (nvfsio->gpu_page_offset % PAGE_SIZE) + if (nvfsio->gpu_page_offset % KiB4) return -EIO; // check total io size is less than or equal to 60K - if ((nvfsio->gpu_page_offset + ((loff_t)nr_pages << PAGE_SHIFT)) > GPU_PAGE_SIZE) + if ((nvfsio->gpu_page_offset + ((loff_t)nr_blocks << NVFS_BLOCK_SHIFT)) > GPU_PAGE_SIZE) return -EIO; - pgoff = nvfsio->gpu_page_offset >> PAGE_SHIFT; + blockoff = nvfsio->gpu_page_offset >> NVFS_BLOCK_SHIFT; // check shadow buffer pages are big enough to map the (gpu base address + offset) - if (((pgoff + nr_pages) > nvfs_mgroup->nvfs_pages_count)) + if (((blockoff + nr_blocks) > nvfs_mgroup->nvfs_blocks_count)) return -EIO; - for (j = 0; j < pgoff; ++j) { + for (j = 0; j < blockoff; ++j) { nvfs_mgroup->nvfs_metadata[j].nvfs_state = NVFS_IO_INIT; } } - nvfsio->nvfs_active_pages_start = pgoff; - for (j = pgoff; j < nr_pages + pgoff; ++j) { - nvfs_mgroup_fill_mpage(nvfs_mgroup->nvfs_ppages[j], + nvfsio->nvfs_active_blocks_start = blockoff; + for (j = blockoff; j < nr_blocks + blockoff; ++j) { + nvfs_mgroup_fill_mpage(nvfs_mgroup->nvfs_ppages[j/nvfs_block_count_per_page], &nvfs_mgroup->nvfs_metadata[j], nvfsio); } - nvfsio->nvfs_active_pages_end = j-1; + nvfsio->nvfs_active_blocks_end = j-1; // clear the state for unqueued pages - for (; j < nvfs_mgroup->nvfs_pages_count ; j++) { + for (; j < nvfs_mgroup->nvfs_blocks_count ; j++) { nvfs_mgroup->nvfs_metadata[j].nvfs_state = NVFS_IO_INIT; } - nvfsio->cpuvaddr += nvfsio->nvfs_active_pages_start << PAGE_SHIFT; - nvfs_dbg("cpuvaddr: %llx active shadow pages range set to (%ld - %ld) \n", + nvfsio->cpuvaddr += nvfsio->nvfs_active_blocks_start << NVFS_BLOCK_SHIFT; + nvfs_dbg("cpuvaddr: %llx active shadow blocks range set to (%ld - %ld) \n", (u64)nvfsio->cpuvaddr, - nvfsio->nvfs_active_pages_start, - nvfsio->nvfs_active_pages_end); + nvfsio->nvfs_active_blocks_start, + nvfsio->nvfs_active_blocks_end); return 0; } @@ -1036,14 +1064,15 @@ void nvfs_mgroup_get_gpu_index_and_off(nvfs_mgroup_ptr_t nvfs_mgroup, struct pag { unsigned long rel_page_index = (page->index % NVFS_MAX_SHADOW_PAGES); *gpu_index = nvfs_mgroup->nvfsio.cur_gpu_base_index + (rel_page_index >> PAGE_PER_GPU_PAGE_SHIFT); - *offset = (rel_page_index % GPU_PAGE_SHIFT) << PAGE_SHIFT; + if (PAGE_SIZE < GPU_PAGE_SIZE) + *offset = (rel_page_index % GPU_PAGE_SHIFT) << PAGE_SHIFT; } uint64_t nvfs_mgroup_get_gpu_physical_address(nvfs_mgroup_ptr_t nvfs_mgroup, struct page* page) { struct nvfs_gpu_args *gpu_info = &nvfs_mgroup->gpu_info; unsigned long gpu_page_index = ULONG_MAX; - pgoff_t pgoff; + pgoff_t pgoff = 0; dma_addr_t phys_base_addr, phys_start_addr; nvfs_mgroup_get_gpu_index_and_off(nvfs_mgroup, page, @@ -1060,6 +1089,10 @@ static nvfs_mgroup_ptr_t __nvfs_mgroup_from_page(struct page* page, bool check_d nvfs_mgroup_ptr_t nvfs_mgroup = NULL; nvfs_mgroup_page_ptr_t nvfs_mpage; struct nvfs_io* nvfsio = NULL; + int i = 0; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; + unsigned block_idx = 0; + bool found_first_block = false; // bailout if page mapping is not NULL if(page == NULL || page->mapping != NULL) { @@ -1081,45 +1114,64 @@ static nvfs_mgroup_ptr_t __nvfs_mgroup_from_page(struct page* page, bool check_d if (unlikely(IS_ERR(nvfs_mgroup))) return ERR_PTR(-EIO); + nvfsio = &nvfs_mgroup->nvfsio; + // check if this is a valid metadata pointing to same page - nvfs_mpage = &nvfs_mgroup->nvfs_metadata[page->index % NVFS_MAX_SHADOW_PAGES]; - if (nvfs_mpage == NULL || nvfs_mpage->nvfs_start_magic != NVFS_START_MAGIC || - nvfs_mpage->page != page) { + block_idx = (page->index % NVFS_MAX_SHADOW_PAGES) * nvfs_block_count_per_page; + for (i = block_idx; i < block_idx + nvfs_block_count_per_page; i++) { + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[i]; + if (nvfs_mpage == NULL || nvfs_mpage->nvfs_start_magic != NVFS_START_MAGIC) { + nvfs_mgroup_put(nvfs_mgroup); + WARN_ON_ONCE(1); + return NULL; + } else { + if (nvfs_mpage->page == page) { + found_first_block = true; + break; + } + } + } + + if (!found_first_block) { nvfs_mgroup_put(nvfs_mgroup); WARN_ON_ONCE(1); return NULL; } - nvfsio = &nvfs_mgroup->nvfsio; - // check if the page start offset is correct within the group - if(nvfsio->nvfs_active_pages_start > (page->index % NVFS_MAX_SHADOW_PAGES)) { + if((nvfsio->nvfs_active_blocks_start/nvfs_block_count_per_page) > (page->index % NVFS_MAX_SHADOW_PAGES)) { nvfs_mgroup_put(nvfs_mgroup); return ERR_PTR(-EIO); } // check if the page end offset is correct within the group - if(nvfsio->nvfs_active_pages_end < (page->index % NVFS_MAX_SHADOW_PAGES)) { + if((nvfsio->nvfs_active_blocks_end/nvfs_block_count_per_page) < (page->index % NVFS_MAX_SHADOW_PAGES)) { nvfs_mgroup_put(nvfs_mgroup); return ERR_PTR(-EIO); } - if (check_dma_error && nvfs_mpage->nvfs_state == NVFS_IO_DMA_ERROR) { - nvfs_mgroup_put(nvfs_mgroup); - return ERR_PTR(-EIO); + for (i = block_idx; i < block_idx + nvfs_block_count_per_page; i++) { + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[i]; + if (check_dma_error && nvfs_mpage->nvfs_state == NVFS_IO_DMA_ERROR) { + nvfs_mgroup_put(nvfs_mgroup); + return ERR_PTR(-EIO); + } } return nvfs_mgroup; } -nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int npages) +nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int nblocks, unsigned int start_offset) { nvfs_mgroup_ptr_t nvfs_mgroup = NULL; nvfs_mgroup_page_ptr_t nvfs_mpage = NULL, prev_mpage = NULL; struct nvfs_io* nvfsio = NULL; unsigned i = 0; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; + unsigned block_idx; + unsigned cur_page; - nvfs_dbg("setting for %d npages from page: %p \n", npages, page); + nvfs_dbg("setting for %d nblocks from page: %p and start offset :%u\n", nblocks, page, start_offset); nvfs_mgroup = __nvfs_mgroup_from_page(page, false); if (!nvfs_mgroup) return NULL; @@ -1127,19 +1179,25 @@ nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int npages) if (unlikely(IS_ERR(nvfs_mgroup))) return ERR_PTR(-EIO); - for (i = 0; i < npages ; i++) { + block_idx = (page->index % NVFS_MAX_SHADOW_PAGES) * nvfs_block_count_per_page; + block_idx += ((start_offset) / NVFS_BLOCK_SIZE); + for (i = 0; i < nblocks ; i++) { // check the page range is not beyond the issued range nvfsio = &nvfs_mgroup->nvfsio; - if(((page->index + i) % NVFS_MAX_SHADOW_PAGES) > nvfsio->nvfs_active_pages_end) { + cur_page = i / nvfs_block_count_per_page; + if(((page->index + cur_page) % NVFS_MAX_SHADOW_PAGES) > (nvfsio->nvfs_active_blocks_end/nvfs_block_count_per_page)) { WARN_ON_ONCE(1); + nvfs_dbg("page index: %lu cur_page: %u, blockend: %lu\n", page->index, cur_page, + nvfsio->nvfs_active_blocks_end); goto err; } - nvfs_mpage = &nvfs_mgroup->nvfs_metadata[(page->index +i) % NVFS_MAX_SHADOW_PAGES]; + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[block_idx + i]; - // check the pages are indeed contiguous - if (prev_mpage && page_to_pfn(nvfs_mpage->page) != - (page_to_pfn(prev_mpage->page) + 1)) { + // check the blocks are in same page or in indeed contiguous pages + if (prev_mpage && + (page_to_pfn(nvfs_mpage->page) != page_to_pfn(prev_mpage->page) + 1) && + (page_to_pfn(nvfs_mpage->page) != page_to_pfn(prev_mpage->page))) { WARN_ON_ONCE(1); goto err; @@ -1152,7 +1210,8 @@ nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int npages) goto err; } - nvfs_dbg("%ld page dma start %p\n", (page->index + i), nvfs_mpage); + nvfs_dbg("%u block dma start %p\n", block_idx + i, nvfs_mpage); + // Updating block metadata state nvfs_mpage->nvfs_state = NVFS_IO_DMA_START; prev_mpage = nvfs_mpage; } @@ -1167,11 +1226,61 @@ nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int npages) return ERR_PTR(-EIO); } +int nvfs_mgroup_metadata_set_dma_state(struct page* page, + struct nvfs_io_mgroup *nvfs_mgroup, + unsigned int bv_len, + unsigned int bv_offset) +{ + unsigned int start_block = 0; + unsigned int end_block = 0; + nvfs_mgroup_page_ptr_t nvfs_mpage; + int nvfs_block_count_per_page = (int) PAGE_SIZE / NVFS_BLOCK_SIZE; + int block_idx = 0; + int i; + + if (!nvfs_mgroup) + return -EIO; + + if (unlikely(IS_ERR(nvfs_mgroup))) + return -EIO; + + start_block = METADATA_BLOCK_START_INDEX(bv_offset); + end_block = METADATA_BLOCK_END_INDEX(bv_offset, bv_len); + block_idx = (page->index % NVFS_MAX_SHADOW_PAGES) * nvfs_block_count_per_page; + + // For each + for (i = block_idx + start_block; i <= block_idx + end_block; i++) { + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[i]; + + if(nvfs_mpage->nvfs_state != NVFS_IO_QUEUED && + nvfs_mpage->nvfs_state != NVFS_IO_DMA_START) + { + nvfs_err("%s: found page in wrong state: %d, page->index: %ld at block: %d\n", + __func__, nvfs_mpage->nvfs_state, page->index % NVFS_MAX_SHADOW_PAGES, i); + nvfs_mpage->nvfs_state = NVFS_IO_DMA_ERROR; + nvfs_mgroup_put(nvfs_mgroup); + WARN_ON_ONCE(1); + return (-EIO); + } + + if (nvfs_mpage->nvfs_state == NVFS_IO_QUEUED) { + nvfs_mpage->nvfs_state = NVFS_IO_DMA_START; + nvfs_dbg("%s : setting page in IO_QUEUED, page->index: %ld at block: %d\n", + __func__, page->index % NVFS_MAX_SHADOW_PAGES, i); + } else if (nvfs_mpage->nvfs_state == NVFS_IO_DMA_START) { + nvfs_dbg("%s : setting page in IO_DMA_START, page->index: %ld at block: %d\n", + __func__, page->index % NVFS_MAX_SHADOW_PAGES, i); + } + } + + // success + return 0; +} + nvfs_mgroup_ptr_t nvfs_mgroup_from_page(struct page* page) { nvfs_mgroup_ptr_t nvfs_mgroup = NULL; nvfs_mgroup_page_ptr_t nvfs_mpage; - nvfs_mgroup = __nvfs_mgroup_from_page(page, false); if (!nvfs_mgroup) return NULL; @@ -1179,28 +1288,19 @@ nvfs_mgroup_ptr_t nvfs_mgroup_from_page(struct page* page) if (unlikely(IS_ERR(nvfs_mgroup))) return ERR_PTR(-EIO); - nvfs_mpage = &nvfs_mgroup->nvfs_metadata[page->index % NVFS_MAX_SHADOW_PAGES]; - - if(nvfs_mpage->nvfs_state != NVFS_IO_QUEUED && - nvfs_mpage->nvfs_state != NVFS_IO_DMA_START) - { - nvfs_err("%s: found page in wrong state: %d, page->index: %ld \n", - __func__, nvfs_mpage->nvfs_state, page->index % NVFS_MAX_SHADOW_PAGES); - nvfs_mpage->nvfs_state = NVFS_IO_DMA_ERROR; - nvfs_mgroup_put(nvfs_mgroup); - WARN_ON_ONCE(1); - return ERR_PTR(-EIO); - } - - if (nvfs_mpage->nvfs_state == NVFS_IO_QUEUED) { - nvfs_mpage->nvfs_state = NVFS_IO_DMA_START; - nvfs_dbg("%s : setting page in IO_QUEUED, page->index: %ld \n", - __func__, page->index % NVFS_MAX_SHADOW_PAGES); - } else if (nvfs_mpage->nvfs_state == NVFS_IO_DMA_START) { - nvfs_dbg("%s : setting page in IO_DMA_START, page->index: %ld \n", - __func__, page->index % NVFS_MAX_SHADOW_PAGES); + if (PAGE_SIZE < GPU_PAGE_SIZE) { + nvfs_mpage = &nvfs_mgroup->nvfs_metadata[page->index % NVFS_MAX_SHADOW_PAGES]; + if(nvfs_mpage->nvfs_state != NVFS_IO_QUEUED && + nvfs_mpage->nvfs_state != NVFS_IO_DMA_START) + { + nvfs_err("%s: found page in wrong state: %d, page->index: %ld \n", + __func__, nvfs_mpage->nvfs_state, page->index % NVFS_MAX_SHADOW_PAGES); + nvfs_mpage->nvfs_state = NVFS_IO_DMA_ERROR; + nvfs_mgroup_put(nvfs_mgroup); + WARN_ON_ONCE(1); + return ERR_PTR(-EIO); + } } - return nvfs_mgroup; } @@ -1229,12 +1329,14 @@ bool nvfs_is_gpu_page(struct page *page) /* nvfs_check_gpu_page_and_error : checks if a page belongs to a GPU request and if it has any gpu dma mapping error * @page (in) : start page pointer + * @offset(in) : offset in page + * @len(in) : len starting at offset * @nr_pages (in) : number of pages from the start page * @returns : 1 on GPU page without error * -1 on GPU page with dma mapping error * 0 on a non-GPU page */ -int nvfs_check_gpu_page_and_error(struct page *page) +int nvfs_check_gpu_page_and_error(struct page *page, unsigned int offset, unsigned int len) { nvfs_mgroup_ptr_t nvfs_mgroup; diff --git a/src/nvfs-mmap.h b/src/nvfs-mmap.h index 99e3f2a..fafae4d 100644 --- a/src/nvfs-mmap.h +++ b/src/nvfs-mmap.h @@ -25,10 +25,20 @@ #include #include #include +#include #include "nv-p2p.h" +#define KiB4 (4096) +#define NVFS_BLOCK_SIZE (4096) +#define NVFS_BLOCK_SHIFT (12) +#define METADATA_BLOCK_INDEX(bv_offset) ((bv_offset) / NVFS_BLOCK_SIZE) +#define METADATA_BLOCK_START_INDEX(bv_offset) (METADATA_BLOCK_INDEX(bv_offset)) +#define METADATA_BLOCK_END_INDEX(bv_offset, bv_len) (METADATA_BLOCK_INDEX((bv_offset) + (bv_len) - 1)) #define NVFS_MIN_BASE_INDEX ((unsigned long)1L<<32) -#define NVFS_MAX_SHADOW_PAGES_ORDER 12 +#ifndef NVFS_PAGE_TO_BLOCK_ORDER +#define NVFS_PAGE_TO_BLOCK_ORDER ((int)ilog2(PAGE_SIZE / NVFS_BLOCK_SIZE)) +#endif +#define NVFS_MAX_SHADOW_PAGES_ORDER (12 - NVFS_PAGE_TO_BLOCK_ORDER) #define NVFS_MAX_SHADOW_ALLOCS_ORDER 12 #define NVFS_MAX_SHADOW_PAGES (1 << NVFS_MAX_SHADOW_PAGES_ORDER) @@ -38,7 +48,7 @@ struct nvfs_gpu_args; -enum nvfs_page_state { +enum nvfs_block_state { NVFS_IO_FREE = 0, // set on init NVFS_IO_ALLOC, NVFS_IO_INIT, @@ -98,8 +108,8 @@ typedef struct nvfs_io { bool check_sparse; // set if file is sparse bool rw_stats_enabled; unsigned long cur_gpu_base_index; // starting gpu index in this op - unsigned long nvfs_active_pages_start; - unsigned long nvfs_active_pages_end; + unsigned long nvfs_active_blocks_start; + unsigned long nvfs_active_blocks_end; nvfs_metastate_enum state; // set if the io encountered sparse data int retrycnt; // retry count for retriable errors wait_queue_head_t rw_wq; // wait queue for serializing parallel dma req @@ -121,6 +131,7 @@ struct nvfs_gpu_args { u64 gpuvaddr; // GPU Buffer address u64 gpu_buf_len; // length of gpu buffer struct page *end_fence_page; // end fence addr pinned page + u32 offset_in_page; // end_fence_addr byte offset in end_fence_page atomic_t io_state; // IO state transitions atomic_t dma_mapping_in_progress; // Mapping in progress for a specific PCI device atomic_t callback_invoked; @@ -134,7 +145,7 @@ struct nvfs_gpu_args { struct nvfs_io_metadata { u64 nvfs_start_magic; // start magic of metadata - enum nvfs_page_state nvfs_state; + enum nvfs_block_state nvfs_state; struct page *page; } __attribute__((packed, aligned(8))); @@ -161,7 +172,7 @@ struct nvfs_io_mgroup { struct hlist_node hash_link; u64 cpu_base_vaddr; unsigned long base_index; - unsigned long nvfs_pages_count; + unsigned long nvfs_blocks_count; struct page **nvfs_ppages; struct nvfs_io_metadata *nvfs_metadata; struct nvfs_gpu_args gpu_info; @@ -183,15 +194,15 @@ int nvfs_mgroup_mmap(struct file *filp, struct vm_area_struct *vma); nvfs_mgroup_ptr_t nvfs_mgroup_get(unsigned long base_index); void nvfs_mgroup_put(nvfs_mgroup_ptr_t nvfs_mgroup); void nvfs_mgroup_put_dma(nvfs_mgroup_ptr_t nvfs_mgroup); -void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_page_state state, bool validate, bool update_nvfsio); +void nvfs_mgroup_check_and_set(nvfs_mgroup_ptr_t nvfs_mgroup, enum nvfs_block_state state, bool validate, bool update_nvfsio); nvfs_mgroup_ptr_t nvfs_mgroup_from_page(struct page* page); -nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int npages); +nvfs_mgroup_ptr_t nvfs_mgroup_from_page_range(struct page* page, int nblocks, unsigned int start_offset); bool nvfs_is_gpu_page(struct page *page); unsigned int nvfs_gpu_index(struct page *page); -int nvfs_check_gpu_page_and_error(struct page *page); +int nvfs_check_gpu_page_and_error(struct page *page, unsigned int offset, unsigned int len); unsigned int nvfs_device_priority(struct device *dev, unsigned int gpu_index); -int nvfs_mgroup_fill_mpages(nvfs_mgroup_ptr_t nvfs_mgroup, unsigned nr_pages); +int nvfs_mgroup_fill_mpages(nvfs_mgroup_ptr_t nvfs_mgroup, unsigned nr_blocks); nvfs_mgroup_ptr_t nvfs_mgroup_pin_shadow_pages(u64 cpuvaddr, unsigned long length); void nvfs_mgroup_unpin_shadow_pages(nvfs_mgroup_ptr_t nvfs_mgroup); nvfs_mgroup_ptr_t nvfs_get_mgroup_from_vaddr(u64 cpuvaddr); @@ -200,4 +211,9 @@ uint64_t nvfs_mgroup_get_gpu_physical_address(nvfs_mgroup_ptr_t nvfs_mgroup, str void nvfs_mgroup_put_pending_mgroups(void); void nvfs_mgroup_get_ref(nvfs_mgroup_ptr_t mgroup); bool nvfs_mgroup_put_ref(nvfs_mgroup_ptr_t mgroup); +int is_nvfs_metadata_block_fill_needed(void); +int nvfs_mgroup_metadata_set_dma_state(struct page* page, + struct nvfs_io_mgroup *nvfs_mgroup, + unsigned int bv_len, + unsigned int bv_offset); #endif /* NVFS_MMAP_H */ diff --git a/src/nvfs-rdma.c b/src/nvfs-rdma.c index 27c9f8a..9ba2992 100644 --- a/src/nvfs-rdma.c +++ b/src/nvfs-rdma.c @@ -55,7 +55,7 @@ int nvfs_set_rdma_reg_info_to_mgroup( goto error; } - shadow_buf_size = (nvfs_mgroup->nvfs_pages_count) * PAGE_SIZE; + shadow_buf_size = (nvfs_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE; nvfs_dbg("SG: %s nvfs_mgroup = %p\n GPU vaddr: %llx", __func__, @@ -133,7 +133,7 @@ int nvfs_get_rdma_reg_info_from_mgroup( printk("SG Error: nvfs_mgroup NULL\n"); return -EINVAL; } - shadow_buf_size = (nvfs_mgroup->nvfs_pages_count) * PAGE_SIZE; + shadow_buf_size = (nvfs_mgroup->nvfs_blocks_count) * NVFS_BLOCK_SIZE; nvfs_dbg("%s nvfs_mgroup = %p sbuf size = %llu\n", __func__, nvfs_mgroup, shadow_buf_size); @@ -162,8 +162,8 @@ int nvfs_get_rdma_reg_info_from_mgroup( tmp_vaddr = rdma_reg_info_args->nvfs_rdma_info.rem_vaddr; i = 0; for_each_sg(sgl, sg, tmp_nents, i) { - tmp_offset = tmp_vaddr % PAGE_SIZE; - tmp_size = PAGE_SIZE - tmp_offset; + tmp_offset = tmp_vaddr % NVFS_BLOCK_SIZE; + tmp_size = NVFS_BLOCK_SIZE - tmp_offset; #ifdef HAVE_PIN_USER_PAGES_FAST if(pin_user_pages_fast(tmp_vaddr, 1, 1, &tmp_page) < 0) { #else diff --git a/src/nvfs-vers.h b/src/nvfs-vers.h index 9145a41..1af6f51 100644 --- a/src/nvfs-vers.h +++ b/src/nvfs-vers.h @@ -29,7 +29,7 @@ #define NVFS_DRIVER_MINOR_VERSION 17 //2-bytes // template for build version -#define NVFS_DRIVER_PATCH_VERSION 0 +#define NVFS_DRIVER_PATCH_VERSION 4 static inline unsigned int nvfs_driver_version(void) { return (NVFS_DRIVER_MAJOR_VERSION << 16) | NVFS_DRIVER_MINOR_VERSION;