From bd984fbb1b6e4bb42d7567dd3dd0e5d27406e93d Mon Sep 17 00:00:00 2001 From: XuZhengguo Date: Mon, 11 Nov 2024 09:36:26 +0800 Subject: [PATCH] [Decode] Add cache bucket to optimization resource allocation This is to support bo recycle on xe drm --- .../common/os/mos_context_specific_next.cpp | 3 +- .../linux/common/os/xe/mos_bufmgr_xe.c | 239 ++++++++++++++++++ 2 files changed, 241 insertions(+), 1 deletion(-) diff --git a/media_softlet/linux/common/os/mos_context_specific_next.cpp b/media_softlet/linux/common/os/mos_context_specific_next.cpp index 15a255e583d..fae0cc82990 100644 --- a/media_softlet/linux/common/os/mos_context_specific_next.cpp +++ b/media_softlet/linux/common/os/mos_context_specific_next.cpp @@ -136,7 +136,8 @@ MOS_STATUS OsContextSpecificNext::Init(DDI_DEVICE_CONTEXT ddiDriverContext) } if (m_platformInfo.eProductFamily == IGFX_METEORLAKE || - m_platformInfo.eProductFamily == IGFX_ARROWLAKE) + m_platformInfo.eProductFamily == IGFX_ARROWLAKE || + m_platformInfo.eProductFamily == IGFX_LUNARLAKE) { ReadUserSetting( userSettingPtr, diff --git a/media_softlet/linux/common/os/xe/mos_bufmgr_xe.c b/media_softlet/linux/common/os/xe/mos_bufmgr_xe.c index 285ac729173..1246087691c 100644 --- a/media_softlet/linux/common/os/xe/mos_bufmgr_xe.c +++ b/media_softlet/linux/common/os/xe/mos_bufmgr_xe.c @@ -86,6 +86,7 @@ typedef struct MOS_OCA_EXEC_LIST_INFO mos_oca_exec_list_info; #define PAGE_SIZE_4K (1ull << 12) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) //mos_xe_mem_class currently used as index of default_alignment enum mos_xe_mem_class @@ -159,6 +160,36 @@ typedef struct mos_xe_device { struct drm_xe_query_uc_fw_version uc_versions[UC_TYPE_MAX]; } mos_xe_device; +struct mos_xe_gem_bo_bucket { + drmMMListHead sys_head; + drmMMListHead vram_head; + unsigned long size; +}; + +struct mos_xe_bucket_lab { +/** + * 4K aligned for all + */ +#define CACHE_BUCKET_MODE_DEFAULT 0 +/** + * 64K aligned for size < 1M and 4K aligned for size >=1M + */ +#define CACHE_BUCKET_MODE_64K 1 +/** + * 2M aligned for size > 1M and 4K aligned for size >= 1M + */ +#define CACHE_BUCKET_MODE_2M 2 +/** + * 64K aligned for size <=1M and 2M aligned for size > 1M + */ +#define CACHE_BUCKET_MODE_64K_2M 3 + struct mos_xe_gem_bo_bucket cache_buckets[64]; + int num_buckets; + uint64_t max_cache_size; + uint8_t cache_mode; + bool enable_bo_reuse; +}; + typedef struct mos_xe_bufmgr_gem { struct mos_bufmgr bufmgr; @@ -210,6 +241,8 @@ typedef struct mos_xe_bufmgr_gem { uint32_t default_alignment[MOS_XE_MEM_CLASS_MAX] = {PAGE_SIZE_4K, PAGE_SIZE_4K}; //End of Note + struct mos_xe_bucket_lab bucket_lab; + /** * Indicates whether gpu-gpu and cpu-gpu synchronization is disabled. * This is mainly for debug purpose, and synchronizarion should be always enabled by default. @@ -421,6 +454,11 @@ int mos_query_engines_xe(struct mos_bufmgr *bufmgr, void *engine_map); static void mos_gem_bo_wait_rendering_xe(struct mos_linux_bo *bo); +static struct mos_xe_gem_bo_bucket* +__mos_gem_find_bucket_xe( + struct mos_xe_gem_bo_bucket *buckets, + int num_buckets, uint64_t alloc_size); + static struct mos_xe_bufmgr_gem * mos_bufmgr_gem_find(int fd) { @@ -1247,6 +1285,8 @@ mos_bo_alloc_xe(struct mos_bufmgr *bufmgr, struct mos_xe_bo_gem *bo_gem; struct drm_xe_gem_create create; uint32_t bo_align = alloc->alignment; + struct mos_xe_bucket_lab *lab = &bufmgr_gem->bucket_lab; + struct mos_xe_gem_bo_bucket *bucket = nullptr; int ret; /** @@ -1289,6 +1329,22 @@ mos_bo_alloc_xe(struct mos_bufmgr *bufmgr, create.vm_id = 0; create.size = ALIGN(alloc->size, bo_align); + /** + * Find a bucket + * 1, 4K cache bucket: + * 1) 4K aligned size -> all buckets workable + * 2) 64K aligned size -> only bucket whose bucket.size % 64k == 0 workable + * 2, 64K cache bucket: + * 1) 4K/64K aligned size -> all buckets workable + * 3, 2M cache bucket: + * 1) 4K/64K aligned size -> all bucket workable + */ + bucket = __mos_gem_find_bucket_xe(lab->cache_buckets, lab->num_buckets, create.size); + if (bucket) + { + create.size = bucket->size; + } + /** * Note: current, it only supports WB/ WC while UC and other cache are not allowed. */ @@ -3296,6 +3352,186 @@ mos_get_driver_info_xe(struct mos_bufmgr *bufmgr, struct LinuxDriverInfo *drvInf return MOS_XE_SUCCESS; } +static void +__mos_gem_add_bucket_xe(struct mos_xe_bufmgr_gem *bufmgr_gem, int size) +{ + mos_xe_bucket_lab *lab = &bufmgr_gem->bucket_lab; + unsigned int i = lab->num_buckets; + + if (i < ARRAY_SIZE(lab->cache_buckets)) + { + DRMINITLISTHEAD(&lab->cache_buckets[i].sys_head); + DRMINITLISTHEAD(&lab->cache_buckets[i].vram_head); + lab->cache_buckets[i].size = size; + lab->num_buckets++; + } + else + { + MOS_DRM_ASSERTMESSAGE("Unable to add more bucket because of cache bucket full"); + } +} + +static struct mos_xe_gem_bo_bucket* +__mos_gem_find_bucket_xe( + struct mos_xe_gem_bo_bucket *buckets, + int num_buckets, uint64_t alloc_size) +{ + int l = 0, r = num_buckets; + while (l < r) + { + int mid = l + (r - l) / 2; + if (buckets[mid].size < alloc_size) + { + l = mid + 1; + } + else + { + r = mid; + } + } + if (buckets[l].size >= alloc_size) + { + return &buckets[l]; + } + else + { + return nullptr; + } +} + +static void +__mos_gem_init_cache_buckets(struct mos_xe_bufmgr_gem *bufmgr_gem) +{ + unsigned long size, max_cache_size = 64 * 1024 * 1024; + struct mos_xe_bucket_lab *lab = &bufmgr_gem->bucket_lab; + lab->cache_mode = CACHE_BUCKET_MODE_DEFAULT; + lab->max_cache_size = max_cache_size; + + /* OK, so power of two buckets was too wasteful of memory. + * Give 3 other sizes between each power of two, to hopefully + * cover things accurately enough. (The alternative is + * probably to just go for exact matching of sizes, and assume + * that for things like composited window resize the tiled + * width/height alignment and rounding of sizes to pages will + * get us useful cache hit rates anyway) + */ + __mos_gem_add_bucket_xe(bufmgr_gem, 4096); + __mos_gem_add_bucket_xe(bufmgr_gem, 4096 * 2); + __mos_gem_add_bucket_xe(bufmgr_gem, 4096 * 3); + + /* Initialize the linked lists for BO reuse cache. */ + for (size = 4 * 4096; size <= max_cache_size; size *= 2) + { + __mos_gem_add_bucket_xe(bufmgr_gem, size); + + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 1 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 2 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 3 / 4); + } +} + +static void +__mos_gem_cleanup_cache_bucket_xe(struct mos_xe_bufmgr_gem *bufmgr_gem) +{ + struct mos_xe_bucket_lab *lab = &bufmgr_gem->bucket_lab; + for (int i = 0; i < lab->num_buckets; i++) { + struct mos_xe_gem_bo_bucket *bucket = + &lab->cache_buckets[i]; + + lab->cache_buckets[i].size = 0; + } + lab->num_buckets = 0; + lab->cache_mode = CACHE_BUCKET_MODE_DEFAULT; + lab->max_cache_size = 0; +} + +static void +mos_gem_realloc_cache_bucket_xe(struct mos_bufmgr *bufmgr, uint8_t alloc_mode) +{ + unsigned long size, max_cache_size = 64 * 1024 * 1024, unit_size; + struct mos_xe_bufmgr_gem *bufmgr_gem = (struct mos_xe_bufmgr_gem *)bufmgr; + struct mos_xe_bucket_lab *lab = &bufmgr_gem->bucket_lab; + + __mos_gem_cleanup_cache_bucket_xe(bufmgr_gem); + + lab->cache_mode = alloc_mode; + lab->max_cache_size = max_cache_size; + + + /* OK, so power of two buckets was too wasteful of memory. + * Give 3 other sizes between each power of two, to hopefully + * cover things accurately enough. (The alternative is + * probably to just go for exact matching of sizes, and assume + * that for things like composited window resize the tiled + * width/height alignment and rounding of sizes to pages will + * get us useful cache hit rates anyway) + */ + /* alloc_mode 0 is default alloc_mode + * alloc_mode 1 rounding up to 64K for all < 1M + * alloc_mode 2 rounding up to 2M for size> 1M + * alloc_mode 3 rounding up to 2M for size > 1M and 64K for size <= 1M */ + if ( alloc_mode > 3 ) + alloc_mode = 0; + + if ( CACHE_BUCKET_MODE_DEFAULT == alloc_mode + || CACHE_BUCKET_MODE_2M == alloc_mode) + { + // < 1M normal alloc_mode + __mos_gem_add_bucket_xe(bufmgr_gem, 4096); + __mos_gem_add_bucket_xe(bufmgr_gem, 4096 * 2); + __mos_gem_add_bucket_xe(bufmgr_gem, 4096 * 3); + /* Initialize the linked lists for BO reuse cache. */ + for (size = 4 * 4096; size < 1024 * 1024; size *= 2) + { + __mos_gem_add_bucket_xe(bufmgr_gem, size); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 1 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 2 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 3 / 4); + } + + __mos_gem_add_bucket_xe(bufmgr_gem, 1024 * 1024); + } + if (CACHE_BUCKET_MODE_64K == alloc_mode + || CACHE_BUCKET_MODE_64K_2M == alloc_mode) + { + // < 1M 64k alignment + unit_size = 64 * 1024; + for (size = unit_size; size <= 1024 * 1024; size += unit_size) + { + __mos_gem_add_bucket_xe(bufmgr_gem, size); + } + } + if ( CACHE_BUCKET_MODE_DEFAULT == alloc_mode + || CACHE_BUCKET_MODE_64K == alloc_mode) + { + //> 1M is normal alloc_mode + __mos_gem_add_bucket_xe(bufmgr_gem, 1280 * 1024); + __mos_gem_add_bucket_xe(bufmgr_gem, 1536 * 1024); + __mos_gem_add_bucket_xe(bufmgr_gem, 1792 * 1024); + + for (size = 2 * 1024 * 1024; size < max_cache_size; size *= 2) + { + __mos_gem_add_bucket_xe(bufmgr_gem, size); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 1 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 2 / 4); + __mos_gem_add_bucket_xe(bufmgr_gem, size + size * 3 / 4); + } + } + if ( CACHE_BUCKET_MODE_2M == alloc_mode + || CACHE_BUCKET_MODE_64K_2M == alloc_mode) + { + //> 1M rolling to 2M + unit_size = 2 * 1024 * 1024; + __mos_gem_add_bucket_xe(bufmgr_gem, unit_size); + __mos_gem_add_bucket_xe(bufmgr_gem, 3 * 1024 * 1024); + + for (size = 4 * 1024 * 1024; size <= max_cache_size; size += unit_size) + { + __mos_gem_add_bucket_xe(bufmgr_gem, size); + } + } +} + /** * Initializes the GEM buffer manager, which uses the kernel to allocate, map, * and manage map buffer objections. @@ -3356,6 +3592,7 @@ mos_bufmgr_gem_init_xe(int fd, int batch_size) bufmgr_gem->bufmgr.bo_unmap_wc = mos_bo_unmap_wc_xe; bufmgr_gem->bufmgr.bo_create_from_prime = mos_bo_create_from_prime_xe; bufmgr_gem->bufmgr.bo_export_to_prime = mos_bo_export_to_prime_xe; + bufmgr_gem->bufmgr.realloc_cache = mos_gem_realloc_cache_bucket_xe; bufmgr_gem->bufmgr.get_devid = mos_get_devid_xe; bufmgr_gem->bufmgr.query_engines_count = mos_query_engines_count_xe; bufmgr_gem->bufmgr.query_engines = mos_query_engines_xe; @@ -3419,6 +3656,8 @@ mos_bufmgr_gem_init_xe(int fd, int batch_size) DRMLISTADD(&bufmgr_gem->managers, &bufmgr_list); DRMINITLISTHEAD(&bufmgr_gem->named); + __mos_gem_init_cache_buckets(bufmgr_gem); + mos_vma_heap_init(&bufmgr_gem->vma_heap[MEMZONE_SYS], MEMZONE_SYS_START, MEMZONE_SYS_SIZE); mos_vma_heap_init(&bufmgr_gem->vma_heap[MEMZONE_DEVICE], MEMZONE_DEVICE_START, MEMZONE_DEVICE_SIZE); mos_vma_heap_init(&bufmgr_gem->vma_heap[MEMZONE_PRIME], MEMZONE_PRIME_START, MEMZONE_PRIME_SIZE);