Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5b070d1

Browse files
Alexander Stetsenkoallanjude
Alexander Stetsenko
andcommittedSep 4, 2024·
Implement parallel dbuf eviction
In the previous code, dbuf_evict_thread() would called dbuf_evict_one() in a look while dbuf_cache_above_lowater(). dbuf_evict_one() would select a random sublist from the dbuf cache, then walk it from the tail forward, attempting to acquire the lock on each object until it succeeded, then evict that object and return. As the name suggests, it would evict only a single object from the cache. However, evicting one object is not likely to bring us below the desired low water mark, so dbuf_evict_one() will be called again, where it will loop over all of the same busy objects again, until it founds one it can evict. This has been replaced with dbuf_evict_many() which takes a specific sublist as a parameter, as well as a desired amount of data to evict. It then walks the sublist from the tail forward, evicting what it can until the number of bytes evicted satisfies the input parameter or the head of the sublist is reached. The dbuf_evict_thread now runs is parallel as well, allowing it to keep up with demand more easily. For the dbuf cache, if the single thread was not able to keep up, ZFS would shift the work of evicting some items to each incoming I/O thread. While that is still the case it should be seen much less often now that dbuf_evict is more efficient and no longer bottlenecked to a single thread. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Allan Jude <[email protected]>
1 parent d4d7945 commit 5b070d1

File tree

2 files changed

+186
-15
lines changed

2 files changed

+186
-15
lines changed
 

‎man/man4/zfs.4

+11-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
.\" own identifying information:
1717
.\" Portions Copyright [yyyy] [name of copyright owner]
1818
.\"
19-
.Dd June 27, 2024
19+
.Dd August 28, 2024
2020
.Dt ZFS 4
2121
.Os
2222
.
@@ -73,6 +73,16 @@ When set to
7373
.Sy 0
7474
the array is dynamically sized based on total system memory.
7575
.
76+
.It Sy dbuf_evict_parallel Ns = Ns Sy 0 Pq uint
77+
When set to 1, ZFS will use up to
78+
.Sy dbuf_evict_threads
79+
threads to evict dbuf data in parallel, improving the responsiveness
80+
of ZFS to memory pressure.
81+
.
82+
.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq uint
83+
Sets the maximum number of dbuf eviction threads to be used.
84+
When set to 0, ZFS uses half the available CPUs or 16, whichever is less.
85+
.
7686
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
7787
dnode slots allocated in a single operation as a power of 2.
7888
The default value minimizes lock contention for the bulk operation performed.

‎module/zfs/dbuf.c

+175-14
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
183183
*/
184184
static kmem_cache_t *dbuf_kmem_cache;
185185
static taskq_t *dbu_evict_taskq;
186+
static taskq_t *dbuf_evict_taskq;
186187

187188
static kthread_t *dbuf_cache_evict_thread;
188189
static kmutex_t dbuf_evict_lock;
@@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6;
237238
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238239
static uint_t dbuf_mutex_cache_shift = 0;
239240

241+
/*
242+
* Number of dbuf_evict threads
243+
*/
244+
static uint_t dbuf_evict_threads = 0;
245+
246+
/*
247+
* The minimum number of bytes we can evict at once is a block size.
248+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
249+
* We use this value to compute a scaling factor for the eviction tasks.
250+
*/
251+
#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
252+
253+
static uint_t dbuf_evict_parallel = 0;
254+
240255
static unsigned long dbuf_cache_target_bytes(void);
241256
static unsigned long dbuf_metadata_cache_target_bytes(void);
242257

@@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void)
762777
}
763778

764779
/*
765-
* Evict the oldest eligible dbuf from the dbuf cache.
780+
* Evict the oldest eligible dbufs from the dbuf cache.
781+
* Use the multilist sublist (mls) with the provided index #idx.
766782
*/
767783
static void
768-
dbuf_evict_one(void)
784+
dbuf_evict_many(uint64_t bytes, unsigned int idx)
769785
{
770-
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
786+
int64_t evicted = 0;
787+
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
788+
marker->db_objset = NULL;
789+
790+
ASSERT3U(idx, <, multilist_get_num_sublists(
791+
&dbuf_caches[DB_DBUF_CACHE].cache));
792+
771793
multilist_sublist_t *mls = multilist_sublist_lock_idx(
772794
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
773795

774796
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
775797

776798
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
777-
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
778-
db = multilist_sublist_prev(mls, db);
779-
}
799+
multilist_sublist_insert_after(mls, db, marker);
800+
801+
while (db != NULL && evicted < bytes) {
802+
int skip = 0;
803+
while (db != NULL && (db->db_objset == NULL ||
804+
mutex_tryenter(&db->db_mtx) == 0)) {
805+
db = multilist_sublist_prev(mls, db);
806+
if (skip == 0)
807+
skip = 1;
808+
}
780809

781-
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
782-
multilist_sublist_t *, mls);
810+
if (db == NULL)
811+
break;
812+
813+
if (skip) {
814+
multilist_sublist_remove(mls, marker);
815+
multilist_sublist_insert_before(mls, db, marker);
816+
}
817+
818+
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
819+
multilist_sublist_t *, mls);
783820

784-
if (db != NULL) {
785821
multilist_sublist_remove(mls, db);
786822
multilist_sublist_unlock(mls);
787823
uint64_t size = db->db.db_size;
@@ -797,9 +833,121 @@ dbuf_evict_one(void)
797833
db->db_caching_status = DB_NO_CACHE;
798834
dbuf_destroy(db);
799835
DBUF_STAT_BUMP(cache_total_evicts);
800-
} else {
801-
multilist_sublist_unlock(mls);
836+
evicted += size + usize;
837+
838+
mls = multilist_sublist_lock_idx(
839+
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
840+
db = multilist_sublist_prev(mls, marker);
802841
}
842+
843+
multilist_sublist_remove(mls, marker);
844+
multilist_sublist_unlock(mls);
845+
kmem_cache_free(dbuf_kmem_cache, marker);
846+
}
847+
848+
typedef struct evict_arg {
849+
taskq_ent_t tqe;
850+
unsigned idx;
851+
uint64_t bytes;
852+
} evict_arg_t;
853+
854+
static void
855+
dbuf_evict_task(void *arg)
856+
{
857+
evict_arg_t *eva = arg;
858+
dbuf_evict_many(eva->bytes, eva->idx);
859+
}
860+
861+
static void
862+
dbuf_evict(void)
863+
{
864+
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) -
865+
dbuf_cache_lowater_bytes());
866+
867+
if (bytes <= 0)
868+
return;
869+
870+
unsigned idx = multilist_get_random_index(
871+
&dbuf_caches[DB_DBUF_CACHE].cache);
872+
873+
if (!dbuf_evict_parallel)
874+
return (dbuf_evict_many(bytes, idx));
875+
876+
/*
877+
* Go to the parallel eviction.
878+
*/
879+
unsigned int num_sublists = multilist_get_num_sublists(
880+
&dbuf_caches[DB_DBUF_CACHE].cache);
881+
evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists,
882+
KM_SLEEP);
883+
/*
884+
* How we scale
885+
*
886+
* Example 1, # of chunks less than # of tasks.
887+
* We have:
888+
* - 4 tasks
889+
* - 3 chunks
890+
* - 3 full col
891+
* - 0 low cols.
892+
*
893+
* The first low col index is 3.
894+
* The tasks #0-#2 evict 1 chunk each.
895+
*
896+
* 0 | 1 | 2 | 3 |
897+
* +===+===+===+===+
898+
* | x | x | x | |
899+
* +---+---+---+---+
900+
*
901+
* Example 2, # of chunks more than # of tasks.
902+
* We have:
903+
* - 4 tasks
904+
* - 9 chunks
905+
* - 1 full col
906+
* - 3 low cols
907+
*
908+
* The first low col index is 1.
909+
* The task #0 evicts 3 chunks, the others evict 2 chunks each.
910+
*
911+
* 0 | 1 | 2 | 3 |
912+
* +===+===+===+===+
913+
* | x | x | x | x |
914+
* +---+---+---+---+
915+
* | x | x | x | x |
916+
* +---+---+---+---+
917+
* | x | | | |
918+
* +---+---+---+---+
919+
*/
920+
921+
/*
922+
* Compute number of tasks to run (n), first low col index (k),
923+
* normal and low bytes per task.
924+
*/
925+
uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1;
926+
unsigned n = nchunks < num_sublists ? nchunks : num_sublists;
927+
uint64_t fullrows = nchunks / n;
928+
unsigned lastrowcols = nchunks % n;
929+
unsigned k = (lastrowcols ? lastrowcols : n);
930+
931+
uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT;
932+
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
933+
(1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0);
934+
935+
for (unsigned i = 0; i < n; i++) {
936+
uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low;
937+
938+
evarg[i].idx = idx;
939+
evarg[i].bytes = evict;
940+
941+
taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task,
942+
&evarg[i], 0, &evarg[i].tqe);
943+
944+
/* wrap idx */
945+
if (++idx >= num_sublists)
946+
idx = 0;
947+
}
948+
949+
taskq_wait(dbuf_evict_taskq);
950+
kmem_free(evarg, sizeof (*evarg) * num_sublists);
803951
}
804952

805953
/*
@@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused)
833981
* minimize lock contention.
834982
*/
835983
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
836-
dbuf_evict_one();
984+
dbuf_evict();
837985
}
838986

839987
mutex_enter(&dbuf_evict_lock);
@@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size)
8601008
*/
8611009
if (size > dbuf_cache_target_bytes()) {
8621010
if (size > dbuf_cache_hiwater_bytes())
863-
dbuf_evict_one();
1011+
dbuf_evict();
8641012
cv_signal(&dbuf_evict_cv);
8651013
}
8661014
}
@@ -965,11 +1113,16 @@ dbuf_init(void)
9651113

9661114
dbuf_stats_init(h);
9671115

1116+
if (dbuf_evict_threads == 0)
1117+
dbuf_evict_threads = MIN(16, max_ncpus >> 1);
9681118
/*
9691119
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
9701120
* configuration is not required.
9711121
*/
9721122
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
1123+
dbuf_evict_taskq = taskq_create("dbuf_evict",
1124+
MIN(dbuf_evict_threads, max_ncpus), defclsyspri,
1125+
MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE);
9731126

9741127
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
9751128
multilist_create(&dbuf_caches[dcs].cache,
@@ -1035,6 +1188,8 @@ dbuf_fini(void)
10351188

10361189
kmem_cache_destroy(dbuf_kmem_cache);
10371190
taskq_destroy(dbu_evict_taskq);
1191+
taskq_wait(dbuf_evict_taskq);
1192+
taskq_destroy(dbuf_evict_taskq);
10381193

10391194
mutex_enter(&dbuf_evict_lock);
10401195
dbuf_evict_thread_exit = B_TRUE;
@@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
39634118
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
39644119
* ^ |
39654120
* | |
3966-
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
4121+
* +-----dbuf_destroy()<--dbuf_evict()<------------+
39674122
*
39684123
*/
39694124
void
@@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
52825437

52835438
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
52845439
"Set size of dbuf cache mutex array as log2 shift.");
5440+
5441+
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW,
5442+
"Evict from the dbuf cache in parallel using a taskq");
5443+
5444+
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW,
5445+
"Maximum number of dbuf_evict threads");

0 commit comments

Comments
 (0)
Please sign in to comment.