@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
183183static kmem_cache_t * dbuf_kmem_cache ;
184184kmem_cache_t * dbuf_dirty_kmem_cache ;
185185static taskq_t * dbu_evict_taskq ;
186+ static taskq_t * dbuf_evict_taskq ;
186187
187188static kthread_t * dbuf_cache_evict_thread ;
188189static kmutex_t dbuf_evict_lock ;
@@ -237,6 +238,24 @@ static uint_t dbuf_metadata_cache_shift = 6;
237238/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238239static uint_t dbuf_mutex_cache_shift = 0 ;
239240
241+ /*
242+ * Controls the number of dbuf eviction threads.
243+ * Possible values:
244+ * 0 (auto) compute the number of threads using a logarithmic formula.
245+ * 1 (disabled) one thread - parallel eviction is disabled.
246+ * 2+ (manual) set the number manually, limited by dbuf_evict_threads_max.
247+ */
248+ static uint_t dbuf_evict_threads = 0 ;
249+
250+ /*
251+ * The number of allocated dbuf eviction threads. This limits the maximum value
252+ * of dbuf_evict_threads.
253+ * The number is set up at module load time and depends on the initial value of
254+ * dbuf_evict_threads. If dbuf_evict_threads is set to auto, a logarithmic
255+ * function is used to compute this value. Otherwise, it is set to max_ncpus.
256+ */
257+ static uint_t dbuf_evict_threads_max ;
258+
240259static unsigned long dbuf_cache_target_bytes (void );
241260static unsigned long dbuf_metadata_cache_target_bytes (void );
242261
@@ -768,26 +787,47 @@ dbuf_cache_above_lowater(void)
768787}
769788
770789/*
771- * Evict the oldest eligible dbuf from the dbuf cache.
790+ * Evict the oldest eligible dbufs from the dbuf cache.
791+ * Use the multilist sublist (mls) with the provided index #idx.
772792 */
773793static void
774- dbuf_evict_one ( void )
794+ dbuf_evict_many ( uint64_t bytes , unsigned int idx )
775795{
776- int idx = multilist_get_random_index (& dbuf_caches [DB_DBUF_CACHE ].cache );
796+ int64_t evicted = 0 ;
797+ dmu_buf_impl_t * marker = kmem_cache_alloc (dbuf_kmem_cache , KM_SLEEP );
798+ marker -> db_objset = NULL ;
799+
800+ ASSERT3U (idx , < , multilist_get_num_sublists (
801+ & dbuf_caches [DB_DBUF_CACHE ].cache ));
802+
777803 multilist_sublist_t * mls = multilist_sublist_lock_idx (
778804 & dbuf_caches [DB_DBUF_CACHE ].cache , idx );
779805
780806 ASSERT (!MUTEX_HELD (& dbuf_evict_lock ));
781807
782808 dmu_buf_impl_t * db = multilist_sublist_tail (mls );
783- while (db != NULL && mutex_tryenter (& db -> db_mtx ) == 0 ) {
784- db = multilist_sublist_prev (mls , db );
785- }
809+ multilist_sublist_insert_after (mls , db , marker );
810+
811+ while (db != NULL && evicted < bytes ) {
812+ int skip = 0 ;
813+ while (db != NULL && (db -> db_objset == NULL ||
814+ mutex_tryenter (& db -> db_mtx ) == 0 )) {
815+ db = multilist_sublist_prev (mls , db );
816+ if (skip == 0 )
817+ skip = 1 ;
818+ }
786819
787- DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
788- multilist_sublist_t * , mls );
820+ if (db == NULL )
821+ break ;
822+
823+ if (skip ) {
824+ multilist_sublist_remove (mls , marker );
825+ multilist_sublist_insert_before (mls , db , marker );
826+ }
827+
828+ DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
829+ multilist_sublist_t * , mls );
789830
790- if (db != NULL ) {
791831 multilist_sublist_remove (mls , db );
792832 multilist_sublist_unlock (mls );
793833 uint64_t size = db -> db .db_size ;
@@ -803,9 +843,97 @@ dbuf_evict_one(void)
803843 db -> db_caching_status = DB_NO_CACHE ;
804844 dbuf_destroy (db );
805845 DBUF_STAT_BUMP (cache_total_evicts );
846+ evicted += size + usize ;
847+
848+ mls = multilist_sublist_lock_idx (
849+ & dbuf_caches [DB_DBUF_CACHE ].cache , idx );
850+ db = multilist_sublist_prev (mls , marker );
851+ }
852+
853+ multilist_sublist_remove (mls , marker );
854+ multilist_sublist_unlock (mls );
855+ kmem_cache_free (dbuf_kmem_cache , marker );
856+ }
857+
858+ typedef struct evict_arg {
859+ taskq_ent_t tqe ;
860+ unsigned idx ;
861+ uint64_t bytes ;
862+ } evict_arg_t ;
863+
864+ static void
865+ dbuf_evict_task (void * arg )
866+ {
867+ evict_arg_t * eva = arg ;
868+ dbuf_evict_many (eva -> bytes , eva -> idx );
869+ }
870+
871+ /*
872+ * The minimum number of bytes we can evict at once is a block size.
873+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
874+ */
875+ #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
876+
877+ static void
878+ dbuf_evict (void )
879+ {
880+ int64_t bytes = (zfs_refcount_count (& dbuf_caches [DB_DBUF_CACHE ].size ) -
881+ dbuf_cache_lowater_bytes ());
882+
883+ if (bytes <= 0 )
884+ return ;
885+
886+ unsigned int num_sublists = multilist_get_num_sublists (
887+ & dbuf_caches [DB_DBUF_CACHE ].cache );
888+ uint_t nthreads = MIN (num_sublists , (dbuf_evict_threads == 0 ?
889+ dbuf_evict_threads_max :
890+ MIN (dbuf_evict_threads , dbuf_evict_threads_max )));
891+ boolean_t use_evcttq = nthreads > 1 ;
892+ evict_arg_t * evarg = NULL ;
893+
894+ if (use_evcttq ) {
895+ evarg = kmem_zalloc (sizeof (* evarg ) * nthreads , KM_NOSLEEP );
896+ /*
897+ * Fall back to a regular single-threaded eviction.
898+ */
899+ if (evarg == NULL )
900+ use_evcttq = B_FALSE ;
901+ }
902+
903+ unsigned idx = multilist_get_random_index (
904+ & dbuf_caches [DB_DBUF_CACHE ].cache );
905+
906+ if (!use_evcttq )
907+ return (dbuf_evict_many (bytes , idx ));
908+
909+ /*
910+ * Go to the parallel eviction.
911+ */
912+ uint64_t evict ;
913+ uint_t ntasks ;
914+
915+ if (bytes > nthreads * MIN_EVICT_SIZE ) {
916+ evict = DIV_ROUND_UP (bytes , nthreads );
917+ ntasks = nthreads ;
806918 } else {
807- multilist_sublist_unlock (mls );
919+ evict = MIN_EVICT_SIZE ;
920+ ntasks = DIV_ROUND_UP (bytes , MIN_EVICT_SIZE );
921+ }
922+
923+ for (unsigned i = 0 ; i < ntasks ; i ++ ) {
924+ evarg [i ].idx = idx ;
925+ evarg [i ].bytes = evict ;
926+
927+ taskq_dispatch_ent (dbuf_evict_taskq , dbuf_evict_task ,
928+ & evarg [i ], 0 , & evarg [i ].tqe );
929+
930+ /* wrap idx */
931+ if (++ idx >= num_sublists )
932+ idx = 0 ;
808933 }
934+
935+ taskq_wait (dbuf_evict_taskq );
936+ kmem_free (evarg , sizeof (* evarg ) * nthreads );
809937}
810938
811939/*
@@ -839,7 +967,7 @@ dbuf_evict_thread(void *unused)
839967 * minimize lock contention.
840968 */
841969 while (dbuf_cache_above_lowater () && !dbuf_evict_thread_exit ) {
842- dbuf_evict_one ();
970+ dbuf_evict ();
843971 }
844972
845973 mutex_enter (& dbuf_evict_lock );
@@ -866,7 +994,7 @@ dbuf_evict_notify(uint64_t size)
866994 */
867995 if (size > dbuf_cache_target_bytes ()) {
868996 if (size > dbuf_cache_hiwater_bytes ())
869- dbuf_evict_one ();
997+ dbuf_evict ();
870998 cv_signal (& dbuf_evict_cv );
871999 }
8721000}
@@ -980,6 +1108,27 @@ dbuf_init(void)
9801108 * configuration is not required.
9811109 */
9821110 dbu_evict_taskq = taskq_create ("dbu_evict" , 1 , defclsyspri , 0 , 0 , 0 );
1111+ if (max_ncpus > 1 ) {
1112+ if (dbuf_evict_threads == 0 ) {
1113+ /*
1114+ * Limit the maximum number of threads by 16.
1115+ * We reach the limit when max_ncpu == 256.
1116+ */
1117+ uint_t nthreads = MIN ((highbit64 (max_ncpus ) - 1 ) +
1118+ max_ncpus / 32 , 16 );
1119+ dbuf_evict_threads_max = max_ncpus < 4 ? 1 :
1120+ nthreads ;
1121+ } else {
1122+ dbuf_evict_threads_max = max_ncpus / 2 ;
1123+ }
1124+
1125+ if (dbuf_evict_threads_max > 1 ) {
1126+ dbuf_evict_taskq = taskq_create ("dbuf_evict" ,
1127+ dbuf_evict_threads_max ,
1128+ defclsyspri , 0 , INT_MAX , TASKQ_PREPOPULATE );
1129+ }
1130+ }
1131+
9831132
9841133 for (dbuf_cached_state_t dcs = 0 ; dcs < DB_CACHE_MAX ; dcs ++ ) {
9851134 multilist_create (& dbuf_caches [dcs ].cache ,
@@ -1047,6 +1196,10 @@ dbuf_fini(void)
10471196 kmem_cache_destroy (dbuf_kmem_cache );
10481197 kmem_cache_destroy (dbuf_dirty_kmem_cache );
10491198 taskq_destroy (dbu_evict_taskq );
1199+ if (dbuf_evict_taskq != NULL ) {
1200+ taskq_wait (dbuf_evict_taskq );
1201+ taskq_destroy (dbuf_evict_taskq );
1202+ }
10501203
10511204 mutex_enter (& dbuf_evict_lock );
10521205 dbuf_evict_thread_exit = B_TRUE ;
@@ -4106,7 +4259,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
41064259 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
41074260 * ^ |
41084261 * | |
4109- * +-----dbuf_destroy()<--dbuf_evict_one ()<--------+
4262+ * +-----dbuf_destroy()<--dbuf_evict ()<---- --------+
41104263 *
41114264 */
41124265void
@@ -5440,3 +5593,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
54405593
54415594ZFS_MODULE_PARAM (zfs_dbuf , dbuf_ , mutex_cache_shift , UINT , ZMOD_RD ,
54425595 "Set size of dbuf cache mutex array as log2 shift." );
5596+
5597+ ZFS_MODULE_PARAM (zfs_arc , dbuf_ , evict_threads , UINT , ZMOD_RW ,
5598+ "Controls the number of dbuf eviction threads" );
5599+
5600+ ZFS_MODULE_PARAM (zfs_arc , dbuf_ , evict_threads_max , UINT , ZMOD_RD ,
5601+ "The number of allocated dbuf eviction threads" );
0 commit comments