@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
183
183
*/
184
184
static kmem_cache_t * dbuf_kmem_cache ;
185
185
static taskq_t * dbu_evict_taskq ;
186
+ static taskq_t * dbuf_evict_taskq ;
186
187
187
188
static kthread_t * dbuf_cache_evict_thread ;
188
189
static kmutex_t dbuf_evict_lock ;
@@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6;
237
238
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238
239
static uint_t dbuf_mutex_cache_shift = 0 ;
239
240
241
+ /*
242
+ * Number of dbuf_evict threads
243
+ */
244
+ static uint_t dbuf_evict_threads = 0 ;
245
+
246
+ /*
247
+ * The minimum number of bytes we can evict at once is a block size.
248
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
249
+ * We use this value to compute a scaling factor for the eviction tasks.
250
+ */
251
+ #define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
252
+
253
+ static uint_t dbuf_evict_parallel = 0 ;
254
+
240
255
static unsigned long dbuf_cache_target_bytes (void );
241
256
static unsigned long dbuf_metadata_cache_target_bytes (void );
242
257
@@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void)
762
777
}
763
778
764
779
/*
765
- * Evict the oldest eligible dbuf from the dbuf cache.
780
+ * Evict the oldest eligible dbufs from the dbuf cache.
781
+ * Use the multilist sublist (mls) with the provided index #idx.
766
782
*/
767
783
static void
768
- dbuf_evict_one ( void )
784
+ dbuf_evict_many ( uint64_t bytes , unsigned int idx )
769
785
{
770
- int idx = multilist_get_random_index (& dbuf_caches [DB_DBUF_CACHE ].cache );
786
+ int64_t evicted = 0 ;
787
+ dmu_buf_impl_t * marker = kmem_cache_alloc (dbuf_kmem_cache , KM_SLEEP );
788
+ marker -> db_objset = NULL ;
789
+
790
+ ASSERT3U (idx , < , multilist_get_num_sublists (
791
+ & dbuf_caches [DB_DBUF_CACHE ].cache ));
792
+
771
793
multilist_sublist_t * mls = multilist_sublist_lock_idx (
772
794
& dbuf_caches [DB_DBUF_CACHE ].cache , idx );
773
795
774
796
ASSERT (!MUTEX_HELD (& dbuf_evict_lock ));
775
797
776
798
dmu_buf_impl_t * db = multilist_sublist_tail (mls );
777
- while (db != NULL && mutex_tryenter (& db -> db_mtx ) == 0 ) {
778
- db = multilist_sublist_prev (mls , db );
779
- }
799
+ multilist_sublist_insert_after (mls , db , marker );
800
+
801
+ while (db != NULL && evicted < bytes ) {
802
+ int skip = 0 ;
803
+ while (db != NULL && (db -> db_objset == NULL ||
804
+ mutex_tryenter (& db -> db_mtx ) == 0 )) {
805
+ db = multilist_sublist_prev (mls , db );
806
+ if (skip == 0 )
807
+ skip = 1 ;
808
+ }
780
809
781
- DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
782
- multilist_sublist_t * , mls );
810
+ if (db == NULL )
811
+ break ;
812
+
813
+ if (skip ) {
814
+ multilist_sublist_remove (mls , marker );
815
+ multilist_sublist_insert_before (mls , db , marker );
816
+ }
817
+
818
+ DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
819
+ multilist_sublist_t * , mls );
783
820
784
- if (db != NULL ) {
785
821
multilist_sublist_remove (mls , db );
786
822
multilist_sublist_unlock (mls );
787
823
uint64_t size = db -> db .db_size ;
@@ -797,9 +833,121 @@ dbuf_evict_one(void)
797
833
db -> db_caching_status = DB_NO_CACHE ;
798
834
dbuf_destroy (db );
799
835
DBUF_STAT_BUMP (cache_total_evicts );
800
- } else {
801
- multilist_sublist_unlock (mls );
836
+ evicted += size + usize ;
837
+
838
+ mls = multilist_sublist_lock_idx (
839
+ & dbuf_caches [DB_DBUF_CACHE ].cache , idx );
840
+ db = multilist_sublist_prev (mls , marker );
802
841
}
842
+
843
+ multilist_sublist_remove (mls , marker );
844
+ multilist_sublist_unlock (mls );
845
+ kmem_cache_free (dbuf_kmem_cache , marker );
846
+ }
847
+
848
+ typedef struct evict_arg {
849
+ taskq_ent_t tqe ;
850
+ unsigned idx ;
851
+ uint64_t bytes ;
852
+ } evict_arg_t ;
853
+
854
+ static void
855
+ dbuf_evict_task (void * arg )
856
+ {
857
+ evict_arg_t * eva = arg ;
858
+ dbuf_evict_many (eva -> bytes , eva -> idx );
859
+ }
860
+
861
+ static void
862
+ dbuf_evict (void )
863
+ {
864
+ int64_t bytes = (zfs_refcount_count (& dbuf_caches [DB_DBUF_CACHE ].size ) -
865
+ dbuf_cache_lowater_bytes ());
866
+
867
+ if (bytes <= 0 )
868
+ return ;
869
+
870
+ unsigned idx = multilist_get_random_index (
871
+ & dbuf_caches [DB_DBUF_CACHE ].cache );
872
+
873
+ if (!dbuf_evict_parallel )
874
+ return (dbuf_evict_many (bytes , idx ));
875
+
876
+ /*
877
+ * Go to the parallel eviction.
878
+ */
879
+ unsigned int num_sublists = multilist_get_num_sublists (
880
+ & dbuf_caches [DB_DBUF_CACHE ].cache );
881
+ evict_arg_t * evarg = kmem_zalloc (sizeof (* evarg ) * num_sublists ,
882
+ KM_SLEEP );
883
+ /*
884
+ * How we scale
885
+ *
886
+ * Example 1, # of chunks less than # of tasks.
887
+ * We have:
888
+ * - 4 tasks
889
+ * - 3 chunks
890
+ * - 3 full col
891
+ * - 0 low cols.
892
+ *
893
+ * The first low col index is 3.
894
+ * The tasks #0-#2 evict 1 chunk each.
895
+ *
896
+ * 0 | 1 | 2 | 3 |
897
+ * +===+===+===+===+
898
+ * | x | x | x | |
899
+ * +---+---+---+---+
900
+ *
901
+ * Example 2, # of chunks more than # of tasks.
902
+ * We have:
903
+ * - 4 tasks
904
+ * - 9 chunks
905
+ * - 1 full col
906
+ * - 3 low cols
907
+ *
908
+ * The first low col index is 1.
909
+ * The task #0 evicts 3 chunks, the others evict 2 chunks each.
910
+ *
911
+ * 0 | 1 | 2 | 3 |
912
+ * +===+===+===+===+
913
+ * | x | x | x | x |
914
+ * +---+---+---+---+
915
+ * | x | x | x | x |
916
+ * +---+---+---+---+
917
+ * | x | | | |
918
+ * +---+---+---+---+
919
+ */
920
+
921
+ /*
922
+ * Compute number of tasks to run (n), first low col index (k),
923
+ * normal and low bytes per task.
924
+ */
925
+ uint64_t nchunks = ((bytes - 1 ) >> DBUF_MIN_EVICT_PERTASK_SHIFT ) + 1 ;
926
+ unsigned n = nchunks < num_sublists ? nchunks : num_sublists ;
927
+ uint64_t fullrows = nchunks / n ;
928
+ unsigned lastrowcols = nchunks % n ;
929
+ unsigned k = (lastrowcols ? lastrowcols : n );
930
+
931
+ uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT ;
932
+ uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
933
+ (1 << DBUF_MIN_EVICT_PERTASK_SHIFT ) : 0 );
934
+
935
+ for (unsigned i = 0 ; i < n ; i ++ ) {
936
+ uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low ;
937
+
938
+ evarg [i ].idx = idx ;
939
+ evarg [i ].bytes = evict ;
940
+
941
+ taskq_dispatch_ent (dbuf_evict_taskq , dbuf_evict_task ,
942
+ & evarg [i ], 0 , & evarg [i ].tqe );
943
+
944
+ /* wrap idx */
945
+ if (++ idx >= num_sublists )
946
+ idx = 0 ;
947
+ }
948
+
949
+ taskq_wait (dbuf_evict_taskq );
950
+ kmem_free (evarg , sizeof (* evarg ) * num_sublists );
803
951
}
804
952
805
953
/*
@@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused)
833
981
* minimize lock contention.
834
982
*/
835
983
while (dbuf_cache_above_lowater () && !dbuf_evict_thread_exit ) {
836
- dbuf_evict_one ();
984
+ dbuf_evict ();
837
985
}
838
986
839
987
mutex_enter (& dbuf_evict_lock );
@@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size)
860
1008
*/
861
1009
if (size > dbuf_cache_target_bytes ()) {
862
1010
if (size > dbuf_cache_hiwater_bytes ())
863
- dbuf_evict_one ();
1011
+ dbuf_evict ();
864
1012
cv_signal (& dbuf_evict_cv );
865
1013
}
866
1014
}
@@ -965,11 +1113,16 @@ dbuf_init(void)
965
1113
966
1114
dbuf_stats_init (h );
967
1115
1116
+ if (dbuf_evict_threads == 0 )
1117
+ dbuf_evict_threads = MIN (16 , max_ncpus >> 1 );
968
1118
/*
969
1119
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
970
1120
* configuration is not required.
971
1121
*/
972
1122
dbu_evict_taskq = taskq_create ("dbu_evict" , 1 , defclsyspri , 0 , 0 , 0 );
1123
+ dbuf_evict_taskq = taskq_create ("dbuf_evict" ,
1124
+ MIN (dbuf_evict_threads , max_ncpus ), defclsyspri ,
1125
+ MIN (dbuf_evict_threads , max_ncpus ), max_ncpus , TASKQ_PREPOPULATE );
973
1126
974
1127
for (dbuf_cached_state_t dcs = 0 ; dcs < DB_CACHE_MAX ; dcs ++ ) {
975
1128
multilist_create (& dbuf_caches [dcs ].cache ,
@@ -1035,6 +1188,8 @@ dbuf_fini(void)
1035
1188
1036
1189
kmem_cache_destroy (dbuf_kmem_cache );
1037
1190
taskq_destroy (dbu_evict_taskq );
1191
+ taskq_wait (dbuf_evict_taskq );
1192
+ taskq_destroy (dbuf_evict_taskq );
1038
1193
1039
1194
mutex_enter (& dbuf_evict_lock );
1040
1195
dbuf_evict_thread_exit = B_TRUE ;
@@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
3963
4118
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
3964
4119
* ^ |
3965
4120
* | |
3966
- * +-----dbuf_destroy()<--dbuf_evict_one ()<--------+
4121
+ * +-----dbuf_destroy()<--dbuf_evict ()<---- --------+
3967
4122
*
3968
4123
*/
3969
4124
void
@@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
5282
5437
5283
5438
ZFS_MODULE_PARAM (zfs_dbuf , dbuf_ , mutex_cache_shift , UINT , ZMOD_RD ,
5284
5439
"Set size of dbuf cache mutex array as log2 shift." );
5440
+
5441
+ ZFS_MODULE_PARAM (zfs_dbuf , dbuf_ , evict_parallel , UINT , ZMOD_RW ,
5442
+ "Evict from the dbuf cache in parallel using a taskq" );
5443
+
5444
+ ZFS_MODULE_PARAM (zfs_dbuf , dbuf_ , evict_threads , UINT , ZMOD_RW ,
5445
+ "Maximum number of dbuf_evict threads" );
0 commit comments