@@ -28,8 +28,9 @@ use common_meta::rpc::router::RegionRoute;
2828use common_telemetry:: { debug, error, info, warn} ;
2929use futures:: TryStreamExt ;
3030use futures:: stream:: { FuturesUnordered , StreamExt } ;
31+ use ordered_float:: OrderedFloat ;
3132use snafu:: { OptionExt as _, ResultExt } ;
32- use store_api:: storage:: { RegionId , TableFileRefsManifest } ;
33+ use store_api:: storage:: { FileRefsManifest , RegionId } ;
3334use table:: metadata:: TableId ;
3435use tokio:: sync:: mpsc:: { Receiver , Sender } ;
3536use tokio:: time:: sleep;
@@ -63,6 +64,8 @@ pub struct GcConfig {
6364 pub file_removal_rate_weight : f64 ,
6465 /// Cooldown period between GC operations on the same region.
6566 pub gc_cooldown_period : Duration ,
67+ /// Maximum number of regions to select for GC per table.
68+ pub regions_per_table_threshold : usize ,
6669}
6770
6871impl Default for GcConfig {
@@ -75,6 +78,7 @@ impl Default for GcConfig {
7578 sst_count_weight : 1.0 ,
7679 file_removal_rate_weight : 0.5 ,
7780 gc_cooldown_period : Duration :: from_secs ( 60 * 30 ) , // 30 minutes
81+ regions_per_table_threshold : 20 , // Select top 20 regions per table
7882 }
7983 }
8084}
@@ -83,21 +87,21 @@ impl Default for GcConfig {
8387#[ derive( Debug , Clone , PartialEq , Eq , Hash ) ]
8488struct GcCandidate {
8589 region_id : RegionId ,
86- score : u64 , // Changed to u64 for Hash/Eq implementation
90+ score : OrderedFloat < f64 > ,
8791 region_stat : RegionStat ,
8892}
8993
9094impl GcCandidate {
9195 fn new ( region_id : RegionId , score : f64 , region_stat : RegionStat ) -> Self {
9296 Self {
9397 region_id,
94- score : ( score * 1000.0 ) as u64 , // Convert to u64 for hashing
98+ score : OrderedFloat ( score) ,
9599 region_stat,
96100 }
97101 }
98102
99103 fn score_f64 ( & self ) -> f64 {
100- self . score as f64 / 1000.0
104+ self . score . into_inner ( )
101105 }
102106}
103107
@@ -247,7 +251,7 @@ impl GcTrigger {
247251 let gc_tracker = self . region_gc_tracker . lock ( ) . await ;
248252
249253 for ( table_id, region_stats) in table_to_region_stats {
250- let mut candidates = HashSet :: new ( ) ;
254+ let mut candidates = Vec :: new ( ) ;
251255
252256 for region_stat in region_stats {
253257 // Skip regions that are too small
@@ -267,17 +271,25 @@ impl GcTrigger {
267271
268272 // Only consider regions with a meaningful score
269273 if score > 0.0 {
270- candidates. insert ( GcCandidate :: new ( region_stat. id , score, region_stat. clone ( ) ) ) ;
274+ candidates. push ( GcCandidate :: new ( region_stat. id , score, region_stat. clone ( ) ) ) ;
271275 }
272276 }
273277
274- if !candidates. is_empty ( ) {
278+ // Sort candidates by score in descending order and take top N
279+ candidates. sort_by ( |a, b| b. score . cmp ( & a. score ) ) ;
280+ let top_candidates: HashSet < GcCandidate > = candidates
281+ . into_iter ( )
282+ . take ( self . config . regions_per_table_threshold )
283+ . collect ( ) ;
284+
285+ if !top_candidates. is_empty ( ) {
275286 info ! (
276- "Selected {} GC candidates for table {}" ,
277- candidates. len( ) ,
278- table_id
287+ "Selected {} GC candidates for table {} (top {} out of all qualified)" ,
288+ top_candidates. len( ) ,
289+ table_id,
290+ self . config. regions_per_table_threshold
279291 ) ;
280- table_candidates. insert ( * table_id, candidates ) ;
292+ table_candidates. insert ( * table_id, top_candidates ) ;
281293 }
282294 }
283295
@@ -388,9 +400,38 @@ impl GcTrigger {
388400 . get_file_references ( & related_region_ids, & table_peer)
389401 . await ?;
390402
391- // Step 4: Process each candidate region with retry logic
392- let mut successful_regions = 0 ;
403+ // Step 4: Filter out candidates that don't have file references available
404+ let total_candidates = candidates. len ( ) ;
405+ let mut valid_candidates = Vec :: new ( ) ;
393406 for candidate in candidates {
407+ // Check if we have file references for this region
408+ if let Some ( region_route) = table_peer
409+ . region_routes
410+ . iter ( )
411+ . find ( |r| r. region . id == candidate. region_id )
412+ {
413+ if let Some ( peer) = & region_route. leader_peer {
414+ // Check if this peer's file references were successfully obtained
415+ if file_refs_manifest
416+ . manifest_version
417+ . contains_key ( & candidate. region_id )
418+ {
419+ valid_candidates. push ( candidate) ;
420+ } else {
421+ warn ! (
422+ "Skipping region {} due to missing file references (datanode {} may be unavailable)" ,
423+ candidate. region_id, peer
424+ ) ;
425+ }
426+ }
427+ }
428+ }
429+
430+ // Step 5: Process each valid candidate region with retry logic
431+ let valid_candidates_count = valid_candidates. len ( ) ;
432+ let mut successful_regions = 0 ;
433+
434+ for candidate in valid_candidates {
394435 let region_id = candidate. region_id ;
395436 match self
396437 . process_region_gc_with_retry ( candidate, & file_refs_manifest, & table_peer)
@@ -409,10 +450,11 @@ impl GcTrigger {
409450 }
410451
411452 info ! (
412- "Completed GC for table {}: {}/{} regions successful" ,
453+ "Completed GC for table {}: {}/{} regions successful ({} skipped due to missing file references) " ,
413454 table_id,
414455 successful_regions,
415- candidate_region_ids. len( )
456+ valid_candidates_count,
457+ total_candidates - valid_candidates_count
416458 ) ;
417459
418460 Ok ( successful_regions)
@@ -434,7 +476,7 @@ impl GcTrigger {
434476 & self ,
435477 region_ids : & [ RegionId ] ,
436478 table_peer : & PhysicalTableRouteValue ,
437- ) -> Result < TableFileRefsManifest > {
479+ ) -> Result < FileRefsManifest > {
438480 info ! ( "Getting file references for {} regions" , region_ids. len( ) ) ;
439481
440482 // Group regions by datanode to minimize RPC calls
@@ -466,13 +508,17 @@ impl GcTrigger {
466508 all_manifest_versions. extend ( manifest. manifest_version ) ;
467509 }
468510 Err ( e) => {
469- error ! ( "Failed to get file refs from datanode {}: {}" , peer, e) ;
470- return Err ( e) ;
511+ warn ! (
512+ "Failed to get file refs from datanode {}: {}. Skipping regions on this datanode." ,
513+ peer, e
514+ ) ;
515+ // Continue processing other datanodes instead of failing the entire operation
516+ continue ;
471517 }
472518 }
473519 }
474520
475- Ok ( TableFileRefsManifest {
521+ Ok ( FileRefsManifest {
476522 file_refs : all_file_refs,
477523 manifest_version : all_manifest_versions,
478524 } )
@@ -482,7 +528,7 @@ impl GcTrigger {
482528 async fn process_region_gc_with_retry (
483529 & self ,
484530 candidate : GcCandidate ,
485- file_refs_manifest : & TableFileRefsManifest ,
531+ file_refs_manifest : & FileRefsManifest ,
486532 table_peer : & PhysicalTableRouteValue ,
487533 ) -> Result < ( ) > {
488534 let region_id = candidate. region_id ;
@@ -548,7 +594,7 @@ impl GcTrigger {
548594 & self ,
549595 peer : & Peer ,
550596 region_ids : & [ RegionId ] ,
551- ) -> Result < TableFileRefsManifest > {
597+ ) -> Result < FileRefsManifest > {
552598 info ! (
553599 "Sending GetFileRefs instruction to datanode {} for {} regions" ,
554600 peer,
@@ -618,7 +664,7 @@ impl GcTrigger {
618664 & self ,
619665 peer : Peer ,
620666 region_id : RegionId ,
621- file_refs_manifest : & TableFileRefsManifest ,
667+ file_refs_manifest : & FileRefsManifest ,
622668 ) -> Result < ( ) > {
623669 info ! (
624670 "Sending GC instruction to datanode {} for region {}" ,
0 commit comments