@@ -767,6 +767,7 @@ def test_handle_health_check(
767
767
region = "region" ,
768
768
boto3_config = None ,
769
769
fleet_config = {},
770
+ ec2_instance_missing_max_count = 0 ,
770
771
)
771
772
772
773
cluster_manager = ClusterManager (mock_sync_config )
@@ -831,6 +832,7 @@ def test_update_static_nodes_in_replacement(current_replacing_nodes, slurm_nodes
831
832
region = "region" ,
832
833
boto3_config = None ,
833
834
fleet_config = {},
835
+ ec2_instance_missing_max_count = 0 ,
834
836
)
835
837
cluster_manager = ClusterManager (mock_sync_config )
836
838
cluster_manager ._static_nodes_in_replacement = current_replacing_nodes
@@ -2646,43 +2648,61 @@ def initialize_console_logger_mock(mocker):
2646
2648
2647
2649
2648
2650
@pytest .mark .parametrize (
2649
- "current_replacing_nodes, node, instance, current_time, expected_result" ,
2651
+ "current_replacing_nodes, node, instance, current_time, max_count, expected_result" ,
2650
2652
[
2651
2653
(
2652
2654
set (),
2653
2655
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "IDLE+CLOUD" , "queue1" ),
2654
2656
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2655
2657
datetime (2020 , 1 , 1 , 0 , 0 , 29 ),
2658
+ 0 ,
2656
2659
False ,
2657
2660
),
2658
2661
(
2659
2662
{"queue1-st-c5xlarge-1" },
2660
2663
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "IDLE+CLOUD" , "queue1" ),
2661
2664
None ,
2662
2665
datetime (2020 , 1 , 1 , 0 , 0 , 29 ),
2666
+ 0 ,
2663
2667
False ,
2664
2668
),
2665
2669
(
2666
2670
{"queue1-st-c5xlarge-1" },
2667
2671
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "DOWN+CLOUD" , "queue1" ),
2668
2672
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2669
2673
datetime (2020 , 1 , 1 , 0 , 0 , 29 ),
2674
+ 0 ,
2670
2675
True ,
2671
2676
),
2672
2677
(
2673
2678
{"queue1-st-c5xlarge-1" },
2674
2679
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "IDLE+CLOUD" , "queue1" ),
2675
2680
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2676
2681
datetime (2020 , 1 , 1 , 0 , 0 , 30 ),
2682
+ 0 ,
2677
2683
False ,
2678
2684
),
2685
+ (
2686
+ {"queue1-st-c5xlarge-1" },
2687
+ StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "IDLE+CLOUD" , "queue1" ),
2688
+ None ,
2689
+ datetime (2020 , 1 , 1 , 0 , 0 , 30 ),
2690
+ 1 ,
2691
+ True ,
2692
+ ),
2693
+ ],
2694
+ ids = [
2695
+ "not_in_replacement" ,
2696
+ "no-backing-instance" ,
2697
+ "in_replacement" ,
2698
+ "timeout" ,
2699
+ "no-backing-instance-with-max-count" ,
2679
2700
],
2680
- ids = ["not_in_replacement" , "no-backing-instance" , "in_replacement" , "timeout" ],
2681
2701
)
2682
2702
@pytest .mark .usefixtures (
2683
2703
"initialize_instance_manager_mock" , "initialize_executor_mock" , "initialize_console_logger_mock"
2684
2704
)
2685
- def test_is_node_being_replaced (current_replacing_nodes , node , instance , current_time , expected_result ):
2705
+ def test_is_node_being_replaced (current_replacing_nodes , node , instance , current_time , max_count , expected_result ):
2686
2706
mock_sync_config = SimpleNamespace (
2687
2707
node_replacement_timeout = 30 ,
2688
2708
insufficient_capacity_timeout = 3 ,
@@ -2691,6 +2711,7 @@ def test_is_node_being_replaced(current_replacing_nodes, node, instance, current
2691
2711
region = "region" ,
2692
2712
boto3_config = None ,
2693
2713
fleet_config = {},
2714
+ ec2_instance_missing_max_count = max_count ,
2694
2715
)
2695
2716
cluster_manager = ClusterManager (mock_sync_config )
2696
2717
cluster_manager ._current_time = current_time
@@ -2700,24 +2721,34 @@ def test_is_node_being_replaced(current_replacing_nodes, node, instance, current
2700
2721
2701
2722
2702
2723
@pytest .mark .parametrize (
2703
- "node, instance, current_node_in_replacement, is_replacement_timeout" ,
2724
+ "node, instance, current_node_in_replacement, max_count, is_replacement_timeout" ,
2704
2725
[
2705
2726
(
2706
2727
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "DOWN+CLOUD+NOT_RESPONDING" , "queue1" ),
2707
2728
None ,
2708
2729
{"queue1-st-c5xlarge-1" },
2730
+ 0 ,
2731
+ False ,
2732
+ ),
2733
+ (
2734
+ StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "DOWN+CLOUD+NOT_RESPONDING" , "queue1" ),
2735
+ None ,
2736
+ {"queue1-st-c5xlarge-1" },
2737
+ 1 ,
2709
2738
False ,
2710
2739
),
2711
2740
(
2712
2741
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "DOWN+CLOUD+NOT_RESPONDING" , "queue1" ),
2713
2742
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2714
2743
{"queue1-st-c5xlarge-1" },
2744
+ 0 ,
2715
2745
True ,
2716
2746
),
2717
2747
(
2718
2748
DynamicNode ("queue1-dy-c5xlarge-1" , "ip-1" , "hostname" , "MIXED+CLOUD+NOT_RESPONDING+POWERING_UP" , "queue1" ),
2719
2749
None ,
2720
2750
{"some_node_in_replacement" },
2751
+ 0 ,
2721
2752
False ,
2722
2753
),
2723
2754
(
@@ -2730,20 +2761,22 @@ def test_is_node_being_replaced(current_replacing_nodes, node, instance, current
2730
2761
),
2731
2762
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2732
2763
{"some_node_in_replacement" },
2764
+ 0 ,
2733
2765
False ,
2734
2766
),
2735
2767
(
2736
2768
StaticNode ("queue1-st-c5xlarge-1" , "ip-1" , "hostname" , "DOWN+CLOUD+NOT_RESPONDING" , "queue1" ),
2737
2769
EC2Instance ("id-1" , "ip-1" , "hostname" , {"ip-1" }, datetime (2020 , 1 , 1 , 0 , 0 , 0 )),
2738
2770
{"some_node_in_replacement" },
2771
+ 0 ,
2739
2772
False ,
2740
2773
),
2741
2774
],
2742
2775
)
2743
2776
@pytest .mark .usefixtures (
2744
2777
"initialize_instance_manager_mock" , "initialize_executor_mock" , "initialize_console_logger_mock"
2745
2778
)
2746
- def test_is_node_replacement_timeout (node , current_node_in_replacement , is_replacement_timeout , instance ):
2779
+ def test_is_node_replacement_timeout (node , current_node_in_replacement , max_count , is_replacement_timeout , instance ):
2747
2780
node .instance = instance
2748
2781
mock_sync_config = SimpleNamespace (
2749
2782
node_replacement_timeout = 30 ,
@@ -2753,6 +2786,7 @@ def test_is_node_replacement_timeout(node, current_node_in_replacement, is_repla
2753
2786
region = "region" ,
2754
2787
boto3_config = None ,
2755
2788
fleet_config = {},
2789
+ ec2_instance_missing_max_count = 0 ,
2756
2790
)
2757
2791
cluster_manager = ClusterManager (mock_sync_config )
2758
2792
cluster_manager ._current_time = datetime (2020 , 1 , 2 , 0 , 0 , 0 )
0 commit comments