@@ -436,34 +436,30 @@ def is_backing_instance_valid(
436
436
return self .ec2_backing_instance_valid
437
437
# Set ec2_backing_instance_valid to True since it will be the result most often
438
438
self .ec2_backing_instance_valid = True
439
- if self .is_nodeaddr_set ():
440
- if not self .instance :
439
+ if self .is_nodeaddr_set () and not self .instance :
440
+ if log_warn_if_unhealthy :
441
+ logger .warning (
442
+ "Node state check: no corresponding instance in EC2 for node %s, node state: %s" ,
443
+ self ,
444
+ self .state_string ,
445
+ )
446
+ # Allow a few iterations for the eventual consistency of EC2 data
447
+ logger .debug (f"Map of slurm nodes without backing instances { nodes_without_backing_instance_count_map } " )
448
+ missing_instance_loop_count = nodes_without_backing_instance_count_map .get (self .name , 0 )
449
+ # If the loop count has been reached, the instance is unhealthy and will be terminated
450
+ if missing_instance_loop_count >= ec2_instance_missing_max_count :
451
+ if log_warn_if_unhealthy :
452
+ logger .warning (f"EC2 instance availability for node { self .name } has timed out." )
453
+ # Remove the slurm node from the map since a new instance will be launched
454
+ nodes_without_backing_instance_count_map .pop (self .name , None )
455
+ self .ec2_backing_instance_valid = False
456
+ else :
457
+ nodes_without_backing_instance_count_map [self .name ] = missing_instance_loop_count + 1
441
458
if log_warn_if_unhealthy :
442
459
logger .warning (
443
- "Node state check: no corresponding instance in EC2 for node %s, node state: %s" ,
444
- self ,
445
- self .state_string ,
460
+ f"Incrementing missing EC2 instance count for node { self .name } to "
461
+ f"{ nodes_without_backing_instance_count_map [self .name ]} ."
446
462
)
447
- # Allow a few iterations for the eventual consistency of EC2 data
448
- logger .debug (f"Map of slurm nodes without backing instances { nodes_without_backing_instance_count_map } " )
449
- missing_instance_loop_count = nodes_without_backing_instance_count_map .get (self .name , 0 )
450
- # If the loop count has been reached, the instance is unhealthy and will be terminated
451
- if missing_instance_loop_count >= ec2_instance_missing_max_count :
452
- if log_warn_if_unhealthy :
453
- logger .warning (f"EC2 instance availability for node { self .name } has timed out." )
454
- # Remove the slurm node from the map since a new instance will be launched
455
- nodes_without_backing_instance_count_map .pop (self .name , None )
456
- self .ec2_backing_instance_valid = False
457
- else :
458
- nodes_without_backing_instance_count_map [self .name ] = missing_instance_loop_count + 1
459
- if log_warn_if_unhealthy :
460
- logger .warning (
461
- f"Incrementing missing EC2 instance count for node { self .name } to "
462
- f"{ nodes_without_backing_instance_count_map [self .name ]} ."
463
- )
464
- else :
465
- # Remove the slurm node from the map since the instance is healthy
466
- nodes_without_backing_instance_count_map .pop (self .name , None )
467
463
else :
468
464
# Remove the slurm node from the map since the instance is healthy
469
465
nodes_without_backing_instance_count_map .pop (self .name , None )
0 commit comments