Skip to content

Commit ed8a7ec

Browse files
authored
Merge pull request #288 from fabric-testbed/287.maint
Maintenance Mode Issues
2 parents e16684c + a4ae45a commit ed8a7ec

File tree

2 files changed

+75
-4
lines changed

2 files changed

+75
-4
lines changed

fabric_cf/actor/core/container/maintenance.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#
2525
# Author: Komal Thareja ([email protected])
2626
from datetime import datetime, timezone
27-
from typing import List, Dict, Tuple
27+
from typing import List, Dict, Tuple, Union
2828

2929
from fim.slivers.maintenance_mode import MaintenanceInfo, MaintenanceState
3030

@@ -92,16 +92,56 @@ def is_worker_in_maintenance(self, *, worker: str) -> bool:
9292
def __str__(self):
9393
return f"Name: {self.name} MaintInfo: {self.maintenance_info} Properties: {self.properties}"
9494

95+
def clone_maintenance_info(self) -> Union[MaintenanceInfo or None]:
96+
if self.maintenance_info is not None:
97+
return self.maintenance_info.copy()
98+
return None
99+
100+
def update_maintenance_info(self, maint_info: MaintenanceInfo):
101+
self.maintenance_info = maint_info
102+
95103

96104
class Maintenance:
97105
@staticmethod
98106
def update_maintenance_mode(*, database: ABCDatabase, properties: Dict[str, str], sites: List[Site] = None):
107+
"""
108+
Update Maintenance Mode at Testbed/Site/Worker Level
109+
- Tesbed level Maintenance - single Site object is passed with Name = ALL
110+
- Site level Maintenance - single Site object per site is passed with Name = SiteName
111+
- Worker level Maintenance - single Site object per site with one entry per worker
112+
@param database database
113+
@param properties properties container project ids/ user emails
114+
@param sites Maintenance information for the sites
115+
"""
99116
for s in sites:
117+
# Set the list of allowed projects/users at the site level
100118
if properties is not None:
101119
s.set_properties(properties=properties)
102120

103-
if database.get_site(site_name=s.get_name()) is not None:
104-
database.update_site(site=s)
121+
# Get Current Maintenance mode for the Site
122+
existing_site = database.get_site(site_name=s.get_name())
123+
# Site entry exists
124+
if existing_site is not None:
125+
# Site level Maintenance Update
126+
if s.get_maintenance_info().get(s.get_name()) is not None:
127+
database.update_site(site=s)
128+
# Worker level Maintenance Update
129+
else:
130+
new_maint_info = existing_site.clone_maintenance_info()
131+
if new_maint_info.get(s.get_name()):
132+
new_maint_info.rem(s.get_name())
133+
for worker_name, entry in s.get_maintenance_info().list_details():
134+
# Remove existing entry
135+
if new_maint_info.get(worker_name):
136+
new_maint_info.rem(worker_name)
137+
138+
# Add worker entry using the new information only if worker is in Maintenance
139+
if entry.state != MaintenanceState.Active:
140+
new_maint_info.add(worker_name, entry)
141+
new_maint_info.finalize()
142+
existing_site.update_maintenance_info(maint_info=new_maint_info)
143+
database.update_site(site=existing_site)
144+
# Adding Maintenance State First Time
105145
else:
106146
database.add_site(site=s)
107147

@@ -137,7 +177,7 @@ def is_sliver_provisioning_allowed(*, database: ABCDatabase, project: str, email
137177
"""
138178
status, site = Maintenance.is_site_in_maintenance(database=database, site_name=site)
139179

140-
if not status:
180+
if not status and site is None:
141181
return True, None
142182

143183
projects = site.get_properties().get(Constants.PROJECT_ID)

fabric_cf/actor/core/policy/broker_simpler_units_policy.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from fabric_cf.actor.core.apis.abc_delegation import ABCDelegation
4848
from fabric_cf.actor.core.apis.abc_reservation_mixin import ABCReservationMixin
4949
from fabric_cf.actor.core.common.constants import Constants
50+
from fabric_cf.actor.core.container.maintenance import Maintenance
5051
from fabric_cf.actor.core.delegation.resource_ticket import ResourceTicketFactory
5152
from fabric_cf.actor.core.common.exceptions import BrokerException, ExceptionErrorCode
5253
from fabric_cf.actor.core.kernel.reservation_states import ReservationStates
@@ -508,6 +509,32 @@ def __candidate_nodes(self, *, sliver: NodeSliver) -> List[str]:
508509

509510
return result
510511

512+
def __prune_nodes_in_maintenance(self, node_id_list: List[str], site: str, reservation: ABCBrokerReservation):
513+
"""
514+
Prune the candidate node list to exclude the workers in Maintenance
515+
@param node_id_list: Candidate Node List identified to allocate the reservation
516+
@param site: Site Name
517+
@param reservation: Reservation to be allocated
518+
"""
519+
project_id = reservation.get_slice().get_project_id()
520+
email = reservation.get_slice().get_owner().get_email()
521+
522+
nodes_to_remove = []
523+
for node_id in node_id_list:
524+
graph_node = self.get_network_node_from_graph(node_id=node_id)
525+
status, error_message = Maintenance.is_sliver_provisioning_allowed(database=self.actor.get_plugin().get_database(),
526+
project=project_id, site=site,
527+
worker=graph_node.get_name(),
528+
email=email)
529+
if not status:
530+
self.logger.info(f"Excluding {graph_node.get_name()} as allocation candidate due to {error_message}")
531+
nodes_to_remove.append(node_id)
532+
533+
for x in nodes_to_remove:
534+
node_id_list.remove(x)
535+
536+
return node_id_list
537+
511538
def __find_first_fit(self, node_id_list: List[str], node_id_to_reservations: dict, inv: InventoryForType,
512539
reservation: ABCBrokerReservation) -> Tuple[str, BaseSliver, Any]:
513540
"""
@@ -571,6 +598,10 @@ def __allocate_nodes(self, *, reservation: ABCBrokerReservation, inv: NetworkNod
571598
delegation_id = None
572599
node_id_list = self.__candidate_nodes(sliver=sliver)
573600

601+
node_id_list = self.__prune_nodes_in_maintenance(node_id_list=node_id_list,
602+
site=sliver.site,
603+
reservation=reservation)
604+
574605
# no candidate nodes found
575606
if len(node_id_list) == 0:
576607
error_msg = f'Insufficient resources: No candidates nodes found to serve {reservation}'

0 commit comments

Comments
 (0)