Skip to content

Management Server - Prepare for Maintenance and Cancel Maintenance improvements #10995

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions agent/src/main/java/com/cloud/agent/Agent.java
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
return new SetupCertificateAnswer(true);
}

private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final boolean triggerHostLB) {
if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) {
try {
final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm);
Expand All @@ -941,6 +941,12 @@ private void processManagementServerList(final List<String> msList, final List<S
}
}
shell.setAvoidHosts(avoidMsList);
if (triggerHostLB) {
logger.info("Triggering preferred host task");
hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory("HostLB-Executor")));
ScheduledExecutorService hostLbExecutor = Executors.newScheduledThreadPool(1);
hostLbExecutor.schedule(new PreferredHostCheckerTask(), 0, TimeUnit.MILLISECONDS);
}
if ("shuffle".equals(lbAlgorithm)) {
scheduleHostLBCheckerTask(0);
} else {
Expand All @@ -949,14 +955,14 @@ private void processManagementServerList(final List<String> msList, final List<S
}

private Answer setupManagementServerList(final SetupMSListCommand cmd) {
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), cmd.getTriggerHostLb());
return new SetupMSListAnswer(true);
}

private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
try {
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), false);
}
Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> {
migrateAgentConnection(cmd.getAvoidMsList());
Expand Down Expand Up @@ -1046,7 +1052,7 @@ public void processReadyCommand(final Command cmd) {
}

verifyAgentArch(ready.getArch());
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval(), false);

logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class OperationTimedoutException extends CloudException {
boolean _isActive;

public OperationTimedoutException(Command[] cmds, long agentId, long seqId, int time, boolean isActive) {
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time);
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time + " secs");
_agentId = agentId;
_seqId = seqId;
_time = time;
Expand Down
7 changes: 5 additions & 2 deletions api/src/main/java/com/cloud/resource/ResourceState.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ public static Event toEvent(String e) {
}
}

public static List<ResourceState> s_maintenanceStates = List.of(ResourceState.Maintenance,
ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance,
ResourceState.ErrorInPrepareForMaintenance);

public ResourceState getNextState(Event a) {
return s_fsm.getNextState(this, a);
}
Expand All @@ -98,8 +102,7 @@ public static String[] toString(ResourceState... states) {
}

public static boolean isMaintenanceState(ResourceState state) {
return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance,
ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state);
return s_maintenanceStates.contains(state);
}

public static boolean canAttemptMaintenance(ResourceState state) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ public class ApiConstants {
public static final String PUBLIC_END_PORT = "publicendport";
public static final String PUBLIC_ZONE = "publiczone";
public static final String PURGE_RESOURCES = "purgeresources";
public static final String REBALANCE = "rebalance";
public static final String RECEIVED_BYTES = "receivedbytes";
public static final String RECONNECT = "reconnect";
public static final String RECOVER = "recover";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public class PatchSystemVMCmd extends BaseAsyncCmd {
@Parameter(name = ApiConstants.FORCED, type = CommandType.BOOLEAN,
description = "If true, initiates copy of scripts and restart of the agent, even if the scripts version matches." +
"To be used with ID parameter only")
private Boolean force;
private Boolean forced;

/////////////////////////////////////////////////////
/////////////////// Accessors ///////////////////////
Expand All @@ -58,7 +58,7 @@ public Long getId() {
}

public boolean isForced() {
return force != null && force;
return forced != null && forced;
}

/////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ public class SetupMSListCommand extends Command {
private List<String> avoidMsList;
private String lbAlgorithm;
private Long lbCheckInterval;
private Boolean triggerHostLb;

public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final Boolean triggerHostLb) {
super();
this.msList = msList;
this.avoidMsList = avoidMsList;
this.lbAlgorithm = lbAlgorithm;
this.lbCheckInterval = lbCheckInterval;
this.triggerHostLb = triggerHostLb;
}

public List<String> getMsList() {
Expand All @@ -54,9 +56,12 @@ public Long getLbCheckInterval() {
return lbCheckInterval;
}

public boolean getTriggerHostLb() {
return triggerHostLb;
}

@Override
public boolean executeInSequence() {
return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,5 @@ enum TapAgentsAction {

void propagateChangeToAgents(Map<String, String> params);

boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs);
boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance);
}
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,6 @@ public boolean configure(final String name, final Map<String, Object> params) th

_executor = new ThreadPoolExecutor(agentTaskThreads, agentTaskThreads, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), new NamedThreadFactory("AgentTaskPool"));

initConnectExecutor();

maxConcurrentNewAgentConnections = RemoteAgentMaxConcurrentNewConnections.value();

_connection = new NioServer("AgentManager", Port.value(), Workers.value() + 10,
Expand Down Expand Up @@ -828,6 +826,7 @@ public boolean start() {
return true;
}

initConnectExecutor();
startDirectlyConnectedHosts(false);

if (_connection != null) {
Expand Down Expand Up @@ -2193,7 +2192,7 @@ public void propagateChangeToAgents(Map<String, String> params) {
}

@Override
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLEngine;

import com.cloud.resource.ResourceState;
import org.apache.cloudstack.ca.CAManager;
import org.apache.cloudstack.framework.config.ConfigDepot;
import org.apache.cloudstack.framework.config.ConfigKey;
Expand Down Expand Up @@ -431,10 +432,10 @@ public boolean routeToPeer(final String peer, final byte[] bytes) {
ch = connectToPeer(peer, ch);
if (ch == null) {
try {
logD(bytes, "Unable to route to peer: " + Request.parse(bytes));
logD(bytes, "Unable to establish connection to route to peer: " + Request.parse(bytes));
} catch (ClassNotFoundException | UnsupportedVersionException e) {
// Request.parse thrown exception when we try to log it, log as much as we can
logD(bytes, "Unable to route to peer, and Request.parse further caught exception" + e.getMessage());
logD(bytes, "Unable to establish connection to route to peer, and Request.parse further caught exception" + e.getMessage());
}
return false;
}
Expand Down Expand Up @@ -643,7 +644,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
final Link link = task.getLink();

if (Request.fromServer(data)) {

final AgentAttache agent = findAttache(hostId);

if (Request.isControl(data)) {
Expand Down Expand Up @@ -691,7 +691,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
cancel(Long.toString(Request.getManagementServerId(data)), hostId, Request.getSequence(data), e.getMessage());
}
} else {

final long mgmtId = Request.getManagementServerId(data);
if (mgmtId != -1 && mgmtId != _nodeId) {
routeToPeer(Long.toString(mgmtId), data);
Expand Down Expand Up @@ -1352,7 +1351,7 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
if (cmd instanceof PrepareForMaintenanceManagementServerHostCommand) {
logger.debug("Received PrepareForMaintenanceManagementServerHostCommand - preparing for maintenance");
try {
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm());
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm(), ((PrepareForMaintenanceManagementServerHostCommand) cmd).isForced());
return "Successfully prepared for maintenance";
} catch(CloudRuntimeException e) {
return e.getMessage();
Expand Down Expand Up @@ -1399,14 +1398,14 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
}

@Override
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
if (timeoutDurationInMs <= 0) {
logger.debug("Not transferring direct agents from management server node {} (id: {}) to other nodes, invalid timeout duration", fromMsId, fromMsUuid);
return false;
}

long transferStartTimeInMs = System.currentTimeMillis();
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId))) {
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId, excludeHostsInMaintenance))) {
logger.info("No direct agent hosts available on management server node {} (id: {}), to transfer", fromMsId, fromMsUuid);
return true;
}
Expand All @@ -1421,7 +1420,7 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
int agentTransferFailedCount = 0;
List<DataCenterVO> dataCenterList = dcDao.listAll();
for (DataCenterVO dc : dataCenterList) {
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId());
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId(), excludeHostsInMaintenance);
if (CollectionUtils.isEmpty(directAgentHostsInDc)) {
continue;
}
Expand Down Expand Up @@ -1455,9 +1454,10 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
return (agentTransferFailedCount == 0);
}

private List<HostVO> getDirectAgentHosts(long msId) {
private List<HostVO> getDirectAgentHosts(long msId, boolean excludeHostsInMaintenance) {
List<HostVO> directAgentHosts = new ArrayList<>();
List<HostVO> hosts = _hostDao.listHostsByMs(msId);
List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of();
List<HostVO> hosts = _hostDao.listHostsByMsResourceState(msId, statesToExclude);
for (HostVO host : hosts) {
AgentAttache agent = findAttache(host.getId());
if (agent instanceof DirectAgentAttache) {
Expand All @@ -1468,9 +1468,11 @@ private List<HostVO> getDirectAgentHosts(long msId) {
return directAgentHosts;
}

private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId) {
private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId, boolean excludeHostsInMaintenance) {
List<HostVO> directAgentHosts = new ArrayList<>();
List<HostVO> hosts = _hostDao.listHostsByMsAndDc(msId, dcId);
// To exclude maintenance states use values from ResourceState as source of truth
List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of();
List<HostVO> hosts = _hostDao.listHostsByMsDcResourceState(msId, dcId, statesToExclude);
for (HostVO host : hosts) {
AgentAttache agent = findAttache(host.getId());
if (agent instanceof DirectAgentAttache) {
Expand Down Expand Up @@ -1506,6 +1508,10 @@ public void onManagementServerPreparingForMaintenance() {
public void onManagementServerCancelPreparingForMaintenance() {
logger.debug("Management server cancel preparing for maintenance");
super.onManagementServerPreparingForMaintenance();

// needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state
// (where this variable will be reset)
_agentLbHappened = false;
}

@Override
Expand Down
18 changes: 14 additions & 4 deletions engine/schema/src/main/java/com/cloud/host/dao/HostDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,24 @@ public interface HostDao extends GenericDao<HostVO, Long>, StateDao<Status, Stat

List<HostVO> listHostsByMsAndDc(long msId, long dcId);

List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates);

List<HostVO> listHostsByMs(long msId);

List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates);

/**
* Retrieves the number of hosts/agents this {@see ManagementServer} has responsibility over.
* @param msId the id of the {@see ManagementServer}
* @return the number of hosts/agents this {@see ManagementServer} has responsibility over
* Count Hosts by given Management Server, Host and Hypervisor Types,
* and exclude Hosts with given Resource States.
*
* @param msId Management Server Id
* @param excludedResourceStates Resource States to be excluded
* @param hostTypes Host Types
* @param hypervisorTypes Hypervisor Types
* @return Hosts count
*/
int countByMs(long msId);
int countHostsByMsResourceStateTypeAndHypervisorType(long msId, List<ResourceState> excludedResourceStates,
List<Type> hostTypes, List<HypervisorType> hypervisorTypes);

/**
* Retrieves the host ids/agents this {@see ManagementServer} has responsibility over.
Expand Down
42 changes: 38 additions & 4 deletions engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import com.cloud.utils.db.GenericSearchBuilder;
import com.cloud.utils.db.JoinBuilder;
import com.cloud.utils.db.JoinBuilder.JoinType;
import com.cloud.utils.db.QueryBuilder;
import com.cloud.utils.db.SearchBuilder;
import com.cloud.utils.db.SearchCriteria;
import com.cloud.utils.db.SearchCriteria.Func;
Expand Down Expand Up @@ -1600,6 +1601,17 @@ public List<HostVO> listHostsByMsAndDc(long msId, long dcId) {
return listBy(sc);
}

@Override
public List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates) {
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId);
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
}
return listBy(sc.create());
}

@Override
public List<HostVO> listHostsByMs(long msId) {
SearchCriteria<HostVO> sc = ResponsibleMsSearch.create();
Expand All @@ -1608,10 +1620,32 @@ public List<HostVO> listHostsByMs(long msId) {
}

@Override
public int countByMs(long msId) {
SearchCriteria<HostVO> sc = ResponsibleMsSearch.create();
sc.setParameters("managementServerId", msId);
return getCount(sc);
public List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates) {
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
}
return listBy(sc.create());
}

@Override
public int countHostsByMsResourceStateTypeAndHypervisorType(long msId,
List<ResourceState> excludedResourceStates,
List<Type> hostTypes,
List<HypervisorType> hypervisorTypes) {
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
}
if (CollectionUtils.isNotEmpty(hostTypes)) {
sc.and(sc.entity().getType(), Op.IN, hostTypes.toArray());
}
if (CollectionUtils.isNotEmpty(hypervisorTypes)) {
sc.and(sc.entity().getHypervisorType(), Op.IN, hypervisorTypes.toArray());
}
return getCount(sc.create());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@ public interface IndirectAgentLB {
*/
Long getLBPreferredHostCheckInterval(Long clusterId);

void propagateMSListToAgents();
void propagateMSListToAgents(boolean triggerHostLB);

boolean haveAgentBasedHosts(long msId);
void propagateMSListToAgentsInCluster(Long clusterId);

boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs);
boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance);

boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance);
}
Loading
Loading