diff --git a/agent/src/main/java/com/cloud/agent/Agent.java b/agent/src/main/java/com/cloud/agent/Agent.java index fcd4234a1361..b7c24e5126cb 100644 --- a/agent/src/main/java/com/cloud/agent/Agent.java +++ b/agent/src/main/java/com/cloud/agent/Agent.java @@ -453,22 +453,30 @@ private void scheduleCertificateRenewalTask() { certExecutor.schedule(new PostCertificateRenewalTask(this), 5, TimeUnit.SECONDS); } - private void scheduleHostLBCheckerTask(final long checkInterval) { + private void scheduleHostLBCheckerTask(final String lbAlgorithm, final long checkInterval) { String name = "HostLBCheckerTask"; if (hostLbCheckExecutor != null && !hostLbCheckExecutor.isShutdown()) { + logger.info("Shutting down the preferred host checker task {}", name); hostLbCheckExecutor.shutdown(); try { if (!hostLbCheckExecutor.awaitTermination(1, TimeUnit.SECONDS)) { hostLbCheckExecutor.shutdownNow(); } } catch (InterruptedException e) { - logger.debug("Forcing {} shutdown as it did not shutdown in the desired time due to: {}", + logger.debug("Forcing the preferred host checker task {} shutdown as it did not shutdown in the desired time due to: {}", name, e.getMessage()); hostLbCheckExecutor.shutdownNow(); } } if (checkInterval > 0L) { - logger.info("Scheduling preferred host task with host.lb.interval={}ms", checkInterval); + if ("shuffle".equalsIgnoreCase(lbAlgorithm)) { + logger.info("Scheduling the preferred host checker task to trigger once (to apply lb algorithm '{}') after host.lb.interval={} ms", lbAlgorithm, checkInterval); + hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name))); + hostLbCheckExecutor.schedule(new PreferredHostCheckerTask(), checkInterval, TimeUnit.MILLISECONDS); + return; + } + + logger.info("Scheduling a recurring preferred host checker task with lb algorithm '{}' and host.lb.interval={} ms", lbAlgorithm, checkInterval); hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name))); hostLbCheckExecutor.scheduleAtFixedRate(new PreferredHostCheckerTask(), checkInterval, checkInterval, TimeUnit.MILLISECONDS); @@ -928,7 +936,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) { return new SetupCertificateAnswer(true); } - private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) { + private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final boolean triggerHostLB) { if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) { try { final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm); @@ -941,22 +949,24 @@ private void processManagementServerList(final List<String> msList, final List<S } } shell.setAvoidHosts(avoidMsList); - if ("shuffle".equals(lbAlgorithm)) { - scheduleHostLBCheckerTask(0); - } else { - scheduleHostLBCheckerTask(shell.getLbCheckerInterval(lbCheckInterval)); + if (triggerHostLB) { + logger.info("Triggering the preferred host checker task now"); + ScheduledExecutorService hostLbExecutor = Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("HostLB-Executor")); + hostLbExecutor.schedule(new PreferredHostCheckerTask(), 0, TimeUnit.MILLISECONDS); + hostLbExecutor.shutdown(); } + scheduleHostLBCheckerTask(lbAlgorithm, shell.getLbCheckerInterval(lbCheckInterval)); } private Answer setupManagementServerList(final SetupMSListCommand cmd) { - processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval()); + processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), cmd.getTriggerHostLb()); return new SetupMSListAnswer(true); } private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) { try { if (CollectionUtils.isNotEmpty(cmd.getMsList())) { - processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval()); + processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), false); } Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> { migrateAgentConnection(cmd.getAvoidMsList()); @@ -1046,7 +1056,7 @@ public void processReadyCommand(final Command cmd) { } verifyAgentArch(ready.getArch()); - processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval()); + processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval(), false); logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName()); } diff --git a/api/src/main/java/com/cloud/exception/OperationTimedoutException.java b/api/src/main/java/com/cloud/exception/OperationTimedoutException.java index fe27408eb4e3..66b607100d97 100644 --- a/api/src/main/java/com/cloud/exception/OperationTimedoutException.java +++ b/api/src/main/java/com/cloud/exception/OperationTimedoutException.java @@ -40,7 +40,7 @@ public class OperationTimedoutException extends CloudException { boolean _isActive; public OperationTimedoutException(Command[] cmds, long agentId, long seqId, int time, boolean isActive) { - super("Commands " + seqId + " to Host " + agentId + " timed out after " + time); + super("Commands " + seqId + " to Host " + agentId + " timed out after " + time + " secs"); _agentId = agentId; _seqId = seqId; _time = time; diff --git a/api/src/main/java/com/cloud/resource/ResourceState.java b/api/src/main/java/com/cloud/resource/ResourceState.java index 70738c7921bc..e91cf820b081 100644 --- a/api/src/main/java/com/cloud/resource/ResourceState.java +++ b/api/src/main/java/com/cloud/resource/ResourceState.java @@ -76,6 +76,10 @@ public static Event toEvent(String e) { } } + public static List<ResourceState> s_maintenanceStates = List.of(ResourceState.Maintenance, + ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance, + ResourceState.ErrorInPrepareForMaintenance); + public ResourceState getNextState(Event a) { return s_fsm.getNextState(this, a); } @@ -98,8 +102,7 @@ public static String[] toString(ResourceState... states) { } public static boolean isMaintenanceState(ResourceState state) { - return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance, - ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state); + return s_maintenanceStates.contains(state); } public static boolean canAttemptMaintenance(ResourceState state) { diff --git a/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java b/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java index 304e43009f26..36929e74a54d 100644 --- a/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java +++ b/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java @@ -432,6 +432,7 @@ public class ApiConstants { public static final String PUBLIC_END_PORT = "publicendport"; public static final String PUBLIC_ZONE = "publiczone"; public static final String PURGE_RESOURCES = "purgeresources"; + public static final String REBALANCE = "rebalance"; public static final String RECEIVED_BYTES = "receivedbytes"; public static final String RECONNECT = "reconnect"; public static final String RECOVER = "recover"; diff --git a/api/src/main/java/org/apache/cloudstack/api/command/admin/systemvm/PatchSystemVMCmd.java b/api/src/main/java/org/apache/cloudstack/api/command/admin/systemvm/PatchSystemVMCmd.java index 4f4b26316673..eafee7424ffc 100644 --- a/api/src/main/java/org/apache/cloudstack/api/command/admin/systemvm/PatchSystemVMCmd.java +++ b/api/src/main/java/org/apache/cloudstack/api/command/admin/systemvm/PatchSystemVMCmd.java @@ -46,7 +46,7 @@ public class PatchSystemVMCmd extends BaseAsyncCmd { @Parameter(name = ApiConstants.FORCED, type = CommandType.BOOLEAN, description = "If true, initiates copy of scripts and restart of the agent, even if the scripts version matches." + "To be used with ID parameter only") - private Boolean force; + private Boolean forced; ///////////////////////////////////////////////////// /////////////////// Accessors /////////////////////// @@ -58,7 +58,7 @@ public Long getId() { } public boolean isForced() { - return force != null && force; + return forced != null && forced; } ///////////////////////////////////////////////////// diff --git a/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java b/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java index 32f436434c17..864a3e22eb3e 100644 --- a/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java +++ b/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java @@ -29,13 +29,15 @@ public class SetupMSListCommand extends Command { private List<String> avoidMsList; private String lbAlgorithm; private Long lbCheckInterval; + private Boolean triggerHostLb; - public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) { + public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final Boolean triggerHostLb) { super(); this.msList = msList; this.avoidMsList = avoidMsList; this.lbAlgorithm = lbAlgorithm; this.lbCheckInterval = lbCheckInterval; + this.triggerHostLb = triggerHostLb; } public List<String> getMsList() { @@ -54,9 +56,12 @@ public Long getLbCheckInterval() { return lbCheckInterval; } + public boolean getTriggerHostLb() { + return triggerHostLb; + } + @Override public boolean executeInSequence() { return false; } - } diff --git a/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java b/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java index dd388d2a2d8a..0aa5805b1601 100644 --- a/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java +++ b/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java @@ -171,5 +171,5 @@ enum TapAgentsAction { void propagateChangeToAgents(Map<String, String> params); - boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs); + boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance); } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 585c479f65f8..2b8eb3cc5c74 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -273,8 +273,6 @@ public boolean configure(final String name, final Map<String, Object> params) th _executor = new ThreadPoolExecutor(agentTaskThreads, agentTaskThreads, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), new NamedThreadFactory("AgentTaskPool")); - initConnectExecutor(); - maxConcurrentNewAgentConnections = RemoteAgentMaxConcurrentNewConnections.value(); _connection = new NioServer("AgentManager", Port.value(), Workers.value() + 10, @@ -828,6 +826,7 @@ public boolean start() { return true; } + initConnectExecutor(); startDirectlyConnectedHosts(false); if (_connection != null) { @@ -2193,7 +2192,7 @@ public void propagateChangeToAgents(Map<String, String> params) { } @Override - public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) { + public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { return true; } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java index 8795c8d428fd..a7dca34f0321 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java @@ -42,6 +42,7 @@ import javax.net.ssl.SSLContext; import javax.net.ssl.SSLEngine; +import com.cloud.resource.ResourceState; import org.apache.cloudstack.ca.CAManager; import org.apache.cloudstack.framework.config.ConfigDepot; import org.apache.cloudstack.framework.config.ConfigKey; @@ -431,10 +432,10 @@ public boolean routeToPeer(final String peer, final byte[] bytes) { ch = connectToPeer(peer, ch); if (ch == null) { try { - logD(bytes, "Unable to route to peer: " + Request.parse(bytes)); + logD(bytes, "Unable to establish connection to route to peer: " + Request.parse(bytes)); } catch (ClassNotFoundException | UnsupportedVersionException e) { // Request.parse thrown exception when we try to log it, log as much as we can - logD(bytes, "Unable to route to peer, and Request.parse further caught exception" + e.getMessage()); + logD(bytes, "Unable to establish connection to route to peer, and Request.parse further caught exception" + e.getMessage()); } return false; } @@ -643,7 +644,6 @@ protected void doTask(final Task task) throws TaskExecutionException { final Link link = task.getLink(); if (Request.fromServer(data)) { - final AgentAttache agent = findAttache(hostId); if (Request.isControl(data)) { @@ -691,7 +691,6 @@ protected void doTask(final Task task) throws TaskExecutionException { cancel(Long.toString(Request.getManagementServerId(data)), hostId, Request.getSequence(data), e.getMessage()); } } else { - final long mgmtId = Request.getManagementServerId(data); if (mgmtId != -1 && mgmtId != _nodeId) { routeToPeer(Long.toString(mgmtId), data); @@ -1352,7 +1351,7 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS if (cmd instanceof PrepareForMaintenanceManagementServerHostCommand) { logger.debug("Received PrepareForMaintenanceManagementServerHostCommand - preparing for maintenance"); try { - managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm()); + managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm(), ((PrepareForMaintenanceManagementServerHostCommand) cmd).isForced()); return "Successfully prepared for maintenance"; } catch(CloudRuntimeException e) { return e.getMessage(); @@ -1399,14 +1398,14 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS } @Override - public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) { + public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { if (timeoutDurationInMs <= 0) { logger.debug("Not transferring direct agents from management server node {} (id: {}) to other nodes, invalid timeout duration", fromMsId, fromMsUuid); return false; } long transferStartTimeInMs = System.currentTimeMillis(); - if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId))) { + if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId, excludeHostsInMaintenance))) { logger.info("No direct agent hosts available on management server node {} (id: {}), to transfer", fromMsId, fromMsUuid); return true; } @@ -1421,7 +1420,7 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long int agentTransferFailedCount = 0; List<DataCenterVO> dataCenterList = dcDao.listAll(); for (DataCenterVO dc : dataCenterList) { - List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId()); + List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId(), excludeHostsInMaintenance); if (CollectionUtils.isEmpty(directAgentHostsInDc)) { continue; } @@ -1455,9 +1454,10 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long return (agentTransferFailedCount == 0); } - private List<HostVO> getDirectAgentHosts(long msId) { + private List<HostVO> getDirectAgentHosts(long msId, boolean excludeHostsInMaintenance) { List<HostVO> directAgentHosts = new ArrayList<>(); - List<HostVO> hosts = _hostDao.listHostsByMs(msId); + List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of(); + List<HostVO> hosts = _hostDao.listHostsByMsResourceState(msId, statesToExclude); for (HostVO host : hosts) { AgentAttache agent = findAttache(host.getId()); if (agent instanceof DirectAgentAttache) { @@ -1468,9 +1468,11 @@ private List<HostVO> getDirectAgentHosts(long msId) { return directAgentHosts; } - private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId) { + private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId, boolean excludeHostsInMaintenance) { List<HostVO> directAgentHosts = new ArrayList<>(); - List<HostVO> hosts = _hostDao.listHostsByMsAndDc(msId, dcId); + // To exclude maintenance states use values from ResourceState as source of truth + List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of(); + List<HostVO> hosts = _hostDao.listHostsByMsDcResourceState(msId, dcId, statesToExclude); for (HostVO host : hosts) { AgentAttache agent = findAttache(host.getId()); if (agent instanceof DirectAgentAttache) { @@ -1506,6 +1508,10 @@ public void onManagementServerPreparingForMaintenance() { public void onManagementServerCancelPreparingForMaintenance() { logger.debug("Management server cancel preparing for maintenance"); super.onManagementServerPreparingForMaintenance(); + + // needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state + // (where this variable will be reset) + _agentLbHappened = false; } @Override diff --git a/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java b/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java index 2b8a23a1b510..090b019334f4 100644 --- a/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java +++ b/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java @@ -177,14 +177,24 @@ public interface HostDao extends GenericDao<HostVO, Long>, StateDao<Status, Stat List<HostVO> listHostsByMsAndDc(long msId, long dcId); + List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates); + List<HostVO> listHostsByMs(long msId); + List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates); + /** - * Retrieves the number of hosts/agents this {@see ManagementServer} has responsibility over. - * @param msId the id of the {@see ManagementServer} - * @return the number of hosts/agents this {@see ManagementServer} has responsibility over + * Count Hosts by given Management Server, Host and Hypervisor Types, + * and exclude Hosts with given Resource States. + * + * @param msId Management Server Id + * @param excludedResourceStates Resource States to be excluded + * @param hostTypes Host Types + * @param hypervisorTypes Hypervisor Types + * @return Hosts count */ - int countByMs(long msId); + int countHostsByMsResourceStateTypeAndHypervisorType(long msId, List<ResourceState> excludedResourceStates, + List<Type> hostTypes, List<HypervisorType> hypervisorTypes); /** * Retrieves the host ids/agents this {@see ManagementServer} has responsibility over. diff --git a/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java b/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java index 61fa3edcf227..8f218841b074 100644 --- a/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java +++ b/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java @@ -72,6 +72,7 @@ import com.cloud.utils.db.GenericSearchBuilder; import com.cloud.utils.db.JoinBuilder; import com.cloud.utils.db.JoinBuilder.JoinType; +import com.cloud.utils.db.QueryBuilder; import com.cloud.utils.db.SearchBuilder; import com.cloud.utils.db.SearchCriteria; import com.cloud.utils.db.SearchCriteria.Func; @@ -1600,6 +1601,17 @@ public List<HostVO> listHostsByMsAndDc(long msId, long dcId) { return listBy(sc); } + @Override + public List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates) { + QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + return listBy(sc.create()); + } + @Override public List<HostVO> listHostsByMs(long msId) { SearchCriteria<HostVO> sc = ResponsibleMsSearch.create(); @@ -1608,10 +1620,32 @@ public List<HostVO> listHostsByMs(long msId) { } @Override - public int countByMs(long msId) { - SearchCriteria<HostVO> sc = ResponsibleMsSearch.create(); - sc.setParameters("managementServerId", msId); - return getCount(sc); + public List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates) { + QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + return listBy(sc.create()); + } + + @Override + public int countHostsByMsResourceStateTypeAndHypervisorType(long msId, + List<ResourceState> excludedResourceStates, + List<Type> hostTypes, + List<HypervisorType> hypervisorTypes) { + QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + if (CollectionUtils.isNotEmpty(hostTypes)) { + sc.and(sc.entity().getType(), Op.IN, hostTypes.toArray()); + } + if (CollectionUtils.isNotEmpty(hypervisorTypes)) { + sc.and(sc.entity().getHypervisorType(), Op.IN, hypervisorTypes.toArray()); + } + return getCount(sc.create()); } @Override diff --git a/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java b/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java index b136b8e842b8..780a09b883e0 100644 --- a/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java +++ b/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java @@ -70,9 +70,11 @@ public interface IndirectAgentLB { */ Long getLBPreferredHostCheckInterval(Long clusterId); - void propagateMSListToAgents(); + void propagateMSListToAgents(boolean triggerHostLB); - boolean haveAgentBasedHosts(long msId); + void propagateMSListToAgentsInCluster(Long clusterId); - boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs); + boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance); + + boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance); } diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java index a0f091ef1e4a..ab3f900b693f 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java @@ -18,12 +18,15 @@ package org.apache.cloudstack.api.command; import org.apache.cloudstack.api.APICommand; +import org.apache.cloudstack.api.ApiConstants; import org.apache.cloudstack.api.BaseCmd; import com.cloud.user.Account; +import org.apache.cloudstack.api.Parameter; import org.apache.cloudstack.api.response.ManagementServerMaintenanceResponse; import org.apache.cloudstack.acl.RoleType; +import org.apache.commons.lang3.BooleanUtils; @APICommand(name = CancelMaintenanceCmd.APINAME, description = "Cancels maintenance of the management server", @@ -36,6 +39,13 @@ public class CancelMaintenanceCmd extends BaseMSMaintenanceActionCmd { public static final String APINAME = "cancelMaintenance"; + @Parameter(name = ApiConstants.REBALANCE, type = CommandType.BOOLEAN, description = "Rebalance agents (applicable for indirect agents, ensure the settings 'host' and 'indirect.agent.lb.algorithm' are properly configured) after cancelling maintenance, default is true") + private Boolean rebalance; + + public boolean getRebalance() { + return BooleanUtils.toBooleanDefaultIfNull(rebalance, true); + } + @Override public String getCommandName() { return APINAME.toLowerCase() + BaseCmd.RESPONSE_SUFFIX; diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java index 3c036c4c35f2..2b63b28e0c5b 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java @@ -26,6 +26,7 @@ import org.apache.cloudstack.api.response.ManagementServerMaintenanceResponse; import org.apache.cloudstack.acl.RoleType; +import org.apache.commons.lang3.BooleanUtils; @APICommand(name = PrepareForMaintenanceCmd.APINAME, description = "Prepares management server for maintenance by preventing new jobs from being accepted after completion of active jobs and migrating the agents", @@ -40,6 +41,9 @@ public class PrepareForMaintenanceCmd extends BaseMSMaintenanceActionCmd { " when this is not set, already configured algorithm from setting 'indirect.agent.lb.algorithm' is considered") private String algorithm; + @Parameter(name = ApiConstants.FORCED, type = CommandType.BOOLEAN, description = "Force management server to maintenance after the maintenance window timeout, default is false") + private Boolean forced; + public String getAlgorithm() { return algorithm; } @@ -48,6 +52,10 @@ public void setAlgorithm(String algorithm) { this.algorithm = algorithm; } + public boolean isForced() { + return BooleanUtils.toBooleanDefaultIfNull(forced, false); + } + @Override public String getCommandName() { return APINAME.toLowerCase() + BaseCmd.RESPONSE_SUFFIX; diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java index 3af19164cc93..b7b68b065c0c 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java @@ -40,6 +40,15 @@ public interface ManagementServerMaintenanceManager { ConfigKey.Scope.Global, null); + ConfigKey<Boolean> ManagementServerMaintenanceIgnoreMaintenanceHosts = new ConfigKey<>(Boolean.class, + "management.server.maintenance.ignore.maintenance.hosts", + "Advanced", + String.valueOf(Boolean.FALSE), + "Host in Maintenance state can sometimes block Management Server to go to Maintenance; this setting skips Host(s) in Maintenance state during Management Server Maintenance, default: false.", + true, + ConfigKey.Scope.Global, + null); + void registerListener(ManagementServerMaintenanceListener listener); void unregisterListener(ManagementServerMaintenanceListener listener); @@ -76,14 +85,14 @@ public interface ManagementServerMaintenanceManager { // Indicates whether the current management server is preparing to maintenance boolean isPreparingForMaintenance(); - void resetPreparingForMaintenance(); + void resetMaintenanceParams(); long getMaintenanceStartTime(); String getLbAlgorithm(); // Prepares the current management server for maintenance by migrating the agents and not accepting any more async jobs - void prepareForMaintenance(String lbAlorithm); + void prepareForMaintenance(String lbAlorithm, boolean forced); // Cancels maintenance of the current management server void cancelMaintenance(); diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java index fcfa32d6ce88..16cf14e1fb12 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java @@ -26,7 +26,9 @@ import javax.inject.Inject; +import com.cloud.resource.ResourceState; import org.apache.cloudstack.agent.lb.IndirectAgentLB; +import org.apache.cloudstack.agent.lb.IndirectAgentLBServiceImpl; import org.apache.cloudstack.api.command.CancelMaintenanceCmd; import org.apache.cloudstack.api.command.CancelShutdownCmd; import org.apache.cloudstack.api.command.PrepareForMaintenanceCmd; @@ -39,6 +41,7 @@ import org.apache.cloudstack.framework.config.Configurable; import org.apache.cloudstack.framework.jobs.AsyncJobManager; import org.apache.cloudstack.managed.context.ManagedContextRunnable; +import org.apache.cloudstack.management.ManagementServerHost; import org.apache.cloudstack.management.ManagementServerHost.State; import org.apache.cloudstack.maintenance.command.CancelMaintenanceManagementServerHostCommand; import org.apache.cloudstack.maintenance.command.CancelShutdownManagementServerHostCommand; @@ -196,13 +199,20 @@ public boolean isPreparingForShutdown() { return preparingForShutdown; } + private void resetShutdownParams() { + logger.debug("Resetting shutdown params"); + preparingForShutdown = false; + shutdownTriggered = false; + } + @Override public boolean isPreparingForMaintenance() { return preparingForMaintenance; } @Override - public void resetPreparingForMaintenance() { + public void resetMaintenanceParams() { + logger.debug("Resetting maintenance params"); preparingForMaintenance = false; maintenanceStartTime = 0; lbAlgorithm = null; @@ -235,6 +245,11 @@ public void triggerShutdown() { } this.shutdownTriggered = true; prepareForShutdown(true); + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } + msHostDao.updateState(msHost.getId(), State.ShuttingDown); } private void prepareForShutdown(boolean postTrigger) { @@ -251,29 +266,38 @@ private void prepareForShutdown(boolean postTrigger) { this.preparingForShutdown = true; jobManager.disableAsyncJobs(); - waitForPendingJobs(); + waitForPendingJobs(false); } @Override public void prepareForShutdown() { prepareForShutdown(false); + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } + msHostDao.updateState(msHost.getId(), State.PreparingForShutDown); } @Override public void cancelShutdown() { - if (!this.preparingForShutdown) { + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } + if (!this.preparingForShutdown && !(State.PreparingForShutDown.equals(msHost.getState()) || State.ReadyToShutDown.equals(msHost.getState()))) { throw new CloudRuntimeException("Shutdown has not been triggered"); } - this.preparingForShutdown = false; - this.shutdownTriggered = false; - resetPreparingForMaintenance(); + resetShutdownParams(); + resetMaintenanceParams(); jobManager.enableAsyncJobs(); cancelWaitForPendingJobs(); + msHostDao.updateState(msHost.getId(), State.Up); } @Override - public void prepareForMaintenance(String lbAlorithm) { + public void prepareForMaintenance(String lbAlorithm, boolean forced) { if (this.preparingForShutdown) { throw new CloudRuntimeException("Shutdown has already been triggered, cancel shutdown and try again"); } @@ -281,41 +305,57 @@ public void prepareForMaintenance(String lbAlorithm) { if (this.preparingForMaintenance) { throw new CloudRuntimeException("Maintenance has already been initiated"); } + + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } this.preparingForMaintenance = true; this.maintenanceStartTime = System.currentTimeMillis(); this.lbAlgorithm = lbAlorithm; jobManager.disableAsyncJobs(); onPreparingForMaintenance(); - waitForPendingJobs(); + waitForPendingJobs(forced); + msHostDao.updateState(msHost.getId(), State.PreparingForMaintenance); } @Override public void cancelMaintenance() { - if (!this.preparingForMaintenance) { + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } + if (!this.preparingForMaintenance && !(State.Maintenance.equals(msHost.getState()) || State.PreparingForMaintenance.equals(msHost.getState()))) { throw new CloudRuntimeException("Maintenance has not been initiated"); } - resetPreparingForMaintenance(); - this.preparingForShutdown = false; - this.shutdownTriggered = false; + resetMaintenanceParams(); + resetShutdownParams(); jobManager.enableAsyncJobs(); cancelWaitForPendingJobs(); - ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); - if (msHost != null) { - if (State.PreparingForMaintenance.equals(msHost.getState())) { - onCancelPreparingForMaintenance(); - } - if (State.Maintenance.equals(msHost.getState())) { - onCancelMaintenance(); - } + msHostDao.updateState(msHost.getId(), State.Up); + ScheduledExecutorService cancelMaintenanceService = Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("CancelMaintenance-Job")); + cancelMaintenanceService.schedule(() -> { + cancelMaintenanceTask(msHost.getState()); + }, 0, TimeUnit.SECONDS); + cancelMaintenanceService.shutdown(); + } + + private void cancelMaintenanceTask(ManagementServerHost.State msState) { + if (State.PreparingForMaintenance.equals(msState)) { + onCancelPreparingForMaintenance(); + } + if (State.Maintenance.equals(msState)) { + onCancelMaintenance(); } } - private void waitForPendingJobs() { + private void waitForPendingJobs(boolean forceMaintenance) { cancelWaitForPendingJobs(); pendingJobsCheckTask = Executors.newScheduledThreadPool(1, new NamedThreadFactory("PendingJobsCheck")); long pendingJobsCheckDelayInSecs = 1L; // 1 sec long pendingJobsCheckPeriodInSecs = 3L; // every 3 secs, check more frequently for pending jobs - pendingJobsCheckTask.scheduleAtFixedRate(new CheckPendingJobsTask(this), pendingJobsCheckDelayInSecs, pendingJobsCheckPeriodInSecs, TimeUnit.SECONDS); + boolean ignoreMaintenanceHosts = ManagementServerMaintenanceIgnoreMaintenanceHosts.value(); + pendingJobsCheckTask.scheduleAtFixedRate(new CheckPendingJobsTask(this, ignoreMaintenanceHosts, forceMaintenance), pendingJobsCheckDelayInSecs, pendingJobsCheckPeriodInSecs, TimeUnit.SECONDS); } @Override @@ -349,7 +389,6 @@ public ManagementServerMaintenanceResponse prepareForShutdown(PrepareForShutdown cmds[0] = new PrepareForShutdownManagementServerHostCommand(msHost.getMsid()); executeCmd(msHost, cmds); - msHostDao.updateState(msHost.getId(), State.PreparingForShutDown); return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -375,7 +414,6 @@ public ManagementServerMaintenanceResponse triggerShutdown(TriggerShutdownCmd cm cmds[0] = new TriggerShutdownManagementServerHostCommand(msHost.getMsid()); executeCmd(msHost, cmds); - msHostDao.updateState(msHost.getId(), State.ShuttingDown); return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -395,7 +433,6 @@ public ManagementServerMaintenanceResponse cancelShutdown(CancelShutdownCmd cmd) cmds[0] = new CancelShutdownManagementServerHostCommand(msHost.getMsid()); executeCmd(msHost, cmds); - msHostDao.updateState(msHost.getId(), State.Up); return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -426,7 +463,8 @@ public ManagementServerMaintenanceResponse prepareForMaintenance(PrepareForMaint checkAnyMsInPreparingStates("prepare for maintenance"); - if (indirectAgentLB.haveAgentBasedHosts(msHost.getMsid())) { + boolean ignoreMaintenanceHosts = ManagementServerMaintenanceIgnoreMaintenanceHosts.value(); + if (indirectAgentLB.haveAgentBasedHosts(msHost.getMsid(), ignoreMaintenanceHosts)) { List<String> indirectAgentMsList = indirectAgentLB.getManagementServerList(); indirectAgentMsList.remove(msHost.getServiceIP()); List<String> nonUpMsList = msHostDao.listNonUpStateMsIPs(); @@ -437,10 +475,9 @@ public ManagementServerMaintenanceResponse prepareForMaintenance(PrepareForMaint } final Command[] cmds = new Command[1]; - cmds[0] = new PrepareForMaintenanceManagementServerHostCommand(msHost.getMsid(), cmd.getAlgorithm()); + cmds[0] = new PrepareForMaintenanceManagementServerHostCommand(msHost.getMsid(), cmd.getAlgorithm(), cmd.isForced()); executeCmd(msHost, cmds); - msHostDao.updateState(msHost.getId(), State.PreparingForMaintenance); return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -460,7 +497,11 @@ public ManagementServerMaintenanceResponse cancelMaintenance(CancelMaintenanceCm cmds[0] = new CancelMaintenanceManagementServerHostCommand(msHost.getMsid()); executeCmd(msHost, cmds); - msHostDao.updateState(msHost.getId(), State.Up); + if (cmd.getRebalance()) { + logger.info("Propagate MS list and rebalance indirect agents"); + indirectAgentLB.propagateMSListToAgents(true); + } + return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -485,12 +526,14 @@ private void executeCmd(ManagementServerHostVO msHost, Command[] cmds) { @Override public void cancelPreparingForMaintenance(ManagementServerHostVO msHost) { - resetPreparingForMaintenance(); - this.preparingForShutdown = false; - this.shutdownTriggered = false; + resetMaintenanceParams(); + resetShutdownParams(); jobManager.enableAsyncJobs(); if (msHost == null) { msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + throw new CloudRuntimeException("Invalid node id for the management server"); + } } onCancelPreparingForMaintenance(); msHostDao.updateState(msHost.getId(), State.Up); @@ -546,17 +589,21 @@ public String getConfigComponentName() { @Override public ConfigKey<?>[] getConfigKeys() { return new ConfigKey<?>[]{ - ManagementServerMaintenanceTimeoutInMins + ManagementServerMaintenanceTimeoutInMins, ManagementServerMaintenanceIgnoreMaintenanceHosts }; } private final class CheckPendingJobsTask extends ManagedContextRunnable { private ManagementServerMaintenanceManager managementServerMaintenanceManager; + private boolean ignoreMaintenanceHosts = false; private boolean agentsTransferTriggered = false; + private boolean forceMaintenance = false; - public CheckPendingJobsTask(ManagementServerMaintenanceManager managementServerMaintenanceManager) { + public CheckPendingJobsTask(ManagementServerMaintenanceManager managementServerMaintenanceManager, boolean ignoreMaintenanceHosts, boolean forceMaintenance) { this.managementServerMaintenanceManager = managementServerMaintenanceManager; + this.ignoreMaintenanceHosts = ignoreMaintenanceHosts; + this.forceMaintenance = forceMaintenance; } @Override @@ -570,6 +617,19 @@ protected void runInContext() { } if (managementServerMaintenanceManager.isPreparingForMaintenance() && isMaintenanceWindowExpired()) { + if (forceMaintenance) { + logger.debug("Maintenance window timeout, MS is forced to Maintenance Mode"); + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + logger.warn("Unable to find the management server, invalid node id"); + return; + } + msHostDao.updateState(msHost.getId(), State.Maintenance); + managementServerMaintenanceManager.onMaintenance(); + managementServerMaintenanceManager.cancelWaitForPendingJobs(); + return; + } + logger.debug("Maintenance window timeout, terminating the pending jobs check timer task"); managementServerMaintenanceManager.cancelPreparingForMaintenance(null); managementServerMaintenanceManager.cancelWaitForPendingJobs(); @@ -577,9 +637,11 @@ protected void runInContext() { } long totalPendingJobs = managementServerMaintenanceManager.countPendingJobs(ManagementServerNode.getManagementServerId()); - int totalAgents = hostDao.countByMs(ManagementServerNode.getManagementServerId()); - String msg = String.format("Checking for triggered maintenance or shutdown... shutdownTriggered [%b] AllowAsyncJobs [%b] PendingJobCount [%d] AgentsCount [%d]", - managementServerMaintenanceManager.isShutdownTriggered(), managementServerMaintenanceManager.isAsyncJobsEnabled(), totalPendingJobs, totalAgents); + + long totalAgents = totalAgentsInMs(); + + String msg = String.format("Checking for triggered maintenance or shutdown... shutdownTriggered [%b] preparingForShutdown[%b] preparingForMaintenance[%b] AllowAsyncJobs [%b] PendingJobCount [%d] AgentsCount [%d]", + managementServerMaintenanceManager.isShutdownTriggered(), managementServerMaintenanceManager.isPreparingForShutdown(), managementServerMaintenanceManager.isPreparingForMaintenance(), managementServerMaintenanceManager.isAsyncJobsEnabled(), totalPendingJobs, totalAgents); logger.debug(msg); if (totalPendingJobs > 0) { @@ -594,6 +656,10 @@ protected void runInContext() { } if (managementServerMaintenanceManager.isPreparingForMaintenance()) { ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + logger.warn("Unable to find the management server, invalid node id"); + return; + } if (totalAgents == 0) { logger.info("MS is in Maintenance Mode"); msHostDao.updateState(msHost.getId(), State.Maintenance); @@ -609,7 +675,7 @@ protected void runInContext() { agentsTransferTriggered = true; logger.info(String.format("Preparing for maintenance - migrating agents from management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); - boolean agentsMigrated = indirectAgentLB.migrateAgents(msHost.getUuid(), ManagementServerNode.getManagementServerId(), managementServerMaintenanceManager.getLbAlgorithm(), remainingMaintenanceWindowInMs()); + boolean agentsMigrated = indirectAgentLB.migrateAgents(msHost.getUuid(), ManagementServerNode.getManagementServerId(), managementServerMaintenanceManager.getLbAlgorithm(), remainingMaintenanceWindowInMs(), ignoreMaintenanceHosts); if (!agentsMigrated) { logger.warn(String.format("Unable to prepare for maintenance, cannot migrate indirect agents on this management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); managementServerMaintenanceManager.cancelPreparingForMaintenance(msHost); @@ -617,18 +683,20 @@ protected void runInContext() { return; } - if(!agentMgr.transferDirectAgentsFromMS(msHost.getUuid(), ManagementServerNode.getManagementServerId(), remainingMaintenanceWindowInMs())) { + if(!agentMgr.transferDirectAgentsFromMS(msHost.getUuid(), ManagementServerNode.getManagementServerId(), remainingMaintenanceWindowInMs(), ignoreMaintenanceHosts)) { logger.warn(String.format("Unable to prepare for maintenance, cannot transfer direct agents on this management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); managementServerMaintenanceManager.cancelPreparingForMaintenance(msHost); managementServerMaintenanceManager.cancelWaitForPendingJobs(); - return; } } else if (managementServerMaintenanceManager.isPreparingForShutdown()) { logger.info("MS is Ready To Shutdown"); ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + if (msHost == null) { + logger.warn("Unable to find the management server, invalid node id"); + return; + } msHostDao.updateState(msHost.getId(), State.ReadyToShutDown); managementServerMaintenanceManager.cancelWaitForPendingJobs(); - return; } } catch (final Exception e) { logger.error("Error trying to check/run pending jobs task", e); @@ -648,5 +716,14 @@ private long remainingMaintenanceWindowInMs() { long remainingMaintenanceWindowTimeInMs = (ManagementServerMaintenanceTimeoutInMins.value().longValue() * 60 * 1000) - maintenanceElapsedTimeInMs; return (remainingMaintenanceWindowTimeInMs > 0) ? remainingMaintenanceWindowTimeInMs : 0; } + + private long totalAgentsInMs() { + /* Any Host in Maintenance state could block moving Management Server to Maintenance state, exclude those Hosts from total agents count + * To exclude maintenance states use values from ResourceState as source of truth + */ + List<ResourceState> statesToExclude = ignoreMaintenanceHosts ? ResourceState.s_maintenanceStates : List.of(); + return hostDao.countHostsByMsResourceStateTypeAndHypervisorType(ManagementServerNode.getManagementServerId(), statesToExclude, + IndirectAgentLBServiceImpl.agentValidHostTypes, null); + } } } diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java index 8f2a4e62b32d..ad96454b0542 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java @@ -20,17 +20,23 @@ public class PrepareForMaintenanceManagementServerHostCommand extends BaseShutdownManagementServerHostCommand { String lbAlgorithm; + boolean forced; public PrepareForMaintenanceManagementServerHostCommand(long msId) { super(msId); } - public PrepareForMaintenanceManagementServerHostCommand(long msId, String lbAlgorithm) { + public PrepareForMaintenanceManagementServerHostCommand(long msId, String lbAlgorithm, boolean forced) { super(msId); this.lbAlgorithm = lbAlgorithm; + this.forced = forced; } public String getLbAlgorithm() { return lbAlgorithm; } + + public boolean isForced() { + return forced; + } } diff --git a/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java b/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java index 9fe33aa6c547..280d1eaf9eb9 100644 --- a/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java +++ b/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java @@ -92,6 +92,8 @@ public void countPendingJobs() { @Test public void prepareForShutdown() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); spy.prepareForShutdown(); Mockito.verify(jobManagerMock).disableAsyncJobs(); @@ -106,6 +108,9 @@ public void prepareForShutdown() { @Test public void cancelShutdown() { + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHost.getState()).thenReturn(ManagementServerHost.State.Up); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); Assert.assertThrows(CloudRuntimeException.class, () -> { spy.cancelShutdown(); }); @@ -115,6 +120,8 @@ public void cancelShutdown() { public void triggerShutdown() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); Mockito.lenient().when(spy.isShutdownTriggered()).thenReturn(false); + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); spy.triggerShutdown(); Mockito.verify(jobManagerMock).disableAsyncJobs(); @@ -305,43 +312,44 @@ public void triggerShutdownCmd() { @Test public void prepareForMaintenanceAndCancelFromMaintenanceState() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); - spy.prepareForMaintenance("static"); + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); + spy.prepareForMaintenance("static", false); Mockito.verify(jobManagerMock).disableAsyncJobs(); Assert.assertThrows(CloudRuntimeException.class, () -> { - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); }); - ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); Mockito.when(msHost.getState()).thenReturn(ManagementServerHost.State.Maintenance); - Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); Mockito.doNothing().when(jobManagerMock).enableAsyncJobs(); spy.cancelMaintenance(); Mockito.verify(jobManagerMock).enableAsyncJobs(); - Mockito.verify(spy, Mockito.times(1)).onCancelMaintenance(); } @Test public void prepareForMaintenanceAndCancelFromPreparingForMaintenanceState() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); - spy.prepareForMaintenance("static"); + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); + spy.prepareForMaintenance("static", false); Mockito.verify(jobManagerMock).disableAsyncJobs(); Assert.assertThrows(CloudRuntimeException.class, () -> { - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); }); - ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); Mockito.when(msHost.getState()).thenReturn(ManagementServerHost.State.PreparingForMaintenance); - Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); Mockito.doNothing().when(jobManagerMock).enableAsyncJobs(); spy.cancelMaintenance(); Mockito.verify(jobManagerMock).enableAsyncJobs(); - Mockito.verify(spy, Mockito.times(1)).onCancelPreparingForMaintenance(); } @Test public void cancelMaintenance() { + ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); + Mockito.when(msHost.getState()).thenReturn(ManagementServerHost.State.Up); + Mockito.when(msHostDao.findByMsid(anyLong())).thenReturn(msHost); Assert.assertThrows(CloudRuntimeException.class, () -> { spy.cancelMaintenance(); }); @@ -455,7 +463,7 @@ public void prepareForMaintenanceCmdNoIndirectMsHosts() { Mockito.when(msHostDao.listNonUpStateMsIPs()).thenReturn(new ArrayList<>()); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(true); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(true); Mockito.when(indirectAgentLBMock.getManagementServerList()).thenReturn(new ArrayList<>()); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -476,7 +484,7 @@ public void prepareForMaintenanceCmdNullResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn(null); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -497,7 +505,7 @@ public void prepareForMaintenanceCmdFailedResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn("Failed"); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -518,7 +526,7 @@ public void prepareForMaintenanceCmdSuccessResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(hostDao.listByMs(anyLong())).thenReturn(new ArrayList<>()); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn("Success"); diff --git a/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java b/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java index 908f3d7dad07..747416bf26c0 100644 --- a/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java +++ b/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java @@ -287,6 +287,7 @@ import com.cloud.user.dao.UserDao; import com.cloud.utils.NumbersUtil; import com.cloud.utils.Pair; +import com.cloud.utils.Ternary; import com.cloud.utils.UriUtils; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.crypt.DBEncryptionUtil; @@ -630,21 +631,30 @@ protected void populateConfigKeysAllowedOnlyForDefaultAdmin() { private void initMessageBusListener() { messageBus.subscribe(EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, new MessageSubscriber() { @Override - public void onPublishMessage(String serderAddress, String subject, Object args) { - String globalSettingUpdated = (String) args; - if (StringUtils.isEmpty(globalSettingUpdated)) { + public void onPublishMessage(String senderAddress, String subject, Object args) { + Ternary<String, ConfigKey.Scope, Long> settingUpdated = (Ternary<String, ConfigKey.Scope, Long>) args; + String settingNameUpdated = settingUpdated.first(); + if (StringUtils.isEmpty(settingNameUpdated)) { return; } - if (globalSettingUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) || - globalSettingUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) { - _indirectAgentLB.propagateMSListToAgents(); - } else if (globalSettingUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString()) - || globalSettingUpdated.equals(Config.MigrateWait.toString())) { + if (settingNameUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) || + settingNameUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) { + _indirectAgentLB.propagateMSListToAgents(false); + } else if (settingNameUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString()) + || settingNameUpdated.equals(Config.MigrateWait.toString())) { Map<String, String> params = new HashMap<String, String>(); params.put(Config.RouterAggregationCommandEachTimeout.toString(), _configDao.getValue(Config.RouterAggregationCommandEachTimeout.toString())); params.put(Config.MigrateWait.toString(), _configDao.getValue(Config.MigrateWait.toString())); _agentManager.propagateChangeToAgents(params); - } else if (VMLeaseManager.InstanceLeaseEnabled.key().equals(globalSettingUpdated)) { + } else if (settingNameUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBCheckInterval.key())) { + ConfigKey.Scope scope = settingUpdated.second(); + if (scope == ConfigKey.Scope.Global) { + _indirectAgentLB.propagateMSListToAgents(false); + } else if (scope == ConfigKey.Scope.Cluster) { + Long clusterId = settingUpdated.third(); + _indirectAgentLB.propagateMSListToAgentsInCluster(clusterId); + } + } else if (VMLeaseManager.InstanceLeaseEnabled.key().equals(settingNameUpdated)) { vmLeaseManager.onLeaseFeatureToggle(); } } @@ -833,6 +843,7 @@ public String updateConfiguration(final long userId, final String name, final St CallContext.current().setEventDetails(String.format(" Name: %s, New Value: %s, Scope: %s", name, value, scope.name())); _configDepot.invalidateConfigCache(name, scope, resourceId); + messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, new Ternary<>(name, scope, resourceId)); return valueEncrypted ? DBEncryptionUtil.decrypt(value) : value; } @@ -927,7 +938,7 @@ public String updateConfiguration(final long userId, final String name, final St } txn.commit(); - messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, name); + messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, new Ternary<>(name, ConfigKey.Scope.Global, resourceId)); return _configDao.getValue(name); } diff --git a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java index 3336d44dba81..fc893a7ef50c 100644 --- a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java +++ b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java @@ -30,6 +30,7 @@ import javax.inject.Inject; import javax.naming.ConfigurationException; +import com.cloud.dc.ClusterVO; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBRoundRobinAlgorithm; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBShuffleAlgorithm; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBStaticAlgorithm; @@ -62,7 +63,8 @@ public class IndirectAgentLBServiceImpl extends ComponentLifecycleBase implement public static final ConfigKey<String> IndirectAgentLBAlgorithm = new ConfigKey<>(String.class, "indirect.agent.lb.algorithm", "Advanced", "static", - "The algorithm to be applied on the provided management server list in the 'host' config that that is sent to indirect agents. Allowed values are: static, roundrobin and shuffle.", + "The algorithm to be applied on the provided management server list in the 'host' config that that is sent to indirect agents. Allowed values are: static, roundrobin and shuffle. " + + "Note: The lb algorithm 'shuffle' disables the indirect agent lb check background task once the algorithm is applied on the agent.", true, ConfigKey.Scope.Global, null, null, null, null, null, ConfigKey.Kind.Select, "static,roundrobin,shuffle"); public static final ConfigKey<Long> IndirectAgentLBCheckInterval = new ConfigKey<>("Advanced", Long.class, @@ -89,7 +91,9 @@ public class IndirectAgentLBServiceImpl extends ComponentLifecycleBase implement private static final List<ResourceState> agentValidResourceStates = List.of( ResourceState.Enabled, ResourceState.Maintenance, ResourceState.Disabled, ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance); - private static final List<Host.Type> agentValidHostTypes = List.of(Host.Type.Routing, Host.Type.ConsoleProxy, + private static final List<ResourceState> agentNonMaintenanceResourceStates = List.of( + ResourceState.Enabled, ResourceState.Disabled); + public static final List<Host.Type> agentValidHostTypes = List.of(Host.Type.Routing, Host.Type.ConsoleProxy, Host.Type.SecondaryStorage, Host.Type.SecondaryStorageVM); private static final List<Host.Type> agentNonRoutingHostTypes = List.of(Host.Type.ConsoleProxy, Host.Type.SecondaryStorage, Host.Type.SecondaryStorageVM); @@ -132,7 +136,7 @@ public List<String> getManagementServerList(final Long hostId, final Long dcId, final org.apache.cloudstack.agent.lb.IndirectAgentLBAlgorithm algorithm = getAgentMSLBAlgorithm(lbAlgorithm); List<Long> hostIdList = orderedHostIdList; if (hostIdList == null) { - hostIdList = algorithm.isHostListNeeded() ? getOrderedHostIdList(dcId) : new ArrayList<>(); + hostIdList = algorithm.isHostListNeeded() ? getOrderedHostIdList(dcId, false) : new ArrayList<>(); } // just in case we have a host in creating state make sure it is in the list: @@ -167,8 +171,8 @@ public Long getLBPreferredHostCheckInterval(final Long clusterId) { return IndirectAgentLBCheckInterval.valueIn(clusterId); } - List<Long> getOrderedHostIdList(final Long dcId) { - final List<Long> hostIdList = getAllAgentBasedHostsFromDB(dcId, null); + List<Long> getOrderedHostIdList(final Long dcId, boolean excludeHostsInMaintenance) { + final List<Long> hostIdList = getAllAgentBasedHostsFromDB(dcId, null, null, excludeHostsInMaintenance); hostIdList.sort(Comparator.comparingLong(x -> x)); return hostIdList; } @@ -259,19 +263,25 @@ private List<Long> getAllAgentBasedNonRoutingHostsFromDB(final Long zoneId, fina agentValidResourceStates, agentNonRoutingHostTypes, agentValidHypervisorTypes); } - private List<Long> getAllAgentBasedRoutingHostsFromDB(final Long zoneId, final Long clusterId, final Long msId) { + private List<Long> getAllAgentBasedRoutingHostsFromDB(final Long zoneId, final Long clusterId, final Long msId, boolean excludeHostsInMaintenance) { + List<ResourceState> validResourceStates = excludeHostsInMaintenance ? agentNonMaintenanceResourceStates : agentValidResourceStates; return hostDao.findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(zoneId, clusterId, msId, - agentValidResourceStates, List.of(Host.Type.Routing), agentValidHypervisorTypes); + validResourceStates, List.of(Host.Type.Routing), agentValidHypervisorTypes); } - private List<Long> getAllAgentBasedHostsFromDB(final Long zoneId, final Long clusterId) { + private List<Long> getAllAgentBasedHostsFromDB(final Long zoneId, final Long clusterId, final Long msId, boolean excludeHostsInMaintenance) { + List<ResourceState> validResourceStates = excludeHostsInMaintenance ? agentNonMaintenanceResourceStates : agentValidResourceStates; return hostDao.findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(zoneId, clusterId, null, - agentValidResourceStates, agentValidHostTypes, agentValidHypervisorTypes); + validResourceStates, agentValidHostTypes, agentValidHypervisorTypes); + } + + private List<Long> getAllAgentBasedHosts(long msId, boolean excludeHostsInMaintenance) { + return getAllAgentBasedHostsFromDB(null, null, msId, excludeHostsInMaintenance); } @Override - public boolean haveAgentBasedHosts(long msId) { - return CollectionUtils.isNotEmpty(getAllAgentBasedHosts(msId)); + public boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance) { + return CollectionUtils.isNotEmpty(getAllAgentBasedHosts(msId, excludeHostsInMaintenance)); } private org.apache.cloudstack.agent.lb.IndirectAgentLBAlgorithm getAgentMSLBAlgorithm() { @@ -303,8 +313,8 @@ public void checkLBAlgorithmName(String lbAlgorithm) { //////////////////////////////////////////////////////////// @Override - public void propagateMSListToAgents() { - logger.debug("Propagating management server list update to agents"); + public void propagateMSListToAgents(boolean triggerHostLB) { + logger.debug("Propagating management server list update to the agents"); ExecutorService setupMSListExecutorService = Executors.newFixedThreadPool(10, new NamedThreadFactory("SetupMSList-Worker")); final String lbAlgorithm = getLBAlgorithmName(); final Long globalLbCheckInterval = getLBPreferredHostCheckInterval(null); @@ -316,20 +326,20 @@ public void propagateMSListToAgents() { Map<Long, List<Long>> clusterHostIdsMap = new HashMap<>(); List<Long> clusterIds = clusterDao.listAllClusterIds(zone.getId()); for (Long clusterId : clusterIds) { - List<Long> hostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null); + List<Long> hostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null, false); clusterHostIdsMap.put(clusterId, hostIds); zoneHostIds.addAll(hostIds); } zoneHostIds.sort(Comparator.comparingLong(x -> x)); final List<String> avoidMsList = mshostDao.listNonUpStateMsIPs(); for (Long nonRoutingHostId : nonRoutingHostIds) { - setupMSListExecutorService.submit(new SetupMSListTask(nonRoutingHostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, globalLbCheckInterval)); + setupMSListExecutorService.submit(new SetupMSListTask(nonRoutingHostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, globalLbCheckInterval, triggerHostLB)); } for (Long clusterId : clusterIds) { final Long clusterLbCheckInterval = getLBPreferredHostCheckInterval(clusterId); List<Long> hostIds = clusterHostIdsMap.get(clusterId); for (Long hostId : hostIds) { - setupMSListExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval)); + setupMSListExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval, triggerHostLB)); } } } @@ -345,6 +355,45 @@ public void propagateMSListToAgents() { } } + @Override + public void propagateMSListToAgentsInCluster(Long clusterId) { + if (clusterId == null) { + return; + } + + logger.debug("Propagating management server list update to the agents in cluster " + clusterId); + ClusterVO cluster = clusterDao.findById(clusterId); + if (cluster == null) { + logger.warn("Unable to propagate management server list, couldn't find cluster " + clusterId); + return; + } + DataCenterVO zone = dataCenterDao.findById(cluster.getDataCenterId()); + if (zone == null) { + logger.warn("Unable to propagate management server list, couldn't find zone of the cluster " + clusterId); + return; + } + + ExecutorService setupMSListInClusterExecutorService = Executors.newFixedThreadPool(10, new NamedThreadFactory("SetupMSListInCluster-Worker")); + final String lbAlgorithm = getLBAlgorithmName(); + List<Long> clusterHostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null, false); + clusterHostIds.sort(Comparator.comparingLong(x -> x)); + final List<String> avoidMsList = mshostDao.listNonUpStateMsIPs(); + final Long clusterLbCheckInterval = getLBPreferredHostCheckInterval(clusterId); + for (Long hostId : clusterHostIds) { + setupMSListInClusterExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), clusterHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval, false)); + } + + setupMSListInClusterExecutorService.shutdown(); + try { + if (!setupMSListInClusterExecutorService.awaitTermination(300, TimeUnit.SECONDS)) { + setupMSListInClusterExecutorService.shutdownNow(); + } + } catch (InterruptedException e) { + setupMSListInClusterExecutorService.shutdownNow(); + logger.debug(String.format("Force shutdown setup ms list in cluster service as it did not shutdown in the desired time due to: %s", e.getMessage())); + } + } + private final class SetupMSListTask extends ManagedContextRunnable { private Long hostId; private Long dcId; @@ -352,21 +401,23 @@ private final class SetupMSListTask extends ManagedContextRunnable { private List<String> avoidMsList; private String lbAlgorithm; private Long lbCheckInterval; + private Boolean triggerHostLb; public SetupMSListTask(Long hostId, Long dcId, List<Long> orderedHostIdList, List<String> avoidMsList, - String lbAlgorithm, Long lbCheckInterval) { + String lbAlgorithm, Long lbCheckInterval, Boolean triggerHostLb) { this.hostId = hostId; this.dcId = dcId; this.orderedHostIdList = orderedHostIdList; this.avoidMsList = avoidMsList; this.lbAlgorithm = lbAlgorithm; this.lbCheckInterval = lbCheckInterval; + this.triggerHostLb = triggerHostLb; } @Override protected void runInContext() { final List<String> msList = getManagementServerList(hostId, dcId, orderedHostIdList); - final SetupMSListCommand cmd = new SetupMSListCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval); + final SetupMSListCommand cmd = new SetupMSListCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval, triggerHostLb); cmd.setWait(60); final Answer answer = agentManager.easySend(hostId, cmd); if (answer == null || !answer.getResult()) { @@ -419,9 +470,9 @@ protected boolean migrateNonRoutingHostAgentsInZone(String fromMsUuid, long from protected boolean migrateRoutingHostAgentsInCluster(long clusterId, String fromMsUuid, long fromMsId, DataCenter dc, long migrationStartTimeInMs, long timeoutDurationInMs, final List<String> avoidMsList, String lbAlgorithm, - boolean lbAlgorithmChanged, List<Long> orderedHostIdList) { + boolean lbAlgorithmChanged, List<Long> orderedHostIdList, boolean excludeHostsInMaintenance) { - List<Long> agentBasedHostsOfMsInDcAndCluster = getAllAgentBasedRoutingHostsFromDB(dc.getId(), clusterId, fromMsId); + List<Long> agentBasedHostsOfMsInDcAndCluster = getAllAgentBasedRoutingHostsFromDB(dc.getId(), clusterId, fromMsId, excludeHostsInMaintenance); if (CollectionUtils.isEmpty(agentBasedHostsOfMsInDcAndCluster)) { return true; } @@ -461,7 +512,7 @@ protected boolean migrateRoutingHostAgentsInCluster(long clusterId, String fromM } @Override - public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs) { + public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { if (timeoutDurationInMs <= 0) { logger.debug(String.format("Not migrating indirect agents from management server node %d (id: %s) to other nodes, invalid timeout duration", fromMsId, fromMsUuid)); return false; @@ -469,7 +520,7 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith logger.debug(String.format("Migrating indirect agents from management server node %d (id: %s) to other nodes", fromMsId, fromMsUuid)); long migrationStartTimeInMs = System.currentTimeMillis(); - if (!haveAgentBasedHosts(fromMsId)) { + if (!haveAgentBasedHosts(fromMsId, excludeHostsInMaintenance)) { logger.info(String.format("No indirect agents available on management server node %d (id: %s), to migrate", fromMsId, fromMsUuid)); return true; } @@ -489,7 +540,7 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith List<DataCenterVO> dataCenterList = dcDao.listAll(); for (DataCenterVO dc : dataCenterList) { if (!migrateAgentsInZone(dc, fromMsUuid, fromMsId, avoidMsList, lbAlgorithm, lbAlgorithmChanged, - migrationStartTimeInMs, timeoutDurationInMs)) { + migrationStartTimeInMs, timeoutDurationInMs, excludeHostsInMaintenance)) { return false; } } @@ -498,8 +549,8 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith } private boolean migrateAgentsInZone(DataCenterVO dc, String fromMsUuid, long fromMsId, List<String> avoidMsList, - String lbAlgorithm, boolean lbAlgorithmChanged, long migrationStartTimeInMs, long timeoutDurationInMs) { - List<Long> orderedHostIdList = getOrderedHostIdList(dc.getId()); + String lbAlgorithm, boolean lbAlgorithmChanged, long migrationStartTimeInMs, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { + List<Long> orderedHostIdList = getOrderedHostIdList(dc.getId(), excludeHostsInMaintenance); if (!migrateNonRoutingHostAgentsInZone(fromMsUuid, fromMsId, dc, migrationStartTimeInMs, timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList)) { return false; @@ -507,7 +558,7 @@ private boolean migrateAgentsInZone(DataCenterVO dc, String fromMsUuid, long fro List<Long> clusterIds = clusterDao.listAllClusterIds(dc.getId()); for (Long clusterId : clusterIds) { if (!migrateRoutingHostAgentsInCluster(clusterId, fromMsUuid, fromMsId, dc, migrationStartTimeInMs, - timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList)) { + timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList, excludeHostsInMaintenance)) { return false; } } @@ -547,7 +598,9 @@ protected void runInContext() { final MigrateAgentConnectionCommand cmd = new MigrateAgentConnectionCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval); cmd.setWait(60); final Answer answer = agentManager.easySend(hostId, cmd); //may not receive answer when the agent disconnects immediately and try reconnecting to other ms host - if (answer != null && !answer.getResult()) { + if (answer == null) { + logger.warn(String.format("Got empty answer while initiating migration of agent connection for host agent ID: %d", hostId)); + } else if (!answer.getResult()) { logger.warn(String.format("Error while initiating migration of agent connection for host agent ID: %d - %s", hostId, answer.getDetails())); } updateLastManagementServer(hostId, fromMsId); diff --git a/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java b/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java index 1b9923ad3ea1..9cdcce8008e9 100644 --- a/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java +++ b/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java @@ -204,7 +204,7 @@ public void testExceptionOnEmptyHostSetting() throws NoSuchFieldException, Illeg public void testGetOrderedRunningHostIdsEmptyList() { doReturn(Collections.emptyList()).when(hostDao).findHostIdsByZoneClusterResourceStateTypeAndHypervisorType( Mockito.eq(DC_1_ID), Mockito.eq(null), Mockito.eq(null), Mockito.anyList(), Mockito.anyList(), Mockito.anyList()); - Assert.assertTrue(agentMSLB.getOrderedHostIdList(DC_1_ID).isEmpty()); + Assert.assertTrue(agentMSLB.getOrderedHostIdList(DC_1_ID, false).isEmpty()); } @Test @@ -213,6 +213,6 @@ public void testGetOrderedRunningHostIdsOrderList() { .findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(Mockito.eq(DC_1_ID), Mockito.eq(null), Mockito.eq(null), Mockito.anyList(), Mockito.anyList(), Mockito.anyList()); Assert.assertEquals(Arrays.asList(host1.getId(), host2.getId(), host3.getId(), host4.getId()), - agentMSLB.getOrderedHostIdList(DC_1_ID)); + agentMSLB.getOrderedHostIdList(DC_1_ID, false)); } } diff --git a/ui/public/locales/en.json b/ui/public/locales/en.json index b3d5536e1f15..753fb84ba028 100644 --- a/ui/public/locales/en.json +++ b/ui/public/locales/en.json @@ -1876,6 +1876,7 @@ "label.read.io": "Read (IO)", "label.readonly": "Read-Only", "label.reason": "Reason", +"label.rebalance": "Rebalance", "label.reboot": "Reboot", "label.recent.deliveries": "Recent deliveries", "label.receivedbytes": "Bytes received", diff --git a/ui/src/config/section/infra/managementServers.js b/ui/src/config/section/infra/managementServers.js index bd17a4b8d5aa..d2d11d5b25d0 100644 --- a/ui/src/config/section/infra/managementServers.js +++ b/ui/src/config/section/infra/managementServers.js @@ -75,6 +75,7 @@ export default { message: 'message.cancel.maintenance', dataView: true, popup: true, + args: ['rebalance'], show: (record, store) => { return ['PreparingForMaintenance', 'Maintenance'].includes(record.state) }, mapping: { managementserverid: { @@ -109,7 +110,6 @@ export default { icon: 'close-circle-outlined', label: 'label.cancel.shutdown', message: 'message.cancel.shutdown', - docHelp: 'installguide/configuration.html#adding-a-zone', dataView: true, popup: true, show: (record, store) => { return ['PreparingForShutDown', 'ReadyToShutDown', 'ShuttingDown'].includes(record.state) }, diff --git a/ui/src/views/AutogenView.vue b/ui/src/views/AutogenView.vue index 9a625b7e0f75..136b7a096aed 100644 --- a/ui/src/views/AutogenView.vue +++ b/ui/src/views/AutogenView.vue @@ -1189,7 +1189,7 @@ export default { this.getFirstIndexFocus() this.showAction = true - const listIconForFillValues = ['copy-outlined', 'CopyOutlined', 'edit-outlined', 'EditOutlined', 'share-alt-outlined', 'ShareAltOutlined'] + const listIconForFillValues = ['copy-outlined', 'CopyOutlined', 'edit-outlined', 'EditOutlined', 'share-alt-outlined', 'ShareAltOutlined', 'minus-square-outlined'] for (const param of this.currentAction.paramFields) { if (param.type === 'list' && ['tags', 'hosttags', 'storagetags', 'storageaccessgroups', 'files'].includes(param.name)) { param.type = 'string' @@ -1415,6 +1415,8 @@ export default { fieldValue = this.resource[fieldName] ? this.resource[fieldName] : null if (fieldValue) { this.form[field.name] = fieldValue + } else if (field.type === 'boolean' && field.name === 'rebalance' && this.currentAction.api === 'cancelMaintenance') { + this.form[field.name] = true } }) }, @@ -1571,6 +1573,10 @@ export default { } } + if (['cancelMaintenance'].includes(action.api) && (params.rebalance === undefined || params.rebalance === null || params.rebalance === '')) { + params.rebalance = true + } + for (const key in values) { const input = values[key] for (const param of action.params) { diff --git a/ui/src/views/infra/Confirmation.vue b/ui/src/views/infra/Confirmation.vue index ea166ac32191..ea7b841522db 100644 --- a/ui/src/views/infra/Confirmation.vue +++ b/ui/src/views/infra/Confirmation.vue @@ -45,6 +45,12 @@ </a-select-option> </a-select> </a-form-item> + <a-form-item name="forced" ref="forced"> + <template #label> + <tooltip-label :title="$t('label.forced')" :tooltip="prepareForMaintenanceApiParams.forced.description"/> + </template> + <a-switch v-model:checked="form.forced" /> + </a-form-item> <a-divider/> <a-alert type="error"> <template #message> @@ -135,6 +141,7 @@ export default { if (this.isPrepareForMaintenance && this.form.algorithm !== '') { params.algorithm = this.form.algorithm } + params.forced = this.form.forced api(this.action.currentAction.api, params).then(() => { this.$message.success(this.$t(this.action.currentAction.label) + ' : ' + this.resource.name) this.closeAction() diff --git a/utils/src/main/java/com/cloud/utils/nio/NioConnection.java b/utils/src/main/java/com/cloud/utils/nio/NioConnection.java index e2ecbb3210e4..8e1a208a164e 100644 --- a/utils/src/main/java/com/cloud/utils/nio/NioConnection.java +++ b/utils/src/main/java/com/cloud/utils/nio/NioConnection.java @@ -323,7 +323,7 @@ protected void read(final SelectionKey key) throws IOException { logger.trace("Reading from: {}", socketChannel.socket().toString()); final byte[] data = link.read(socketChannel); if (data == null) { - logger.trace("Packet is incomplete. Waiting for more."); + logger.trace("Packet is incomplete. Waiting for more."); return; } final Task task = _factory.create(Task.Type.DATA, link, data);