Skip to content

Commit 6802fe0

Browse files
jtnordCarroll Chiou
andauthored
[JENKINS-69277] relaunch an agent if the connection is broken (#152)
* [JENKINS-69277] relaunch an agent if the connection is broken Even though this is a OnceRetentionStrategy we expect a durable task to be able to run to completion even if the connection is broken. So we relaunch the agent if the agent is offline and it is not done. We are not forcing a reconnect, as this allows any existing connect operation (which could be the first one, to continue running). * [JENKINS-69277] avoid spotbugs warning Whilst spotbugs complained about the lack of synchronization on terminating we do not want to put the connect call inside the synchronized block as it may be long running, or cause some form of re-entry on a different thread. whilst the locking is not perfect, we may try to relaunch and agent just as it was done, we have set the computer to not accept any tasks so the agent should become idle, and be terminated (either in the current termination block) or the next time the check runs. * check if launch is supported early on * [JENKINS-69277] Add a unit test to cover OnceRetentionStrategy iAdds a unit test to cover both the happy case of a cloud agent using OnceRetentionStrategy and the unhappy case where the agent dies and needs restarting. --------- Co-authored-by: Carroll Chiou <[email protected]>
1 parent c8635e9 commit 6802fe0

File tree

3 files changed

+267
-1
lines changed

3 files changed

+267
-1
lines changed

pom.xml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,33 @@
6161
<artifactId>ssh-slaves</artifactId>
6262
<scope>test</scope>
6363
</dependency>
64+
<dependency>
65+
<groupId>org.jenkins-ci.plugins.workflow</groupId>
66+
<artifactId>workflow-job</artifactId>
67+
<scope>test</scope>
68+
</dependency>
69+
<dependency>
70+
<groupId>org.jenkins-ci.plugins.workflow</groupId>
71+
<artifactId>workflow-durable-task-step</artifactId>
72+
<scope>test</scope>
73+
</dependency>
74+
<dependency>
75+
<groupId>org.jenkins-ci.plugins.workflow</groupId>
76+
<artifactId>workflow-cps</artifactId>
77+
<scope>test</scope>
78+
</dependency>
79+
<dependency>
80+
<groupId>org.hamcrest</groupId>
81+
<artifactId>hamcrest</artifactId>
82+
<version>2.2</version>
83+
<scope>test</scope>
84+
</dependency>
85+
<dependency>
86+
<groupId>org.awaitility</groupId>
87+
<artifactId>awaitility</artifactId>
88+
<version>4.2.0</version>
89+
<scope>test</scope>
90+
</dependency>
6491
</dependencies>
6592
<dependencyManagement>
6693
<dependencies>

src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,19 @@ public long check(final AbstractCloudComputer c) {
7373
done(c);
7474
}
7575
}
76-
76+
// if the agent is not done and it is offline we should trigger a relaunch
77+
// do not do this whilst we hold the lock, as connect is documented only as "This method may return immediately"
78+
// so lets not presume any asynchronous nature and instead that this could be a long blocking op.
79+
boolean shouldRestart = false;
80+
synchronized (this) {
81+
if (!terminating && c.isOffline() && c.isLaunchSupported() && !disabled) {
82+
shouldRestart = true;
83+
}
84+
}
85+
if (shouldRestart) {
86+
LOGGER.log(Level.FINE, "Attempting relaunch of {0}", c.getName());
87+
c.connect(false);
88+
}
7789
// Return one because we want to check every minute if idle.
7890
return 1;
7991
}
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
package org.jenkinsci.plugins.durabletask.executors;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
import java.time.Duration;
6+
import java.util.Arrays;
7+
import java.util.Collection;
8+
import java.util.Collections;
9+
import java.util.HashSet;
10+
import java.util.Set;
11+
import java.util.logging.Level;
12+
import java.util.logging.Logger;
13+
import org.jenkinsci.plugins.durabletask.BourneShellScript;
14+
import org.jenkinsci.plugins.durabletask.WindowsBatchScript;
15+
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
16+
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
17+
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
18+
import org.junit.BeforeClass;
19+
import org.junit.Rule;
20+
import org.junit.Test;
21+
import org.junit.rules.TemporaryFolder;
22+
import org.jvnet.hudson.test.BuildWatcher;
23+
import org.jvnet.hudson.test.Issue;
24+
import org.jvnet.hudson.test.JenkinsRule;
25+
import org.jvnet.hudson.test.LoggerRule;
26+
import org.jvnet.hudson.test.SimpleCommandLauncher;
27+
import hudson.Functions;
28+
import hudson.model.Computer;
29+
import hudson.model.LoadStatistics;
30+
import hudson.model.TaskListener;
31+
import hudson.model.Descriptor.FormException;
32+
import hudson.model.labels.LabelAtom;
33+
import hudson.remoting.RequestAbortedException;
34+
import hudson.remoting.Which;
35+
import hudson.slaves.AbstractCloudComputer;
36+
import hudson.slaves.AbstractCloudSlave;
37+
import hudson.slaves.Cloud;
38+
import hudson.slaves.NodeProvisioner.PlannedNode;
39+
import hudson.util.RemotingDiagnostics;
40+
import jenkins.model.Jenkins;
41+
42+
import static org.awaitility.Awaitility.await;
43+
import static org.hamcrest.MatcherAssert.assertThat;
44+
import static org.hamcrest.Matchers.arrayWithSize;
45+
import static org.hamcrest.Matchers.empty;
46+
import static org.hamcrest.Matchers.is;
47+
48+
public class OnceRetentionStrategyTest {
49+
50+
private final static Logger LOGGER = Logger.getLogger(OnceRetentionStrategyTest.class.getName());
51+
52+
@Rule
53+
public LoggerRule lr = new LoggerRule().record(LOGGER, Level.ALL);
54+
55+
@Rule
56+
public JenkinsRule jr = new JenkinsRule();
57+
58+
@Rule
59+
public TemporaryFolder temp = new TemporaryFolder();
60+
61+
@Rule
62+
public BuildWatcher bw = new BuildWatcher();
63+
64+
@BeforeClass
65+
public static void setupStatics() {
66+
LoadStatistics.CLOCK = 100; // 100ms for the LoadStatistics so we have quick provisioning
67+
// deadConnectionsShouldReLaunched seems to fail always when run as part of the suite - but passes always when run individually without the binary wrapper
68+
// (because the ping command and its parent cmd.exe get killed :-o )
69+
WindowsBatchScript.USE_BINARY_WRAPPER=true;
70+
BourneShellScript.USE_BINARY_WRAPPER=true;
71+
}
72+
73+
@Test
74+
public void withoutRestartNodesAreCleaned() throws Exception {
75+
MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());
76+
77+
jr.jenkins.clouds.add(sc);
78+
jr.jenkins.setNumExecutors(0);
79+
//jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();
80+
81+
WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
82+
foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
83+
"node('simples') {",
84+
Functions.isWindows() ? " bat 'echo hello'" : " sh 'echo hello'",
85+
"}"),
86+
true));
87+
WorkflowRun run = foo.scheduleBuild2(0).waitForStart();
88+
89+
jr.waitForCompletion(run);
90+
jr.assertBuildStatusSuccess(run);
91+
92+
// agent should have been removed and we should have no agents but this happens on a different Thread..
93+
await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
94+
assertThat(jr.jenkins.getNodes(), is(empty()));
95+
assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
96+
}
97+
98+
99+
@Test
100+
@Issue("JENKINS-69277")
101+
public void deadConnectionsShouldReLaunched() throws Exception {
102+
MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());
103+
104+
jr.jenkins.clouds.add(sc);
105+
jr.jenkins.setNumExecutors(0);
106+
//jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();
107+
108+
WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
109+
foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
110+
"node('simples') {",
111+
Functions.isWindows() ? " bat 'echo hello & ping -w 1000 -n 30 localhost'"
112+
: " sh 'echo hello & sleep 30'",
113+
"}"),
114+
true));
115+
WorkflowRun run = foo.scheduleBuild2(0).waitForStart();
116+
LOGGER.log(Level.FINE, "waiting for hello");
117+
118+
jr.waitForMessage("hello", run);
119+
LOGGER.log(Level.FINE, "killing da agent");
120+
121+
killAgent();
122+
// retention strategy will kick in but may take up to 60 seconds.
123+
jr.waitForCompletion(run);
124+
jr.assertBuildStatusSuccess(run);
125+
126+
// agent should have been removed and we should have no agents but this happens on a different Thread..
127+
await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
128+
assertThat(jr.jenkins.getNodes(), is(empty()));
129+
assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
130+
}
131+
132+
private void killAgent() throws IOException, InterruptedException {
133+
Computer[] computers = Jenkins.get().getComputers();
134+
for (Computer c : computers) {
135+
if (c instanceof MySimpleCloudComputer) {
136+
LOGGER.log(Level.FINE, "Asking {0} to commit suicide", c);
137+
try {
138+
RemotingDiagnostics.executeGroovy("Runtime.getRuntime().halt(1)", c.getChannel());
139+
} catch (RequestAbortedException ignored) {
140+
// we have just asked the Computer to commit suicide so this is expected.
141+
}
142+
}
143+
}
144+
}
145+
146+
public static class MySimpleCloud extends Cloud {
147+
148+
private final LabelAtom label;
149+
private final File agentRootDir;
150+
private int count = 0;
151+
152+
public MySimpleCloud(String name, File agentRootDir) {
153+
super(name);
154+
this.agentRootDir = agentRootDir;
155+
label = Jenkins.get().getLabelAtom(name);
156+
}
157+
158+
@Override
159+
public boolean canProvision(CloudState state) {
160+
boolean retVal = state.getLabel().matches(Collections.singleton(label));
161+
LOGGER.log(Level.FINE, "SimpleCloud.canProvision({0},{1}) -> {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), retVal});
162+
return retVal;
163+
}
164+
165+
@Override
166+
public Collection<PlannedNode> provision(CloudState state, int excessWorkload) {
167+
LOGGER.log(Level.FINE, "SimpleCloud.provision(({0}, {1}), {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), excessWorkload});
168+
Collection<PlannedNode> retVal = new HashSet<PlannedNode>();
169+
for (int i =0; i < excessWorkload - state.getAdditionalPlannedCapacity(); ++i) {
170+
String agentName;
171+
synchronized (this) {
172+
if (count != 0) {
173+
// sometimes we end up with 2 agents due to the lovely cloud API.
174+
LOGGER.log(Level.FINE, "not provisioning another agent as we have already provisioned.");
175+
return Collections.emptyList();
176+
}
177+
agentName = "cloud-"+name+"-"+(++count);
178+
}
179+
180+
PlannedNode n = new PlannedNode(agentName,
181+
Computer.threadPoolForRemoting.submit( () -> new MySimpleCloudSlave(agentName, new File(agentRootDir, agentName), label)),
182+
1);
183+
LOGGER.log(Level.FINE, "SimpleCloud.provision() -> Added planned node for {0}", name);
184+
retVal.add(n);
185+
}
186+
return retVal;
187+
}
188+
}
189+
190+
public static class MySimpleCloudSlave extends AbstractCloudSlave {
191+
192+
private final LabelAtom label;
193+
194+
public MySimpleCloudSlave(String name, File remoteFS, LabelAtom label) throws FormException, IOException {
195+
super(name,
196+
remoteFS.getAbsolutePath(),
197+
new SimpleCommandLauncher("java -Xmx512m -Djava.awt.headless=true -jar " + Which.jarFile(hudson.remoting.Launcher.class) + " -jar-cache " + remoteFS + " -workDir " + remoteFS));
198+
this.setRetentionStrategy(new OnceRetentionStrategy(1));
199+
LOGGER.log(Level.FINE, "SimpleCloudSlave()");
200+
this.label = label;
201+
}
202+
203+
@Override
204+
public AbstractCloudComputer createComputer() {
205+
LOGGER.log(Level.FINE, "SimpleCloudSlave.createComputer()");
206+
return new MySimpleCloudComputer(this);
207+
}
208+
209+
@Override
210+
protected void _terminate(TaskListener listener) throws IOException, InterruptedException {
211+
// nothing to do here
212+
}
213+
214+
@Override
215+
public Set<LabelAtom> getAssignedLabels() {
216+
return new HashSet<LabelAtom>(Arrays.asList(label, getSelfLabel()));
217+
}
218+
}
219+
220+
public static class MySimpleCloudComputer extends AbstractCloudComputer<MySimpleCloudSlave> {
221+
222+
public MySimpleCloudComputer(MySimpleCloudSlave slave) {
223+
super(slave);
224+
}
225+
}
226+
227+
}

0 commit comments

Comments
 (0)