Skip to content

Commit 25b5415

Browse files
committed
Debug dc5 deployment process
1 parent 8142605 commit 25b5415

File tree

5 files changed

+63
-46
lines changed

5 files changed

+63
-46
lines changed

building/build-debs/homeworld-admin-tools/debian/changelog

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
homeworld-admin-tools (0.1.17) stretch; urgency=medium
2+
3+
* Updated debian release
4+
5+
-- Cel Skeggs <[email protected]> Tue, 10 Oct 2017 04:55:00 -0400
6+
7+
homeworld-admin-tools (0.1.16) stretch; urgency=medium
8+
9+
* Updated debian release
10+
11+
-- Cel Skeggs <[email protected]> Tue, 10 Oct 2017 03:05:00 -0400
12+
113
homeworld-admin-tools (0.1.15) stretch; urgency=medium
214

315
* Updated debian release

building/build-debs/homeworld-admin-tools/resources/clustered/flannel.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ spec:
4545
serviceAccountName: flannel
4646
containers:
4747
- name: kube-flannel
48-
image: homeworld.mit.edu/flannel:0.8.0-2
48+
image: homeworld.mit.edu/flannel:0.8.0-4
4949
command: [ "/usr/bin/flanneld", "--ip-masq", "--kube-subnet-mgr" ]
5050
securityContext:
5151
privileged: true

building/build-debs/homeworld-admin-tools/src/verify.py

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import query
2+
import time
23
import threading
34
import tempfile
45
import os
@@ -92,15 +93,15 @@ def check_ssh_with_certs(hostname=None):
9293
result = subprocess.check_output(["ssh", "-i", keypath, "root@%s.%s" % (hostname, config.external_domain), "echo confirmed"], env=env)
9394
except subprocess.CalledProcessError as e:
9495
command.fail("ssh check failed: %s" % e)
95-
if result != "confirmed":
96-
command.fail("unexpected result from ssh check")
96+
if result != b"confirmed\n":
97+
command.fail("unexpected result from ssh check: '%s'" % result.decode())
9798
print("ssh access confirmed!")
9899

99100

100101
def check_etcd_health():
101102
config = configuration.Config.load_from_project()
102103
result = access.call_etcdctl(["cluster-health"], return_result=True)
103-
lines = result.strip().split("\n")
104+
lines = result.strip().decode().split("\n")
104105
if lines.pop() != "cluster is healthy":
105106
command.fail("cluster did not report as healthy!")
106107
member_ids = []
@@ -118,7 +119,7 @@ def check_etcd_health():
118119
command.fail("did not find expected healthy result info for: %s" % server_name)
119120
member_ids.append(line.split(" ")[1])
120121

121-
result = access.call_etcdctl(["member", "list"], return_result=True)
122+
result = access.call_etcdctl(["member", "list"], return_result=True).decode()
122123
found_member_ids = []
123124
servers = []
124125
leader_count = 0
@@ -143,8 +144,8 @@ def check_etcd_health():
143144
if leader_count != 1:
144145
command.fail("wrong number of leaders")
145146

146-
if sorted(servers) != sorted(node.hostname for node in config.nodes):
147-
command.fail("invalid detected set of servers")
147+
if sorted(servers) != sorted(node.hostname for node in config.nodes if node.kind == "master"):
148+
command.fail("invalid detected set of servers: %s" % servers)
148149

149150
if member_ids != found_member_ids:
150151
command.fail("member id list mismatch")
@@ -154,11 +155,11 @@ def check_etcd_health():
154155

155156
def get_kubectl_json(*params: str):
156157
raw = access.call_kubectl(list(params) + ["-o", "json"], return_result=True)
157-
return json.loads(raw)
158+
return json.loads(raw.decode())
158159

159160

160161
def check_kube_health():
161-
expected_kubernetes_version = "v1.7.2"
162+
expected_kubernetes_version = "v1.8.0"
162163
config = configuration.Config.load_from_project()
163164

164165
# verify nodes
@@ -178,16 +179,16 @@ def check_kube_health():
178179
nodeID = node["spec"]["externalID"]
179180
if nodeID not in nodes_remaining:
180181
command.fail("invalid or duplicate node: %s" % nodeID)
181-
node = nodes_remaining[nodeID]
182+
node_obj = nodes_remaining[nodeID]
182183
del nodes_remaining[nodeID]
183-
if node.kind == "master":
184+
if node_obj.kind == "master":
184185
if node["spec"].get("unschedulable", None) is not True:
185186
command.fail("expected master node to be unschedulable")
186187
else:
187-
assert node.kind == "worker"
188+
assert node_obj.kind == "worker"
188189
if node["spec"].get("unschedulable", None):
189190
command.fail("expected worker node to be schedulable")
190-
conditions = {condobj.type: condobj.status for condobj in node["status"]["conditions"]}
191+
conditions = {condobj["type"]: condobj["status"] for condobj in node["status"]["conditions"]}
191192
if conditions["DiskPressure"] != "False":
192193
command.fail("expected no disk pressure")
193194
if conditions["MemoryPressure"] != "False":
@@ -250,14 +251,14 @@ def check_aci_pull():
250251
container_command = "ping -c 1 8.8.8.8 && echo 'PING RESULT SUCCESS' || echo 'PING RESULT FAIL'"
251252
server_command = ["rkt", "run", "--pull-policy=update", "homeworld.mit.edu/debian", "--exec", "/bin/bash", "--", "-c",
252253
setup.escape_shell(container_command)]
253-
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker, config.external_domain), "--"] + server_command)
254-
last_line = results.strip().split(b"\n")[-1]
254+
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker.hostname, config.external_domain), "--"] + server_command)
255+
last_line = results.replace(b"\r\n",b"\n").replace(b"\0",b'').strip().split(b"\n")[-1]
255256
if b"PING RESULT FAIL" in last_line:
256257
if b"PING RESULT SUCCESS" in last_line:
257258
command.fail("should not have seen both success and failure markers in last line")
258259
command.fail("cluster network probably not up (could not ping 8.8.8.8)")
259260
elif b"PING RESULT SUCCESS" not in last_line:
260-
command.fail("container does not seem to have launched properly; container launches are likely broken")
261+
command.fail("container does not seem to have launched properly; container launches are likely broken (line = %s)" % repr(last_line))
261262
print("container seems to be launched, with the correct network!")
262263

263264

@@ -288,13 +289,11 @@ def check_flannel_kubeinfo():
288289
if pod["status"]["phase"] != "Running":
289290
command.fail("pod was not running: %s: %s" % (name, pod["status"]["phase"]))
290291

291-
conditions = {condobj.type: condobj.status for condobj in pod["status"]["conditions"]}
292+
conditions = {condobj["type"]: condobj["status"] for condobj in pod["status"]["conditions"]}
292293
if conditions["Initialized"] != "True":
293294
command.fail("pod not yet initialized")
294295
if conditions["Ready"] != "True":
295296
command.fail("pod not yet ready")
296-
if conditions["PodScheduled"] != "True":
297-
command.fail("pod not yet scheduled")
298297

299298
if len(pod["status"]["containerStatuses"]) != 1:
300299
command.fail("expected only one container")
@@ -330,41 +329,41 @@ def check_flannel_function():
330329

331330
def listen():
332331
try:
333-
container_command = "ip -o addr show dev eth0 to 172.18/16 primary && sleep 5"
334-
server_command = ["rkt", "run", "--net=rkt.kubernetes.io", "homeworld.mit.edu/debian", "--", "-c", container_command]
335-
cmd = ["ssh", "root@%s.%s" % (worker_listener, config.external_domain), "--"] + server_command
336-
with subprocess.Popen(cmd) as process:
337-
stdout, stderr = process.communicate(None, timeout=1)
338-
if stderr:
339-
command.fail("found data on stderr from trying to run ip addr: '%s'" % repr(stderr.decode()))
340-
if b"scope" not in stdout:
341-
command.fail("could not find scope line in ip addr output")
342-
parts = stdout.split(b" ")
343-
if b"inet" not in parts:
332+
container_command = "ip -o addr show dev eth0 to 172.18/16 primary && sleep 15"
333+
server_command = ["rkt", "run", "--net=rkt.kubernetes.io", "homeworld.mit.edu/debian", "--", "-c", setup.escape_shell(container_command)]
334+
cmd = ["ssh", "root@%s.%s" % (worker_listener.hostname, config.external_domain), "--"] + server_command
335+
with subprocess.Popen(cmd, stdout=subprocess.PIPE, bufsize=1, universal_newlines=True) as process:
336+
stdout = process.stdout.readline()
337+
if "scope" not in stdout:
338+
command.fail("could not find scope line in ip addr output (%s)" % repr(stdout))
339+
parts = stdout.split(" ")
340+
if "inet" not in parts:
344341
command.fail("could not find inet address in ip addr output")
345-
address = parts[parts.index(b"inet") + 1]
346-
if not address.endswith(b"/32"):
347-
command.fail("expected address that ended in /32, not '%s'" % address.decode())
348-
if address.count(b".") != 3:
349-
command.fail("expected valid IPv4 address")
350-
if not address.decode().replace(".", "").isdigit():
351-
command.fail("expected valid IPv4 address")
342+
address = parts[parts.index("inet") + 1]
343+
if not address.endswith("/24"):
344+
command.fail("expected address that ended in /24, not '%s'" % address)
345+
address = address[:-3]
346+
if address.count(".") != 3:
347+
command.fail("expected valid IPv4 address, not '%s'" % address)
348+
if not address.replace(".", "").isdigit():
349+
command.fail("expected valid IPv4 address, not '%s'" % address)
352350
found_address[0] = address
353351
event.set()
352+
process.communicate(timeout=20)
354353
finally:
355354
event.set()
356355
return True
357356

358357
def talk():
359-
if not event.wait(5):
358+
if not event.wait(25):
360359
command.fail("timed out while waiting for IPv4 address of listener")
361360
address = found_address[0]
362361
if address is None:
363362
command.fail("no address was specified by listener")
364363
container_command = "ping -c 1 %s && echo 'PING RESULT SUCCESS' || echo 'PING RESULT FAIL'" % address
365364
server_command = ["rkt", "run", "homeworld.mit.edu/debian", "--exec", "/bin/bash", "--", "-c", setup.escape_shell(container_command)]
366-
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker_talker, config.external_domain), "--"] + server_command)
367-
last_line = results.strip().split(b"\n")[-1]
365+
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker_talker.hostname, config.external_domain), "--"] + server_command)
366+
last_line = results.replace(b"\r\n",b"\n").replace(b"\0",b'').strip().split(b"\n")[-1]
368367
if b"PING RESULT FAIL" in last_line:
369368
command.fail("was not able to ping the target container; is flannel working?")
370369
elif b"PING RESULT SUCCESS" not in last_line:
@@ -401,7 +400,7 @@ def check_dns_kubeinfo():
401400
if pod["status"]["phase"] != "Running":
402401
command.fail("pod was not running: %s: %s" % (name, pod["status"]["phase"]))
403402

404-
conditions = {condobj.type: condobj.status for condobj in pod["status"]["conditions"]}
403+
conditions = {condobj["type"]: condobj["status"] for condobj in pod["status"]["conditions"]}
405404
if conditions["Initialized"] != "True":
406405
command.fail("pod not yet initialized")
407406
if conditions["Ready"] != "True":
@@ -434,10 +433,10 @@ def check_dns_function():
434433

435434
container_command = "nslookup kubernetes.default.svc.hyades.local 172.28.0.2"
436435
server_command = ["rkt", "run", "homeworld.mit.edu/debian", "--exec", "/bin/bash", "--", "-c", setup.escape_shell(container_command)]
437-
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker, config.external_domain), "--"] + server_command)
438-
last_line = results.strip().split(b"\n")[-1]
439-
if last_line != b"Address: 172.28.0.1":
440-
command.fail("unexpected last line: '%s'" % repr(last_line.decode()))
436+
results = subprocess.check_output(["ssh", "root@%s.%s" % (worker.hostname, config.external_domain), "--"] + server_command)
437+
last_line = results.replace(b"\r\n",b"\n").replace(b"\0",b'').strip().split(b"\n")[-1]
438+
if not last_line.endswith(b"Address: 172.28.0.1"):
439+
command.fail("unexpected last line: %s" % repr(last_line.decode()))
441440

442441
print("dns-addon seems to work!")
443442

building/build-debs/homeworld-keysystem/debian/changelog

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
homeworld-keysystem (0.1.11) stretch; urgency=medium
2+
3+
* Update debian version
4+
5+
-- Cel Skeggs <[email protected]> Tue, 10 Oct 2017 02:27:00 -0400
6+
17
homeworld-keysystem (0.1.10) stretch; urgency=medium
28

39
* Update debian version

docs/deploy.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ Deploy flannel into the cluster:
141141
Wait a bit for propagation... (if this doesn't work, keep trying for a bit)
142142

143143
$ spire verify flannel-run
144-
$ spire verify flannel-listen
144+
$ spire verify flannel-ping
145145

146146
## Core cluster service: dns-addon
147147

0 commit comments

Comments
 (0)