Skip to content

Commit

Permalink
flakiness - rework the validate-purgelogs flow
Browse files Browse the repository at this point in the history
This change attempt to avoid such issue:

```
2023-11-28 14:52:52.779734 | TASK [health-check/validate-purgelogs : Changing Logs modified time]
2023-11-28 09:52:53.432837 | controller | touch: cannot touch '/home/data/rsync/a8/a8dab0a4be04839ac4801e8dc4da995a50c250b0/post/config-update/e486495/zuul-info': No such file or directory
2023-11-28 09:52:53.434637 | controller | find: '/home/data/rsync/a8/a8dab0a4be04839ac4801e8dc4da995a50c250b0/post/config-update/e486495/zuul-info': No such file or directory
2023-11-28 09:52:53.436455 | controller | touch: cannot touch '/home/data/rsync/a8/a8dab0a4be04839ac4801e8dc4da995a50c250b0/post/config-update/e486495/zuul-manifest.json': No such file or directory
2023-11-28 09:52:53.436477 | controller | touch: cannot touch '/home/data/rsync/a8/a8dab0a4be04839ac4801e8dc4da995a50c250b0/post/config-update/e486495/job-output.txt.gz': No such file or directory
2023-11-28 09:52:53.438198 | controller | touch: cannot touch '/home/data/rsync/a8/a8dab0a4be04839ac4801e8dc4da995a50c250b0/post/config-update/e486495/job-output.json.gz': No such file or directory
2023-11-28 09:52:53.606151 | controller | command terminated with exit code 1
2023-11-28 14:52:53.821139 | controller | ERROR
2023-11-28 14:52:53.821494 | controller | {
2023-11-28 14:52:53.821568 | controller |   "delta": "0:00:00.524195",
2023-11-28 14:52:53.821620 | controller |   "end": "2023-11-28 09:52:53.620098",
2023-11-28 14:52:53.821668 | controller |   "msg": "non-zero return code",
2023-11-28 14:52:53.821716 | controller |   "rc": 1,
2023-11-28 14:52:53.821761 | controller |   "start": "2023-11-28 09:52:53.095903"
2023-11-28 14:52:53.821804 | controller | }
```

I don't know the internal of `find` but I guess when walking the dir
to build entries to act on to perform the 'touch', then purgelogs
passed and purge ?!?

The idea here is to move `find exec` before we restart purgelogs with the
new loop delay.

Change-Id: I8c0cbec2bb30c72345969d50df4cb0b7b55851e9
  • Loading branch information
morucci committed Nov 29, 2023
1 parent 6d51379 commit 1be7899
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 33 deletions.
2 changes: 1 addition & 1 deletion playbooks/files/sf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
executor:
logLevel: DEBUG
logserver:
loopDelay: 5
loopDelay: 3600
retentionDays: 30
storage:
size: 2Gi
Expand Down
2 changes: 1 addition & 1 deletion roles/health-check/validate-purgelogs/defaults/main.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
---
pod_label: "run=logserver"
pod_name: "logserver-0"
container_name: "logserver"
sshd_container_name: "logserver-sshd"
60 changes: 29 additions & 31 deletions roles/health-check/validate-purgelogs/tasks/main.yaml
Original file line number Diff line number Diff line change
@@ -1,58 +1,56 @@
- name: Change logserver/purgelogs loop delay
ansible.builtin.include_role:
name: "update-custom-resource"
vars:
cr_spec:
logserver:
loopDelay: 5

- name: Ensure job results exists in Logserver
ansible.builtin.include_role:
name: "health-check/ensure-job-result-artifacts"
vars:
log_url: "{{ zuul_config_update_build_log_url }}"

- name: Ensure logserver/purgelog restarted with correct loopDelay
command: >
kubectl get pods -l "{{ pod_label }}"
-o jsonpath='{range .items[*]}{range .spec.containers[*]}{.name}{" "}{.command}{"\n"}{end}{end}'
register: logserver_containers
until: logserver_containers is not failed and "5" in logserver_containers.stdout
delay: 5
retries: 6

- name: Get logserver pod name
- name: Create a local archive of the logserver content
ansible.builtin.shell: >
kubectl get pods -l "{{ pod_label }}"
--field-selector status.phase=Running --no-headers
-o custom-columns=":metadata.name"
register: _logserver_pod_name
# We wait until we get only one output line because even with the
# 'Running' phase selector we might get the previous pod (logserver replicaset is reconfigured
# two tasks above) in the command output.
until: _logserver_pod_name.stdout_lines | length == 1
kubectl exec {{ pod_name }} -c {{ container_name }} -- bash -c "cd /opt/rh/httpd24/root/var/www/logs/; tar -czf /tmp/logserver.tgz ."
- name: Save logserver content before changing the date
ansible.builtin.shell: >
kubectl exec {{ _logserver_pod_name.stdout }} -c {{ sshd_container_name }}
-- rsync -r /home/data/rsync/ /tmp/rsync-back/
kubectl cp {{ pod_name }}:/tmp/logserver.tgz -c {{ container_name }} /tmp/logserver.tgz
- name: Changing Logs modified time
ansible.builtin.shell: >
kubectl exec {{ _logserver_pod_name.stdout }} -c {{ sshd_container_name }}
kubectl exec {{ pod_name }} -c {{ sshd_container_name }}
-- find /home/data/rsync/ -mindepth 1 -exec touch --date="1970-01-01" {} \;
- name: Change logserver/purgelogs loop delay
ansible.builtin.include_role:
name: "update-custom-resource"
vars:
cr_spec:
logserver:
loopDelay: 5

- name: Ensure logserver/purgelog restarted with correct loopDelay
command: kubectl get pods {{ pod_name }} -o jsonpath="{.spec.containers[?(@.name=='purgelogs')].command}"
register: purgelogs_command
until: purgelogs_command is not failed and "5" in purgelogs_command.stdout
delay: 5
retries: 6

- name: Sleep for few seconds to let purgelogs purge old logs
ansible.builtin.wait_for:
timeout: 10

- name: Ensure job results do not exist in Logserver after purging
ansible.builtin.include_role:
name: "health-check/ensure-job-result-artifacts"
vars:
log_url: "{{ zuul_config_update_build_log_url }}"
status: 404

- name: Upload logserver backup content to pod
ansible.builtin.shell: >
kubectl cp /tmp/logserver.tgz {{ pod_name }}:/tmp/ -c {{ container_name }}
- name: Restore logserver content
ansible.builtin.shell: >
kubectl exec {{ _logserver_pod_name.stdout }} -c {{ sshd_container_name }}
-- rsync -r --delete /tmp/rsync-back/ /home/data/rsync/
kubectl exec {{ pod_name }} -c {{ container_name }} --
bash -c "mkdir -p /tmp/logs && tar -xvf /tmp/logserver.tgz -C /tmp/logs/ && cp -Rf /tmp/logs/* /opt/rh/httpd24/root/var/www/logs/"
- name: Ensure logserver content restoring worked
ansible.builtin.include_role:
Expand Down

0 comments on commit 1be7899

Please sign in to comment.