From effc31635953fc5da3435728f4ff94da133fd0dd Mon Sep 17 00:00:00 2001 From: Eduardo Benzecri Date: Mon, 18 Mar 2024 19:32:43 -0300 Subject: [PATCH] scylla-ansible-roles: Adds example playbook "kernel_version_enforcer" "kernel_version_enforcer" playbook allow the user to: - Pin a specific kernel version (and ensure it will be picked in the next reboot) if required - Upgrade kernel version to the latest available - Purge all old kernel versions - Upgrade all upgradable packages Signed-off-by: Eduardo Benzecri --- .../kernel_version_enforcer.yml | 34 +++++ .../kernel_version_enforcer/ubuntu/grub.yml | 30 +++++ .../ubuntu/kernel_enforce_cleanup.yml | 75 +++++++++++ .../kernel_version_enforcer/ubuntu/main.yml | 118 +++++++++++++++++ .../ubuntu/stop_reboot_start.yml | 122 ++++++++++++++++++ 5 files changed, 379 insertions(+) create mode 100644 example-playbooks/kernel_version_enforcer/kernel_version_enforcer.yml create mode 100644 example-playbooks/kernel_version_enforcer/ubuntu/grub.yml create mode 100644 example-playbooks/kernel_version_enforcer/ubuntu/kernel_enforce_cleanup.yml create mode 100644 example-playbooks/kernel_version_enforcer/ubuntu/main.yml create mode 100644 example-playbooks/kernel_version_enforcer/ubuntu/stop_reboot_start.yml diff --git a/example-playbooks/kernel_version_enforcer/kernel_version_enforcer.yml b/example-playbooks/kernel_version_enforcer/kernel_version_enforcer.yml new file mode 100644 index 00000000..0576c6a1 --- /dev/null +++ b/example-playbooks/kernel_version_enforcer/kernel_version_enforcer.yml @@ -0,0 +1,34 @@ +--- + +- name: Kernel Version Enforcer + hosts: scylla + gather_facts: true + serial: 1 + vars: + api_address: 127.0.0.1 + api_delay: 10 + api_port: 10000 + api_retries: 360 + api_timeout: 300 + cql_timeout: 86400 + reboot_timeout: 600 + systemd_unit_retries: 5 + systemd_unit_delay: 30 + grub_config_file: /boot/grub/grub.cfg + image_package_prefix: linux-image + image_version: 5.15.0-1051-gcp + kernel_related_packages: + - linux-gcp + - linux-image-gcp + - linux-headers-gcp + pid_kill_delay: 12 + pid_kill_retries: 5 + pause_time: 15 + pin_kernel_version: false + purge_older_images: false + upgrade_all_packages: false + upgrade_latest_kernel: false + tasks: + - name: Enforce kernel version for Ubuntu + ansible.builtin.include_tasks: ubuntu/main.yml + when: ansible_distribution == "Ubuntu" diff --git a/example-playbooks/kernel_version_enforcer/ubuntu/grub.yml b/example-playbooks/kernel_version_enforcer/ubuntu/grub.yml new file mode 100644 index 00000000..ae33e079 --- /dev/null +++ b/example-playbooks/kernel_version_enforcer/ubuntu/grub.yml @@ -0,0 +1,30 @@ +--- + +- name: Get {{ grub_config_file }} metadata + ansible.builtin.stat: + path: "{{ grub_config_file }}" + register: grub_config + +- name: Fail if GRUB config file doesn't exist + ansible.builtin.fail: + msg: "{{ grub_config_file }} doesn't exist" + when: not grub_config.stat.exists + +- name: Get GRUB entries + ansible.builtin.command: grep -E "^\smenuentry" {{ grub_config_file }} + register: grub_entries + +- name: Get GRUB index for '{{ image_package_prefix }}-{{ image_version }}' + ansible.builtin.set_fact: + target_grub_index="{{ grub_index }}" + when: + - image_version in item + - not "recovery mode" in item + - target_grub_index is not defined + loop: "{{ grub_entries.stdout_lines }}" + loop_control: + index_var: grub_index + +- name: Set index '1>{{ target_grub_index }}' to be used in the next reboot + ansible.builtin.command: grub-reboot "1>{{ target_grub_index }}" + become: true diff --git a/example-playbooks/kernel_version_enforcer/ubuntu/kernel_enforce_cleanup.yml b/example-playbooks/kernel_version_enforcer/ubuntu/kernel_enforce_cleanup.yml new file mode 100644 index 00000000..83656b2d --- /dev/null +++ b/example-playbooks/kernel_version_enforcer/ubuntu/kernel_enforce_cleanup.yml @@ -0,0 +1,75 @@ +--- + +- name: Purge all kernel images newer than '{{ final_image_version }}' + ansible.builtin.apt: + name: "{{ image_package_prefix }}-{{ item }}" + state: absent + purge: true + become: true + when: item is version(final_image_version, '>') + loop: "{{ vmlinuz_versions.stdout_lines }}" + +- name: Erase all kernel images related files newer than '{{ final_image_version }}' + ansible.builtin.shell: rm -f /boot/*-{{ item }} + become: true + when: item is version(final_image_version, '>') + loop: "{{ vmlinuz_versions.stdout_lines }}" + +- name: Purge all kernel images older than '{{ final_image_version }}' + ansible.builtin.apt: + name: "{{ image_package_prefix }}-{{ item }}" + state: absent + purge: true + become: true + when: + - purge_older_images + - item is version(final_image_version, '<') + loop: "{{ vmlinuz_versions.stdout_lines }}" + +- name: Erase all kernel images related files older than '{{ final_image_version }}' + ansible.builtin.shell: rm -f /boot/*-{{ item }} + become: true + when: + - purge_older_images + - item is version(final_image_version, '<') + loop: "{{ vmlinuz_versions.stdout_lines }}" + +- name: Remove useless packages from the cache + ansible.builtin.apt: + autoclean: true + become: true + +- name: Reconfigure '{{ image_package_prefix }}-{{ final_image_version }}' package + ansible.builtin.command: dpkg-reconfigure {{ image_package_prefix }}-{{ final_image_version }} -f noninteractive -p critical + become: true + +- name: Get /boot/vmlinuz metadata + ansible.builtin.stat: + path: /boot/vmlinuz + register: vmlinuz + +- name: Fail if /boot/vmlinuz is not a symbolic link of /boot/vmlinuz-{{ final_image_version }} + ansible.builtin.fail: + msg: "/boot/vmlinuz is not a symbolic link of /boot/vmlinuz-{{ final_image_version }}" + when: + - not vmlinuz.stat.islnk + - not vmlinuz.stat.lnk_source is /boot/vmlinuz-{{ final_image_version }} + +- name: Get /boot/initrd.img metadata + ansible.builtin.stat: + path: /boot/initrd.img + register: initrd + +- name: Fail if /boot/initrd.img is not a symbolic link of /boot/initrd.img-{{ final_image_version }} + ansible.builtin.fail: + msg: "/boot/initrd.img is not a symbolic link of /boot/initrd.img-{{ final_image_version }}" + when: + - not initrd.stat.islnk + - not initrd.stat.lnk_source is /boot/initrd.img-{{ final_image_version }} + +- name: Clean all non-required packages + ansible.builtin.apt: + autoclean: true + autoremove: true + force_apt_get: true + become: true \ No newline at end of file diff --git a/example-playbooks/kernel_version_enforcer/ubuntu/main.yml b/example-playbooks/kernel_version_enforcer/ubuntu/main.yml new file mode 100644 index 00000000..562b601c --- /dev/null +++ b/example-playbooks/kernel_version_enforcer/ubuntu/main.yml @@ -0,0 +1,118 @@ +--- + +- name: Get current kernel image version + ansible.builtin.command: uname --kernel-release + register: uname_pre_output + +- name: Save kernel image version + ansible.builtin.set_fact: + detected_image_version="{{ uname_pre_output.stdout_lines | first }}" + +- name: Define if the kernel image should be installed + ansible.builtin.set_fact: + kernel_image_required="{{ image_version is version(detected_image_version, 'ne') or upgrade_latest_kernel }}" + +- name: Mark to unhold kernel-related packages + ansible.builtin.dpkg_selections: + name: "{{ item }}" + selection: install + loop: "{{ kernel_related_packages }}" + become: true + when: kernel_image_required + +- name: Ensure kernel image '{{ image_package_prefix }}-{{ image_version }}' is installed + ansible.builtin.apt: + name: "{{ image_package_prefix }}-{{ image_version }}" + state: present + become: true + when: + - kernel_image_required + - not upgrade_latest_kernel + +- name: Upgrade kernel-related packages to the latest version available + ansible.builtin.apt: + name: "{{ item }}" + state: latest + update_cache: true + autoclean: true + autoremove: true + force_apt_get: true + loop: "{{ kernel_related_packages }}" + become: true + when: upgrade_latest_kernel + +- name: Mark to hold kernel-related packages + ansible.builtin.dpkg_selections: + name: "{{ item }}" + selection: hold + loop: "{{ kernel_related_packages }}" + become: true + when: pin_kernel_version + +- name: Upgrade all upgradable packages + ansible.builtin.apt: + name: "*" + state: latest + update_cache: true + autoclean: true + autoremove: true + force_apt_get: true + become: true + when: upgrade_all_packages + +- name: Get all vmlinuz files available + ansible.builtin.shell: ls /boot/vmlinuz-* | sed 's/\/boot\/vmlinuz-*//' + register: vmlinuz_versions + +- name: Define if reconfiguration is required due to the presence of serveral vmlinuz files + ansible.builtin.set_fact: + reconfiguration_required="{{ vmlinuz_versions.stdout_lines | length > 1 }}" + +- name: Mark to unhold kernel-related packages + ansible.builtin.dpkg_selections: + name: "{{ item }}" + selection: install + loop: "{{ kernel_related_packages }}" + become: true + when: reconfiguration_required + +- name: Prepare GRUB modifications + ansible.builtin.include_tasks: grub.yml + when: + - reconfiguration_required + - not upgrade_latest_kernel + +- name: Stop, reboot and start each node (if required) + ansible.builtin.include_tasks: stop_reboot_start.yml + when: reconfiguration_required + +- name: Set final kernel image version if '{{ image_version }}' was installed + ansible.builtin.set_fact: + final_image_version="{{ image_version }}" + when: not upgrade_latest_kernel + +- name: Set final kernel image version if the latest one was installed + ansible.builtin.set_fact: + final_image_version="{{ target_image_version }}" + when: + - reconfiguration_required + - upgrade_latest_kernel + +- name: Enforce kernel version '{{ final_image_version }}' usage + ansible.builtin.include_tasks: kernel_enforce_cleanup.yml + when: reconfiguration_required + +- name: Mark to hold kernel-related packages + ansible.builtin.dpkg_selections: + name: "{{ item }}" + selection: hold + loop: "{{ kernel_related_packages }}" + become: true + when: + - pin_kernel_version + - reconfiguration_required + +- name: Make a pause of {{ pause_time }} seconds + ansible.builtin.wait_for: + timeout: "{{ pause_time | int }}" + when: kernel_image_required diff --git a/example-playbooks/kernel_version_enforcer/ubuntu/stop_reboot_start.yml b/example-playbooks/kernel_version_enforcer/ubuntu/stop_reboot_start.yml new file mode 100644 index 00000000..8c64648c --- /dev/null +++ b/example-playbooks/kernel_version_enforcer/ubuntu/stop_reboot_start.yml @@ -0,0 +1,122 @@ +--- +- name: Populate service facts + ansible.builtin.service_facts: + +- name: Check if Scylla is installed + ansible.builtin.set_fact: + scylla_installation="{{ true if ansible_facts.services['scylla-server.service'] is defined else false }}" + +- name: Stop Scylla + block: + - name: Mask scylla-server service + ansible.builtin.systemd: + name: scylla-server + masked: true + become: true + + - name: Drain node + ansible.builtin.uri: + url: "http://{{ api_address }}:{{ api_port }}/storage_service/drain" + method: POST + retries: "{{ api_retries }}" + delay: "{{ api_delay }}" + timeout: "{{ api_timeout }}" + + - name: Check if the node if fully drained + ansible.builtin.uri: + url: "http://{{ api_address }}:{{ api_port }}/storage_service/operation_mode" + method: GET + retries: "{{ api_retries }}" + delay: "{{ api_delay }}" + timeout: "{{ api_timeout }}" + register: node_drain_status + failed_when: "'DRAINED' not in node_drain_status.json" + + - name: Stop scylla-manager-agent service (if exists) + ansible.builtin.systemd: + name: scylla-manager-agent + enabled: true + state: stopped + become: true + when: ansible_facts.services['scylla-manager-agent.service'] is defined + + - name: Stop scylla-server service + ansible.builtin.systemd: + name: scylla-server + state: stopped + become: true + when: "'DRAINED' in node_drain_status.json" + rescue: + - name: Send a SIGKILL to Scylla PID + ansible.builtin.shell: kill -9 $(pidof scylla) + register: scylla_kill_pid + retries: "{{ pid_kill_retries }}" + delay: "{{ pid_kill_delay }}" + until: scylla_kill_pid.rc == 2 + failed_when: scylla_kill_pid.rc != 2 + become: true + always: + - name: Unask scylla-server service + ansible.builtin.systemd: + name: scylla-server + masked: false + become: true + when: + - scylla_installation + - kernel_image_required + +- name: Reboot and post-reboot checks + block: + - name: Reboot the node + ansible.builtin.reboot: + reboot_timeout: "{{ reboot_timeout }}" + become: true + + - name: Get current kernel image version + ansible.builtin.shell: uname --kernel-release + register: uname_post_output + + - name: Save kernel image version + ansible.builtin.set_fact: + target_image_version="{{ uname_post_output.stdout_lines | first }}" + + - name: Fail if kernel image version '{{ image_version }}' is not currently in use + ansible.builtin.fail: + msg: "'{{ image_version }}' is not currently used" + when: + - target_image_version is version(image_version, 'ne') + - not upgrade_latest_kernel + when: kernel_image_required + +- name: Start Scylla + block: + - name: Get listen address + ansible.builtin.shell: grep '^listen_address:' /etc/scylla/scylla.yaml | awk '{ print $2 }' + register: listen_address + + - name: Start scylla-server service + ansible.builtin.systemd: + name: scylla-server + state: started + retries: "{{ systemd_unit_retries }}" + delay: "{{ systemd_unit_delay }}" + become: true + when: + - ansible_facts.services['scylla-server.service'] is defined + - ansible_facts.services['scylla-server.service'].status == "disabled" + + - name: Wait for CQL port on {{ listen_address.stdout }} + ansible.builtin.wait_for: + port: 9042 + host: "{{ listen_address.stdout }}" + timeout: "{{ cql_timeout }}" + + - name: Wait for the cluster to become healthy + ansible.builtin.shell: nodetool status | grep "{{ listen_address.stdout }}" | grep '^UN' + register: node_status + until: node_status.rc == 0 + retries: "{{ api_retries }}" + delay: "{{ api_delay }}" + when: + - scylla_installation + - kernel_image_required