Skip to content

Commit

Permalink
Merge branch 'improvement/ZENKO-4879' into w/2.7/improvement/ZENKO-4879
Browse files Browse the repository at this point in the history
  • Loading branch information
francoisferrand committed Aug 30, 2024
2 parents 326d7e7 + 2911d08 commit 17c333a
Show file tree
Hide file tree
Showing 12 changed files with 222 additions and 107 deletions.
2 changes: 2 additions & 0 deletions .github/scripts/end2end/install-kind-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ kubectl apply --validate=false -f https://github.com/jetstack/cert-manager/relea
# so if apply fails, replace can work
prom_url=https://raw.githubusercontent.com/coreos/prometheus-operator/${PROMETHEUS_VERSION}/bundle.yaml
kubectl create -f $prom_url || kubectl replace -f $prom_url --wait
# wait for the resource to exist
kubectl wait --for=condition=established --timeout=10m crd/alertmanagers.monitoring.coreos.com
envsubst < configs/prometheus.yaml | kubectl apply -f -

# zookeeper
Expand Down
15 changes: 15 additions & 0 deletions .github/workflows/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,18 @@ jobs:
replicas=3
quorum=3
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}

- name: Render and test dr alerts
uses: scality/[email protected]
with:
alert_file_path: monitoring/dr/alerts.yaml
test_file_path: monitoring/dr/alerts.test.yaml
alert_inputs: |
namespace=zenko
kafka_connect_src_job=artesca-data-dr-source-base-queue-connector-metrics
kafka_connect_sink_job=artesca-data-dr-base-queue-connector-metrics
dr_sink_instance=artesca-data-dr
rto_alert_threshold=30
mongo_jobs=zenko/data-db-mongodb-sharded-shard.*
lifecycle_jobs=artesca-data-backbeat-lifecycle-.*-headless
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}
83 changes: 15 additions & 68 deletions .github/workflows/end2end.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -265,26 +265,7 @@ jobs:

- name: Extract environment
run: |-
sudo curl --fail -L https://github.com/mikefarah/yq/releases/download/${{ env.YQ_VERSION}}/${{ env.YQ_BINARY }} -o /usr/bin/yq
sudo chmod +x /usr/bin/yq
get_image_from_deps() {
local dep_name=$1
yq eval ".$dep_name | (.sourceRegistry // \"docker.io\") + \"/\" + .image" deps.yaml
}
cd solution/
cat <<EOF >> $GITHUB_ENV
scala_version=$(yq eval '.kafka.tag | split("-").[0]' deps.yaml)
kafka_version=$(yq eval '.kafka.tag | split("-").[1]' deps.yaml)
KAFKA_IMAGE=$(get_image_from_deps kafka)
KAFKA_TAG=$(yq eval '.kafka.tag' deps.yaml)
KAFKA_CONNECT_IMAGE=$(get_image_from_deps kafka-connect)
KAFKA_CONNECT_TAG=$(yq eval '.kafka-connect.tag' deps.yaml)
JMX_JAVAAGENT_IMAGE=$(get_image_from_deps jmx-javaagent)
JMX_JAVAAGENT_TAG=$(yq eval '.jmx-javaagent.tag' deps.yaml)
MONGODB_CONNECTOR_TAG=$(yq eval '.mongodb-connector.tag' deps.yaml)
EOF
solution/kafka_build_vars.sh >> $GITHUB_ENV
- name: Check kafka & kafka-connect versions match
run: |-
Expand All @@ -298,9 +279,9 @@ jobs:
build-args: |-
scala_version=${{ env.scala_version }}
kafka_version=${{ env.kafka_version }}
tags: "${{ env.KAFKA_IMAGE }}:${{ env.KAFKA_TAG }}"
cache-from: type=gha,scope=$GITHUB_REF_NAME-kafka
cache-to: type=gha,mode=max,scope=$GITHUB_REF_NAME-kafka
tags: "${{ env.KAFKA_IMAGE }}:${{ env.KAFKA_TAG }}-${{ env.BUILD_TREE_HASH }}"
cache-from: type=gha,scope=kafka-${{ env.KAFKA_TAG }}
cache-to: type=gha,mode=max,scope=kafka-${{ env.KAFKA_TAG }}

- name: Build and push kafka-connect
uses: docker/build-push-action@v5
Expand All @@ -311,11 +292,11 @@ jobs:
JMX_JAVAAGENT_IMAGE=${{ env.JMX_JAVAAGENT_IMAGE }}
JMX_JAVAAGENT_TAG=${{ env.JMX_JAVAAGENT_TAG }}
KAFKA_IMAGE=${{ env.KAFKA_IMAGE }}
KAFKA_TAG=${{ env.KAFKA_TAG }}
KAFKA_TAG=${{ env.KAFKA_TAG }}-${{ env.BUILD_TREE_HASH }}
MONGODB_CONNECTOR_TAG=${{ env.MONGODB_CONNECTOR_TAG }}
tags: "${{ env.KAFKA_CONNECT_IMAGE }}:${{ env.KAFKA_CONNECT_TAG }}"
cache-from: type=gha,scope=$GITHUB_REF_NAME-kafka-connect
cache-to: type=gha,mode=max,scope=$GITHUB_REF_NAME-kafka-connect
tags: "${{ env.KAFKA_CONNECT_IMAGE }}:${{ env.KAFKA_CONNECT_TAG }}-${{ env.BUILD_TREE_HASH }}"
cache-from: type=gha,scope=kafka-connect-${{ env.KAFKA_CONNECT_TAG }}
cache-to: type=gha,mode=max,scope=kafka-connect-${{ env.KAFKA_CONNECT_TAG }}

build-test-image:
runs-on: ubuntu-20.04
Expand Down Expand Up @@ -738,35 +719,7 @@ jobs:
source: /tmp/artifacts
if: always()

write-final-failed-status:
runs-on: ubuntu-latest
needs:
- check-dashboard-versions
- build-doc
- build-iso
- build-kafka
- build-test-image
- end2end-http
- end2end-https
- end2end-sharded
- end2end-pra
- ctst-end2end-sharded
if: failure()
steps:
- name: write failure status
run: |
mkdir -p artifacts
echo -n "FAILED" > artifacts/.final_status
- name: Upload artifacts
uses: scality/action-artifacts@v4
with:
method: upload
url: https://artifacts.scality.net
user: ${{ secrets.ARTIFACTS_USER }}
password: ${{ secrets.ARTIFACTS_PASSWORD }}
source: artifacts

write-final-success-status:
write-final-status:
runs-on: ubuntu-latest
needs:
- check-dashboard-versions
Expand All @@ -780,17 +733,11 @@ jobs:
- end2end-sharded
- end2end-pra
- ctst-end2end-sharded
if: success()
steps:
- name: write success status
run: |
mkdir -p artifacts
echo -n "SUCCESSFUL" > artifacts/.final_status
- name: Upload artifacts
uses: scality/action-artifacts@v4
- name: Upload final status
if: always()
uses: scality/actions/[email protected]
with:
method: upload
url: https://artifacts.scality.net
user: ${{ secrets.ARTIFACTS_USER }}
password: ${{ secrets.ARTIFACTS_PASSWORD }}
source: artifacts
ARTIFACTS_USER: ${{ secrets.ARTIFACTS_USER }}
ARTIFACTS_PASSWORD: ${{ secrets.ARTIFACTS_PASSWORD }}
JOBS_RESULTS: ${{ join(needs.*.result) }}
88 changes: 88 additions & 0 deletions monitoring/dr/alerts.test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# These tests are meant to go hand-in-hand with the rendered alert rule.
# Use github.com/scality/action-prom-render-test@python-renderer python module
#
# Render the alerts file with
# gen-alert render alerts.yaml
evaluation_interval: 1m
rule_files:
- alerts.rendered.yaml

tests:
- interval: 1m
input_series:
- series: up{job="artesca-data-dr-source-base-queue-connector-metrics", namespace="zenko"}
values: _x7
- series: up{drSinkInstance="artesca-data-dr", job="artesca-data-dr-base-queue-connector-metrics", namespace="zenko"}
values: 1x7
- series: kafka_connect_task_error_total_record_errors{job="artesca-data-dr-source-base-queue-connector-metrics"}
values: 0 1 2x5
- series: kafka_connect_task_error_total_record_failures{job="artesca-data-dr-source-base-queue-connector-metrics"}
values: 0x7
- series: kafka_connect_task_error_total_record_errors{drSinkInstance="artesca-data-dr", job="artesca-data-dr-base-queue-connector-metrics"}
values: 0 1 2x5
- series: kafka_connect_task_error_total_record_failures{drSinkInstance="artesca-data-dr", job="artesca-data-dr-base-queue-connector-metrics"}
values: 0 1x6
- series: s3_lifecycle_last_timestamp_ms{ job="artesca-data-backbeat-lifecycle-.*-headless", namespace="zenko"}
values: 10000000x3
- series: mongodb_ss_repl_lastWrite_lastWriteDate{drSinkInstance="artesca-data-dr", job="zenko/data-db-mongodb-sharded-shard.*", namespace="zenko"}
values: 5000000x3
alert_rule_test:
- alertname: DrResourcePausedWhileOtherRunning
eval_time: 1m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
summary: 'DR Resource Paused While Other Running'
description: 'One site''s DR resource is paused while the other is not. This could lead to data inconsistency between sites.'
- alertname: DrResourcePausedForTooLong
eval_time: 4m59s
exp_alerts: []
- alertname: DrResourcePausedForTooLong
eval_time: 5m
exp_alerts:
- exp_labels:
severity: "warning"
exp_annotations:
summary: 'DR resource paused for too long'
description: 'The DR resource has been paused for more than 5 minutes.'
- alertname: DrResourcePausedForTooLong
eval_time: 6m # Grace period check
exp_alerts:
- exp_labels:
severity: "warning"
exp_annotations:
summary: 'DR resource paused for too long'
description: 'The DR resource has been paused for more than 5 minutes.'
- alertname: KafkaConnectOutageSource
eval_time: 1m
exp_alerts: []
- alertname: KafkaConnectOutageSource
eval_time: 2m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
description: >-
Kafka-connect on source is not working nominally. The rate of errors or failures has exceeded 0. This could lead DR to get out of sync if not addressed promptly.
summary: 'Kafka Connect not working'
- alertname: KafkaConnectOutageSink
eval_time: 1m
exp_alerts: []
- alertname: KafkaConnectOutageSink
eval_time: 2m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
description: >-
Kafka-connect on sink is not working nominally. The rate of errors or failures has exceeded 0. This could lead DR to get out of sync if not addressed promptly.
summary: 'Kafka Connect not working'
- alertname: WriteTimesLatency
eval_time: 1m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
summary: 'Write times latency'
description: 'The difference in write times between the source and protected sites is more than half of the Recovery Time Objective (12 hours). This could lead to data inconsistency between sites.'
12 changes: 6 additions & 6 deletions monitoring/dr/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ groups:
severity: warning
annotations:
summary: 'DR Resource Paused While Other Running'
description: 'One site s DR resource is paused while the other is not. This could lead to data inconsistency between sites.'
description: 'One site''s DR resource is paused while the other is not. This could lead to data inconsistency between sites.'

- alert: DrResourcePausedForTooLong
expr: |
Expand All @@ -49,9 +49,9 @@ groups:

- alert: KafkaConnectOutageSource
expr: |
sum(rate(kafka_connect_task_error_total_record_errors{drSinkInstance="", job="${kafka_connect_src_job}"}[$__rate_interval])) > 0
sum(rate(kafka_connect_task_error_total_record_errors{drSinkInstance="", job="${kafka_connect_src_job}"}[2m])) > 0
or
sum(rate(kafka_connect_task_error_total_record_failures{drSinkInstance="", job="${kafka_connect_src_job}"}[$__rate_interval])) > 0
sum(rate(kafka_connect_task_error_total_record_failures{drSinkInstance="", job="${kafka_connect_src_job}"}[2m])) > 0
for: 1m
labels:
severity: critical
Expand All @@ -62,15 +62,15 @@ groups:

- alert: KafkaConnectOutageSink
expr: |
sum(rate(kafka_connect_task_error_total_record_errors{drSinkInstance="${dr_sink_instance}", job="${kafka_connect_sink_job}"}[$__rate_interval])) > 0
sum(rate(kafka_connect_task_error_total_record_errors{drSinkInstance="${dr_sink_instance}", job="${kafka_connect_sink_job}"}[2m])) > 0
or
sum(rate(kafka_connect_task_error_total_record_failures{drSinkInstance="${dr_sink_instance}", job="${kafka_connect_sink_job}"}[$__rate_interval])) > 0
sum(rate(kafka_connect_task_error_total_record_failures{drSinkInstance="${dr_sink_instance}", job="${kafka_connect_sink_job}"}[2m])) > 0
for: 1m
labels:
severity: critical
annotations:
description: >-
Kafka-connect on sink is not working nominally. The rate of errors or failures has exceeded 0. This could lead to data loss if not addressed promptly.
Kafka-connect on sink is not working nominally. The rate of errors or failures has exceeded 0. This could lead DR to get out of sync if not addressed promptly.
summary: 'Kafka Connect not working'

- alert: WriteTimesLatency
Expand Down
42 changes: 19 additions & 23 deletions solution/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,29 +115,6 @@ function copy_docker_image()
dir:${FULL_PATH}
}

function build_image()
{
local path="${1##*/}"
local image="$1:$2"
shift 2

local argumentNames
argumentNames="$(sed -n 's/ARG \([^=]*\).*/\1/p' "$path/Dockerfile" | sort -u)"

local -a buildArgs
readarray -t buildArgs < <(
{
yq eval '.[] | .envsubst + "=" + .tag' deps.yaml ;
yq eval '.[] | .envsubst + "=" + (.sourceRegistry // "docker.io") + "/" + .image' deps.yaml |
sed 's/_TAG=/_IMAGE=/g'
} | grep -F "$argumentNames" | sed 's/\(.*\)/--build-arg\n\1/'
)

# Work around bad expansion of empty array in bash 4.4- (c.f. https://stackoverflow.com/a/7577209)
docker build -t "$image" ${buildArgs[@]+"${buildArgs[@]}"} "$@" "$path/"
copy_docker_image "$image" 'docker-daemon:'
}

function copy_oci_image()
{
IMAGE_NAME=${1##*/}
Expand Down Expand Up @@ -323,12 +300,31 @@ function download_tools()
done
}

function retag()
{
local image=$1
local tag=$2
local suffix=$3
${DOCKER} image inspect "${image}:${tag}-${suffix}" > /dev/null 2>&1 || \
${DOCKER} ${DOCKER_OPTS} pull "${image}:${tag}-${suffix}"
${DOCKER} tag "${image}:${tag}-${suffix}" "${image}:${tag}"
}

function prepare_kafka_images()
(
source <( ${REPOSITORY_DIR}/solution/kafka_build_vars.sh )

retag "$KAFKA_IMAGE" "$KAFKA_TAG" "$BUILD_TREE_HASH"
retag "$KAFKA_CONNECT_IMAGE" "$KAFKA_CONNECT_TAG" "$BUILD_TREE_HASH"
)

# run everything in order
clean
mkdirs
download_tools
gen_manifest_yaml
copy_yamls
prepare_kafka_images
flatten_source_images | while read img ; do
# only pull if the image isnt already local
${DOCKER} image inspect ${img} > /dev/null 2>&1 || ${DOCKER} ${DOCKER_OPTS} pull ${img}
Expand Down
10 changes: 5 additions & 5 deletions solution/deps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ cloudserver:
sourceRegistry: ghcr.io/scality
dashboard: cloudserver/cloudserver-dashboards
image: cloudserver
tag: 8.8.30
tag: 8.8.31
envsubst: CLOUDSERVER_TAG
drctl:
sourceRegistry: ghcr.io/scality
image: zenko-drctl
tag: v1.0.3
tag: v1.0.4
envsubst: DRCTL_TAG
toolName: zenko-drctl
fubectl:
Expand Down Expand Up @@ -118,7 +118,7 @@ sorbet:
policy: sorbet/sorbet-policies
dashboard: sorbet/sorbet-dashboards
image: sorbet
tag: v1.1.11
tag: v1.1.12
envsubst: SORBET_TAG
stern: # tail any pod logs with pattern matchin
tag: 1.30.0
Expand All @@ -136,12 +136,12 @@ vault:
dashboard: vault2/vault-dashboards
policy: vault2/vault-policies
image: vault2
tag: 8.8.8
tag: 8.8.9
envsubst: VAULT_TAG
zenko-operator:
sourceRegistry: ghcr.io/scality
image: zenko-operator
tag: 1.6.0
tag: 1.6.1
envsubst: ZENKO_OPERATOR_TAG
zenko-ui:
sourceRegistry: ghcr.io/scality
Expand Down
Loading

0 comments on commit 17c333a

Please sign in to comment.