Build and host algorithm images on GitHub (base images) (#20)

* wip: first version of PR test workflow * feat: adapt first algorithm to new infrastructure * fix: job generation script * feat: adapt second algorithm to new infrastructure * fix: python version * feat: allow matrix generation script to run in different contexts (folders) * refactor: split up base images and intermediate images; rename folders * feat: adjust workflow to build images in order * fix: build matrix computation script * test empty matrix * feat: prepare image publishing and adapt docker images * chore: restore lof and sublof algorithms and use kmeans; also add licenses to base images * feat: adjust r base image * chore: cleanup workflow definition * refactor: revert changes to intermediate images and algos (later PR) * feat: test image push * feat: test image push again * fix: image license information * feat: fix version information in image labels and finish PR
TimeEval · Nov 20, 2023 · 96af897 · 96af897
1 parent 60814e3
commit 96af897
Show file tree

Hide file tree

Showing 25 changed files with 563 additions and 105 deletions.
diff --git a/.ci/check_output.py b/.ci/check_output.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+import json
+import sys
+
+from pathlib import Path
+
+MODEL_FILEPATH = Path("./results/model.pkl")
+SCORES_FILEPATH = Path("./results/scores.csv")
+
+
+def parse_manifest(algorithm: str) -> dict:
+    manifest_path = Path(".") / algorithm / "manifest.json"
+    with manifest_path.open("r") as fh:
+        manifest = json.load(fh)
+    return manifest
+
+
+def is_readable(filename: Path) -> bool:
+    stat = filename.stat()
+    return stat.st_uid == 1000 and stat.st_gid == 1000
+
+
+def has_postprocessing(algorithm: str) -> bool:
+    readme_path = Path(".") / algorithm / "README.md"
+    if not readme_path.exists():
+        return False
+
+    with readme_path.open("r") as fh:
+        readme = fh.readlines()
+
+    marker = ["<!--BEGIN:timeeval-post-->", "<!--END:timeeval-post-->"]
+    return any([m in l for m in marker for l in readme])
+
+
+def main(algorithm):
+    manifest = parse_manifest(algorithm)
+    errors = []
+
+    if manifest["learningType"].lower() in ["supervised", "semi-supervised"]:
+        # check model.pkl
+        if not is_readable(MODEL_FILEPATH):
+            errors.append("Model file was written with the wrong user and/or group. Do you use a TimeEval base image?")
+
+    # check scores.csv
+    if not is_readable(SCORES_FILEPATH):
+        errors.append("Scoring was written with the wrong user and/or group. Do you use a TimeEval base image?")
+
+    with SCORES_FILEPATH.open("r") as fh:
+        lines = fh.readlines()
+
+
+    # if not post-processing, check length
+    if has_postprocessing(algorithm):
+        print("Skipping scoring (scores.csv) check, because algorithm uses post-processing!")
+    else:
+        # only a single column/dimension:
+        if any(["," in l for l in lines]):
+            errors.append("Scoring contains multiple dimensions (found a ',' in the file). "
+                        "Only a single anomaly score is allowed per time step!")
+
+        # there should be no header
+        try:
+            float(lines[0])
+        except ValueError as e:
+            errors.append(f"No header allowed for the scoring file! First value is not a number! {e}")
+
+        # same length as dataset
+        if manifest["inputDimensionality"].lower() == "univariate":
+            data_path = Path("./data/dataset.csv")
+        else:
+            data_path = Path("./data/multi-dataset.csv")
+
+        n_data = 0
+        with data_path.open("r") as fh:
+            for _ in fh:
+                n_data += 1
+        # substract header
+        n_data -= 1
+
+        if len(lines) != n_data:
+            errors.append("Scoring has wrong length; each input time step needs an anomaly score "
+                          f"(expected={n_data}, found={len(lines)})!")
+
+    for error in errors:
+        print(error, file=sys.stderr)
+
+    if len(errors) > 0:
+        exit(1)
+
+
+if __name__ == "__main__":
+    args = sys.argv
+    if len(args) != 2:
+        raise ValueError("You have to spacify an algorithm name (directory / docker image name)!")
+
+    main(args[1])
diff --git a/.ci/generate-build-matrix.sh b/.ci/generate-build-matrix.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+set -e
+
+default_branch=main
+folder="${1:-.}"
+ignore_pattern="0-base-images|1-intermediate-images|2-scripts|data|results|Dockerfile|README.md|\..*|.*\.py|.*\.yml|.*\.sh|.*\.png"
+changes_in_basedir=""
+
+function echoerr () {
+  echo "$@" >&2
+}
+
+# GITHUB_EVENT_NAME=pull_request
+# GITHUB_BASE_REF=PR target branch (probably default branch)
+# GITHUB_HEAD_REF=PR source branch
+# GITHUB_REF=refs/pull/<pr_number>/merge
+# GITHUB_REF_TYPE=tag or branch
+# RUNNER_ARCH=X86, X64, ARM, or ARM64
+# RUNNER_OD=Linux, Windows, or macOS
+
+# if this is a workflow for a PR targeting the default branch
+if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then
+  # build diff to main
+  echoerr "Detected pipeline for a non-default branch (assuming pull request with target $GITHUB_BASE_REF)"
+  git fetch origin || echoerr "Could not update remote 'origin'! Repository might be out of date."
+  changes_in_basedir=$( git diff --name-only "refs/remotes/origin/$GITHUB_BASE_REF..HEAD" -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 )
+  #changes_in_basedir=$( git diff --name-only "$GITHUB_BASE_REF..HEAD" | cut -d '/' -f 1 )
+
+# if this is a workflow for the default branch
+elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then
+  # build latest commit for the default branch
+  echoerr "Detected pipeline for default branch"
+  #changes_in_basedir=$( git diff --name-only "$CI_COMMIT_BEFORE_SHA..$CI_COMMIT_SHA" )
+  changes_in_basedir=$( git diff --name-only HEAD~1..HEAD -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 )
+
+# if this is a tag-workflow: build all algorithm images
+elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_REF_TYPE" == "tag" ]]; then
+  echoerr "Detected pipeline for a tag"
+  changes_in_basedir=$( ls -1 )
+
+else
+  echoerr "Cannot determine algorithm images to build! Please check the environment variables:"
+  env | grep "GITHUB" >&2 && true
+  echoerr ""
+fi
+
+# filter changes: remove non-algorithm-files/-folders and allow grep to find nothing (exit code 1)
+changed_algos=$( echo "$changes_in_basedir" | sort | uniq | grep -x -v -E "${ignore_pattern}" || [[ $? == 1 ]] )
+# filter changes: remove non-existing algos (e.g. when branch is not up-to-date with default branch or an algorithm was removed)
+changed_algos=$( echo "$changed_algos" | while read -r f; do [[ -d "$folder/$f" ]] && echo "$f" || true; done )
+
+if [[ -z "$changed_algos" ]]; then
+  echoerr "No algorithm changed!"
+fi
+
+echoerr "Generating pipeline for algorithms: $(xargs <<<$changed_algos)"
+(jq -Rc '[.]' | jq -sc '{"algorithm_name": add}') <<<"${changed_algos}"
diff --git a/.ci/get-image-version.sh b/.ci/get-image-version.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+set -e
+
+folder="${1:-}"
+SEMVER_REGEX="^(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(\\-[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?(\\+[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?$"
+
+trim-and-validate() {
+    local var="$*"
+    # remove leading whitespace characters
+    var="${var#"${var%%[![:space:]]*}"}"
+    # remove trailing whitespace characters
+    var="${var%"${var##*[![:space:]]}"}"
+
+    # validate semver version string
+    if [[ "$var" =~ $SEMVER_REGEX ]]; then
+      printf '%s' "$var"
+    else
+      echo "Version $var is not a proper version string according to SemVer 'X.Y.Z(-PRERELEASE)(+BUILD)'!" >&2
+      exit 1
+    fi
+}
+
+if [[ -f "$folder/version.txt" ]]; then
+  trim-and-validate "$( cat "$folder/version.txt" )"
+elif [[ -f "$folder/manifest.json" ]]; then
+  trim-and-validate "$( jq -r '.version' "$folder/manifest.json" )"
+else
+  echo "No version.txt or manifest.json present. Cannot determine Docker image version!" >&2
+  exit 1
+fi
diff --git a/.ci/get_dataset_name.py b/.ci/get_dataset_name.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+import json
+import sys
+
+from pathlib import Path
+
+if __name__ == "__main__":
+    args = sys.argv
+    if len(args) != 2:
+        raise ValueError("You have to specify an algorithm name (directory / docker image name)!")
+
+    algorithm = args[1]
+    manifest_path = Path(".") / algorithm / "manifest.json"
+    with manifest_path.open("r") as fh:
+        manifest = json.load(fh)
+
+    value = manifest["inputDimensionality"]
+    if value.lower() == "univariate":
+        print("data/dataset.csv")
+    elif value.lower() == "multivariate":
+        print("data/multi-dataset.csv")
+    else:
+        raise ValueError(f"Input dimensionality ({value}) of {algorithm}'s manifest is unknown!")