Skip to content

Commit

Permalink
Build and host algorithm images on GitHub (base images) (#20)
Browse files Browse the repository at this point in the history
* wip: first version of PR test workflow

* feat: adapt first algorithm to new infrastructure

* fix: job generation script

* feat: adapt second algorithm to new infrastructure

* fix: python version


* feat: allow matrix generation script to run in different contexts (folders)

* refactor: split up base images and intermediate images; rename folders

* feat: adjust workflow to build images in order

* fix: build matrix computation script

* test empty matrix

* feat: prepare image publishing and adapt docker images

* chore: restore lof and sublof algorithms and use kmeans; also add licenses to base images

* feat: adjust r base image

* chore: cleanup workflow definition

* refactor: revert changes to intermediate images and algos (later PR)

* feat: test image push

* feat: test image push again

* fix: image license information

* feat: fix version information in image labels and finish PR
  • Loading branch information
CodeLionX authored Nov 20, 2023
1 parent 60814e3 commit 96af897
Show file tree
Hide file tree
Showing 25 changed files with 563 additions and 105 deletions.
96 changes: 96 additions & 0 deletions .ci/check_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3
import json
import sys

from pathlib import Path

MODEL_FILEPATH = Path("./results/model.pkl")
SCORES_FILEPATH = Path("./results/scores.csv")


def parse_manifest(algorithm: str) -> dict:
manifest_path = Path(".") / algorithm / "manifest.json"
with manifest_path.open("r") as fh:
manifest = json.load(fh)
return manifest


def is_readable(filename: Path) -> bool:
stat = filename.stat()
return stat.st_uid == 1000 and stat.st_gid == 1000


def has_postprocessing(algorithm: str) -> bool:
readme_path = Path(".") / algorithm / "README.md"
if not readme_path.exists():
return False

with readme_path.open("r") as fh:
readme = fh.readlines()

marker = ["<!--BEGIN:timeeval-post-->", "<!--END:timeeval-post-->"]
return any([m in l for m in marker for l in readme])


def main(algorithm):
manifest = parse_manifest(algorithm)
errors = []

if manifest["learningType"].lower() in ["supervised", "semi-supervised"]:
# check model.pkl
if not is_readable(MODEL_FILEPATH):
errors.append("Model file was written with the wrong user and/or group. Do you use a TimeEval base image?")

# check scores.csv
if not is_readable(SCORES_FILEPATH):
errors.append("Scoring was written with the wrong user and/or group. Do you use a TimeEval base image?")

with SCORES_FILEPATH.open("r") as fh:
lines = fh.readlines()


# if not post-processing, check length
if has_postprocessing(algorithm):
print("Skipping scoring (scores.csv) check, because algorithm uses post-processing!")
else:
# only a single column/dimension:
if any(["," in l for l in lines]):
errors.append("Scoring contains multiple dimensions (found a ',' in the file). "
"Only a single anomaly score is allowed per time step!")

# there should be no header
try:
float(lines[0])
except ValueError as e:
errors.append(f"No header allowed for the scoring file! First value is not a number! {e}")

# same length as dataset
if manifest["inputDimensionality"].lower() == "univariate":
data_path = Path("./data/dataset.csv")
else:
data_path = Path("./data/multi-dataset.csv")

n_data = 0
with data_path.open("r") as fh:
for _ in fh:
n_data += 1
# substract header
n_data -= 1

if len(lines) != n_data:
errors.append("Scoring has wrong length; each input time step needs an anomaly score "
f"(expected={n_data}, found={len(lines)})!")

for error in errors:
print(error, file=sys.stderr)

if len(errors) > 0:
exit(1)


if __name__ == "__main__":
args = sys.argv
if len(args) != 2:
raise ValueError("You have to spacify an algorithm name (directory / docker image name)!")

main(args[1])
58 changes: 58 additions & 0 deletions .ci/generate-build-matrix.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env bash

set -e

default_branch=main
folder="${1:-.}"
ignore_pattern="0-base-images|1-intermediate-images|2-scripts|data|results|Dockerfile|README.md|\..*|.*\.py|.*\.yml|.*\.sh|.*\.png"
changes_in_basedir=""

function echoerr () {
echo "$@" >&2
}

# GITHUB_EVENT_NAME=pull_request
# GITHUB_BASE_REF=PR target branch (probably default branch)
# GITHUB_HEAD_REF=PR source branch
# GITHUB_REF=refs/pull/<pr_number>/merge
# GITHUB_REF_TYPE=tag or branch
# RUNNER_ARCH=X86, X64, ARM, or ARM64
# RUNNER_OD=Linux, Windows, or macOS

# if this is a workflow for a PR targeting the default branch
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then
# build diff to main
echoerr "Detected pipeline for a non-default branch (assuming pull request with target $GITHUB_BASE_REF)"
git fetch origin || echoerr "Could not update remote 'origin'! Repository might be out of date."
changes_in_basedir=$( git diff --name-only "refs/remotes/origin/$GITHUB_BASE_REF..HEAD" -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 )
#changes_in_basedir=$( git diff --name-only "$GITHUB_BASE_REF..HEAD" | cut -d '/' -f 1 )

# if this is a workflow for the default branch
elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then
# build latest commit for the default branch
echoerr "Detected pipeline for default branch"
#changes_in_basedir=$( git diff --name-only "$CI_COMMIT_BEFORE_SHA..$CI_COMMIT_SHA" )
changes_in_basedir=$( git diff --name-only HEAD~1..HEAD -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 )

# if this is a tag-workflow: build all algorithm images
elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_REF_TYPE" == "tag" ]]; then
echoerr "Detected pipeline for a tag"
changes_in_basedir=$( ls -1 )

else
echoerr "Cannot determine algorithm images to build! Please check the environment variables:"
env | grep "GITHUB" >&2 && true
echoerr ""
fi

# filter changes: remove non-algorithm-files/-folders and allow grep to find nothing (exit code 1)
changed_algos=$( echo "$changes_in_basedir" | sort | uniq | grep -x -v -E "${ignore_pattern}" || [[ $? == 1 ]] )
# filter changes: remove non-existing algos (e.g. when branch is not up-to-date with default branch or an algorithm was removed)
changed_algos=$( echo "$changed_algos" | while read -r f; do [[ -d "$folder/$f" ]] && echo "$f" || true; done )

if [[ -z "$changed_algos" ]]; then
echoerr "No algorithm changed!"
fi

echoerr "Generating pipeline for algorithms: $(xargs <<<$changed_algos)"
(jq -Rc '[.]' | jq -sc '{"algorithm_name": add}') <<<"${changed_algos}"
31 changes: 31 additions & 0 deletions .ci/get-image-version.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash

set -e

folder="${1:-}"
SEMVER_REGEX="^(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(\\-[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?(\\+[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?$"

trim-and-validate() {
local var="$*"
# remove leading whitespace characters
var="${var#"${var%%[![:space:]]*}"}"
# remove trailing whitespace characters
var="${var%"${var##*[![:space:]]}"}"

# validate semver version string
if [[ "$var" =~ $SEMVER_REGEX ]]; then
printf '%s' "$var"
else
echo "Version $var is not a proper version string according to SemVer 'X.Y.Z(-PRERELEASE)(+BUILD)'!" >&2
exit 1
fi
}

if [[ -f "$folder/version.txt" ]]; then
trim-and-validate "$( cat "$folder/version.txt" )"
elif [[ -f "$folder/manifest.json" ]]; then
trim-and-validate "$( jq -r '.version' "$folder/manifest.json" )"
else
echo "No version.txt or manifest.json present. Cannot determine Docker image version!" >&2
exit 1
fi
23 changes: 23 additions & 0 deletions .ci/get_dataset_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3
import json
import sys

from pathlib import Path

if __name__ == "__main__":
args = sys.argv
if len(args) != 2:
raise ValueError("You have to specify an algorithm name (directory / docker image name)!")

algorithm = args[1]
manifest_path = Path(".") / algorithm / "manifest.json"
with manifest_path.open("r") as fh:
manifest = json.load(fh)

value = manifest["inputDimensionality"]
if value.lower() == "univariate":
print("data/dataset.csv")
elif value.lower() == "multivariate":
print("data/multi-dataset.csv")
else:
raise ValueError(f"Input dimensionality ({value}) of {algorithm}'s manifest is unknown!")
Loading

0 comments on commit 96af897

Please sign in to comment.