Skip to content

Commit

Permalink
Merge pull request #114 from NearNodeFlash/release-v0.0.3
Browse files Browse the repository at this point in the history
Release v0.0.3
  • Loading branch information
bdevcich authored Jun 7, 2023
2 parents 2860b02 + 2bc974d commit 5b24a02
Show file tree
Hide file tree
Showing 222 changed files with 3,652 additions and 2,294 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ name: Docker build and push
on:
push:
branches:
- 'master'
- 'releases/v*'
- '*'
tags:
- 'v*'
pull_request:
Expand Down
45 changes: 42 additions & 3 deletions .github/workflows/rpm_build.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,36 @@
name: RPM Build
on: push
on:
push:
branches:
- '*'
tags:
- 'v*'
pull_request:
branches:
- 'master'
- 'releases/v*'

jobs:
repo_version:
runs-on: ubuntu-latest
outputs:
version_output: ${{ steps.step1.outputs.version }}
steps:
- name: Verify context
run: |
echo "ref is ${{ github.ref }}"
echo "ref_type is ${{ github.ref_type }}"
- uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Get Version
id: step1
run: echo "version=$(./git-version-gen)" >> $GITHUB_OUTPUT

rpm_build:
runs-on: ubuntu-latest
needs: repo_version
container:
image: centos:8
env:
Expand All @@ -12,17 +39,29 @@ jobs:
- 80
options: --cpus 1
steps:
- name: "Build context"
env:
VERSION_OUTPUT: ${{ needs.repo_version.outputs.version_output }}
run: |
echo "ref is ${{ github.ref }}"
echo "ref_type is ${{ github.ref_type }}"
echo "head.sha is ${{ github.event.pull_request.head.sha }}"
echo "git-version-gen is $VERSION_OUTPUT"
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: environment setup
env:
VERSION_OUTPUT: ${{ needs.repo_version.outputs.version_output }}
run: |
dnf -y --disablerepo '*' --enablerepo=extras swap centos-linux-repos centos-stream-repos
dnf -y distro-sync
dnf -y makecache --refresh
dnf install -y rpm-build rpmdevtools git make
dnf module -y install go-toolset
rpmdev-setuptree
echo $GITHUB_SHA | cut -c1-8 > .commit
echo $VERSION_OUTPUT > .rpmversion
cat .rpmversion
tar -czf /github/home/rpmbuild/SOURCES/nnf-datamovement-1.0.tar.gz --transform 's,^,nnf-datamovement-1.0/,' .
- name: build rpms
run: rpmbuild -ba daemons/compute/server/nnf-dm.spec
Expand Down
16 changes: 14 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020, 2021, 2022 Hewlett Packard Enterprise Development LP
# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
# Other additional copyright holders may be indicated within.
#
# The entirety of this work is licensed under the Apache License,
Expand All @@ -15,6 +15,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# These ARGs must be before the first FROM. This allows them to be valid for
# use in FROM instructions.
ARG NNFMFU_TAG_BASE=ghcr.io/nearnodeflash/nnf-mfu
ARG NNFMFU_VERSION=master

# Build the manager binary
FROM golang:1.19-alpine as builder

Expand Down Expand Up @@ -51,7 +56,7 @@ ENV CGO_ENABLED=0
ENTRYPOINT [ "make", "test" ]

###############################################################################
FROM ghcr.io/nearnodeflash/nnf-mfu:latest
FROM $NNFMFU_TAG_BASE:$NNFMFU_VERSION

RUN apt update

Expand All @@ -70,3 +75,10 @@ WORKDIR /
COPY --from=builder /workspace/manager .

ENTRYPOINT ["/manager"]

# Make it easy to figure out which nnf-mfu was used.
# docker inspect --format='{{json .Config.Labels}}' image:tag
ARG NNFMFU_TAG_BASE
ARG NNFMFU_VERSION
LABEL nnf-mfu="$NNFMFU_TAG_BASE:$NNFMFU_VERSION"

22 changes: 14 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021, 2022 Hewlett Packard Enterprise Development LP
# Copyright 2021-2023 Hewlett Packard Enterprise Development LP
# Other additional copyright holders may be indicated within.
#
# The entirety of this work is licensed under the Apache License,
Expand Down Expand Up @@ -53,6 +53,12 @@ BUNDLE_METADATA_OPTS ?= $(BUNDLE_CHANNELS) $(BUNDLE_DEFAULT_CHANNEL)
# cray.hpe.com/nnf-dm-bundle:$VERSION and cray.hpe.com/nnf-dm-catalog:$VERSION.
IMAGE_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-dm

# The NNF-MFU container image to use in NNFContainerProfile resources.
NNFMFU_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-mfu
NNFMFU_VERSION ?= master

DOCKER_BUILDARGS=--build-arg NNFMFU_TAG_BASE=$(NNFMFU_TAG_BASE) --build-arg NNFMFU_VERSION=$(NNFMFU_VERSION)

# BUNDLE_IMG defines the image:tag used for the bundle.
# You can use it as an arg. (E.g make bundle-build BUNDLE_IMG=<some-registry>/<project-name-bundle>:<tag>)
BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:v$(VERSION)
Expand Down Expand Up @@ -115,15 +121,15 @@ test: manifests generate fmt vet envtest ## Run tests.

container-unit-test: VERSION ?= $(shell cat .version)
container-unit-test: .version ## Run tests inside a container image
$(DOCKER) build -f Dockerfile --label $(IMAGE_TAG_BASE)-$@:$(VERSION)-$@ -t $(IMAGE_TAG_BASE)-$@:$(VERSION) --target testing .
$(DOCKER) build -f Dockerfile --label $(IMAGE_TAG_BASE)-$@:$(VERSION)-$@ -t $(IMAGE_TAG_BASE)-$@:$(VERSION) --target testing $(DOCKER_BUILDARGS) .
$(DOCKER) run --rm -t --name $@-nnf-dm $(IMAGE_TAG_BASE)-$@:$(VERSION)

##@ Build

build-daemon: COMMIT_HASH?=$(shell git rev-parse --short HEAD)
build-daemon: RPM_VERSION ?= $(shell ./git-version-gen)
build-daemon: PACKAGE = github.com/NearNodeFlash/nnf-dm/daemons/compute/server/version
build-daemon: manifests generate fmt vet ## Build standalone nnf-datamovement daemon
GOOS=linux GOARCH=amd64 go build -ldflags="-X '$(PACKAGE).commitHash=$(COMMIT_HASH)'" -o bin/nnf-dm daemons/compute/server/main.go
GOOS=linux GOARCH=amd64 go build -ldflags="-X '$(PACKAGE).version=$(RPM_VERSION)'" -o bin/nnf-dm daemons/compute/server/main.go

build: generate fmt vet ## Build manager binary.
go build -o bin/manager main.go
Expand All @@ -133,7 +139,7 @@ run: manifests generate fmt vet ## Run a controller from your host.

docker-build: VERSION ?= $(shell cat .version)
docker-build: .version ## Build docker image with the manager.
$(DOCKER) build -t $(IMAGE_TAG_BASE):$(VERSION) .
$(DOCKER) build -t $(IMAGE_TAG_BASE):$(VERSION) $(DOCKER_BUILDARGS) .

docker-push: VERSION ?= $(shell cat .version)
docker-push: .version ## Push docker image with the manager.
Expand All @@ -146,7 +152,7 @@ kind-push: .version ## Push docker image to kind
# the nnf-dm-rsyncnode daemonset that is created by that deployment.
kind load docker-image $(IMAGE_TAG_BASE):$(VERSION)
${DOCKER} pull gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0
kind load docker-image --nodes `kubectl get node -l cray.nnf.manager=true --no-headers -o custom-columns=":metadata.name" | paste -d, -s -` gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0
kind load docker-image gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0

minikube-push: VERSION ?= $(shell cat .version)
minikube-push: .version
Expand All @@ -162,7 +168,7 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified

deploy: VERSION ?= $(shell cat .version)
deploy: .version kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
./deploy.sh deploy $(KUSTOMIZE) $(IMAGE_TAG_BASE):$(VERSION)
./deploy.sh deploy $(KUSTOMIZE) $(IMAGE_TAG_BASE):$(VERSION) $(NNFMFU_TAG_BASE):$(NNFMFU_VERSION)

undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config.
./deploy.sh undeploy $(KUSTOMIZE)
Expand Down Expand Up @@ -214,7 +220,7 @@ bundle: manifests kustomize ## Generate bundle manifests and metadata, then vali
bundle-build: VERSION ?= $(shell cat .version)
bundle-build: BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:v$(VERSION)
bundle-build: .version ## Build the bundle image.
$(DOCKER) build -f bundle.Dockerfile -t $(BUNDLE_IMG) .
$(DOCKER) build -f bundle.Dockerfile -t $(BUNDLE_IMG) $(DOCKER_BUILDARGS) .

.PHONY: bundle-push
bundle-push: VERSION ?= $(shell cat .version)
Expand Down
2 changes: 0 additions & 2 deletions _setup-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ echo "$(tput bold)Creating kind cluster with two worker nodes and /nnf mount $(t
kind create cluster --wait 60s --image=kindest/node:v1.22.5 --config kind-config.yaml

kubectl taint node kind-control-plane node-role.kubernetes.io/master:NoSchedule-
kubectl label node kind-control-plane cray.nnf.manager=true
kubectl label node kind-control-plane cray.wlm.manager=true

# Label the kind-workers as rabbit nodes for the NLCMs.
NODES=$(kubectl get nodes --no-headers | grep --invert-match "control-plane" | awk '{print $1}')
Expand Down
4 changes: 1 addition & 3 deletions _setup-minikube.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ minikube start --nodes 3


kubectl taint node minikube node-role.kubernetes.io/master:NoSchedule-
kubectl label node minikube cray.nnf.manager=true
kubectl label node minikube cray.wlm.manager=true

NODES=$(kubectl get nodes --no-headers | grep --invert-match "control-plane" | awk '{print $1}')
for NODE in $NODES; do
Expand All @@ -43,4 +41,4 @@ done
kubectl taint nodes "$(echo $NODES | paste -d" " -s -)" cray.nnf.node=true:NoSchedule

certver="v1.7.0"
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/"$certver"/cert-manager.yaml
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/"$certver"/cert-manager.yaml
12 changes: 10 additions & 2 deletions config/dm_config/nnf-dm-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,16 @@ profiles:
# GID: Group ID that is inherited from the Workflow
# SRC: source for the data movement
# DEST destination for the data movement
# default: command: mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST
command: mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST
# default: command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST
command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST

# If true, enable the command's stdout to be saved in the log when the command completes
# successfully. On failure, the output is always logged.
logStdout: false

# Similar to logStdout, store the command's stdout in Status.Message when the command
# completes successfully. On failure, the output is always stored.
storeStdout: false

# NnfDataMovement resources have the ability to collect and store the progress percentage and the
# last few lines of output in the CommandStatus field. This number is used for the interval to collect
Expand Down
3 changes: 3 additions & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ images:
- name: controller
newName: ghcr.io/nearnodeflash/nnf-dm
newTag: 0.0.1
- name: nnf-mfu
newName: ghcr.io/nearnodeflash/nnf-mfu
newTag: master
11 changes: 1 addition & 10 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ spec:
labels:
control-plane: controller-manager
spec:
nodeSelector:
cray.nnf.manager: "true"
containers:
- command:
- /manager
Expand All @@ -46,13 +44,6 @@ spec:
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 20Mi
serviceAccountName: controller-manager
terminationGracePeriodSeconds: 10
---
Expand All @@ -78,7 +69,7 @@ spec:
shareProcessNamespace: true
containers:
- name: worker
image: ghcr.io/nearnodeflash/nnf-mfu:latest
image: nnf-mfu:latest
command:
- /usr/sbin/sshd
args:
Expand Down
4 changes: 2 additions & 2 deletions config/manager/manager_imagepullsecret_patch.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: controller-manager
name: manager-controller-manager
namespace: system
spec:
template:
Expand All @@ -12,7 +12,7 @@ spec:
apiVersion: dm.cray.hpe.com/v1alpha1
kind: DataMovementManager
metadata:
name: datamovementmanager
name: manager-controller-manager
namespace: system
spec:
template:
Expand Down
40 changes: 31 additions & 9 deletions controllers/datamovement_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/yaml"

dmv1alpha1 "github.com/NearNodeFlash/nnf-dm/api/v1alpha1"
Expand Down Expand Up @@ -81,6 +82,8 @@ type DataMovementReconciler struct {
// We maintain a map of active operations which allows us to process cancel requests
// This is a thread safe map since multiple data movement reconcilers and go routines will be executing at the same time.
contexts sync.Map

WatchNamespace string
}

// Keep track of the context and its cancel function so that we can track
Expand All @@ -98,9 +101,11 @@ type dmConfig struct {

// Each profile can have different settings
type dmConfigProfile struct {
Slots int `yaml:"slots,omitempty"`
MaxSlots int `yaml:"maxSlots,omitempty"`
Command string `yaml:"command"`
Slots int `yaml:"slots,omitempty"`
MaxSlots int `yaml:"maxSlots,omitempty"`
Command string `yaml:"command"`
LogStdout bool `yaml:"logStdout"`
StoreStdout bool `yaml:"storeStdout"`
}

// Invalid error is a non-recoverable error type that implies the Data Movement resource is invalid
Expand Down Expand Up @@ -240,14 +245,13 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Info("Config map not found - requeueing", "name", configMapName, "namespace", configMapNamespace)
return ctrl.Result{}, handleInvalidError(err)
}
log.Info("Config map found", "data", configMap.Data)

cfg := dmConfig{}
if err := yaml.Unmarshal([]byte(configMap.Data[configMapKeyData]), &cfg); err != nil {
log.Error(err, "error reading config map data")
return ctrl.Result{}, handleInvalidError(err)
}
log.Info("Config map unmarshalled", "config", cfg)
log.Info("Using config map", "config", cfg)

// TODO: Allow use of non-default dm config profiles - for now only use the default. For copy
// offload API, we could create "fake" profiles and store those in the DM object based on the
Expand All @@ -269,7 +273,7 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Info("MPI Hostfile preview", "first line", peekMpiHostfile(mpiHostfile))
}

cmd := exec.CommandContext(ctxCancel, cmdArgs[0], cmdArgs[1:]...)
cmd := exec.CommandContext(ctxCancel, "/bin/bash", "-c", strings.Join(cmdArgs, " "))

// Record the start of the data movement operation
now := metav1.NowMicro()
Expand Down Expand Up @@ -392,17 +396,28 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
dm.Status.State = nnfv1alpha1.DataMovementConditionTypeFinished
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonSuccess

// On cancellation or failure, log the output. On failure, also store the output in the
// Status.Message. When successful, check the profile/UserConfig config options to log
// and/or store the output.
if errors.Is(ctxCancel.Err(), context.Canceled) {
log.Error(err, "Data movement operation cancelled", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonCancelled
} else if err != nil {
log.Error(err, "Data movement operation failed", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonFailed
dm.Status.Message = fmt.Sprintf("%s: %s", err.Error(), combinedOutBuf.String())

// TODO: Enhanced error capture: parse error response and provide useful message
} else {
log.Info("Completed Command", "cmdStatus", cmdStatus)
log.Info("Data movement operation completed", "cmdStatus", cmdStatus)

// Profile or DM request has enabled stdout logging
if profile.LogStdout || (dm.Spec.UserConfig != nil && dm.Spec.UserConfig.LogStdout) {
log.Info("Data movement operation output", "output", combinedOutBuf.String())
}

// Profile or DM request has enabled storing stdout
if profile.StoreStdout || (dm.Spec.UserConfig != nil && dm.Spec.UserConfig.StoreStdout) {
dm.Status.Message = combinedOutBuf.String()
}
}

os.RemoveAll(filepath.Dir(mpiHostfile))
Expand Down Expand Up @@ -704,10 +719,17 @@ func (r *DataMovementReconciler) getWorkerHostnames(ctx context.Context, nodes [
return hostnames, nil
}

func filterByNamespace(namespace string) predicate.Predicate {
return predicate.NewPredicateFuncs(func(object client.Object) bool {
return object.GetNamespace() == namespace
})
}

// SetupWithManager sets up the controller with the Manager.
func (r *DataMovementReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&nnfv1alpha1.NnfDataMovement{}).
WithOptions(controller.Options{MaxConcurrentReconciles: 128}).
WithEventFilter(filterByNamespace(r.WatchNamespace)).
Complete(r)
}
Loading

0 comments on commit 5b24a02

Please sign in to comment.