diff --git a/go.mod b/go.mod index 49223b4fb4..57cd0e4db9 100644 --- a/go.mod +++ b/go.mod @@ -69,7 +69,8 @@ replace ( k8s.io/sample-controller => k8s.io/sample-controller v0.18.1 ) -// Temporary fork based on 1.18.1 with two additional PRs: +// Temporary fork based on 1.18.1 with additional PRs: // - https://github.com/kubernetes/kubernetes/pull/89819 // - https://github.com/kubernetes/kubernetes/pull/90214 -replace k8s.io/kubernetes => github.com/pohly/kubernetes v1.18.1-pmem-csi-20200416 +// - https://github.com/kubernetes/kubernetes/pull/90335 +replace k8s.io/kubernetes => github.com/pohly/kubernetes v1.18.1-pmem-csi-20200421-2 diff --git a/go.sum b/go.sum index 367554ed9f..8bf1b6a880 100644 --- a/go.sum +++ b/go.sum @@ -498,8 +498,8 @@ github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/pohly/kubernetes v1.18.1-pmem-csi-20200416 h1:hx31zWWCNLZCH5+mnBgwp18m46HxLXDOMvDvoDsk1kc= -github.com/pohly/kubernetes v1.18.1-pmem-csi-20200416/go.mod h1:z8xjOOO1Ljz+TaHpOxVGC7cxtF32TesIamoQ+BZrVS0= +github.com/pohly/kubernetes v1.18.1-pmem-csi-20200421-2 h1:AQ2rM97cDjKIrTMVFkFtMFbMzu174j3MnII/3b2IFno= +github.com/pohly/kubernetes v1.18.1-pmem-csi-20200421-2/go.mod h1:z8xjOOO1Ljz+TaHpOxVGC7cxtF32TesIamoQ+BZrVS0= github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA= github.com/pquerna/ffjson v0.0.0-20180717144149-af8b230fcd20 h1:7sBb9iOkeq+O7AXlVoH/8zpIcRXX523zMkKKspHjjx8= github.com/pquerna/ffjson v0.0.0-20180717144149-af8b230fcd20/go.mod h1:YARuvh7BUWHNhzDq2OM5tzR2RiCcN2D7sapiKyCel/M= diff --git a/test/e2e/deploy/deploy.go b/test/e2e/deploy/deploy.go index bd5ac99d2d..b37c4eca08 100644 --- a/test/e2e/deploy/deploy.go +++ b/test/e2e/deploy/deploy.go @@ -392,10 +392,15 @@ func EnsureDeployment(deploymentName string) *Deployment { f := framework.NewDefaultFramework("cluster") f.SkipNamespaceCreation = true + var prevVol map[string][]string ginkgo.BeforeEach(func() { ginkgo.By(fmt.Sprintf("preparing for test %q", ginkgo.CurrentGinkgoTestDescription().FullTestText)) c, err := NewCluster(f.ClientSet) + + // Remember list of volumes before test, using out-of-band host commands (i.e. not CSI API). + prevVol = GetHostVolumes(deployment) + framework.ExpectNoError(err, "get cluster information") running, err := FindPMEMDriver(c) framework.ExpectNoError(err, "check for PMEM-CSI driver") @@ -428,6 +433,11 @@ func EnsureDeployment(deploymentName string) *Deployment { } }) + ginkgo.AfterEach(func() { + // Check list of volumes after test to detect left-overs + CheckForLeftoverVolumes(deployment, prevVol) + }) + return deployment } diff --git a/test/e2e/deploy/volumeleaks.go b/test/e2e/deploy/volumeleaks.go new file mode 100644 index 0000000000..3a5c574308 --- /dev/null +++ b/test/e2e/deploy/volumeleaks.go @@ -0,0 +1,62 @@ +/* +Copyright 2020 Intel Corporation. + +SPDX-License-Identifier: Apache-2.0 +*/ + +package deploy + +import ( + "fmt" + "os" + "os/exec" + "strings" + + pmemcsidriver "github.com/intel/pmem-csi/pkg/pmem-csi-driver" + + . "github.com/onsi/gomega" +) + +// Register list of volumes before test, using out-of-band host commands (i.e. not CSI API). +func GetHostVolumes(d *Deployment) map[string][]string { + var cmd string + var hdr string + switch d.Mode { + case pmemcsidriver.LVM: + // lvs adds many space (0x20) chars at end, we could squeeze + // repetitions using tr here, but TrimSpace() below strips those away + cmd = "sudo lvs --foreign --noheadings" + hdr = "LVM Volumes" + case pmemcsidriver.Direct: + // ndctl produces multiline block. We want one line per namespace. + // Pick uuid, mode, size for comparison. Note that sorting changes the order so lines + // are not grouped by volume, but keeping volume order would need more complex parsing + // and this is not meant to be pretty-printed for human, just to detect the change. + cmd = "sudo ndctl list |tr -d '\"' |egrep 'uuid|mode|^ *size' |sort |tr -d ' \n'" + hdr = "Namespaces" + } + result := make(map[string][]string) + // Instead of trying to find out number of hosts, we trust the set of + // ssh.N helper scripts matches running hosts, which should be the case in + // correctly running tester system. We run ssh.N commands until a ssh.N + // script appears to be "no such file". + for worker := 1; ; worker++ { + sshcmd := fmt.Sprintf("%s/_work/%s/ssh.%d", os.Getenv("REPO_ROOT"), os.Getenv("CLUSTER"), worker) + ssh := exec.Command(sshcmd, cmd) + // Intentional Output instead of CombinedOutput to dismiss warnings from stderr. + // lvs may emit lvmetad-related WARNING msg which can't be silenced using -q option. + out, err := ssh.Output() + if err != nil && os.IsNotExist(err) { + break + } + buf := fmt.Sprintf("%s on Node %d", hdr, worker) + result[buf] = strings.Split(strings.TrimSpace(string(out)), "\n") + } + return result +} + +// CheckForLeftovers lists volumes again after test, diff means leftovers. +func CheckForLeftoverVolumes(d *Deployment, volBefore map[string][]string) { + volNow := GetHostVolumes(d) + Expect(volNow).To(Equal(volBefore), "same volumes before and after the test") +} diff --git a/test/e2e/storage/csi_volumes.go b/test/e2e/storage/csi_volumes.go index 3ef25c816f..f97656b7f4 100644 --- a/test/e2e/storage/csi_volumes.go +++ b/test/e2e/storage/csi_volumes.go @@ -119,7 +119,6 @@ var _ = deploy.DescribeForAll("E2E", func(d *deploy.Deployment) { var ( storageClassLateBindingName = "pmem-csi-sc-late-binding" // from deploy/common/pmem-storageclass-late-binding.yaml claim v1.PersistentVolumeClaim - prevVol map[string][]string ) f := framework.NewDefaultFramework("latebinding") BeforeEach(func() { @@ -129,8 +128,6 @@ var _ = deploy.DescribeForAll("E2E", func(d *deploy.Deployment) { skipper.Skipf("storage class %s not found, late binding not supported", storageClassLateBindingName) } framework.ExpectNoError(err, "get storage class %s", storageClassLateBindingName) - // Register list of volumes before test, using out-of-band host commands (i.e. not CSI API). - prevVol = GetHostVolumes(d) claim = v1.PersistentVolumeClaim{ ObjectMeta: metav1.ObjectMeta{ @@ -151,11 +148,6 @@ var _ = deploy.DescribeForAll("E2E", func(d *deploy.Deployment) { } }) - AfterEach(func() { - // Check list of volumes after test to detect left-overs - CheckForLeftoverVolumes(d, prevVol) - }) - It("works", func() { TestDynamicLateBindingProvisioning(f.ClientSet, &claim, "latebinding") }) diff --git a/test/e2e/storage/sanity.go b/test/e2e/storage/sanity.go index da702c40a3..b240e56830 100644 --- a/test/e2e/storage/sanity.go +++ b/test/e2e/storage/sanity.go @@ -97,7 +97,6 @@ var _ = deploy.DescribeForSome("sanity", func(d *deploy.Deployment) bool { f.SkipNamespaceCreation = true // We don't need a per-test namespace and skipping it makes the tests run faster. var execOnTestNode func(args ...string) string var cleanup func() - var prevVol map[string][]string var cluster *deploy.Cluster const socatPort = 9735 @@ -181,13 +180,9 @@ var _ = deploy.DescribeForSome("sanity", func(d *deploy.Deployment) bool { config.CreateStagingDir = mkdir config.RemoveTargetPath = rmdir config.RemoveStagingPath = rmdir - // Register list of volumes before test, using out-of-band host commands (i.e. not CSI API). - prevVol = GetHostVolumes(d) }) AfterEach(func() { - // Check list of volumes after test to detect left-overs - CheckForLeftoverVolumes(d, prevVol) if cleanup != nil { cleanup() } @@ -1026,47 +1021,3 @@ func WaitForPodsWithLabelRunningReady(c clientset.Interface, ns string, label la }) return pods, err } - -// Register list of volumes before test, using out-of-band host commands (i.e. not CSI API). -func GetHostVolumes(d *deploy.Deployment) map[string][]string { - var cmd string - var hdr string - switch d.Mode { - case pmemcsidriver.LVM: - // lvs adds many space (0x20) chars at end, we could squeeze - // repetitions using tr here, but TrimSpace() below strips those away - cmd = "sudo lvs --foreign --noheadings" - hdr = "LVM Volumes" - case pmemcsidriver.Direct: - // ndctl produces multiline block. We want one line per namespace. - // Pick uuid, mode, size for comparison. Note that sorting changes the order so lines - // are not grouped by volume, but keeping volume order would need more complex parsing - // and this is not meant to be pretty-printed for human, just to detect the change. - cmd = "sudo ndctl list |tr -d '\"' |egrep 'uuid|mode|^ *size' |sort |tr -d ' \n'" - hdr = "Namespaces" - } - result := make(map[string][]string) - // Instead of trying to find out number of hosts, we trust the set of - // ssh.N helper scripts matches running hosts, which should be the case in - // correctly running tester system. We run ssh.N commands until a ssh.N - // script appears to be "no such file". - for worker := 1; ; worker++ { - sshcmd := fmt.Sprintf("%s/_work/%s/ssh.%d", os.Getenv("REPO_ROOT"), os.Getenv("CLUSTER"), worker) - ssh := exec.Command(sshcmd, cmd) - // Intentional Output instead of CombinedOutput to dismiss warnings from stderr. - // lvs may emit lvmetad-related WARNING msg which can't be silenced using -q option. - out, err := ssh.Output() - if err != nil && os.IsNotExist(err) { - break - } - buf := fmt.Sprintf("%s on Node %d", hdr, worker) - result[buf] = strings.Split(strings.TrimSpace(string(out)), "\n") - } - return result -} - -// CheckForLeftovers lists volumes again after test, diff means leftovers. -func CheckForLeftoverVolumes(d *deploy.Deployment, volBefore map[string][]string) { - volNow := GetHostVolumes(d) - Expect(volNow).To(Equal(volBefore), "same volumes before and after the test") -} diff --git a/test/e2e/storage/scheduler/scheduler.go b/test/e2e/storage/scheduler/scheduler.go index 568aa0965d..c49e650d3b 100644 --- a/test/e2e/storage/scheduler/scheduler.go +++ b/test/e2e/storage/scheduler/scheduler.go @@ -45,11 +45,9 @@ var _ testsuites.TestSuite = &schedulerTestSuite{} // webhook work. func InitSchedulerTestSuite() testsuites.TestSuite { // We test with an ephemeral inline volume and a PVC with late - // binding. The webhook works reliably only for the inline - // volume. With PVCs there are race conditions (PVC created, - // but controller not informed yet when webhook is called), so - // we may have to wait until eventually it works. + // binding. lateBinding := testpatterns.DefaultFsDynamicPV + lateBinding.Name = "Dynamic PV with late binding" lateBinding.BindingMode = storagev1.VolumeBindingWaitForFirstConsumer suite := &schedulerTestSuite{ diff --git a/test/setup-deployment.sh b/test/setup-deployment.sh index d85724ee9d..613b0f9f05 100755 --- a/test/setup-deployment.sh +++ b/test/setup-deployment.sh @@ -16,8 +16,7 @@ REPO_DIRECTORY="${REPO_DIRECTORY:-$(dirname $(dirname $(readlink -f $0)))}" CLUSTER_DIRECTORY="${CLUSTER_DIRECTORY:-${REPO_DIRECTORY}/_work/${CLUSTER}}" SSH="${CLUSTER_DIRECTORY}/ssh.0" KUBECTL="${SSH} kubectl" # Always use the kubectl installed in the cluster. -KUBERNETES_VERSION="$(${KUBECTL} version --short | grep 'Server Version' | \ - sed -e 's/.*: v\([0-9]*\)\.\([0-9]*\)\..*/\1.\2/')" +KUBERNETES_VERSION="$(cat "$CLUSTER_DIRECTORY/kubernetes.version")" DEPLOYMENT_DIRECTORY="${REPO_DIRECTORY}/deploy/kubernetes-$KUBERNETES_VERSION" case ${TEST_DEPLOYMENTMODE} in testing) @@ -73,7 +72,6 @@ data: tls.key: ${NODE_KEY} EOF -echo "$KUBERNETES_VERSION" > $CLUSTER_DIRECTORY/kubernetes.version case "$KUBERNETES_VERSION" in 1.1[01234]) # We cannot exclude the PMEM-CSI pods from the webhook because objectSelector @@ -113,12 +111,34 @@ patchesJson6902: version: v1 kind: StatefulSet name: pmem-csi-controller - path: scheduler-patch.yaml + path: controller-patch.yaml EOF - ${SSH} "cat >'$tmpdir/my-deployment/scheduler-patch.yaml'" <'$tmpdir/my-deployment/controller-patch.yaml'" <"${CLUSTER_DIRECTORY}/kubernetes.version" + kubernetes_usage ) diff --git a/test/test.make b/test/test.make index 2a952d15fe..9991a24d20 100644 --- a/test/test.make +++ b/test/test.make @@ -143,9 +143,39 @@ TEST_E2E_SKIP_ALL = $(TEST_E2E_SKIP) # https://github.com/kubernetes/kubernetes/blob/25ffbe633810609743944edd42d164cd7990071c/test/e2e/storage/testsuites/provisioning.go#L175-L181 TEST_E2E_SKIP_ALL += should.access.volume.from.different.nodes +# This is a test for behavior of kubelet which Kubernetes <= 1.15 doesn't pass. +TEST_E2E_SKIP_1.14 += volumeMode.should.not.mount.*map.unused.volumes.in.a.pod +TEST_E2E_SKIP_1.15 += volumeMode.should.not.mount.*map.unused.volumes.in.a.pod + +# It looks like Kubernetes <= 1.15 does not wait for +# NodeUnpublishVolume to complete before deleting the pod: +# +# Apr 21 17:33:12.743: INFO: Wait up to 5m0s for pod "dax-volume-test" to be fully deleted +# pmem-csi-node-4dsmr/pmem-driver@pmem..ker2: I0421 17:33:34.491659 1 tracing.go:19] GRPC call: /csi.v1.Node/NodeGetCapabilities +# pmem-csi-node-4dsmr/pmem-driver@pmem..ker2: I0421 17:33:45.549013 1 tracing.go:19] GRPC call: /csi.v1.Node/NodeUnpublishVolume +# pmem-csi-node-4dsmr/pmem-driver@pmem..ker2: I0421 17:33:45.549189 1 nodeserver.go:295] NodeUnpublishVolume: unmount /var/lib/kubelet/pods/1c5f1fec-b08b-4264-8c55-40a22c1b3d16/volumes/kubernetes.io~csi/vol1/mount +# STEP: delete the pod +# Apr 21 17:33:46.769: INFO: Waiting for pod dax-volume-test to disappear +# Apr 21 17:33:46.775: INFO: Pod dax-volume-test no longer exists +# +# That breaks our volume leak detection because the test continues +# before the volume is truly removed. As a workaround, we disable +# ephemeral volume tests on Kubernetes <= 1.15. That's okay because the feature +# was alpha in those releases and shouldn't be used. +TEST_E2E_SKIP_1.14 += Testpattern:.Ephemeral-volume Testpattern:.inline.ephemeral.CSI.volume +TEST_E2E_SKIP_1.15 += Testpattern:.Ephemeral-volume Testpattern:.inline.ephemeral.CSI.volume + +# Add all Kubernetes version-specific suppressions. +TEST_E2E_SKIP_ALL += $(TEST_E2E_SKIP_$(shell cat _work/$(CLUSTER)/kubernetes.version)) + # E2E tests which are to be executed (space separated list of regular expressions, default is all that aren't skipped). TEST_E2E_FOCUS = +foobar: + echo TEST_E2E_SKIP_$(shell cat _work/$(CLUSTER)/kubernetes.version) + echo $(TEST_E2E_SKIP_$(shell cat _work/$(CLUSTER)/kubernetes.version)) + echo $(TEST_E2E_SKIP_ALL) + # E2E Junit output directory (default empty = none). junit_.xml files will be written there, # i.e. usually just junit_01.xml. TEST_E2E_REPORT_DIR=