diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000000..2a94d68b05 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,36 @@ +name: Publish + +on: + push: + branches: + - devel + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + - name: Install dependencies + run: | + export PATH="$HOME/.local/bin:$PATH" + sudo apt-get install -y python3-setuptools + pip3 install --user -r docs/requirements.txt + - name: Build the docs + run: | + export PATH="$HOME/.local/bin:$PATH" + make html + - name: Deploy the docs + run: | + mkdir $HOME/output + mv _output/html $HOME/output/latest + touch $HOME/output/.nojekyll + mv docs/html/index2.html $HOME/output/index.html + cd $HOME/output + git init + git config --global user.name "${GITHUB_ACTOR}" + git config --global user.email "${GITHUB_ACTOR}@github.com" + git add . + git commit -m "latest html output" + git push -f https://${GITHUB_ACTOR}:${{secrets.ACCESS_TOKEN}}@github.com/intelkevinputnam/pmem-csi.git HEAD:gh-pages diff --git a/.gitignore b/.gitignore index 2d434329cf..404a50f9c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ /vendor /_output /_work +/.tox +Manifest +/_build + diff --git a/Jenkinsfile b/Jenkinsfile index 5c46aa4027..b5f68b3804 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -171,7 +171,8 @@ pipeline { // Install additional tools: // - ssh client for govm - sh "docker exec ${env.BUILD_CONTAINER} swupd bundle-add openssh-client" + // - python3 for Sphinx (i.e. make html) + sh "docker exec ${env.BUILD_CONTAINER} swupd bundle-add openssh-client python3-basic" // Now commit those changes to ensure that the result of "swupd bundle add" gets cached. sh "docker commit ${env.BUILD_CONTAINER} ${env.BUILD_IMAGE}" @@ -213,6 +214,13 @@ pipeline { } } + stage('docsite') { + steps { + sh "${RunInBuilder()} ${env.BUILD_CONTAINER} make vhtml" + publishHTML([allowMissing: false, alwaysLinkToLastBuild: false, keepAll: false, reportDir: '_output/html', reportFiles: 'index.html', reportName: 'Doc Site', reportTitles: '']) + } + } + stage('make test') { options { timeout(time: 20, unit: "MINUTES") diff --git a/Makefile b/Makefile index 4852533db5..5df4dcd4f2 100644 --- a/Makefile +++ b/Makefile @@ -209,3 +209,28 @@ $(addprefix test-kustomize-,$(KUSTOMIZE_OUTPUT)): test-kustomize-%: _work/kustom .PHONY: check-go-version-% check-go-version-%: @ hack/verify-go-version.sh "$*" + +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _output + +# Generate doc site under _build/html with Sphinx. +vhtml: _work/venv/.stamp + . _work/venv/bin/activate && \ + $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) && \ + cp docs/html/index.html $(BUILDDIR)/html/index.html + +html: + $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) && \ + cp docs/html/index.html $(BUILDDIR)/html/index.html + +clean-html: + rm -rf _output/html + +# Set up a Python3 environment with the necessary tools for document creation. +_work/venv/.stamp: docs/requirements.txt + rm -rf ${@D} + python3 -m venv ${@D} + . ${@D}/bin/activate && pip install -r $< + touch $@ diff --git a/README.md b/README.md index 72f25f0d72..1ce47a2834 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PMEM-CSI for Kubernetes +# Introduction to PMEM-CSI for Kubernetes **Note: This is Alpha code and not production ready.** @@ -12,7 +12,7 @@ library](https://github.com/pmem/ndctl). In this readme, we use module (NVDIMM). The [v0.6.0 release](https://github.com/intel/pmem-csi/releases/tag/v0.6.0) -is the latest feature release and is [regularly updated](./DEVELOPMENT.md#release-management) with newer base images +is the latest feature release and is [regularly updated](docs/DEVELOPMENT.md#release-management) with newer base images and bug fixes. Older versions are no longer supported. The PMEM-CSI driver follows the [CSI @@ -20,576 +20,15 @@ specification](https://github.com/container-storage-interface/spec) by listening for API requests and provisioning volumes accordingly. - [PMEM-CSI for Kubernetes](#pmem-csi-for-kubernetes) - - [Design](#design) - - [Architecture and Operation](#architecture-and-operation) - - [LVM device mode](#lvm-device-mode) - - [Direct device mode](#direct-device-mode) - - [Driver modes](#driver-modes) - - [Driver Components](#driver-components) - - [Communication between components](#communication-between-components) - - [Security](#security) - - [Volume Persistency](#volume-persistency) - - [Capacity-aware pod scheduling](#capacity-aware-pod-scheduling) - - [Prerequisites](#prerequisites) - - [Software required](#software-required) - - [Hardware required](#hardware-required) - - [Persistent memory pre-provisioning](#persistent-memory-pre-provisioning) - [Supported Kubernetes versions](#supported-kubernetes-versions) - - [Setup](#setup) - - [Get source code](#get-source-code) - - [Run PMEM-CSI on Kubernetes](#run-pmem-csi-on-kubernetes) - - [Automated testing](#automated-testing) - - [Unit testing and code quality](#unit-testing-and-code-quality) - - [QEMU and Kubernetes](#qemu-and-kubernetes) - - [Starting and stopping a test cluster](#starting-and-stopping-a-test-cluster) - - [Running commands on test cluster nodes over ssh](#running-commands-on-test-cluster-nodes-over-ssh) - - [Configuration options](#configuration-options) - - [Running E2E tests](#running-e2e-tests) - - [Application examples](#application-examples) - - [Communication and contribution](#communication-and-contribution) - -## Design - -### Architecture and Operation - -The PMEM-CSI driver can operate in two different device modes: *LVM* and -*direct*. This table contains an overview and comparison of those modes. -There is a more detailed explanation in the following paragraphs. - -| |`LVM` |`direct` | -|:-- |:-- |:-- | -|Main advantage |avoids free space fragmentation1 |simpler, somewhat faster, but free space may get fragmented1 | -|What is served |LVM logical volume |pmem block device | -|Region affinity2 |yes: one LVM volume group is created per region, and a volume has to be in one volume group |yes: namespace can belong to one region only | -|Startup |two extra stages: pmem-ns-init (creates namespaces), vgm (creates volume groups) |no extra steps at startup | -|Namespace modes |`fsdax` mode3 namespaces pre-created as pools |namespace in `fsdax` mode created directly, no need to pre-create pools | -|Limiting space usage | can leave part of device unused during pools creation |no limits, creates namespaces on device until runs out of space | -| *Name* field in namespace | *Name* gets set to 'pmem-csi' to achieve own vs. foreign marking | *Name* gets set to VolumeID, without attempting own vs. foreign marking | -|Minimum volume size| 4 MB | 1 GB (see also alignment adjustment below) | -|Alignment requirements |LVM creation aligns size up to next 4MB boundary |driver aligns size up to next alignment boundary. The default alignment step is 1 GB. Device(s) in interleaved mode will require larger minimum as size has to be at least one alignment step. The possibly bigger alignment step is calculated as interleave-set-size multiplied by 1 GB | - -1 **Free space fragmentation** is a problem when there appears to -be enough free capacity for a new namespace, but there isn't a contiguous -region big enough to allocate it. The PMEM-CSI driver is only capable of -allocating continguous memory to a namespace and cannot de-fragment or combine -smaller blocks. For example, this could happen when you create a 63 GB -namespace, followed by a 1 GB namespace, and then delete the 63 GB namespace. -Eventhough there is 127 GB available, the driver cannot create a namespace -larger than 64 GB. - -``` ---------------------------------------------------------------------- -| 63 GB free | 1GB used | 64 GB free | ---------------------------------------------------------------------- -``` - -2 **Region affinity** means that all parts of a provisioned file -system are physically located on device(s) that belong to same PMEM region. -This is important on multi-socket systems where media access time may vary -based on where the storage device(s) are physically attached. - -3 **fsdax mode** is required for NVDIMM -namespaces. See [Persistent Memory -Programming](https://pmem.io/ndctl/ndctl-create-namespace.html) for -details. `devdax` mode is not supported. Though a -raw block volume would be useful when a filesystem isn't needed, Kubernetes -cannot handle [binding a character device to a loop device](https://github.com/kubernetes/kubernetes/blob/7c87b5fb55ca096c007c8739d4657a5a4e29fb09/pkg/volume/util/util.go#L531-L534). - -### LVM device mode - -In Logical Volume Management (LVM) mode the PMEM-CSI driver -uses LVM for logical volume Management to avoid the risk of fragmentation. The -LVM logical volumes are served to satisfy API requests. There is one volume -group created per region, ensuring the region-affinity of served volumes. - -![devicemode-lvm diagram](/docs/images/devicemodes/pmem-csi-lvm.png) - -The driver consists of three separate binaries that form two -initialization stages and a third API-serving stage. - -During startup, the driver scans persistent memory for regions and -namespaces, and tries to create more namespaces using all or part -(selectable via option) of the remaining available space. This first -stage is performed by a separate entity `pmem-ns-init`. - -The second stage of initialization arranges physical volumes provided -by namespaces into LVM volume groups. This is performed by a separate -binary `pmem-vgm`. - -After two initialization stages, the third binary `pmem-csi-driver` -starts serving CSI API requests. - -#### Namespace modes in LVM device mode - -The PMEM-CSI driver pre-creates namespaces in `fsdax` mode forming -the corresponding LVM volume group. The amount of space to be -used is determined using the option `-useforfsdax` given to `pmem-ns-init`. -This options specifies an integer presenting limit as percentage. -The default value is `useforfsdax=100`. - -#### Using limited amount of total space in LVM device mode - -The PMEM-CSI driver can leave space on devices for others, and -recognize "own" namespaces. Leaving space for others can be achieved -by specifying lower-than-100 value to `-useforfsdax` options -The distinction "own" vs. "foreign" is -implemented by setting the _Name_ field in namespace to a static -string "pmem-csi" during namespace creation. When adding physical -volumes to volume groups, only those physical volumes that are based on -namespaces with the name "pmem-csi" are considered. - -### Direct device mode - -The following diagram illustrates the operation in Direct device mode: -![devicemode-direct diagram](/docs/images/devicemodes/pmem-csi-direct.png) - -In direct device mode PMEM-CSI driver allocates namespaces directly -from the storage device. This creates device space fragmentation risk, -but reduces complexity and run-time overhead by avoiding additional -device mapping layer. Direct mode also ensures the region-affinity of -served volumes, because provisioned volume can belong to one region -only. - -In Direct mode, the two preparation stages used in LVM mode, are not -needed. - -#### Namespace modes in direct device mode - -The PMEM-CSI driver creates a namespace directly in the mode which is -asked by volume creation request, thus bypassing the complexity of -pre-allocated pools that are used in LVM device mode. - -#### Using limited amount of total space in direct device mode - -In direct device mode, the driver does not attempt to limit space -use. It also does not mark "own" namespaces. The _Name_ field of a -namespace gets value of the VolumeID. - -### Driver modes - -The PMEM-CSI driver supports running in different modes, which can be -controlled by passing one of the below options to the driver's -'_-mode_' command line option. In each mode, it starts a different set -of open source Remote Procedure Call (gRPC) -[servers](#driver-components) on given driver endpoint(s). - -* **_Controller_** should run as a single instance in cluster level. When the - driver is running in _Controller_ mode, it forwards the pmem volume - create/delete requests to the registered node controller servers - running on the worker node. In this mode, the driver starts the - following gRPC servers: - - * [IdentityServer](#identity-server) - * [NodeRegistryServer](#node-registry-server) - * [MasterControllerServer](#master-controller-server) - -* One **_Node_** instance should run on each - worker node that has persistent memory devices installed. When the - driver starts in such mode, it registers with the _Controller_ - driver running on a given _-registryEndpoint_. In this mode, the - driver starts the following servers: - - * [IdentityServer](#identity-server) - * [NodeControllerServer](#node-controller-server) - * [NodeServer](#node-server) - -### Driver Components - -#### Identity Server - -This gRPC server operates on a given endpoint in all driver modes and -implements the CSI [Identity -interface](https://github.com/container-storage-interface/spec/blob/master/spec.md#identity-service-rpc). - -#### Node Registry Server - -When the PMEM-CSI driver runs in _Controller_ mode, it starts a gRPC -server on a given endpoint(_-registryEndpoint_) and serves the -[RegistryServer](pkg/pmem-registry/pmem-registry.proto) interface. The -driver(s) running in _Node_ mode can register themselves with node -specific information such as node id, -[NodeControllerServer](#node-controller-server) endpoint, and their -available persistent memory capacity. - -#### Master Controller Server - -This gRPC server is started by the PMEM-CSI driver running in -_Controller_ mode and serves the -[Controller](https://github.com/container-storage-interface/spec/blob/master/spec.md#controller-service-rpc) -interface defined by the CSI specification. The server responds to -CreateVolume(), DeleteVolume(), ControllerPublishVolume(), -ControllerUnpublishVolume(), and ListVolumes() calls coming from -[external-provisioner]() and [external-attacher]() sidecars. It -forwards the publish and unpublish volume requests to the appropriate -[Node controller server](#node-controller-server) running on a worker -node that was registered with the driver. - -#### Node Controller Server - -This gRPC server is started by the PMEM-CSI driver running in _Node_ -mode and implements the -[ControllerPublishVolume](https://github.com/container-storage-interface/spec/blob/master/spec.md#controllerpublishvolume) -and -[ControllerUnpublishVolume](https://github.com/container-storage-interface/spec/blob/master/spec.md#controllerunpublishvolume) -methods of the [Controller -service](https://github.com/container-storage-interface/spec/blob/master/spec.md#controller-service-rpc) -interface defined by the CSI specification. It serves the -ControllerPublishVolume() and ControllerUnpublish() requests coming -from the [Master controller server](#master-controller-server) and -creates/deletes persistent memory devices. - -#### Node Server - -This gRPC server is started by the driver running in _Node_ mode and -implements the [Node -service](https://github.com/container-storage-interface/spec/blob/master/spec.md#node-service-rpc) -interface defined in the CSI specification. It serves the -NodeStageVolume(), NodeUnstageVolume(), NodePublishVolume(), and -NodeUnpublishVolume() requests coming from the Container Orchestrator -(CO). - -### Communication between components - -The following diagram illustrates the communication channels between driver components: -![communication diagram](/docs/images/communication/pmem-csi-communication-diagram.png) - -### Security - -All PMEM-CSI specific communication [shown in above -section](#communication-channels) between Master -Controller([RegistryServer](#node-registry-server), -[MasterControllerServer](#master-controller-server)) and -NodeControllers([NodeControllerServer](#node-controller-server)) is -protected by mutual TLS. Both client and server must identify -themselves and the certificate they present must be trusted. The -common name in each certificate is used to identify the different -components. The following common names have a special meaning: - -- `pmem-registry` is used by the [RegistryServer](#node-registry-server). -- `pmem-node-controller` is used by [NodeControllerServers](#node-controller-server) - -The [`test/setup-ca.sh`](test/setup-ca.sh) -script shows how to generate self-signed certificates. The test cluster is set -up using certificates created by that script, with secrets prepared by -[`test/setup-deployment.sh`](test/setup-deployment.sh) before -deploying the driver using the provided [deployment files](deploy/). - -Beware that these are just examples. Administrators of a cluster must -ensure that they choose key lengths and algorithms of sufficient -strength for their purposes and manage certificate distribution. - -A production deployment can improve upon that by using some other key -delivery mechanism, like for example -[Vault](https://www.vaultproject.io/). - - - -### Volume Persistency - -In a typical CSI deployment, volumes are provided by a storage backend -that is independent of a particular node. When a node goes offline, -the volume can be mounted elsewhere. But PMEM volumes are *local* to -node and thus can only be used on the node where they were -created. This means the applications using PMEM volume cannot freely -move between nodes. This limitation needs to be considered when -designing and deploying applications that are to use *local storage*. - -These are the volume persistency models considered for implementation -in PMEM-CSI to serve different application use cases: - -* **Persistent volumes** -A volume gets created independently of the application, on some node -where there is enough free space. Applications using such a volume are -then forced to run on that node and cannot run when the node is -down. Data is retained until the volume gets deleted. - -* **Ephemeral volumes** -Each time an application starts to run on a node, a new volume is -created for it on that node. When the application stops, the volume is -deleted. The volume cannot be shared with other applications. Data on -this volume is retained only while the application runs. - -* **Cache volumes** -Volumes are pre-created on a certain set of nodes, each with its own -local data. Applications are started on those nodes and then get to -use the volume on their node. Data persists across application -restarts. This is useful when the data is only cached information that -can be discarded and reconstructed at any time *and* the application -can reuse existing local data when restarting. - -Volume | Kubernetes | PMEM-CSI | Limitations ---- | --- | --- | --- -Persistent | supported | supported | topology aware scheduling1 -Ephemeral | supported2 | supported | resource constraints3 -Cache | supported | supported | topology aware scheduling1 - -1 [Topology aware -scheduling](https://github.com/kubernetes/enhancements/issues/490) -ensures that an application runs on a node where the volume was -created. For CSI-based drivers like PMEM-CSI, Kubernetes >= 1.13 is -needed. On older Kubernetes releases, pods must be scheduled manually -onto the right node(s). - -2 [CSI ephemeral volumes](https://kubernetes.io/docs/concepts/storage/volumes/#csi-ephemeral-volumes) -feature support is alpha in Kubernetes v1.15, and beta in v1.16. - -3 The upstream design for ephemeral volumes currently does -not take [resource -constraints](https://github.com/kubernetes/enhancements/pull/716#discussion_r250536632) -into account. If an application gets scheduled onto a node and then -creating the ephemeral volume on that node fails, the application on -the node cannot start until resources become available. - -#### Usage on Kubernetes - -Kubernetes cluster administrators can expose above mentioned persistent and cache volumes -to applications using -[`StorageClass -Parameters`](https://kubernetes.io/docs/concepts/storage/storage-classes/#parameters). An -optional `persistencyModel` parameter differentiates how the -provisioned volume can be used: - -* no `persistencyModel` parameter or `persistencyModel: normal` in `StorageClass` - A normal Kubernetes persistent volume. In this case - PMEM-CSI creates PMEM volume on a node and the application that - claims to use this volume is supposed to be scheduled onto this node - by Kubernetes. Choosing of node is depend on StorageClass - `volumeBindingMode`. In case of `volumeBindingMode: Immediate` - PMEM-CSI chooses a node randomly, and in case of `volumeBindingMode: - WaitForFirstConsumer` (also known as late binding) Kubernetes first chooses a node for scheduling - the application, and PMEM-CSI creates the volume on that - node. Applications which claim a normal persistent volume has to use - `ReadOnlyOnce` access mode in its `accessModes` list. This - [diagram](/docs/images/sequence/pmem-csi-persistent-sequence-diagram.png) - illustrates how a normal persistent volume gets provisioned in - Kubernetes using PMEM-CSI driver. - -* `persistencyModel: cache` -Volumes of this type shall be used in combination with -`volumeBindingMode: Immediate`. In this case, PMEM-CSI creates a set -of PMEM volumes each volume on different node. The number of PMEM -volumes to create can be specified by `cacheSize` StorageClass -parameter. Applications which claim a `cache` volume can use -`ReadWriteMany` in its `accessModes` list. Check with provided [cache -StorageClass](deploy/common/pmem-storageclass-cache.yaml) -example. This -[diagram](/docs/images/sequence/pmem-csi-cache-sequence-diagram.png) -illustrates how a cache volume gets provisioned in Kubernetes using -PMEM-CSI driver. - -**NOTE**: Cache volumes are associated with a node, not a pod. Multiple -pods using the same cache volume on the same node will not get their -own instance but will end up sharing the same PMEM volume instead. -Application deployment has to consider this and use available Kubernetes -mechanisms like [node -anti-affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity). -Check with the provided [cache -application](deploy/common/pmem-app-cache.yaml) example. - -**WARNING**: late binding (`volumeBindingMode:WaitForFirstConsume`) has some caveats: -* Pod creation may get stuck when there isn't enough capacity left for - the volumes; see the next section for details. -* A node is only chosen the first time a pod starts. After that it will always restart - on that node, because that is where the persistent volume was created. - -Volume requests embedded in Pod spec are provisioned as ephemeral volumes. The volume request could use below fields as [`volumeAttributes`](https://kubernetes.io/docs/concepts/storage/volumes/#csi): - -|key|meaning|optional|values| -|---|-------|--------|-------------| -|`size`|Size of the requested ephemeral volume as [Kubernetes memory string](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) ("1Mi" = 1024*1024 bytes, "1e3K = 1000000 bytes)|No|| -|`eraseAfter`|Clear all data after use and before
deleting the volume|Yes|`true` (default),
`false`| - -Check with provided [example application](deploy/kubernetes-1.15/pmem-app-ephemeral.yaml) for -ephemeral volume usage. - -### Capacity-aware pod scheduling - -PMEM-CSI implements the CSI `GetCapacity` call, but Kubernetes -currently doesn't call that and schedules pods onto nodes without -being aware of available storage capacity on the nodes. The effect is -that pods using volumes with late binding may get tentatively assigned -to a node and then get stuck because that decision is not reconsidered -when the volume cannot be created there ([a -bug](https://github.com/kubernetes/kubernetes/issues/72031)). Even if -that decision is reconsidered, the same node may get selected again -because Kubernetes does not get informed about the insufficient -storage. Pods with ephemeral inline volumes always get stuck because -the decision to use the node [is final](https://github.com/kubernetes-sigs/descheduler/issues/62). - -Work is [under -way](https://github.com/kubernetes/enhancements/pull/1353) to enhance -scheduling in Kubernetes. In the meantime, PMEM-CSI provides two components -that help with pod scheduling: - -#### Scheduler extender - -When a pod requests the special [extended -resource](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#extended-resources) -called `pmem-csi.intel.com/scheduler`, the Kubernetes scheduler calls -a [scheduler -extender](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/scheduling/scheduler_extender.md) -provided by PMEM-CSI with a list of nodes that a pod might run -on. This extender is implemented in the master controller and thus can -connect to the controller on each of these nodes to check for -capacity. PMEM-CSI then filters out all nodes which currently do not -have enough storage left for the volumes that still need to be -created. This considers inline ephemeral volumes and all unbound -volumes, regardless whether they use late binding or immediate -binding. - -This special scheduling can be requested manually by adding this snippet -to one container in the pod spec: -``` -containers: -- name: some-container - ... - resources: - limits: - pmem-csi.intel.com/scheduler: "1" - requests: - pmem-csi.intel.com/scheduler: "1" -``` - -This scheduler extender is optional and not necessarily installed in -all clusters that have PMEM-CSI. Don't add this extended resource -unless the scheduler extender is installed, otherwise the pod won't -start! - -#### Pod admission webhook - -Having to add `pmem-csi.intel.com/scheduler` manually is not -user-friendly. To simplify this, PMEM-CSI provides a [mutating -admission -webhook](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/) -which intercepts the creation of all pods. If that pod uses inline -ephemeral volumes or volumes with late binding that are provided by -PMEM-CSI, the webhook transparently adds the extended resource -request. PMEM-CSI volumes with immediate binding are ignored because -for those the normal topology support ensures that unsuitable nodes -are filtered out. - -The webhook can only do that if the persistent volume claim (PVC) and -its storage class have been created already. This is normally not -required: it's okay to create the pod first, then later add the -PVC. The pod simply won't start in the meantime. - -The webhook deals with this uncertainty by allowing the creation of -the pod without adding the extended resource when it lacks the -necessary information. The alternative would be to reject the pod, but -that would be a change of behavior of the cluster that may affect also pods -that don't use PMEM-CSI at all. - -Users must take care to create PVCs first, then the pods if they want -to use the webhook. In practice, that is often already done because it -is more natural, so it is not a big limitation. - -## Prerequisites - -### Software required - -The recommended mimimum Linux kernel version for running the PMEM-CSI driver is 4.15. See [Persistent Memory Programming](https://pmem.io/2018/05/15/using_persistent_memory_devices_with_the_linux_device_mapper.html) for more details about supported kernel versions. - -### Hardware required - -Persistent memory device(s) are required for operation. However, some -development and testing can be done using QEMU-emulated persistent -memory devices. See the ["QEMU and Kubernetes"](#qemu-and-kubernetes) -section for the commands that create such a virtual test cluster. - -### Persistent memory pre-provisioning - -The PMEM-CSI driver needs pre-provisioned regions on the NVDIMM -device(s). The PMEM-CSI driver itself intentionally leaves that to the -administrator who then can decide how much and how PMEM is to be used -for PMEM-CSI. - -Beware that the PMEM-CSI driver will run without errors on a node -where PMEM was not prepared for it. It will then report zero local -storage for that node, something that currently is only visible in the -log files. - -When running the Kubernetes cluster and PMEM-CSI on bare metal, -the [ipmctl](https://github.com/intel/ipmctl) utility can be used to create regions. -App Direct Mode has two configuration options - interleaved or non-interleaved. -One region per each NVDIMM is created in non-interleaved configuration. -In such a configuration, a PMEM-CSI volume cannot be larger than one NVDIMM. - -Example of creating regions without interleaving, using all NVDIMMs: -```sh -# ipmctl create -goal PersistentMemoryType=AppDirectNotInterleaved -``` - -Alternatively, multiple NVDIMMs can be combined to form an interleaved set. -This causes the data to be striped over multiple NVDIMM devices -for improved read/write performance and allowing one region (also, PMEM-CSI volume) -to be larger than single NVDIMM. - -Example of creating regions in interleaved mode, using all NVDIMMs: -```sh -# ipmctl create -goal PersistentMemoryType=AppDirect -``` - -When running inside virtual machines, each virtual machine typically -already gets access to one region and `ipmctl` is not needed inside -the virtual machine. Instead, that region must be made available for -use with PMEM-CSI because when the virtual machine comes up for the -first time, the entire region is already allocated for use as a single -block device: -``` sh -# ndctl list -RN -{ - "regions":[ - { - "dev":"region0", - "size":34357641216, - "available_size":0, - "max_available_extent":0, - "type":"pmem", - "persistence_domain":"unknown", - "namespaces":[ - { - "dev":"namespace0.0", - "mode":"raw", - "size":34357641216, - "sector_size":512, - "blockdev":"pmem0" - } - ] - } - ] -} -# ls -l /dev/pmem* -brw-rw---- 1 root disk 259, 0 Jun 4 16:41 /dev/pmem0 -``` - -Labels must be initialized in such a region, which must be performed -once after the first boot: -``` sh -# ndctl disable-region region0 -disabled 1 region -# ndctl init-labels nmem0 -initialized 1 nmem -# ndctl enable-region region0 -enabled 1 region -# ndctl list -RN -[ - { - "dev":"region0", - "size":34357641216, - "available_size":34357641216, - "max_available_extent":34357641216, - "type":"pmem", - "iset_id":10248187106440278, - "persistence_domain":"unknown" - } -] -# ls -l /dev/pmem* -ls: cannot access '/dev/pmem*': No such file or directory -``` + - [Design and architecture](docs/design.md) + - [Instructions for Admins and Users](docs/install.md) + - [Prerequisites](docs/install.md#prerequisites) + - [Installation and setup](docs/install.md#installation-and-setup) + - [Filing issues and contributing](docs/install.md#filing-issues-and-contributing) + - [Develop and contribute](docs/DEVELOPMENT.md) + - [Automated testing](docs/autotest.md) + - [Application examples](examples/readme.rst) ## Supported Kubernetes versions @@ -613,634 +52,3 @@ available in later versions. The external-provisioner v1.0.1 for Kubernetes 1.13 lacks the `--strict-topology` flag and therefore late binding is unreliable. It's also a release that is not supported officially by upstream anymore. - - -## Setup - -### Get source code - -PMEM-CSI uses Go modules and thus can be checked out and (if that should be desired) -built anywhere in the filesystem. Pre-built container images are available and thus -users don't need to build from source, but they will still need some additional files. -To get the source code, use: - -``` -git clone https://github.com/intel/pmem-csi -``` - -### Run PMEM-CSI on Kubernetes - -This section assumes that a Kubernetes cluster is already available -with at least one node that has persistent memory device(s). For development or -testing, it is also possible to use a cluster that runs on QEMU virtual -machines, see the ["QEMU and Kubernetes"](#qemu-and-kubernetes) section below. - -- **Make sure that the alpha feature gates CSINodeInfo and CSIDriverRegistry are enabled** - -The method to configure alpha feature gates may vary, depending on the Kubernetes deployment. -It may not be necessary anymore when the feature has reached beta state, which depends -on the Kubernetes version. - -- **Label the cluster nodes that provide persistent memory device(s)** - -```sh - $ kubectl label node storage=pmem -``` - -- **Set up certificates** - -Certificates are required as explained in [Security](#security). -If you are not using the test cluster described in -[Starting and stopping a test cluster](#starting-and-stopping-a-test-cluster) -where certificates are created automatically, you must set up certificates manually. -This can be done by running the `./test/setup-ca-kubernetes.sh` script for your cluster. -This script requires "cfssl" tools which can be downloaded. -These are the steps for manual set-up of certificates: - -- Download cfssl tools - -```sh - $ curl -L https://pkg.cfssl.org/R1.2/cfssl_linux-amd64 -o _work/bin/cfssl --create-dirs - $ curl -L https://pkg.cfssl.org/R1.2/cfssljson_linux-amd64 -o _work/bin/cfssljson --create-dirs - $ chmod a+x _work/bin/cfssl _work/bin/cfssljson -``` - -- Run certificates set-up script - -```sh - $ KUBCONFIG="<> PATH="$PATH:$PWD/_work/bin" ./test/setup-ca-kubernetes.sh -``` - -- **Deploy the driver to Kubernetes** - -The `deploy/kubernetes-` directory contains -`pmem-csi*.yaml` files which can be used to deploy the driver on that -Kubernetes version. The files in the directory with the highest -Kubernetes version might also work for more recent Kubernetes -releases. All of these deployments use images published by Intel on -[Docker Hub](https://hub.docker.com/u/intel). - -For each Kubernetes version, four different deployment variants are provided: - - - `direct` or `lvm`: one uses direct device mode, the other LVM device mode. - - `testing`: the variants with `testing` in the name enable debugging - features and shouldn't be used in production. - -For example, to deploy for production with LVM device mode onto Kubernetes 1.14, use: - -```sh - $ kubectl create -f deploy/kubernetes-1.14/pmem-csi-lvm.yaml -``` - -The PMEM-CSI [scheduler extender](#scheduler-extender) and -[webhook](#pod-admission-webhook) are not enabled in this basic -installation. See [below](#enable-scheduler-extensions) for -instructions about that. - -These variants were generated with -[`kustomize`](https://github.com/kubernetes-sigs/kustomize). -`kubectl` >= 1.14 includes some support for that. The sub-directories -of `deploy/kubernetes-` can be used as bases -for `kubectl kustomize`. For example: - - - Change namespace: - ``` - $ mkdir -p my-pmem-csi-deployment - $ cat >my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/lvm-parameters-patch.yaml <,storage=pmem -``` - -If **storage=pmem** is missing, label manually as described above. If -**pmem-csi.intel.com/node** is missing, then double-check that the -alpha feature gates are enabled, that the CSI driver is running on the node, -and that the driver's log output doesn't contain errors. - -- **Define two storage classes using the driver** - -```sh - $ kubectl create -f deploy/kubernetes-/pmem-storageclass-ext4.yaml - $ kubectl create -f deploy/kubernetes-/pmem-storageclass-xfs.yaml -``` - -- **Provision two pmem-csi volumes** - -```sh - $ kubectl create -f deploy/kubernetes-/pmem-pvc.yaml -``` - -- **Verify two Persistent Volume Claims have 'Bound' status** - -```sh - $ kubectl get pvc - NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE - pmem-csi-pvc-ext4 Bound pvc-f70f7b36-6b36-11e9-bf09-deadbeef0100 4Gi RWO pmem-csi-sc-ext4 16s - pmem-csi-pvc-xfs Bound pvc-f7101fd2-6b36-11e9-bf09-deadbeef0100 4Gi RWO pmem-csi-sc-xfs 16s -``` - -- **Start two applications requesting one provisioned volume each** - -```sh - $ kubectl create -f deploy/kubernetes-/pmem-app.yaml -``` - -These applications use **storage: pmem** in the nodeSelector -list to ensure scheduling to a node supporting pmem device, and each requests a mount of a volume, -one with ext4-format and another with xfs-format file system. - -- **Verify two application pods reach 'Running' status** - -```sh - $ kubectl get po my-csi-app-1 my-csi-app-2 - NAME READY STATUS RESTARTS AGE - my-csi-app-1 1/1 Running 0 6m5s - NAME READY STATUS RESTARTS AGE - my-csi-app-2 1/1 Running 0 6m1s -``` - -- **Check that applications have a pmem volume mounted with added dax option** - -```sh - $ kubectl exec my-csi-app-1 -- df /data - Filesystem 1K-blocks Used Available Use% Mounted on - /dev/ndbus0region0fsdax/5ccaa889-551d-11e9-a584-928299ac4b17 - 4062912 16376 3820440 0% /data - $ kubectl exec my-csi-app-2 -- df /data - Filesystem 1K-blocks Used Available Use% Mounted on - /dev/ndbus0region0fsdax/5cc9b19e-551d-11e9-a584-928299ac4b17 - 4184064 37264 4146800 1% /data - - $ kubectl exec my-csi-app-1 -- mount |grep /data - /dev/ndbus0region0fsdax/5ccaa889-551d-11e9-a584-928299ac4b17 on /data type ext4 (rw,relatime,dax) - $ kubectl exec my-csi-app-2 -- mount |grep /data - /dev/ndbus0region0fsdax/5cc9b19e-551d-11e9-a584-928299ac4b17 on /data type xfs (rw,relatime,attr2,dax,inode64,noquota) -``` - -#### Note about raw block volumes - -Applications can use volumes provisioned by PMEM-CSI as [raw block -devices](https://kubernetes.io/blog/2019/03/07/raw-block-volume-support-to-beta/). Such -volumes use the same "fsdax" namespace mode as filesystem volumes -and therefore are block devices. That mode only supports dax (= -`mmap(MAP_SYNC)`) through a filesystem. Pages mapped on the raw block -device go through the Linux page cache. Applications have to format -and mount the raw block volume themselves if they want dax. The -advantage then is that they have full control over that part. - -For provisioning a PMEM volume as raw block device, one has to create a -`PersistentVolumeClaim` with `volumeMode: Block`. See example [PVC]( -deploy/common/pmem-pvc-block-volume.yaml) and -[application](deploy/common/pmem-app-block-volume.yaml) for usage reference. - -That example demonstrates how to handle some details: -- `mkfs.ext4` needs `-b 4096` to produce volumes that support dax; - without it, the automatic block size detection may end up choosing - an unsuitable value depending on the volume size. -- [Kubernetes bug #85624](https://github.com/kubernetes/kubernetes/issues/85624) - must be worked around to format and mount the raw block device. - -#### Enable scheduler extensions - -The PMEM-CSI scheduler extender and admission webhook are provided by -the PMEM-CSI controller. They need to be enabled during deployment via -the `--schedulerListen=[]:` parameter. The -listen address is optional and can be left out. The port is where a -HTTPS server will run. It uses the same certificates as the internal -gRPC service. When using the CA creation script described above, they -will contain alternative names for the URLs described in this section -(service names, `127.0.0.1` IP address). - -This parameter can be added to one of the existing deployment files -with `kustomize`. All of the following examples assume that the -current directory contains the `deploy` directory from the PMEM-CSI -repository. It is also possible to reference the base via a -[URL](https://github.com/kubernetes-sigs/kustomize/blob/master/examples/remoteBuild.md). - -``` sh -mkdir my-pmem-csi-deployment - -cat >my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/scheduler-patch.yaml <my-scheduler/kustomization.yaml <my-scheduler/node-port-patch.yaml </var/lib/scheduler/scheduler-policy.cfg' <:", - "filterVerb": "filter", - "prioritizeVerb": "prioritize", - "nodeCacheCapable": false, - "weight": 1, - "managedResources": - [{ - "name": "pmem-csi.intel.com/scheduler", - "ignoredByScheduler": true - }] - }] -} -EOF - -cat >kubeadm.config <= -1.15, it can also be used to let individual pods bypass the webhook by -adding that label. The CA gets configured explicitly, which is -supported for webhooks. - -``` sh -mkdir my-webhook - -cat >my-webhook/kustomization.yaml <my-webhook/webhook-patch.yaml < - -## Automated testing - -### Unit testing and code quality - -Use the `make test` command. - -### QEMU and Kubernetes - -E2E testing relies on a cluster running inside multiple QEMU virtual -machines deployed by [GoVM](https://github.com/govm-project/govm). The -same cluster can also be used interactively when real hardware is not -available. - -E2E testing is known to work on a Linux development host system. The user -must be allowed to use Docker. - -KVM must be enabled. Usually this is the case when `/dev/kvm` exists. -The current user does not need the privileges to use KVM and QEMU -doesn't have to be installed because GoVM will run QEMU inside a -container with root privileges. - -Note that cloud providers often don't offer KVM support on their -regular machines. Search for "nested virtualization" for your provider -to determine whether and how it supports KVM. - -Nested virtualization is also needed when using Kata Containers inside -the cluster. On Intel-based machines it can be enabled by loading the -`kvm_intel` module with `nested=1` (see -https://wiki.archlinux.org/index.php/KVM#Nested_virtualization). At -this time, Kata Containers up to and including 1.9.1 is [not -compatible with -PMEM-CSI](https://github.com/intel/pmem-csi/issues/303) because -volumes are not passed in as PMEM, but Kata Containers [can be -installed](https://github.com/kata-containers/packaging/tree/master/kata-deploy#kubernetes-quick-start) -and used for applications that are not using PMEM. - -The `clear-cloud` image is downloaded automatically. By default, -four different virtual machines are prepared. Each image is pre-configured -with its own hostname and with network. - -The images will contain the latest -[Clear Linux OS](https://clearlinux.org/) and have the Kubernetes -version supported by Clear Linux installed. - -PMEM-CSI images must have been created and published in some Docker -registry, as described earlier in [build PMEM-CSI](#build-pmem-csi). -In addition, that registry must be accessible from inside the -cluster. That works for the default (a local registry in the build -host) but may require setting additional [configuration -options](#configuration-options) for other scenarios. - -### Starting and stopping a test cluster - -`make start` will bring up a Kubernetes test cluster inside four QEMU -virtual machines. -The first node is the Kubernetes master without -persistent memory. -The other three nodes are worker nodes with one emulated 32GB NVDIMM each. -After the cluster has been formed, `make start` adds `storage=pmem` label -to the worker nodes and deploys the PMEM-CSI driver. -Once `make start` completes, the cluster is ready for interactive use via -`kubectl` inside the virtual machine. Alternatively, you can also -set `KUBECONFIG` as shown at the end of the `make start` output -and use `kubectl` binary on the host running VMs. - -Use `make stop` to stop and remove the virtual machines. - -`make restart` can be used to cleanly reboot all virtual -machines. This is useful during development after a `make push-images` -to ensure that the cluster runs those rebuilt images. - -### Running commands on test cluster nodes over ssh - -`make start` generates ssh wrapper scripts `_work/pmem-govm/ssh.N` for each -test cluster node which are handy for running a single command or to -start an interactive shell. Examples: - -`_work/pmem-govm/ssh.0 kubectl get pods` runs a kubectl command on -the master node. - -`_work/pmem-govm/ssh.1` starts a shell on the first worker node. - -### Deploying PMEM-CSI on a test cluster - -After `make start`, PMEM-CSI is *not* installed yet. Either install -manually as [described for a normal -cluster](#run-pmem-csi-on-kubernetes) or use the -[setup-deployment.sh](./test/setup-deployment.sh) script. - -### Configuration options - -Several aspects of the cluster and build setup can be configured by overriding -the settings in the [test-config.sh](test/test-config.sh) file. See -that file for a description of all options. Options can be set as -environment variables of `make start` on a case-by-case basis or -permanently by creating a file like `test/test-config.d/my-config.sh`. - -Multiple different clusters can be brought up in parallel by changing -the default `pmem-govm` cluster name via the `CLUSTER` env variable. - -For example, this invocation sets up a cluster using the non-default -Fedora distro: - -``` sh -TEST_DISTRO=fedora CLUSTER=fedora-govm make start -``` - -### Running E2E tests - -`make test_e2e` will run [csi-test -sanity](https://github.com/kubernetes-csi/csi-test/tree/master/pkg/sanity) -tests and some [Kubernetes storage -tests](https://github.com/kubernetes/kubernetes/tree/master/test/e2e/storage/testsuites) -against the PMEM-CSI driver. - -When [ginkgo](https://onsi.github.io/ginkgo/) is installed, then it -can be used to run individual tests and to control additional aspects -of the test run. For example, to run just the E2E provisioning test -(create PVC, write data in one pod, read it in another) in verbose mode: - -``` sh -$ KUBECONFIG=$(pwd)/_work/pmem-govm/kube.config REPO_ROOT=$(pwd) ginkgo -v -focus=pmem-csi.*should.provision.storage.with.defaults ./test/e2e/ -Nov 26 11:21:28.805: INFO: The --provider flag is not set. Treating as a conformance test. Some tests may not be run. -Running Suite: PMEM E2E suite -============================= -Random Seed: 1543227683 - Will randomize all specs -Will run 1 of 61 specs - -Nov 26 11:21:28.812: INFO: checking config -Nov 26 11:21:28.812: INFO: >>> kubeConfig: /nvme/gopath/src/github.com/intel/pmem-csi/_work/pmem-govm/kube.config -Nov 26 11:21:28.817: INFO: Waiting up to 30m0s for all (but 0) nodes to be schedulable -... -Ran 1 of 61 Specs in 58.465 seconds -SUCCESS! -- 1 Passed | 0 Failed | 0 Pending | 60 Skipped -PASS - -Ginkgo ran 1 suite in 1m3.850672246s -Test Suite Passed -``` - -It is also possible to run just the sanity tests until one of them fails: - -``` sh -$ REPO_ROOT=`pwd` ginkgo '-focus=sanity' -failFast ./test/e2e/ -... -``` - -## Application examples - -Information about specific usages of PMEM-CSI are described in separate documents: - -* Deploying a Redis cluster through the redis-operator using QEMU-emulated persistent memory devices ([examples/redis-operator.md](examples/redis-operator.md)). -* Installing Kubernetes and PMEM-CSI on Google Cloud machines. ([examples/gce.md](examples/gce.md)). - -## Communication and contribution - -Report a bug by [filing a new issue](https://github.com/intel/pmem-csi/issues). - -Before making your first contribution, be sure to read the [development documentation](DEVELOPMENT.md) -for guidance on code quality and branches. - -Contribute by [opening a pull request](https://github.com/intel/pmem-csi/pulls). - -Learn [about pull requests](https://help.github.com/articles/using-pull-requests/). - -**Reporting a Potential Security Vulnerability:** If you have discovered potential security vulnerability in PMEM-CSI, please send an e-mail to secure@intel.com. For issues related to Intel Products, please visit [Intel Security Center](https://security-center.intel.com). - -It is important to include the following details: - -- The projects and versions affected -- Detailed description of the vulnerability -- Information on known exploits - -Vulnerability information is extremely sensitive. Please encrypt all security vulnerability reports using our [PGP key](https://www.intel.com/content/www/us/en/security-center/pgp-public-key.html). - -A member of the Intel Product Security Team will review your e-mail and contact you to collaborate on resolving the issue. For more information on how Intel works to resolve security issues, see: [vulnerability handling guidelines](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). - - diff --git a/conf.json b/conf.json new file mode 100644 index 0000000000..0d922c9e15 --- /dev/null +++ b/conf.json @@ -0,0 +1,32 @@ +{ + "author": "", + "copyright": "2019,", + "exclude_patterns": [ + "_output", + "Thumbs.db", + ".DS_Store", + ".tox", + "_work", + "deploy/kustomize", + "test/test-config.d", + "pkg/scheduler" + ], + "extensions": [ + "recommonmark", + "sphinx_markdown_tables" + ], + "html_static_path": [ + "_static" + ], + "html_theme": "sphinx_rtd_theme", + "project": "PMEM-CSI", + "templates_path": [ + "_templates" + ], + "html_copy_source": false, + "rst_epilog": ".. include:: /docs/substitutions.txt", + "source_suffix": { + ".rst": "restructuredtext", + ".md": "markdown" + } +} diff --git a/conf.py b/conf.py new file mode 100644 index 0000000000..d0e82f59a8 --- /dev/null +++ b/conf.py @@ -0,0 +1,169 @@ + +import json +from docutils import nodes +from os.path import isdir, isfile, join, basename, dirname +from os import makedirs, getenv +from shutil import copyfile + +############################################################################## +# +# This section determines the behavior of links to local items in .md files. +# +# if useGitHubURL == True: +# +# links to local files and directories will be turned into github URLs +# using either the baseBranch defined here or using the commit SHA. +# +# if useGitHubURL == False: +# +# local files will be moved to the website directory structure when built +# local directories will still be links to github URLs +# +# if built with GitHub workflows: +# +# the GitHub URLs will use the commit SHA (GITHUB_SHA environment variable +# is defined by GitHub workflows) to link to the specific commit. +# +############################################################################## + +baseBranch = "devel" +useGitHubURL = True +commitSHA = getenv('GITHUB_SHA') +githubBaseURL = "https://github.com/intelkevinputnam/pmem-csi/" +githubFileURL = githubBaseURL + "blob/" +githubDirURL = githubBaseURL + "tree/" +if commitSHA: + githubFileURL = githubFileURL + commitSHA + "/" + githubDirURL = githubDirURL + commitSHA + "/" +else: + githubFileURL = githubFileURL + baseBranch + "/" + githubDirURL = githubDirURL + baseBranch + "/" + +# End GitHub URL section + +with open('conf.json') as jsonFile: + conf = json.load(jsonFile) + +for item in conf: + globals()[item] = (conf[item]) + +def setup(app): + app.connect('doctree-resolved',fixLocalMDAnchors) + app.connect('missing-reference',fixRSTLinkInMD) + +############################################################################## +# +# This section defines callbacks that make markdown specific tweaks to +# either: +# +# 1. Fix something that recommonmark does wrong. +# 2. Provide support for .md files that are written as READMEs in a GitHub +# repo. +# +# Only use these changes if using the extension ``recommonmark``. +# +############################################################################## + + +# Callback registerd with 'missing-reference'. +def fixRSTLinkInMD(app, env, node, contnode): + refTarget = node.get('reftarget') + filePath = refTarget.lstrip("/") + if '.rst' in refTarget and "://" not in refTarget: + # This occurs when a .rst file is referenced from a .md file + # Currently unable to check if file exists as no file + # context is provided and links are relative. + # + # Example: [Application examples](examples/readme.rst) + # + contnode['refuri'] = contnode['refuri'].replace('.rst','.html') + contnode['internal'] = "True" + return contnode + else: + # This occurs when a file is referenced for download from an .md file. + # Construct a list of them and short-circuit the warning. The files + # are moved later (need file location context). To avoid warnings, + # write .md files, make the links absolute. This only marks them fixed + # if it can verify that they exist. + # + # Example: [Makefile](/Makefile) + # + if isfile(filePath) or isdir(filePath): + return contnode + + +def normalizePath(docPath,uriPath): + if uriPath == "": + return uriPath + if "#" in uriPath: + # Strip out anchors + uriPath = uriPath.split("#")[0] + if uriPath.startswith("/"): + # It's an absolute path + return uriPath.lstrip("/") #path to file from project directory + else: + # It's a relative path + docDir = dirname(docPath) + return join(docDir,uriPath) #path to file from referencing file + + +# Callback registerd with 'doctree-resolved'. +def fixLocalMDAnchors(app, doctree, docname): + for node in doctree.traverse(nodes.reference): + uri = node.get('refuri') + filePath = normalizePath(docname,uri) + if isfile(filePath): + # Only do this if the file exists. + # + # TODO: Pop a warning if the file doesn't exist. + # + if '.md' in uri and '://' not in uri: + # Make sure .md file links that weren't caught are converted. + # These occur when creating an explicit link to an .md file + # from an .rst file. By default these are not validated by Sphinx + # or recommonmark. Only toctree references are validated. recommonmark + # also fails to convert links to local Markdown files that include + # anchors. This fixes that as well. + # + # Only include this code if .md files are being converted to html + # + # Example: `Google Cloud Engine `__ + # [configuration options](autotest.md#configuration-options) + # + node['refuri'] = node['refuri'].replace('.md','.html') + else: + # Handle the case where markdown is referencing local files in the repo + # + # Example: [Makefile](/Makefile) + # + if useGitHubURL: + # Replace references to local files with links to the GitHub repo + # + newURI = githubFileURL + filePath + print("new url: ", newURI) + node['refuri']=newURI + else: + # If there are links to local files other than .md (.rst files are caught + # when warnings are fired), move the files into the Sphinx project, so + # they can be accessed. + newFileDir = join(app.outdir,dirname(filePath)) # where to move the file in Sphinx output. + newFilePath = join(app.outdir,filePath) + newURI = uri # if the path is relative no need to change it. + if uri.startswith("/"): + # It's an absolute path. Need to make it relative. + uri = uri.lstrip("/") + docDirDepth = len(docname.split("/")) - 1 + newURI = "../"*docDirDepth + uri + if not isdir(newFileDir): + makedirs(newFileDir) + copyfile(filePath,newFilePath) + node['refuri'] = newURI + elif "#" not in uri: # ignore anchors + # turn links to directories into links to the repo + if isdir(filePath): + newURI = githubDirURL + filePath + node['refuri']=newURI + + + + diff --git a/DEVELOPMENT.md b/docs/DEVELOPMENT.md similarity index 77% rename from DEVELOPMENT.md rename to docs/DEVELOPMENT.md index 51734b0c3a..4488507c75 100644 --- a/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -1,4 +1,4 @@ -# Develop and Contribute +# Develop and contribute - [Setup](#setup) - [Build PMEM-CSI](#build-pmem-csi) @@ -20,10 +20,10 @@ - [Specific arguments to pmem-csi-driver](#specific-arguments-to-pmem-csi-driver) - [Environment variables](#environment-variables) - [Logging](#logging) -- [Notes about switching device mode](#notes-about-switching-device-mode) +- [Switching device mode](#switching-device-mode) - [Going from LVM device mode to direct device mode](#going-from-lvm-device-mode-to-direct-device-mode) - [Going from direct device mode to LVM device mode](#going-from-direct-device-mode-to-lvm-device-mode) -- [Notes about accessing system directories in a container](#notes-about-accessing-system-directories-in-a-container) +- [Accessing system directories in a container](#accessing-system-directories-in-a-container) - [Read-only access to /sys](#read-only-access-to-sys) - [Access to /dev of host](#access-to-dev-of-host) - [Repository elements which are generated or created separately](#repository-elements-which-are-generated-or-created-separately) @@ -32,6 +32,7 @@ - [Diagrams describing provisioning sequence](#diagrams-describing-provisioning-sequence) - [RegistryServer spec](#registryserver-spec) - [Table of Contents in README and DEVELOPMENT](#table-of-contents-in-readme-and-development) +- [Edit, build, and deploy the Read the Docs site](#build-edit-and-deploy-the-read-the-docs-site) ## Setup @@ -42,14 +43,14 @@ 2. Use `make push-images` to push Docker container images to a Docker image registry. The default is to push to a local [Docker registry](https://docs.docker.com/registry/deploying/). Some other registry can be configured by setting the variables described in - in the [test-config.sh](test/test-config.sh) file, see the [configuration options](#configuration-options) + in the [test-config.sh](/test/test-config.sh) file, see the [configuration options](autotest.md#configuration-options) section below. Alternatively, the registry can also be set with a make variable: `make push-images REGISTRY_NAME=my-registry:5000` -See the [Makefile](Makefile) for additional make targets and possible make variables. +See the [Makefile](/Makefile) for additional make targets and possible make variables. The source code gets developed and tested using the version of Go that -is set with `GO_VERSION` in the [Dockerfile](Dockerfile). Some other +is set with `GO_VERSION` in the [Dockerfile](/Dockerfile). Some other version may or may not work. In particular, `test_fmt` and `test_vendor` are known to be sensitive to the version of Go. @@ -174,7 +175,7 @@ Network ports are opened as configured in manifest files: - registry endpoint: typical port value 10000, used for PMEM-CSI internal communication - controller endpoint: typical port value 10001, used for serving CSI API -- webhook endpoint: disabled by default, port chosen when [enabling the scheduler extensions](./README.md#enable-scheduler-extensions) +- webhook endpoint: disabled by default, port chosen when [enabling the scheduler extensions](../README.md#enable-scheduler-extensions) ### Local sockets @@ -250,7 +251,7 @@ The klog.Info statements are used via the verbosity checker using the following There are also messages using klog.Warning, klog.Error and klog.Fatal, and their formatted counterparts. -## Notes about switching device mode +## Switching device mode If device mode is switched between LVM and direct(aka ndctl), please keep in mind that PMEM-CSI driver does not clean up or reclaim namespaces, @@ -264,7 +265,7 @@ will create trouble in another device mode. - examine LV physical volumes state on a node: `pvs` - delete LV groups before deleting namespaces to avoid orphaned volume groups: `vgremove VGNAME` -NOTE: The next **WILL DELETE ALL NAMESPACES** so be careful! +NOTE: The following **WILL DELETE ALL NAMESPACES** so be careful! - Delete namespaces on a node using CLI: `ndctl destroy-namespace all --force` @@ -279,7 +280,7 @@ those (LVM device mode does honor "foreign" namespaces and leaves those alone) if you have enough space, or you can choose to delete those using `ndctl` on node. -## Notes about accessing system directories in a container +## Accessing system directories in a container The PMEM-CSI driver will run as container, but it needs access to system directories /sys and /dev. Two related potential problems have @@ -345,7 +346,8 @@ $ git clone https://github.com/golang/protobuf.git && cd protobuf $ make # installs needed binary in $GOPATH/bin/protoc-gen-go ``` -- generate by running in ~/go/src/github.com/intel/pmem-csi/pkg/pmem-registry: +- generate by running in \~/go/src/github.com/intel/pmem-csi/pkg/pmem-registry: + ```sh protoc --plugin=protoc-gen-go=$GOPATH/bin/protoc-gen-go --go_out=plugins=grpc:./ pmem-registry.proto ``` @@ -364,3 +366,117 @@ Note that pandoc is known to produce incorrect TOC entries if headers contain sp means TOC generation will be more reliable if we avoid non-letter-or-number characters in the headers. - Another method is to use emacs command markdown-toc-generate-toc and manually check and edit the generated part: we do not show generated 3rd-level headings in README.md. + +## Build, edit, and deploy the Read the Docs site + +The PMEM-CSI documentation is available as in-repo READMEs and as a GitHub\* +hosted [website](https://intel.github.io/pmem-csi). The website is created +using the [Sphinx](https://www.sphinx-doc.org/) documentation generator and +the well-known [Read the Docs](https://sphinx-rtd-theme.readthedocs.io/) +theme. + +### Build + +Building the documentation requires Python 3.x and venv. + +```bash +make vhtml +``` + +### Edit + +Sphinx uses [reStructuredText](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html) (reST) as the primary document source type but can be +extended to use Markdown by adding the ``recommonmark`` and +``sphinx_markdown_tables`` extensions (see [conf.json](/conf.json)). + +Change the navigation tree or add documents by updating the ``toctree``. The +main ``toctree`` is in ``index.rst``: + +``` rst +.. toctree:: + :maxdepth: 2 + + README.md + docs/design.md + docs/install.md + docs/DEVELOPMENT.md + docs/autotest.md + examples/readme.rst + Project GitHub repository +``` + +reST files, Markdown files, and URLs can be added to a ``toctree``. The +``:maxdepth:`` argument dictates the number of header levels that will be +displayed on that page. This website replaces the ``index.html`` output of +this project with a redirect to ``README.html`` (the conversion of the top +level README) to closer match the in-repo documentation. + +Any reST or Markdown file not referenced by a ``toctree`` will generate a +warning in the build. This document has a ``toctree`` in: + +1. ``index.rst`` +2. ``examples/readme.rst`` + +NOTE: Though GitHub can parse reST files, the ``toctree`` directive is Sphinx +specific, so it is not understood by GitHub. ``examples/readme.rst`` is a good +example. Adding the ``:hidden:`` argument to the ``toctree`` directive means +that the ``toctree`` is not displayed in the Sphinx built version of the page. + +### Custom link handling + +This project has some custom capabilities added to the [conf.py](/conf.py) to +fix or improve how Sphinx generates the HTML site. + +1. Markdown files: Converts references to Markdown files that include anchors. + ``` md + [configuration options](autotest.md#configuration-options) + ``` +2. reST files: Fixes explicit links to Markdown files. + ``` rst + `Google Cloud Engine `__ + ``` +3. Markdown files: Fixes references to reST files. + ``` md + [Application examples](examples/readme.rst) + ``` +4. Markdown files: Fixes links to files and directories within the GitHub repo. + ``` md + [Makefile](/Makefile) + [deploy/kustomize](/deploy/kustomize) + ``` + Links to files can be fixed one of two ways, which can be set in the + [conf.py](/conf.py). + + ``` python + baseBranch = "devel" + useGitHubURL = True + commitSHA = getenv('GITHUB_SHA') + githubBaseURL = "https://github.com/intelkevinputnam/pmem-csi/" + ``` + + If ``useGitHubURL`` is set to True, it will try to create links based on + your ``githubBaseURL`` and the SHA for the commit to the GitHub repo + determined by the GitHub workflow on merge). If there is no SHA available, + it will use the value of ``baseBranch``. + + If ``useGitHubURL`` is set to False, it will copy the files to the HTML + output directory and provide links to that location. + + NOTE: Links to files and directories should use absolute paths relative to + the repo (see Makefile and deploy/kustomize above). This will work both for + the Sphinx build and when viewing in the GitHub repo. + + Links to directories are always converted to links to the GitHub repository. + +### Deploying with GitHub actions + +The publish [workflow](/.github/workflows/publish.yml) is run each time a commit is made to the designated branch and pushes the rendered HTML to the gh-pages branch. Other rules can be created for other branches. + +``` yaml +on: + push: + branches: + - devel +``` + +NOTE: Create a secret called ``ACCESS_TOKEN`` in repo>settings>secrets with a [token](https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line) generated by a user with write privileges to enable the automated push to the gh-pages branch. \ No newline at end of file diff --git a/docs/autotest.md b/docs/autotest.md new file mode 100644 index 0000000000..e75097c562 --- /dev/null +++ b/docs/autotest.md @@ -0,0 +1,156 @@ +# Automated testing + +- [Automated testing](#automated-testing) + - [Unit testing and code quality](#unit-testing-and-code-quality) + - [QEMU and Kubernetes](#qemu-and-kubernetes) + - [Starting and stopping a test cluster](#starting-and-stopping-a-test-cluster) + - [Running commands on test cluster nodes over ssh](#running-commands-on-test-cluster-nodes-over-ssh) + - [Configuration options](#configuration-options) + - [Running E2E tests](#running-e2e-tests) + +## Unit testing and code quality + +Use the `make test` command. + +## QEMU and Kubernetes + +E2E testing relies on a cluster running inside multiple QEMU virtual +machines deployed by [GoVM](https://github.com/govm-project/govm). The +same cluster can also be used interactively when real hardware is not +available. + +E2E testing is known to work on a Linux development host system. The user +must be allowed to use Docker. + +KVM must be enabled. Usually this is the case when `/dev/kvm` exists. +The current user does not need the privileges to use KVM and QEMU +doesn't have to be installed because GoVM will run QEMU inside a +container with root privileges. + +Note that cloud providers often don't offer KVM support on their +regular machines. Search for "nested virtualization" for your provider +to determine whether and how it supports KVM. + +Nested virtualization is also needed when using Kata Containers inside +the cluster. On Intel-based machines it can be enabled by loading the +`kvm_intel` module with `nested=1` (see +https://wiki.archlinux.org/index.php/KVM#Nested_virtualization). At +this time, Kata Containers up to and including 1.9.1 is [not +compatible with +PMEM-CSI](https://github.com/intel/pmem-csi/issues/303) because +volumes are not passed in as PMEM, but Kata Containers [can be +installed](https://github.com/kata-containers/packaging/tree/master/kata-deploy#kubernetes-quick-start) +and used for applications that are not using PMEM. + +The `clear-cloud` image is downloaded automatically. By default, +four different virtual machines are prepared. Each image is pre-configured +with its own hostname and with network. + +The images will contain the latest +[Clear Linux OS](https://clearlinux.org/) and have the Kubernetes +version supported by Clear Linux installed. + +PMEM-CSI images must have been created and published in some Docker +registry, as described earlier in [build PMEM-CSI](DEVELOPMENT.md#build-pmem-csi). +In addition, that registry must be accessible from inside the +cluster. That works for the default (a local registry in the build +host) but may require setting additional [configuration +options](#configuration-options) for other scenarios. + +## Starting and stopping a test cluster + +`make start` will bring up a Kubernetes test cluster inside four QEMU +virtual machines. +The first node is the Kubernetes master without +persistent memory. +The other three nodes are worker nodes with one emulated 32GB NVDIMM each. +After the cluster has been formed, `make start` adds `storage=pmem` label +to the worker nodes and deploys the PMEM-CSI driver. +Once `make start` completes, the cluster is ready for interactive use via +`kubectl` inside the virtual machine. Alternatively, you can also +set `KUBECONFIG` as shown at the end of the `make start` output +and use `kubectl` binary on the host running VMs. + +Use `make stop` to stop and remove the virtual machines. + +`make restart` can be used to cleanly reboot all virtual +machines. This is useful during development after a `make push-images` +to ensure that the cluster runs those rebuilt images. + +## Running commands on test cluster nodes over ssh + +`make start` generates ssh wrapper scripts `_work/pmem-govm/ssh.N` for each +test cluster node which are handy for running a single command or to +start an interactive shell. Examples: + +`_work/pmem-govm/ssh.0 kubectl get pods` runs a kubectl command on +the master node. + +`_work/pmem-govm/ssh.1` starts a shell on the first worker node. + +## Deploying PMEM-CSI on a test cluster + +After `make start`, PMEM-CSI is *not* installed yet. Either install +manually as [described for a normal +cluster](#run-pmem-csi-on-kubernetes) or use the +[setup-deployment.sh](/test/setup-deployment.sh) script. + +## Configuration options + +Several aspects of the cluster and build setup can be configured by overriding +the settings in the [test-config.sh](/test/test-config.sh) file. See +that file for a description of all options. Options can be set as +environment variables of `make start` on a case-by-case basis or +permanently by creating a file like `test/test-config.d/my-config.sh`. + +Multiple different clusters can be brought up in parallel by changing +the default `pmem-govm` cluster name via the `CLUSTER` env variable. + +For example, this invocation sets up a cluster using the non-default +Fedora distro: + +``` sh +TEST_DISTRO=fedora CLUSTER=fedora-govm make start +``` + +See additional details in [test/test-config.d](/test/test-config.d). + +## Running E2E tests + +`make test_e2e` will run [csi-test +sanity](https://github.com/kubernetes-csi/csi-test/tree/master/pkg/sanity) +tests and some [Kubernetes storage +tests](https://github.com/kubernetes/kubernetes/tree/master/test/e2e/storage/testsuites) +against the PMEM-CSI driver. + +When [ginkgo](https://onsi.github.io/ginkgo/) is installed, then it +can be used to run individual tests and to control additional aspects +of the test run. For example, to run just the E2E provisioning test +(create PVC, write data in one pod, read it in another) in verbose mode: + +``` sh +$ KUBECONFIG=$(pwd)/_work/pmem-govm/kube.config REPO_ROOT=$(pwd) ginkgo -v -focus=pmem-csi.*should.provision.storage.with.defaults ./test/e2e/ +Nov 26 11:21:28.805: INFO: The --provider flag is not set. Treating as a conformance test. Some tests may not be run. +Running Suite: PMEM E2E suite +============================= +Random Seed: 1543227683 - Will randomize all specs +Will run 1 of 61 specs + +Nov 26 11:21:28.812: INFO: checking config +Nov 26 11:21:28.812: INFO: >>> kubeConfig: /nvme/gopath/src/github.com/intel/pmem-csi/_work/pmem-govm/kube.config +Nov 26 11:21:28.817: INFO: Waiting up to 30m0s for all (but 0) nodes to be schedulable +... +Ran 1 of 61 Specs in 58.465 seconds +SUCCESS! -- 1 Passed | 0 Failed | 0 Pending | 60 Skipped +PASS + +Ginkgo ran 1 suite in 1m3.850672246s +Test Suite Passed +``` + +It is also possible to run just the sanity tests until one of them fails: + +``` sh +$ REPO_ROOT=`pwd` ginkgo '-focus=sanity' -failFast ./test/e2e/ +... +``` \ No newline at end of file diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000000..25225b5263 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,391 @@ +# Design and architecture + +- [Design](#design) + - [Architecture and Operation](#architecture-and-operation) + - [LVM device mode](#lvm-device-mode) + - [Direct device mode](#direct-device-mode) + - [Driver modes](#driver-modes) + - [Driver Components](#driver-components) + - [Communication between components](#communication-between-components) + - [Security](#security) + - [Volume Persistency](#volume-persistency) + - [Capacity-aware pod scheduling](#capacity-aware-pod-scheduling) + +## Architecture and Operation + +The PMEM-CSI driver can operate in two different device modes: *LVM* and +*direct*. This table contains an overview and comparison of those modes. +There is a more detailed explanation in the following paragraphs. + +| |`LVM` |`direct` | +|:-- |:-- |:-- | +|Main advantage |avoids free space fragmentation1 |simpler, somewhat faster, but free space may get fragmented1 | +|What is served |LVM logical volume |pmem block device | +|Region affinity2 |yes: one LVM volume group is created per region, and a volume has to be in one volume group |yes: namespace can belong to one region only | +|Startup |two extra stages: pmem-ns-init (creates namespaces), vgm (creates volume groups) |no extra steps at startup | +|Namespace modes |`fsdax` mode3 namespaces pre-created as pools |namespace in `fsdax` mode created directly, no need to pre-create pools | +|Limiting space usage | can leave part of device unused during pools creation |no limits, creates namespaces on device until runs out of space | +| *Name* field in namespace | *Name* gets set to 'pmem-csi' to achieve own vs. foreign marking | *Name* gets set to VolumeID, without attempting own vs. foreign marking | +|Minimum volume size| 4 MB | 1 GB (see also alignment adjustment below) | +|Alignment requirements |LVM creation aligns size up to next 4MB boundary |driver aligns size up to next alignment boundary. The default alignment step is 1 GB. Device(s) in interleaved mode will require larger minimum as size has to be at least one alignment step. The possibly bigger alignment step is calculated as interleave-set-size multiplied by 1 GB | + +1 **Free space fragmentation** is a problem when there appears to +be enough free capacity for a new namespace, but there isn't a contiguous +region big enough to allocate it. The PMEM-CSI driver is only capable of +allocating continguous memory to a namespace and cannot de-fragment or combine +smaller blocks. For example, this could happen when you create a 63 GB +namespace, followed by a 1 GB namespace, and then delete the 63 GB namespace. +Eventhough there is 127 GB available, the driver cannot create a namespace +larger than 64 GB. + +``` +--------------------------------------------------------------------- +| 63 GB free | 1GB used | 64 GB free | +--------------------------------------------------------------------- +``` + +2 **Region affinity** means that all parts of a provisioned file +system are physically located on device(s) that belong to same PMEM region. +This is important on multi-socket systems where media access time may vary +based on where the storage device(s) are physically attached. + +3 **fsdax mode** is required for NVDIMM +namespaces. See [Persistent Memory +Programming](https://pmem.io/ndctl/ndctl-create-namespace.html) for +details. `devdax` mode is not supported. Though a +raw block volume would be useful when a filesystem isn't needed, Kubernetes +cannot handle [binding a character device to a loop device](https://github.com/kubernetes/kubernetes/blob/7c87b5fb55ca096c007c8739d4657a5a4e29fb09/pkg/volume/util/util.go#L531-L534). + +## LVM device mode + +In Logical Volume Management (LVM) mode the PMEM-CSI driver +uses LVM for logical volume Management to avoid the risk of fragmentation. The +LVM logical volumes are served to satisfy API requests. There is one volume +group created per region, ensuring the region-affinity of served volumes. + +![devicemode-lvm diagram](/docs/images/devicemodes/pmem-csi-lvm.png) + +The driver consists of three separate binaries that form two +initialization stages and a third API-serving stage. + +During startup, the driver scans persistent memory for regions and +namespaces, and tries to create more namespaces using all or part +(selectable via option) of the remaining available space. This first +stage is performed by a separate entity `pmem-ns-init`. + +The second stage of initialization arranges physical volumes provided +by namespaces into LVM volume groups. This is performed by a separate +binary `pmem-vgm`. + +After two initialization stages, the third binary `pmem-csi-driver` +starts serving CSI API requests. + +### Namespace modes in LVM device mode + +The PMEM-CSI driver pre-creates namespaces in `fsdax` mode forming +the corresponding LVM volume group. The amount of space to be +used is determined using the option `-useforfsdax` given to `pmem-ns-init`. +This options specifies an integer presenting limit as percentage. +The default value is `useforfsdax=100`. + +### Using limited amount of total space in LVM device mode + +The PMEM-CSI driver can leave space on devices for others, and +recognize "own" namespaces. Leaving space for others can be achieved +by specifying lower-than-100 value to `-useforfsdax` options +The distinction "own" vs. "foreign" is +implemented by setting the _Name_ field in namespace to a static +string "pmem-csi" during namespace creation. When adding physical +volumes to volume groups, only those physical volumes that are based on +namespaces with the name "pmem-csi" are considered. + +## Direct device mode + +The following diagram illustrates the operation in Direct device mode: +![devicemode-direct diagram](/docs/images/devicemodes/pmem-csi-direct.png) + +In direct device mode PMEM-CSI driver allocates namespaces directly +from the storage device. This creates device space fragmentation risk, +but reduces complexity and run-time overhead by avoiding additional +device mapping layer. Direct mode also ensures the region-affinity of +served volumes, because provisioned volume can belong to one region +only. + +In Direct mode, the two preparation stages used in LVM mode, are not +needed. + +### Namespace modes in direct device mode + +The PMEM-CSI driver creates a namespace directly in the mode which is +asked by volume creation request, thus bypassing the complexity of +pre-allocated pools that are used in LVM device mode. + +### Using limited amount of total space in direct device mode + +In direct device mode, the driver does not attempt to limit space +use. It also does not mark "own" namespaces. The _Name_ field of a +namespace gets value of the VolumeID. + +## Driver modes + +The PMEM-CSI driver supports running in different modes, which can be +controlled by passing one of the below options to the driver's +'_-mode_' command line option. In each mode, it starts a different set +of open source Remote Procedure Call (gRPC) +[servers](#driver-components) on given driver endpoint(s). + +* **_Controller_** should run as a single instance in cluster level. When the + driver is running in _Controller_ mode, it forwards the pmem volume + create/delete requests to the registered node controller servers + running on the worker node. In this mode, the driver starts the + following gRPC servers: + + * [IdentityServer](#identity-server) + * [NodeRegistryServer](#node-registry-server) + * [MasterControllerServer](#master-controller-server) + +* One **_Node_** instance should run on each + worker node that has persistent memory devices installed. When the + driver starts in such mode, it registers with the _Controller_ + driver running on a given _-registryEndpoint_. In this mode, the + driver starts the following servers: + + * [IdentityServer](#identity-server) + * [NodeControllerServer](#node-controller-server) + * [NodeServer](#node-server) + +## Driver Components + +### Identity Server + +This gRPC server operates on a given endpoint in all driver modes and +implements the CSI [Identity +interface](https://github.com/container-storage-interface/spec/blob/master/spec.md#identity-service-rpc). + +### Node Registry Server + +When the PMEM-CSI driver runs in _Controller_ mode, it starts a gRPC +server on a given endpoint(_-registryEndpoint_) and serves the +[RegistryServer](/pkg/pmem-registry/pmem-registry.proto) interface. The +driver(s) running in _Node_ mode can register themselves with node +specific information such as node id, +[NodeControllerServer](#node-controller-server) endpoint, and their +available persistent memory capacity. + +### Master Controller Server + +This gRPC server is started by the PMEM-CSI driver running in +_Controller_ mode and serves the +[Controller](https://github.com/container-storage-interface/spec/blob/master/spec.md#controller-service-rpc) +interface defined by the CSI specification. The server responds to +CreateVolume(), DeleteVolume(), ControllerPublishVolume(), +ControllerUnpublishVolume(), and ListVolumes() calls coming from +external-provisioner() and external-attacher() sidecars. It +forwards the publish and unpublish volume requests to the appropriate +[Node controller server](#node-controller-server) running on a worker +node that was registered with the driver. + +### Node Controller Server + +This gRPC server is started by the PMEM-CSI driver running in _Node_ +mode and implements the +[ControllerPublishVolume](https://github.com/container-storage-interface/spec/blob/master/spec.md#controllerpublishvolume) +and +[ControllerUnpublishVolume](https://github.com/container-storage-interface/spec/blob/master/spec.md#controllerunpublishvolume) +methods of the [Controller +service](https://github.com/container-storage-interface/spec/blob/master/spec.md#controller-service-rpc) +interface defined by the CSI specification. It serves the +ControllerPublishVolume() and ControllerUnpublish() requests coming +from the [Master controller server](#master-controller-server) and +creates/deletes persistent memory devices. + +### Node Server + +This gRPC server is started by the driver running in _Node_ mode and +implements the [Node +service](https://github.com/container-storage-interface/spec/blob/master/spec.md#node-service-rpc) +interface defined in the CSI specification. It serves the +NodeStageVolume(), NodeUnstageVolume(), NodePublishVolume(), and +NodeUnpublishVolume() requests coming from the Container Orchestrator +(CO). + +## Communication between components + +The following diagram illustrates the communication channels between driver components: +![communication diagram](/docs/images/communication/pmem-csi-communication-diagram.png) + +## Security + +All PMEM-CSI specific communication [shown in above +section](#communication-between-components) between Master +Controller([RegistryServer](#node-registry-server), +[MasterControllerServer](#master-controller-server)) and +NodeControllers([NodeControllerServer](#node-controller-server)) is +protected by mutual TLS. Both client and server must identify +themselves and the certificate they present must be trusted. The +common name in each certificate is used to identify the different +components. The following common names have a special meaning: + +- `pmem-registry` is used by the [RegistryServer](#node-registry-server). +- `pmem-node-controller` is used by [NodeControllerServers](#node-controller-server) + +The [`test/setup-ca.sh`](/test/setup-ca.sh) +script shows how to generate self-signed certificates. The test cluster is set +up using certificates created by that script, with secrets prepared by +[`test/setup-deployment.sh`](/test/setup-deployment.sh) before +deploying the driver using the provided [deployment files](/deploy/). + +Beware that these are just examples. Administrators of a cluster must +ensure that they choose key lengths and algorithms of sufficient +strength for their purposes and manage certificate distribution. + +A production deployment can improve upon that by using some other key +delivery mechanism, like for example +[Vault](https://www.vaultproject.io/). + + + +## Volume Persistency + +In a typical CSI deployment, volumes are provided by a storage backend +that is independent of a particular node. When a node goes offline, +the volume can be mounted elsewhere. But PMEM volumes are *local* to +node and thus can only be used on the node where they were +created. This means the applications using PMEM volume cannot freely +move between nodes. This limitation needs to be considered when +designing and deploying applications that are to use *local storage*. + +These are the volume persistency models considered for implementation +in PMEM-CSI to serve different application use cases: + +* **Persistent volumes** +A volume gets created independently of the application, on some node +where there is enough free space. Applications using such a volume are +then forced to run on that node and cannot run when the node is +down. Data is retained until the volume gets deleted. + +* **Ephemeral volumes** +Each time an application starts to run on a node, a new volume is +created for it on that node. When the application stops, the volume is +deleted. The volume cannot be shared with other applications. Data on +this volume is retained only while the application runs. + +* **Cache volumes** +Volumes are pre-created on a certain set of nodes, each with its own +local data. Applications are started on those nodes and then get to +use the volume on their node. Data persists across application +restarts. This is useful when the data is only cached information that +can be discarded and reconstructed at any time *and* the application +can reuse existing local data when restarting. + +Volume | Kubernetes | PMEM-CSI | Limitations +--- | --- | --- | --- +Persistent | supported | supported | topology aware scheduling1 +Ephemeral | supported2 | supported | resource constraints3 +Cache | supported | supported | topology aware scheduling1 + +1 [Topology aware +scheduling](https://github.com/kubernetes/enhancements/issues/490) +ensures that an application runs on a node where the volume was +created. For CSI-based drivers like PMEM-CSI, Kubernetes >= 1.13 is +needed. On older Kubernetes releases, pods must be scheduled manually +onto the right node(s). + +2 [CSI ephemeral volumes](https://kubernetes.io/docs/concepts/storage/volumes/#csi-ephemeral-volumes) +feature support is alpha in Kubernetes v1.15, and beta in v1.16. + +3 The upstream design for ephemeral volumes currently does +not take [resource +constraints](https://github.com/kubernetes/enhancements/pull/716#discussion_r250536632) +into account. If an application gets scheduled onto a node and then +creating the ephemeral volume on that node fails, the application on +the node cannot start until resources become available. + +See [exposing persistent and cache volumes](install.md#expose-persistent-and-cache-volumes-to-applications) for configuration information. + +## Capacity-aware pod scheduling + +PMEM-CSI implements the CSI `GetCapacity` call, but Kubernetes +currently doesn't call that and schedules pods onto nodes without +being aware of available storage capacity on the nodes. The effect is +that pods using volumes with late binding may get tentatively assigned +to a node and then get stuck because that decision is not reconsidered +when the volume cannot be created there ([a +bug](https://github.com/kubernetes/kubernetes/issues/72031)). Even if +that decision is reconsidered, the same node may get selected again +because Kubernetes does not get informed about the insufficient +storage. Pods with ephemeral inline volumes always get stuck because +the decision to use the node [is final](https://github.com/kubernetes-sigs/descheduler/issues/62). + +Work is [under +way](https://github.com/kubernetes/enhancements/pull/1353) to enhance +scheduling in Kubernetes. In the meantime, PMEM-CSI provides two components +that help with pod scheduling: + +### Scheduler extender + +When a pod requests the special [extended +resource](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#extended-resources) +called `pmem-csi.intel.com/scheduler`, the Kubernetes scheduler calls +a [scheduler +extender](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/scheduling/scheduler_extender.md) +provided by PMEM-CSI with a list of nodes that a pod might run +on. This extender is implemented in the master controller and thus can +connect to the controller on each of these nodes to check for +capacity. PMEM-CSI then filters out all nodes which currently do not +have enough storage left for the volumes that still need to be +created. This considers inline ephemeral volumes and all unbound +volumes, regardless whether they use late binding or immediate +binding. + +This special scheduling can be requested manually by adding this snippet +to one container in the pod spec: +``` +containers: +- name: some-container + ... + resources: + limits: + pmem-csi.intel.com/scheduler: "1" + requests: + pmem-csi.intel.com/scheduler: "1" +``` + +This scheduler extender is optional and not necessarily installed in +all clusters that have PMEM-CSI. Don't add this extended resource +unless the scheduler extender is installed, otherwise the pod won't +start! + +See our [implementation](http://github.com/intel/pmem-csi/tree/devel/pkg/scheduler) of a scheduler extender. + +### Pod admission webhook + +Having to add `pmem-csi.intel.com/scheduler` manually is not +user-friendly. To simplify this, PMEM-CSI provides a [mutating +admission +webhook](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/) +which intercepts the creation of all pods. If that pod uses inline +ephemeral volumes or volumes with late binding that are provided by +PMEM-CSI, the webhook transparently adds the extended resource +request. PMEM-CSI volumes with immediate binding are ignored because +for those the normal topology support ensures that unsuitable nodes +are filtered out. + +The webhook can only do that if the persistent volume claim (PVC) and +its storage class have been created already. This is normally not +required: it's okay to create the pod first, then later add the +PVC. The pod simply won't start in the meantime. + +The webhook deals with this uncertainty by allowing the creation of +the pod without adding the extended resource when it lacks the +necessary information. The alternative would be to reject the pod, but +that would be a change of behavior of the cluster that may affect also pods +that don't use PMEM-CSI at all. + +Users must take care to create PVCs first, then the pods if they want +to use the webhook. In practice, that is often already done because it +is more natural, so it is not a big limitation. \ No newline at end of file diff --git a/docs/html/index.html b/docs/html/index.html new file mode 100644 index 0000000000..5f62e3d9be --- /dev/null +++ b/docs/html/index.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/html/index2.html b/docs/html/index2.html new file mode 100644 index 0000000000..c19cf06e84 --- /dev/null +++ b/docs/html/index2.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000000..783ac37e4d --- /dev/null +++ b/docs/install.md @@ -0,0 +1,655 @@ +# Instructions for Admins and Users + +- [Prerequisites](#prerequisites) + - [Software required](#software-required) + - [Hardware required](#hardware-required) + - [Persistent memory pre-provisioning](#persistent-memory-pre-provisioning) +- [Installation and setup](#installation-and-setup) + - [Get source code](#get-source-code) + - [Run PMEM-CSI on Kubernetes](#run-pmem-csi-on-kubernetes) + - [Expose persistent and cache volumes to applications](#expose-persistent-and-cache-volumes-to-applications) + - [Raw block volumes](#raw-block-volumes) + - [Enable scheduler extensions](#enable-scheduler-extensions) +- [Filing issues and contributing](#filing-issues-and-contributing) + +## Prerequisites + +### Software required + +The recommended mimimum Linux kernel version for running the PMEM-CSI driver is 4.15. See [Persistent Memory Programming](https://pmem.io/2018/05/15/using_persistent_memory_devices_with_the_linux_device_mapper.html) for more details about supported kernel versions. + +### Hardware required + +Persistent memory device(s) are required for operation. However, some +development and testing can be done using QEMU-emulated persistent +memory devices. See the ["QEMU and Kubernetes"](autotest.md#qemu-and-kubernetes) +section for the commands that create such a virtual test cluster. + +### Persistent memory pre-provisioning + +The PMEM-CSI driver needs pre-provisioned regions on the NVDIMM +device(s). The PMEM-CSI driver itself intentionally leaves that to the +administrator who then can decide how much and how PMEM is to be used +for PMEM-CSI. + +Beware that the PMEM-CSI driver will run without errors on a node +where PMEM was not prepared for it. It will then report zero local +storage for that node, something that currently is only visible in the +log files. + +When running the Kubernetes cluster and PMEM-CSI on bare metal, +the [ipmctl](https://github.com/intel/ipmctl) utility can be used to create regions. +App Direct Mode has two configuration options - interleaved or non-interleaved. +One region per each NVDIMM is created in non-interleaved configuration. +In such a configuration, a PMEM-CSI volume cannot be larger than one NVDIMM. + +Example of creating regions without interleaving, using all NVDIMMs: +```sh +# ipmctl create -goal PersistentMemoryType=AppDirectNotInterleaved +``` + +Alternatively, multiple NVDIMMs can be combined to form an interleaved set. +This causes the data to be striped over multiple NVDIMM devices +for improved read/write performance and allowing one region (also, PMEM-CSI volume) +to be larger than single NVDIMM. + +Example of creating regions in interleaved mode, using all NVDIMMs: +```sh +# ipmctl create -goal PersistentMemoryType=AppDirect +``` + +When running inside virtual machines, each virtual machine typically +already gets access to one region and `ipmctl` is not needed inside +the virtual machine. Instead, that region must be made available for +use with PMEM-CSI because when the virtual machine comes up for the +first time, the entire region is already allocated for use as a single +block device: +``` sh +# ndctl list -RN +{ + "regions":[ + { + "dev":"region0", + "size":34357641216, + "available_size":0, + "max_available_extent":0, + "type":"pmem", + "persistence_domain":"unknown", + "namespaces":[ + { + "dev":"namespace0.0", + "mode":"raw", + "size":34357641216, + "sector_size":512, + "blockdev":"pmem0" + } + ] + } + ] +} +# ls -l /dev/pmem* +brw-rw---- 1 root disk 259, 0 Jun 4 16:41 /dev/pmem0 +``` + +Labels must be initialized in such a region, which must be performed +once after the first boot: +``` sh +# ndctl disable-region region0 +disabled 1 region +# ndctl init-labels nmem0 +initialized 1 nmem +# ndctl enable-region region0 +enabled 1 region +# ndctl list -RN +[ + { + "dev":"region0", + "size":34357641216, + "available_size":34357641216, + "max_available_extent":34357641216, + "type":"pmem", + "iset_id":10248187106440278, + "persistence_domain":"unknown" + } +] +# ls -l /dev/pmem* +ls: cannot access '/dev/pmem*': No such file or directory +``` + +## Installation and setup + +### Get source code + +PMEM-CSI uses Go modules and thus can be checked out and (if that should be desired) +built anywhere in the filesystem. Pre-built container images are available and thus +users don't need to build from source, but they will still need some additional files. +To get the source code, use: + +``` +git clone https://github.com/intel/pmem-csi +``` + +### Run PMEM-CSI on Kubernetes + +This section assumes that a Kubernetes cluster is already available +with at least one node that has persistent memory device(s). For development or +testing, it is also possible to use a cluster that runs on QEMU virtual +machines, see the ["QEMU and Kubernetes"](autotest.md#qemu-and-kubernetes). + +- **Make sure that the alpha feature gates CSINodeInfo and CSIDriverRegistry are enabled** + +The method to configure alpha feature gates may vary, depending on the Kubernetes deployment. +It may not be necessary anymore when the feature has reached beta state, which depends +on the Kubernetes version. + +- **Label the cluster nodes that provide persistent memory device(s)** + +```sh + $ kubectl label node storage=pmem +``` + +- **Set up certificates** + +Certificates are required as explained in [Security](design.md#security). +If you are not using the test cluster described in +[Starting and stopping a test cluster](autotest.md#starting-and-stopping-a-test-cluster) +where certificates are created automatically, you must set up certificates manually. +This can be done by running the `./test/setup-ca-kubernetes.sh` script for your cluster. +This script requires "cfssl" tools which can be downloaded. +These are the steps for manual set-up of certificates: + +- Download cfssl tools + +```sh + $ curl -L https://pkg.cfssl.org/R1.2/cfssl_linux-amd64 -o _work/bin/cfssl --create-dirs + $ curl -L https://pkg.cfssl.org/R1.2/cfssljson_linux-amd64 -o _work/bin/cfssljson --create-dirs + $ chmod a+x _work/bin/cfssl _work/bin/cfssljson +``` + +- Run certificates set-up script + +```sh + $ KUBCONFIG="<> PATH="$PATH:$PWD/_work/bin" ./test/setup-ca-kubernetes.sh +``` + +- **Deploy the driver to Kubernetes** + +The `deploy/kubernetes-` directory contains +`pmem-csi*.yaml` files which can be used to deploy the driver on that +Kubernetes version. The files in the directory with the highest +Kubernetes version might also work for more recent Kubernetes +releases. All of these deployments use images published by Intel on +[Docker Hub](https://hub.docker.com/u/intel). + +For each Kubernetes version, four different deployment variants are provided: + + - `direct` or `lvm`: one uses direct device mode, the other LVM device mode. + - `testing`: the variants with `testing` in the name enable debugging + features and shouldn't be used in production. + +For example, to deploy for production with LVM device mode onto Kubernetes 1.14, use: + +```sh + $ kubectl create -f deploy/kubernetes-1.14/pmem-csi-lvm.yaml +``` + +The PMEM-CSI [scheduler extender](design.md#scheduler-extender) and +[webhook](design.md#pod-admission-webhook) are not enabled in this basic +installation. See [below](#enable-scheduler-extensions) for +instructions about that. + +These variants were generated with +[`kustomize`](https://github.com/kubernetes-sigs/kustomize). +`kubectl` >= 1.14 includes some support for that. The sub-directories +of [deploy/kustomize](/deploy/kustomize)`-` can be used as bases +for `kubectl kustomize`. For example: + + - Change namespace: + ``` + $ mkdir -p my-pmem-csi-deployment + $ cat >my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/lvm-parameters-patch.yaml <,storage=pmem +``` + +If **storage=pmem** is missing, label manually as described above. If +**pmem-csi.intel.com/node** is missing, then double-check that the +alpha feature gates are enabled, that the CSI driver is running on the node, +and that the driver's log output doesn't contain errors. + +- **Define two storage classes using the driver** + +```sh + $ kubectl create -f deploy/kubernetes-/pmem-storageclass-ext4.yaml + $ kubectl create -f deploy/kubernetes-/pmem-storageclass-xfs.yaml +``` + +- **Provision two pmem-csi volumes** + +```sh + $ kubectl create -f deploy/kubernetes-/pmem-pvc.yaml +``` + +- **Verify two Persistent Volume Claims have 'Bound' status** + +```sh + $ kubectl get pvc + NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE + pmem-csi-pvc-ext4 Bound pvc-f70f7b36-6b36-11e9-bf09-deadbeef0100 4Gi RWO pmem-csi-sc-ext4 16s + pmem-csi-pvc-xfs Bound pvc-f7101fd2-6b36-11e9-bf09-deadbeef0100 4Gi RWO pmem-csi-sc-xfs 16s +``` + +- **Start two applications requesting one provisioned volume each** + +```sh + $ kubectl create -f deploy/kubernetes-/pmem-app.yaml +``` + +These applications use **storage: pmem** in the nodeSelector +list to ensure scheduling to a node supporting pmem device, and each requests a mount of a volume, +one with ext4-format and another with xfs-format file system. + +- **Verify two application pods reach 'Running' status** + +```sh + $ kubectl get po my-csi-app-1 my-csi-app-2 + NAME READY STATUS RESTARTS AGE + my-csi-app-1 1/1 Running 0 6m5s + NAME READY STATUS RESTARTS AGE + my-csi-app-2 1/1 Running 0 6m1s +``` + +- **Check that applications have a pmem volume mounted with added dax option** + +```sh + $ kubectl exec my-csi-app-1 -- df /data + Filesystem 1K-blocks Used Available Use% Mounted on + /dev/ndbus0region0fsdax/5ccaa889-551d-11e9-a584-928299ac4b17 + 4062912 16376 3820440 0% /data + $ kubectl exec my-csi-app-2 -- df /data + Filesystem 1K-blocks Used Available Use% Mounted on + /dev/ndbus0region0fsdax/5cc9b19e-551d-11e9-a584-928299ac4b17 + 4184064 37264 4146800 1% /data + + $ kubectl exec my-csi-app-1 -- mount |grep /data + /dev/ndbus0region0fsdax/5ccaa889-551d-11e9-a584-928299ac4b17 on /data type ext4 (rw,relatime,dax) + $ kubectl exec my-csi-app-2 -- mount |grep /data + /dev/ndbus0region0fsdax/5cc9b19e-551d-11e9-a584-928299ac4b17 on /data type xfs (rw,relatime,attr2,dax,inode64,noquota) +``` + +#### Expose persistent and cache volumes to applications + +Kubernetes cluster administrators can expose persistent and cache volumes +to applications using +[`StorageClass +Parameters`](https://kubernetes.io/docs/concepts/storage/storage-classes/#parameters). An +optional `persistencyModel` parameter differentiates how the +provisioned volume can be used: + +* no `persistencyModel` parameter or `persistencyModel: normal` in `StorageClass` + + A normal Kubernetes persistent volume. In this case + PMEM-CSI creates PMEM volume on a node and the application that + claims to use this volume is supposed to be scheduled onto this node + by Kubernetes. Choosing of node is depend on StorageClass + `volumeBindingMode`. In case of `volumeBindingMode: Immediate` + PMEM-CSI chooses a node randomly, and in case of `volumeBindingMode: + WaitForFirstConsumer` (also known as late binding) Kubernetes first chooses a node for scheduling + the application, and PMEM-CSI creates the volume on that + node. Applications which claim a normal persistent volume has to use + `ReadOnlyOnce` access mode in its `accessModes` list. This + [diagram](/docs/images/sequence/pmem-csi-persistent-sequence-diagram.png) + illustrates how a normal persistent volume gets provisioned in + Kubernetes using PMEM-CSI driver. + +* `persistencyModel: cache` + + Volumes of this type shall be used in combination with + `volumeBindingMode: Immediate`. In this case, PMEM-CSI creates a set + of PMEM volumes each volume on different node. The number of PMEM + volumes to create can be specified by `cacheSize` StorageClass + parameter. Applications which claim a `cache` volume can use + `ReadWriteMany` in its `accessModes` list. Check with provided + [cacheStorageClass](/deploy/common/pmem-storageclass-cache.yaml) + example. This + [diagram](/docs/images/sequence/pmem-csi-cache-sequence-diagram.png) + illustrates how a cache volume gets provisioned in Kubernetes using + PMEM-CSI driver. + +**NOTE**: Cache volumes are associated with a node, not a pod. Multiple +pods using the same cache volume on the same node will not get their +own instance but will end up sharing the same PMEM volume instead. +Application deployment has to consider this and use available Kubernetes +mechanisms like [node +anti-affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity). +Check with the provided +[cacheapplication](/deploy/common/pmem-app-cache.yaml) example. + +**WARNING**: late binding (`volumeBindingMode:WaitForFirstConsume`) has some caveats: +* Pod creation may get stuck when there isn't enough capacity left for + the volumes; see the next section for details. +* A node is only chosen the first time a pod starts. After that it will always restart + on that node, because that is where the persistent volume was created. + +Volume requests embedded in Pod spec are provisioned as ephemeral volumes. The volume request could use below fields as [`volumeAttributes`](https://kubernetes.io/docs/concepts/storage/volumes/#csi): + +|key|meaning|optional|values| +|---|-------|--------|-------------| +|`size`|Size of the requested ephemeral volume as [Kubernetes memory string](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) ("1Mi" = 1024*1024 bytes, "1e3K = 1000000 bytes)|No|| +|`eraseAfter`|Clear all data after use and before
deleting the volume|Yes|`true` (default),
`false`| + +Check with provided [example application](/deploy/kubernetes-1.15/pmem-app-ephemeral.yaml) for +ephemeral volume usage. + +#### Raw block volumes + +Applications can use volumes provisioned by PMEM-CSI as [raw block +devices](https://kubernetes.io/blog/2019/03/07/raw-block-volume-support-to-beta/). Such +volumes use the same "fsdax" namespace mode as filesystem volumes +and therefore are block devices. That mode only supports dax (= +`mmap(MAP_SYNC)`) through a filesystem. Pages mapped on the raw block +device go through the Linux page cache. Applications have to format +and mount the raw block volume themselves if they want dax. The +advantage then is that they have full control over that part. + +For provisioning a PMEM volume as raw block device, one has to create a +`PersistentVolumeClaim` with `volumeMode: Block`. See example [PVC]( +/deploy/common/pmem-pvc-block-volume.yaml) and +[application](/deploy/common/pmem-app-block-volume.yaml) for usage reference. + +That example demonstrates how to handle some details: +- `mkfs.ext4` needs `-b 4096` to produce volumes that support dax; + without it, the automatic block size detection may end up choosing + an unsuitable value depending on the volume size. +- [Kubernetes bug #85624](https://github.com/kubernetes/kubernetes/issues/85624) + must be worked around to format and mount the raw block device. + +#### Enable scheduler extensions + +The PMEM-CSI scheduler extender and admission webhook are provided by +the PMEM-CSI controller. They need to be enabled during deployment via +the `--schedulerListen=[]:` parameter. The +listen address is optional and can be left out. The port is where a +HTTPS server will run. It uses the same certificates as the internal +gRPC service. When using the CA creation script described above, they +will contain alternative names for the URLs described in this section +(service names, `127.0.0.1` IP address). + +This parameter can be added to one of the existing deployment files +with `kustomize`. All of the following examples assume that the +current directory contains the `deploy` directory from the PMEM-CSI +repository. It is also possible to reference the base via a +[URL](https://github.com/kubernetes-sigs/kustomize/blob/master/examples/remoteBuild.md). + +``` sh +mkdir my-pmem-csi-deployment + +cat >my-pmem-csi-deployment/kustomization.yaml <my-pmem-csi-deployment/scheduler-patch.yaml <my-scheduler/kustomization.yaml <my-scheduler/node-port-patch.yaml </var/lib/scheduler/scheduler-policy.cfg' <:", + "filterVerb": "filter", + "prioritizeVerb": "prioritize", + "nodeCacheCapable": false, + "weight": 1, + "managedResources": + [{ + "name": "pmem-csi.intel.com/scheduler", + "ignoredByScheduler": true + }] + }] +} +EOF + +cat >kubeadm.config <= +1.15, it can also be used to let individual pods bypass the webhook by +adding that label. The CA gets configured explicitly, which is +supported for webhooks. + +``` sh +mkdir my-webhook + +cat >my-webhook/kustomization.yaml <my-webhook/webhook-patch.yaml < + +## Filing issues and contributing + +Report a bug by [filing a new issue](https://github.com/intel/pmem-csi/issues). + +Before making your first contribution, be sure to read the [development documentation](DEVELOPMENT.md) +for guidance on code quality and branches. + +Contribute by [opening a pull request](https://github.com/intel/pmem-csi/pulls). + +Learn [about pull requests](https://help.github.com/articles/using-pull-requests/). + +**Reporting a Potential Security Vulnerability:** If you have discovered potential security vulnerability in PMEM-CSI, please send an e-mail to secure@intel.com. For issues related to Intel Products, please visit [Intel Security Center](https://security-center.intel.com). + +It is important to include the following details: + +- The projects and versions affected +- Detailed description of the vulnerability +- Information on known exploits + +Vulnerability information is extremely sensitive. Please encrypt all security vulnerability reports using our [PGP key](https://www.intel.com/content/www/us/en/security-center/pgp-public-key.html). + +A member of the Intel Product Security Team will review your e-mail and contact you to collaborate on resolving the issue. For more information on how Intel works to resolve security issues, see: [vulnerability handling guidelines](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). + + diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000..b6f6a3a327 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +sphinx +sphinx_rtd_theme +recommonmark +sphinx-markdown-tables \ No newline at end of file diff --git a/docs/substitutions.txt b/docs/substitutions.txt new file mode 100644 index 0000000000..3eef4529a3 --- /dev/null +++ b/docs/substitutions.txt @@ -0,0 +1 @@ +.. |PR| replace:: Project Name \ No newline at end of file diff --git a/examples/gce.md b/examples/gce.md index d26ea7da25..03cbb43f85 100644 --- a/examples/gce.md +++ b/examples/gce.md @@ -206,7 +206,7 @@ To stop the cluster, use the same env variables for the After the previous step, `kubectl` works and is configured to use the new cluster. What follows next are the steps explained in more details in the top-level README's [Run PMEM-CSI on -Kubernetes](../run-pmem-csi-on-kubernetes) section. +Kubernetes](../docs/install.md#run-pmem-csi-on-kubernetes) section. First the worker nodes need to be labeled: diff --git a/examples/readme.rst b/examples/readme.rst new file mode 100644 index 0000000000..72db86de6c --- /dev/null +++ b/examples/readme.rst @@ -0,0 +1,15 @@ +Application examples +#################### + +`Redis-pmem operator `__ + Deploy a Redis cluster through the redis-operator using QEMU-emulated persistent memory devices + +`Google Cloud Engine `__ + Install Kubernetes and PMEM-CSI on Google Cloud machines. + +.. toctree:: + :hidden: + + redis-operator.md + gce.md + \ No newline at end of file diff --git a/examples/redis-operator.md b/examples/redis-operator.md index 92e2cab3d4..87b5554cd8 100644 --- a/examples/redis-operator.md +++ b/examples/redis-operator.md @@ -1,4 +1,5 @@ # Redis-pmem operator + This readme describes a complete example to deploy a Redis cluster through the [redis-operator](https://github.com/spotahome/redis-operator) using QEMU-emulated persistent memory devices. ## Prerequisites diff --git a/index.rst b/index.rst new file mode 100644 index 0000000000..6109596696 --- /dev/null +++ b/index.rst @@ -0,0 +1,18 @@ +.. Project Name documentation master file, created by + sphinx-quickstart on Tue Nov 5 14:52:28 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +PMEM-CSI +========= + +.. toctree:: + :maxdepth: 2 + + README.md + docs/design.md + docs/install.md + docs/DEVELOPMENT.md + docs/autotest.md + examples/readme.rst + Project GitHub repository diff --git a/make.bat b/make.bat new file mode 100644 index 0000000000..455cbe3888 --- /dev/null +++ b/make.bat @@ -0,0 +1,43 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_output + +if "%1" == "" goto help + +if "%1" == "html" goto html + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:html +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +python.exe .\fix-refs.py +copy index.html %BUILDDIR%\html\index.html +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..e15c9ea6f3 --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup(name='Sphinx GUI Utility', + version='0.1', + description='Build Sphinx docs from a GUI', + author='Kevin Putnam', + author_email='kevin.putnam@intel.com', + url='https://github.com/intel/pmem-csi', + ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000..ffeb174d91 --- /dev/null +++ b/tox.ini @@ -0,0 +1,13 @@ +[tox] +envlist = py3-{mylinux,mywindows} + +[testenv] +platform = mylinux: linux + mywindows: win32 +whitelist_externals = make.bat + /usr/bin/make +deps = -rrequirements.txt +commands = + mylinux: make {posargs} + mywindows: make.bat {posargs} +