cybozu-go
diff --git a/‎CHANGELOG.md
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎constraints.go
Lines changed: 2 additions & 0 deletions b/‎constraints.go
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/ckecli.md
Lines changed: 28 additions & 2 deletions b/‎docs/ckecli.md
Lines changed: 28 additions & 2 deletions
diff --git a/‎docs/constraints.md
Lines changed: 1 addition & 0 deletions b/‎docs/constraints.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/sabakan-integration.md
Lines changed: 2 additions & 2 deletions b/‎docs/sabakan-integration.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/sabakan-triggered-repair.md
Lines changed: 86 additions & 0 deletions b/‎docs/sabakan-triggered-repair.md
Lines changed: 86 additions & 0 deletions
diff --git a/‎mtest/ckecli_test.go
Lines changed: 9 additions & 0 deletions b/‎mtest/ckecli_test.go
Lines changed: 9 additions & 0 deletions
diff --git a/‎pkg/ckecli/cmd/auto_repair.go
Lines changed: 16 additions & 0 deletions b/‎pkg/ckecli/cmd/auto_repair.go
Lines changed: 16 additions & 0 deletions
diff --git a/‎pkg/ckecli/cmd/auto_repair_disable.go
Lines changed: 26 additions & 0 deletions b/‎pkg/ckecli/cmd/auto_repair_disable.go
Lines changed: 26 additions & 0 deletions
diff --git a/‎pkg/ckecli/cmd/auto_repair_enable.go
Lines changed: 26 additions & 0 deletions b/‎pkg/ckecli/cmd/auto_repair_enable.go
Lines changed: 26 additions & 0 deletions
@@ -5,6 +5,14 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve
 
 ## [Unreleased]
 
+### Added
+
+- Add sabakan-triggered automatic repair functionality in [#725](https://github.com/cybozu-go/cke/pull/725)
+
+### Fixed
+
+- Fix not to send unassigned query parameters in Sabakan integration in [#725](https://github.com/cybozu-go/cke/pull/725)
+
 ## [1.28.0]
 
 ### Changed
 
@@ -8,6 +8,7 @@ type Constraints struct {
 	MinimumWorkers           int `json:"minimum-workers"`
 	MaximumWorkers           int `json:"maximum-workers"`
 	RebootMaximumUnreachable int `json:"maximum-unreachable-nodes-for-reboot"`
+	MaximumRepairs           int `json:"maximum-repair-queue-entries"`
 }
 
 // Check checks the cluster satisfies the constraints
@@ -41,5 +42,6 @@ func DefaultConstraints() *Constraints {
 		MinimumWorkers:           1,
 		MaximumWorkers:           0,
 		RebootMaximumUnreachable: 0,
+		MaximumRepairs:           0,
 	}
 }
@@ -67,6 +67,11 @@ $ ckecli [--config FILE] <subcommand> args...
   - [`ckecli sabakan get-template`](#ckecli-sabakan-get-template)
   - [`ckecli sabakan set-variables FILE`](#ckecli-sabakan-set-variables-file)
   - [`ckecli sabakan get-variables`](#ckecli-sabakan-get-variables)
+- [`ckecli auto-repair`](#ckecli-auto-repair)
+  - [`ckecli auto-repair enable|disable`](#ckecli-auto-repair-enabledisable)
+  - [`ckecli auto-repair is-enabled`](#ckecli-auto-repair-is-enabled)
+  - [`ckecli auto-repair set-variables FILE`](#ckecli-auto-repair-set-variables-file)
+  - [`ckecli auto-repair get-variables`](#ckecli-auto-repair-get-variables)
 - [`ckecli status`](#ckecli-status)
 
 ## `ckecli cluster`
@@ -91,6 +96,7 @@ Set a constraint on the cluster configuration.
 - `minimum-workers`
 - `maximum-workers`
 - `maximum-unreachable-nodes-for-reboot`
+- `maximum-repair-queue-entries`
 
 ### `ckecli constraints show`
 
@@ -408,12 +414,32 @@ Get the cluster configuration template.
 
 ### `ckecli sabakan set-variables FILE`
 
-Set the query variables to search machines in sabakan.
+Set the query variables to search available machines in sabakan.
 `FILE` should contain JSON as described in [sabakan integration](sabakan-integration.md#variables).
 
 ### `ckecli sabakan get-variables`
 
-Get the query variables to search machines in sabakan.
+Get the query variables to search available machines in sabakan.
+
+## `ckecli auto-repair`
+
+### `ckecli auto-repair enable|disable`
+
+Enable/Disable [sabakan-triggered automatic repair](sabakan-triggered-repair.md).
+
+### `ckecli auto-repair is-enabled`
+
+Show sabakan-triggered automatic repair is enabled or disabled.
+It displays `true` or `false`.
+
+### `ckecli auto-repair set-variables FILE`
+
+Set the query variables to search non-healthy machines in sabakan.
+`FILE` should contain JSON as described in [sabakan-triggered automatic repair](sabakan-triggered-repair.md#query).
+
+### `ckecli auto-repair get-variables`
+
+Get the query variables to search non-healthy machines in sabakan.
 
 ## `ckecli status`
 
 
@@ -12,3 +12,4 @@ Cluster should satisfy these constraints.
 | `minimum-workers`                      | int  | 1       | The minimum number of worker nodes                                    |
 | `maximum-workers`                      | int  | 0       | The maximum number of worker nodes. 0 means unlimited.                |
 | `maximum-unreachable-nodes-for-reboot` | int  | 0       | The maximum number of unreachable nodes allowed for operating reboot. |
+| `maximum-repair-queue-entries`         | int  | 0       | The maximum number of repair queue entries                            |
@@ -345,7 +345,7 @@ Following Machine fields are translated to Node annotations:
 
 
 [sabakan]: https://github.com/cybozu-go/sabakan
-[schema]: https://github.com/cybozu-go/sabakan/blob/master/gql/schema.graphql
+[schema]: https://github.com/cybozu-go/sabakan/blob/main/gql/graph/schema.graphqls
 [taint]: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
-[lifecycle]: https://github.com/cybozu-go/sabakan/blob/master/docs/lifecycle.md#transition-diagram
+[lifecycle]: https://github.com/cybozu-go/sabakan/blob/main/docs/lifecycle.md#transition-diagram
 [well-known taints]: https://kubernetes.io/docs/reference/labels-annotations-taints/
@@ -0,0 +1,86 @@
+Automatic repair triggered by sabakan
+=====================================
+
+[Sabakan][sabakan] is management software for server machines in a data center.
+It stores the status information of machines as well as their spec information.
+By referring to machines' status information in sabakan, CKE can initiate the repair of a non-healthy machine.
+
+This functionality is similar to [sabakan integration](sabakan-integration.md).
+
+How it works
+------------
+
+CKE periodically queries sabakan to retrieve machines' status information in a data center.
+If CKE finds non-healthy machines, it creates [repair queue entries](repair.md) for those machines.
+
+The fields of a repair queue entry are determined based on the [information of the non-healthy machine](https://github.com/cybozu-go/sabakan/blob/main/docs/machine.md).
+* `address`: `.spec.ipv4[0]`
+* `machine_type`: `.spec.bmc.type`
+* `operation`: `.status.state`
+
+Users can configure the query to choose non-healthy machines.
+The queries are executed via sabakan [GraphQL `searchMachines`](https://github.com/cybozu-go/sabakan/blob/master/docs/graphql.md) API.
+
+Query
+-----
+
+CKE uses the following GraphQL query to retrieve machine information from sabakan.
+
+```
+query ckeSearch($having: MachineParams, $notHaving: MachineParams) {
+  searchMachines(having: $having, notHaving: $notHaving) {
+    # snip
+  }
+}
+```
+
+The following values are used for `$having` and `$notHaving` variables by default.
+Users can change these values by [specifying a JSON object](ckecli.md#ckecli-auto-repair-set-variables-file).
+
+```json
+{
+  "having": {
+    "states": ["UNHEALTHY", "UNREACHABLE"]
+  },
+  "notHaving": {
+    "roles": ["boot"]
+  }
+}
+```
+
+The type of `$having` and `$notHaving` is `MachineParams`.
+Consult [GraphQL schema][schema] for the definition of `MachineParams`.
+
+Enqueue limiters
+----------------
+
+### Limiter for a single machine
+
+In order not to repeat repair operations too quickly for a single unstable machine, CKE checks recent repair queue entries before enqueueing.
+If it finds a recent entry for the machine in question, no matter whether the entry has finished or not, it refrains from creating an additional entry.
+
+CKE considers all persisting queue entries as "recent" for simplicity.
+A user should delete a finished repair queue entry for a machine once they consider the machine repaired.
+* If a repair queue entry has finished with success and a user considers the machine stable, they should delete the finished entry.
+* If a repair queue entry has finished with failure or a user considers the machine unstable, they should repair the machine manually. After the machine gets repaired, they should delete the finished entry.
+
+### Limiter for a cluster
+
+Sabakan may occasionally report false-positive non-healthy machines.
+If CKE believes all of the failure reports and initiates a lot of repair operations, the Kubernetes cluster will be stuck -- or worse, corrupted.
+
+Even when the failure reports are correct, it would be good for CKE to refrain from repairing too many machines.
+For example, the failure of many servers might be caused by the temporary power failure of a whole server rack.
+In that case, CKE should not mark the machines unrepairable as a result of pointless repair operations.
+Once the machines are marked unrepairable, sabakan will delete all data on those machines.
+
+In order not to initiate too many repair operations, CKE checks the number of recent repair queue entries plus the number of new failure reports before enqueueing.
+If it finds excessive numbers of entries/reports, no matter whether the entries have finished or not, it refrains from creating an additional entry.
+
+The maximum number of recent repair queue entries and new failure reports is [configurable](ckecli.md#ckecli-constraints-set-name-value) as a [constraint `maximum-repair-queue-entries`](constraints.md).
+
+As stated above, CKE considers all persisting queue entries as "recent" for simplicity.
+
+
+[sabakan]: https://github.com/cybozu-go/sabakan
+[schema]: https://github.com/cybozu-go/sabakan/blob/main/gql/graph/schema.graphqls
@@ -137,4 +137,13 @@ func testCKECLI() {
 		ckecliSafe("sabakan", "enable")
 		ckecliSafe("sabakan", "get-url")
 	})
+
+	It("should invoke auto-repair subcommand successfully", func() {
+		ckecliSafe("auto-repair", "is-enabled")
+		ckecliSafe("auto-repair", "disable")
+		ckecliSafe("auto-repair", "enable")
+		f := remoteTempFile(`{"having":{"states":["UNHEALTHY","UNREACHABLE"]},"notHaving":{"roles":["boot"]}}`)
+		ckecliSafe("auto-repair", "set-variables", f)
+		ckecliSafe("auto-repair", "get-variables")
+	})
 }
@@ -0,0 +1,16 @@
+package cmd
+
+import (
+	"github.com/spf13/cobra"
+)
+
+// autoRepairCmd represents the auto-repair command
+var autoRepairCmd = &cobra.Command{
+	Use:   "auto-repair",
+	Short: "auto-repair subcommand",
+	Long:  `auto-repair subcommand`,
+}
+
+func init() {
+	rootCmd.AddCommand(autoRepairCmd)
+}
@@ -0,0 +1,26 @@
+package cmd
+
+import (
+	"context"
+
+	"github.com/cybozu-go/well"
+	"github.com/spf13/cobra"
+)
+
+var autoRepairDisableCmd = &cobra.Command{
+	Use:   "disable",
+	Short: "disable sabakan-triggered automatic repair",
+	Long:  `Disable sabakan-triggered automatic repair.`,
+
+	RunE: func(cmd *cobra.Command, args []string) error {
+		well.Go(func(ctx context.Context) error {
+			return storage.EnableAutoRepair(ctx, false)
+		})
+		well.Stop()
+		return well.Wait()
+	},
+}
+
+func init() {
+	autoRepairCmd.AddCommand(autoRepairDisableCmd)
+}
@@ -0,0 +1,26 @@
+package cmd
+
+import (
+	"context"
+
+	"github.com/cybozu-go/well"
+	"github.com/spf13/cobra"
+)
+
+var autoRepairEnableCmd = &cobra.Command{
+	Use:   "enable",
+	Short: "enable sabakan-triggered automatic repair",
+	Long:  `Enable sabakan-triggered automatic repair.`,
+
+	RunE: func(cmd *cobra.Command, args []string) error {
+		well.Go(func(ctx context.Context) error {
+			return storage.EnableAutoRepair(ctx, true)
+		})
+		well.Stop()
+		return well.Wait()
+	},
+}
+
+func init() {
+	autoRepairCmd.AddCommand(autoRepairEnableCmd)
+}
Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ type Constraints struct {`
`8`	`8`	MinimumWorkers int `json:"minimum-workers"`
`9`	`9`	MaximumWorkers int `json:"maximum-workers"`
`10`	`10`	RebootMaximumUnreachable int `json:"maximum-unreachable-nodes-for-reboot"`
	`11`	+ MaximumRepairs int `json:"maximum-repair-queue-entries"`
`11`	`12`	`}`
`12`	`13`
`13`	`14`	`// Check checks the cluster satisfies the constraints`
`@@ -41,5 +42,6 @@ func DefaultConstraints() *Constraints {`
`41`	`42`	`MinimumWorkers: 1,`
`42`	`43`	`MaximumWorkers: 0,`
`43`	`44`	`RebootMaximumUnreachable: 0,`
	`45`	`+ MaximumRepairs: 0,`
`44`	`46`	`}`
`45`	`47`	`}`