Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(auto-cancel): server-side logic for auto canceling obsolete builds #911

Merged
merged 13 commits into from
Oct 20, 2023
4 changes: 1 addition & 3 deletions api/auth/validate_oauth.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// Copyright (c) 2023 Target Brands, Inc. All rights reserved.
//
// Use of this source code is governed by the LICENSE file in this repository.
// SPDX-License-Identifier: Apache-2.0

package auth

Expand Down
182 changes: 182 additions & 0 deletions api/build/auto_cancel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// SPDX-License-Identifier: Apache-2.0

package build

import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"

"github.com/gin-gonic/gin"
"github.com/go-vela/server/database"
"github.com/go-vela/server/internal/token"
"github.com/go-vela/types/constants"
"github.com/go-vela/types/library"
)

// AutoCancel is a helper function that checks to see if any pending or running
// builds for the repo can be replaced by the current build.
func AutoCancel(c *gin.Context, b *library.Build, rBs []*library.Build, r *library.Repo, pending, running bool) error {
ecrupper marked this conversation as resolved.
Show resolved Hide resolved
// iterate through pending and running builds
for _, rB := range rBs {
// if build is the current build, continue
if rB.GetID() == b.GetID() {
continue
}

// ensure criteria is met before auto canceling
if (strings.EqualFold(rB.GetEvent(), constants.EventPush) &&
strings.EqualFold(b.GetEvent(), constants.EventPush) &&
strings.EqualFold(b.GetBranch(), rB.GetBranch())) ||
(strings.EqualFold(rB.GetEvent(), constants.EventPull) &&
strings.EqualFold(b.GetEventAction(), rB.GetEventAction()) &&
strings.EqualFold(b.GetHeadRef(), rB.GetHeadRef())) {
switch {
case strings.EqualFold(rB.GetStatus(), constants.StatusPending) && pending:
// pending build will be handled gracefully by worker once pulled off queue
rB.SetStatus(constants.StatusCanceled)

_, err := database.FromContext(c).UpdateBuild(c, rB)
if err != nil {
return err
}
case strings.EqualFold(rB.GetStatus(), constants.StatusRunning) && running:
// call cancelRunning routine for builds already running on worker
err := cancelRunning(c, rB, r)
if err != nil {
return err
}
default:
continue
}

// set error message that references current build
rB.SetError(fmt.Sprintf("build was auto canceled in favor of build %d", b.GetNumber()))

_, err := database.FromContext(c).UpdateBuild(c, rB)
if err != nil {
return err
}
}
}

return nil
}

// cancelRunning is a helper function that determines the executor currently running a build and sends an API call
// to that executor's worker to cancel the build.
func cancelRunning(c *gin.Context, b *library.Build, r *library.Repo) error {
e := new([]library.Executor)
// retrieve the worker
w, err := database.FromContext(c).GetWorkerForHostname(c, b.GetHost())
if err != nil {
return err
}

// prepare the request to the worker to retrieve executors
client := http.DefaultClient
client.Timeout = 30 * time.Second
endpoint := fmt.Sprintf("%s/api/v1/executors", w.GetAddress())

req, err := http.NewRequestWithContext(context.Background(), "GET", endpoint, nil)
if err != nil {
return err
}

tm := c.MustGet("token-manager").(*token.Manager)

// set mint token options
mto := &token.MintTokenOpts{
Hostname: "vela-server",
TokenType: constants.WorkerAuthTokenType,
TokenDuration: time.Minute * 1,
}

// mint token
tkn, err := tm.MintToken(mto)
if err != nil {
return err
}

// add the token to authenticate to the worker
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", tkn))

// make the request to the worker and check the response
resp, err := client.Do(req)
if err != nil {
return err
}

defer resp.Body.Close()

// Read Response Body
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return err
}

// parse response and validate at least one item was returned
err = json.Unmarshal(respBody, e)
if err != nil {
return err
}

for _, executor := range *e {
// check each executor on the worker running the build to see if it's running the build we want to cancel
if strings.EqualFold(executor.Repo.GetFullName(), r.GetFullName()) && *executor.GetBuild().Number == b.GetNumber() {
// prepare the request to the worker
client := http.DefaultClient
client.Timeout = 30 * time.Second

// set the API endpoint path we send the request to
u := fmt.Sprintf("%s/api/v1/executors/%d/build/cancel", w.GetAddress(), executor.GetID())

req, err := http.NewRequestWithContext(context.Background(), "DELETE", u, nil)
if err != nil {
return err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are your thoughts on attempting the other cancels in the event of an error on 1 of them? maybe track an error in the outer scope and check that at the end? idk

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean like if there are multiple running builds to auto-cancel, and one of those attempts fails?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i was looking at the loop over executors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only one build will match, so there should only be one attempt at cancellation.

But we do loop over pending/running builds: https://github.com/go-vela/server/pull/911/files#diff-2e48a4018b70aac91235c0a2f5ec50d7ae186de39e3bd603d7e0fcfe6fd5ab07R25-R65.

I was unsure as to whether or not to immediately error out upon failing one of those (provided they match all the other criteria). What do you think? If this policy was implemented, there really should only be one build at a time that gets the boot. Each new build will supersede the older build, which superseded a previous build, and so on.

All's that to say, even though there are a couple loops here, we're really just performing one operation overall — I think... 😅

}

tm := c.MustGet("token-manager").(*token.Manager)

// set mint token options
mto := &token.MintTokenOpts{
Hostname: "vela-server",
TokenType: constants.WorkerAuthTokenType,
TokenDuration: time.Minute * 1,
}

// mint token
tkn, err := tm.MintToken(mto)
if err != nil {
return err
}

// add the token to authenticate to the worker
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", tkn))

// perform the request to the worker
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

// Read Response Body
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return err
}

err = json.Unmarshal(respBody, b)
if err != nil {
return err
}
}
}

return nil
}
13 changes: 11 additions & 2 deletions api/build/cancel.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func CancelBuild(c *gin.Context) {
e := executors.Retrieve(c)
o := org.Retrieve(c)
r := repo.Retrieve(c)
u := user.Retrieve(c)
user := user.Retrieve(c)
ctx := c.Request.Context()

entry := fmt.Sprintf("%s/%d", r.GetFullName(), b.GetNumber())
Expand All @@ -89,7 +89,7 @@ func CancelBuild(c *gin.Context) {
"build": b.GetNumber(),
"org": o,
"repo": r.GetName(),
"user": u.GetName(),
"user": user.GetName(),
}).Infof("canceling build %s", entry)

switch b.GetStatus() {
Expand Down Expand Up @@ -171,6 +171,15 @@ func CancelBuild(c *gin.Context) {

c.JSON(resp.StatusCode, b)

b.SetError(fmt.Sprintf("build was canceled by %s", user.GetName()))
b, err = database.FromContext(c).UpdateBuild(ctx, b)
ecrupper marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
ecrupper marked this conversation as resolved.
Show resolved Hide resolved
retErr := fmt.Errorf("unable to update status for build %s: %w", entry, err)
util.HandleError(c, http.StatusInternalServerError, retErr)

return
}

return
}
}
Expand Down
4 changes: 1 addition & 3 deletions api/queue/queue.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// Copyright (c) 2023 Target Brands, Inc. All rights reserved.
//
// Use of this source code is governed by the LICENSE file in this repository.
// SPDX-License-Identifier: Apache-2.0

package queue

Expand Down
27 changes: 27 additions & 0 deletions api/webhook/post.go
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,33 @@ func PostWebhook(c *gin.Context) {
logrus.Errorf("unable to set commit status for %s/%d: %v", repo.GetFullName(), b.GetNumber(), err)
}

// if anything is provided in the auto_cancel metadata, then we start with true
runAutoCancel := p.Metadata.AutoCancel.Running || p.Metadata.AutoCancel.Pending || p.Metadata.AutoCancel.DefaultBranch

// if the event is a push to the default branch and the AutoCancel.DefaultBranch value is false, bypass auto cancel
if strings.EqualFold(b.GetEvent(), constants.EventPush) && strings.EqualFold(b.GetBranch(), repo.GetBranch()) && !p.Metadata.AutoCancel.DefaultBranch {
runAutoCancel = false
}

// if event is push or pull_request:synchronize, there is a chance this build could be superceding a stale build
//
// fetch pending and running builds for this repo in order to validate their merit to continue running.
if runAutoCancel &&
((strings.EqualFold(b.GetEvent(), constants.EventPull) && strings.EqualFold(b.GetEventAction(), constants.ActionSynchronize)) ||
strings.EqualFold(b.GetEvent(), constants.EventPush)) {
// fetch pending and running builds
rBs, err := database.FromContext(c).ListPendingAndRunningBuildsForRepo(c, repo)
if err != nil {
logrus.Errorf("unable to fetch pending and running builds for %s: %v", repo.GetFullName(), err)
}

// call auto cancel routine
err = build.AutoCancel(c, b, rBs, repo, p.Metadata.AutoCancel.Pending, p.Metadata.AutoCancel.Running)
if err != nil {
logrus.Errorf("unable to cancel running build: %v", err)
}
}

// publish the build to the queue
go build.PublishToQueue(
ctx,
Expand Down
Loading
Loading