Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(auto-cancel): server-side logic for auto canceling obsolete builds #911

Merged
merged 13 commits into from
Oct 20, 2023
4 changes: 1 addition & 3 deletions api/auth/validate_oauth.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// Copyright (c) 2023 Target Brands, Inc. All rights reserved.
//
// Use of this source code is governed by the LICENSE file in this repository.
// SPDX-License-Identifier: Apache-2.0

package auth

Expand Down
181 changes: 181 additions & 0 deletions api/build/auto_cancel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// SPDX-License-Identifier: Apache-2.0

package build

import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"

"github.com/gin-gonic/gin"
"github.com/go-vela/server/database"
"github.com/go-vela/server/internal/token"
"github.com/go-vela/types/constants"
"github.com/go-vela/types/library"
"github.com/go-vela/types/pipeline"
)

// AutoCancel is a helper function that checks to see if any pending or running
// builds for the repo can be replaced by the current build.
func AutoCancel(c *gin.Context, b *library.Build, rB *library.Build, r *library.Repo, cancelOpts *pipeline.CancelOptions) (bool, error) {
// if build is the current build, continue
if rB.GetID() == b.GetID() {
return false, nil
}

// ensure criteria is met before auto canceling (push to same branch, or pull with same action from same head_ref)
if (strings.EqualFold(rB.GetEvent(), constants.EventPush) &&
strings.EqualFold(b.GetEvent(), constants.EventPush) &&
strings.EqualFold(b.GetBranch(), rB.GetBranch())) ||
(strings.EqualFold(rB.GetEvent(), constants.EventPull) &&
strings.EqualFold(b.GetEventAction(), rB.GetEventAction()) &&
strings.EqualFold(b.GetHeadRef(), rB.GetHeadRef())) {
switch {
case strings.EqualFold(rB.GetStatus(), constants.StatusPending) && cancelOpts.Pending:
// pending build will be handled gracefully by worker once pulled off queue
rB.SetStatus(constants.StatusCanceled)

_, err := database.FromContext(c).UpdateBuild(c, rB)
if err != nil {
return false, err
}
case strings.EqualFold(rB.GetStatus(), constants.StatusRunning) && cancelOpts.Running:
// call cancelRunning routine for builds already running on worker
err := cancelRunning(c, rB, r)
if err != nil {
return false, err
}
default:
return false, nil
}

// set error message that references current build
rB.SetError(fmt.Sprintf("build was auto canceled in favor of build %d", b.GetNumber()))

_, err := database.FromContext(c).UpdateBuild(c, rB)
if err != nil {
// if this call fails, we still canceled the build, so return true
return true, err
}
}

return true, nil
}

// cancelRunning is a helper function that determines the executor currently running a build and sends an API call
// to that executor's worker to cancel the build.
func cancelRunning(c *gin.Context, b *library.Build, r *library.Repo) error {
e := new([]library.Executor)
// retrieve the worker
w, err := database.FromContext(c).GetWorkerForHostname(c, b.GetHost())
if err != nil {
return err
}

// prepare the request to the worker to retrieve executors
client := http.DefaultClient
client.Timeout = 30 * time.Second
endpoint := fmt.Sprintf("%s/api/v1/executors", w.GetAddress())

req, err := http.NewRequestWithContext(context.Background(), "GET", endpoint, nil)
if err != nil {
return err
}

tm := c.MustGet("token-manager").(*token.Manager)

// set mint token options
mto := &token.MintTokenOpts{
Hostname: "vela-server",
TokenType: constants.WorkerAuthTokenType,
TokenDuration: time.Minute * 1,
}

// mint token
tkn, err := tm.MintToken(mto)
if err != nil {
return err
}

// add the token to authenticate to the worker
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", tkn))

// make the request to the worker and check the response
resp, err := client.Do(req)
if err != nil {
return err
}

defer resp.Body.Close()

// Read Response Body
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return err
}

// parse response and validate at least one item was returned
err = json.Unmarshal(respBody, e)
if err != nil {
return err
}

for _, executor := range *e {
// check each executor on the worker running the build to see if it's running the build we want to cancel
if strings.EqualFold(executor.Repo.GetFullName(), r.GetFullName()) && *executor.GetBuild().Number == b.GetNumber() {
// prepare the request to the worker
client := http.DefaultClient
client.Timeout = 30 * time.Second

// set the API endpoint path we send the request to
u := fmt.Sprintf("%s/api/v1/executors/%d/build/cancel", w.GetAddress(), executor.GetID())

req, err := http.NewRequestWithContext(context.Background(), "DELETE", u, nil)
if err != nil {
return err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are your thoughts on attempting the other cancels in the event of an error on 1 of them? maybe track an error in the outer scope and check that at the end? idk

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean like if there are multiple running builds to auto-cancel, and one of those attempts fails?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i was looking at the loop over executors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only one build will match, so there should only be one attempt at cancellation.

But we do loop over pending/running builds: https://github.com/go-vela/server/pull/911/files#diff-2e48a4018b70aac91235c0a2f5ec50d7ae186de39e3bd603d7e0fcfe6fd5ab07R25-R65.

I was unsure as to whether or not to immediately error out upon failing one of those (provided they match all the other criteria). What do you think? If this policy was implemented, there really should only be one build at a time that gets the boot. Each new build will supersede the older build, which superseded a previous build, and so on.

All's that to say, even though there are a couple loops here, we're really just performing one operation overall — I think... 😅

}

tm := c.MustGet("token-manager").(*token.Manager)

// set mint token options
mto := &token.MintTokenOpts{
Hostname: "vela-server",
TokenType: constants.WorkerAuthTokenType,
TokenDuration: time.Minute * 1,
}

// mint token
tkn, err := tm.MintToken(mto)
if err != nil {
return err
}

// add the token to authenticate to the worker
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", tkn))

// perform the request to the worker
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

// Read Response Body
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return err
}

err = json.Unmarshal(respBody, b)
if err != nil {
return err
}
}
}

return nil
}
14 changes: 12 additions & 2 deletions api/build/cancel.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func CancelBuild(c *gin.Context) {
e := executors.Retrieve(c)
o := org.Retrieve(c)
r := repo.Retrieve(c)
u := user.Retrieve(c)
user := user.Retrieve(c)
ctx := c.Request.Context()

entry := fmt.Sprintf("%s/%d", r.GetFullName(), b.GetNumber())
Expand All @@ -89,7 +89,7 @@ func CancelBuild(c *gin.Context) {
"build": b.GetNumber(),
"org": o,
"repo": r.GetName(),
"user": u.GetName(),
"user": user.GetName(),
}).Infof("canceling build %s", entry)

switch b.GetStatus() {
Expand Down Expand Up @@ -169,6 +169,16 @@ func CancelBuild(c *gin.Context) {
return
}

b.SetError(fmt.Sprintf("build was canceled by %s", user.GetName()))

b, err = database.FromContext(c).UpdateBuild(ctx, b)
ecrupper marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
ecrupper marked this conversation as resolved.
Show resolved Hide resolved
retErr := fmt.Errorf("unable to update status for build %s: %w", entry, err)
util.HandleError(c, http.StatusInternalServerError, retErr)

return
}

c.JSON(resp.StatusCode, b)

return
Expand Down
4 changes: 1 addition & 3 deletions api/queue/queue.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// Copyright (c) 2023 Target Brands, Inc. All rights reserved.
//
// Use of this source code is governed by the LICENSE file in this repository.
// SPDX-License-Identifier: Apache-2.0

package queue

Expand Down
34 changes: 34 additions & 0 deletions api/webhook/post.go
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,40 @@ func PostWebhook(c *gin.Context) {
repo,
u,
)

// if anything is provided in the auto_cancel metadata, then we start with true
runAutoCancel := p.Metadata.AutoCancel.Running || p.Metadata.AutoCancel.Pending || p.Metadata.AutoCancel.DefaultBranch

// if the event is a push to the default branch and the AutoCancel.DefaultBranch value is false, bypass auto cancel
if strings.EqualFold(b.GetEvent(), constants.EventPush) && strings.EqualFold(b.GetBranch(), repo.GetBranch()) && !p.Metadata.AutoCancel.DefaultBranch {
runAutoCancel = false
}

// if event is push or pull_request:synchronize, there is a chance this build could be superceding a stale build
//
// fetch pending and running builds for this repo in order to validate their merit to continue running.
if runAutoCancel &&
((strings.EqualFold(b.GetEvent(), constants.EventPull) && strings.EqualFold(b.GetEventAction(), constants.ActionSynchronize)) ||
strings.EqualFold(b.GetEvent(), constants.EventPush)) {
// fetch pending and running builds
rBs, err := database.FromContext(c).ListPendingAndRunningBuildsForRepo(c, repo)
if err != nil {
logrus.Errorf("unable to fetch pending and running builds for %s: %v", repo.GetFullName(), err)
}

for _, rB := range rBs {
// call auto cancel routine
canceled, err := build.AutoCancel(c, b, rB, repo, p.Metadata.AutoCancel)
if err != nil {
// continue cancel loop if error, but log based on type of error
if canceled {
logrus.Errorf("unable to update canceled build error message: %v", err)
} else {
logrus.Errorf("unable to cancel running build: %v", err)
}
}
}
}
}

// handleRepositoryEvent is a helper function that processes repository events from the SCM and updates
Expand Down
2 changes: 2 additions & 0 deletions database/build/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ type BuildInterface interface {
ListBuildsForRepo(context.Context, *library.Repo, map[string]interface{}, int64, int64, int, int) ([]*library.Build, int64, error)
// ListPendingAndRunningBuilds defines a function that gets a list of pending and running builds.
ListPendingAndRunningBuilds(context.Context, string) ([]*library.BuildQueue, error)
// ListPendingAndRunningBuildsForRepo defines a function that gets a list of pending and running builds for a repo.
ListPendingAndRunningBuildsForRepo(context.Context, *library.Repo) ([]*library.Build, error)
// UpdateBuild defines a function that updates an existing build.
UpdateBuild(context.Context, *library.Build) (*library.Build, error)
}
45 changes: 45 additions & 0 deletions database/build/list_pending_running_repo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// SPDX-License-Identifier: Apache-2.0

package build

import (
"context"

"github.com/go-vela/types/constants"
"github.com/go-vela/types/database"
"github.com/go-vela/types/library"
)

// ListPendingAndRunningBuilds gets a list of all pending and running builds in the provided timeframe from the database.
func (e *engine) ListPendingAndRunningBuildsForRepo(ctx context.Context, repo *library.Repo) ([]*library.Build, error) {
e.logger.Trace("listing all pending and running builds from the database")

// variables to store query results and return value
b := new([]database.Build)
builds := []*library.Build{}

// send query to the database and store result in variable
err := e.client.
Table(constants.TableBuild).
Select("*").
Where("repo_id = ?", repo.GetID()).
Where("status = 'running' OR status = 'pending'").
Find(&b).
Error
if err != nil {
return nil, err
}

// iterate through all query results
for _, build := range *b {
// https://golang.org/doc/faq#closures_and_goroutines
tmp := build

// convert query result to library type
//
// https://pkg.go.dev/github.com/go-vela/types/database#Build.ToLibrary
builds = append(builds, tmp.ToLibrary())
}

return builds, nil
}
Loading