Skip to content

Commit

Permalink
cmd/coordinator: consolidate and increase global VM deletion timeout
Browse files Browse the repository at this point in the history
We had a lot of flexibility over timeouts, making their maintenance
harder. Consolidate it to a single timeout in the pool package, and
modify it from 45 minutes to 2 hours.

There's room for improvement in how we maintain this timeout,
but I'm leaving that for future work (with a tracking issue).

Fixes golang/go#52591.
Updates golang/go#52929.
Updates golang/go#49666.
Updates golang/go#42699.

Change-Id: I2ad92648d89a714397bd8b0e1ec490fc9f6d6790
Reviewed-on: https://go-review.googlesource.com/c/build/+/406216
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Heschi Kreinick <heschi@google.com>
  • Loading branch information
dmitshur committed May 16, 2022
1 parent acd0c19 commit 7413a13
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 66 deletions.
2 changes: 1 addition & 1 deletion buildlet/buildlet.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type VMOpts struct {

// DeleteIn optionally specifies a duration at which
// to delete the VM.
// If zero, a reasonable default is used.
// If zero, a short default is used (not enough for longtest builders).
// Negative means no deletion timeout.
DeleteIn time.Duration

Expand Down
1 change: 1 addition & 0 deletions buildlet/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ func (c *EC2Client) StartNewVM(ctx context.Context, buildEnv *buildenv.Environme
opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
}
if opts.DeleteIn == 0 {
// Note: This implements a short default in the rare case the caller doesn't care.
opts.DeleteIn = 30 * time.Minute
}

Expand Down
9 changes: 1 addition & 8 deletions cmd/coordinator/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,6 @@ var maintnerClient apipb.MaintnerServiceClient

const (
maxStatusDone = 30

// vmDeleteTimeout and podDeleteTimeout is how long before we delete a VM.
// In practice this need only be as long as the slowest
// builder (plan9 currently), because on startup this program
// already deletes all buildlets it doesn't know about
// (i.e. ones from a previous instance of the coordinator).
vmDeleteTimeout = 45 * time.Minute
)

// Fake keys signed by a fake CA.
Expand Down Expand Up @@ -266,7 +259,7 @@ func main() {
// a shared package.
pool.SetBuilderMasterKey(masterKey())

err := pool.InitGCE(sc, vmDeleteTimeout, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
err := pool.InitGCE(sc, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
if err != nil {
if *mode == "" {
*mode = "dev"
Expand Down
24 changes: 13 additions & 11 deletions dashboard/builders.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ var Hosts = map[string]*HostConfig{
goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
Notes: "OpenBSD 6.8 (with 009_exit syspatch); GCE VM is built from script in build/env/openbsd-386",
SSHUsername: "gopher",
DeleteTimeout: 60 * time.Minute, // See golang/go#49666.
},
"host-openbsd-amd64-70": &HostConfig{
VMImage: "openbsd-amd64-70",
Expand All @@ -254,7 +253,6 @@ var Hosts = map[string]*HostConfig{
goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
Notes: "OpenBSD 7.0; GCE VM is built from script in build/env/openbsd-386. n2-highcpu host.",
SSHUsername: "gopher",
DeleteTimeout: 60 * time.Minute, // See golang/go#49666.
},
"host-openbsd-386-70-n2d": &HostConfig{
// This host config is only for the runtime team to use investigating golang/go#49209.
Expand Down Expand Up @@ -689,13 +687,13 @@ var Hosts = map[string]*HostConfig{
env: []string{"GOROOT_BOOTSTRAP=/usr/lib/go"},
},
"host-linux-amd64-perf": &HostConfig{
Notes: "Cascade Lake performance testing machines",
machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture.
ContainerImage: "linux-x86-bullseye:latest",
buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
env: []string{"GOROOT_BOOTSTRAP=/go1.4"},
SSHUsername: "root",
DeleteTimeout: 8 * time.Hour,
Notes: "Cascade Lake performance testing machines",
machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture.
ContainerImage: "linux-x86-bullseye:latest",
buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
env: []string{"GOROOT_BOOTSTRAP=/go1.4"},
SSHUsername: "root",
CustomDeleteTimeout: 8 * time.Hour,
},
}

Expand Down Expand Up @@ -786,8 +784,12 @@ type HostConfig struct {
// EC2 options
isEC2 bool // if true, the instance is configured to run on EC2

// GCE or EC2 options
DeleteTimeout time.Duration // Timeout after which the VM is destroyed. Zero duration uses global default.
// GCE or EC2 options:
//
// CustomDeleteTimeout is an optional custom timeout after which the VM is forcibly destroyed and the build is retried.
// Zero duration defers decision to internal/coordinator/pool package, which is enough for longtest post-submit builds.
// (This is generally an internal implementation detail, currently left behind only for the -perf builder.)
CustomDeleteTimeout time.Duration

// Reverse options
ExpectNum int // expected number of reverse buildlets of this type
Expand Down
12 changes: 1 addition & 11 deletions internal/coordinator/pool/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ type awsClient interface {
// EC2Opt is optional configuration for the the buildlet.
type EC2Opt func(*EC2Buildlet)

// WithVMDeleteTimeout sets the VM deletion timeout for all EC2 VMs.
func WithVMDeleteTimeout(timeout time.Duration) EC2Opt {
return func(eb *EC2Buildlet) {
eb.vmDeleteTimeout = timeout
}
}

// EC2Buildlet manages a pool of AWS EC2 buildlets.
type EC2Buildlet struct {
// awsClient is the client used to interact with AWS services.
Expand All @@ -92,8 +85,6 @@ type EC2Buildlet struct {
ledger *ledger
// cancelPoll will signal to the pollers to discontinue polling.
cancelPoll context.CancelFunc
// vmDeleteTimeout contains the timeout used to determine if a VM should be deleted.
vmDeleteTimeout time.Duration
// pollWait waits for all pollers to terminate polling.
pollWait sync.WaitGroup
}
Expand Down Expand Up @@ -121,7 +112,6 @@ func NewEC2Buildlet(client *cloud.AWSClient, buildEnv *buildenv.Environment, hos
hosts: hosts,
isRemoteBuildlet: fn,
ledger: newLedger(),
vmDeleteTimeout: 45 * time.Minute, // default VM delete timeout
}
for _, opt := range opts {
opt(b)
Expand Down Expand Up @@ -195,7 +185,7 @@ func (eb *EC2Buildlet) GetBuildlet(ctx context.Context, hostType string, lg Logg
Zone: "", // allow the EC2 api pick an availability zone with capacity
TLS: kp,
Meta: make(map[string]string),
DeleteIn: determineDeleteTimeout(hconf, eb.vmDeleteTimeout),
DeleteIn: determineDeleteTimeout(hconf),
OnInstanceRequested: func() {
log.Printf("EC2 VM %q now booting", instName)
},
Expand Down
1 change: 1 addition & 0 deletions internal/coordinator/pool/ec2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,7 @@ func (f *fakeEC2BuildletClient) StartNewVM(ctx context.Context, buildEnv *builde
return nil, fmt.Errorf("invalid vmName: %q and hostType: %q", vmName, hostType)
}
if opts.DeleteIn == 0 {
// Note: This implements a short default in the rare case the caller doesn't care.
opts.DeleteIn = 30 * time.Minute
}
if !f.createVMRequestSuccess {
Expand Down
6 changes: 2 additions & 4 deletions internal/coordinator/pool/gce.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,14 @@ var (

// values created due to separating the buildlet pools into a separate package
gceMode string
deleteTimeout time.Duration
testFiles map[string]string
basePinErr *atomic.Value
isGCERemoteBuildlet IsRemoteBuildletFunc
)

// InitGCE initializes the GCE buildlet pool.
func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
func InitGCE(sc *secret.Client, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
gceMode = mode
deleteTimeout = vmDeleteTimeout
testFiles = tFiles
basePinErr = basePin
isGCERemoteBuildlet = fn
Expand Down Expand Up @@ -417,7 +415,7 @@ func (p *GCEBuildlet) GetBuildlet(ctx context.Context, hostType string, lg Logge

log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone)
bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{
DeleteIn: determineDeleteTimeout(hconf, deleteTimeout),
DeleteIn: determineDeleteTimeout(hconf),
OnInstanceRequested: func() {
log.Printf("GCE VM %q now booting", instName)
},
Expand Down
13 changes: 2 additions & 11 deletions internal/coordinator/pool/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,6 @@ import (
This file implements the Kubernetes-based buildlet pool.
*/

const (
// podDeleteTimeout is how long before we delete a VM.
// In practice this need only be as long as the slowest
// builder (plan9 currently), because on startup this program
// already deletes all buildlets it doesn't know about
// (i.e. ones from a previous instance of the coordinator).
podDeleteTimeout = 45 * time.Minute
)

// Initialized by initKube:
var (
buildletsKubeClient *kubernetes.Client // for "buildlets" cluster
Expand Down Expand Up @@ -270,7 +261,7 @@ func (p *kubeBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg
ProjectID: NewGCEConfiguration().BuildEnv().ProjectName,
ImageRegistry: registryPrefix,
Description: fmt.Sprintf("Go Builder for %s", hostType),
DeleteIn: determineDeleteTimeout(hconf, podDeleteTimeout),
DeleteIn: determineDeleteTimeout(hconf),
OnPodCreating: func() {
lg.LogEventTime("pod_creating")
p.setPodUsed(podName, true)
Expand Down Expand Up @@ -406,7 +397,7 @@ func (p *kubeBuildletPool) String() string {
return fmt.Sprintf("Kubernetes pool capacity: %d/%d", inUse, total)
}

// CleanUpOldPods loops forever and periodically enumerates pods
// CleanUpOldPodsLoop loops forever and periodically enumerates pods
// and deletes those which have expired.
//
// A Pod is considered expired if it has a "delete-at" metadata
Expand Down
39 changes: 31 additions & 8 deletions internal/coordinator/pool/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,40 @@ func instanceName(hostType string, length int) string {
return fmt.Sprintf("buildlet-%s-rn%s", strings.TrimPrefix(hostType, "host-"), randHex(length))
}

// determineDeleteTimeout determines the buildlet timeout duration with the
// following priority:
// determineDeleteTimeout reports the buildlet delete timeout duration
// with the following priority:
//
// 1. Host type default from host config.
// 2. Global default from 'timeout'.
func determineDeleteTimeout(host *dashboard.HostConfig, timeout time.Duration) time.Duration {
if host.DeleteTimeout != 0 {
return host.DeleteTimeout
// 1. Host type override from host config.
// 2. Global default.
func determineDeleteTimeout(host *dashboard.HostConfig) time.Duration {
if host.CustomDeleteTimeout != 0 {
return host.CustomDeleteTimeout
}

return timeout
// The value we return below is effectively a global default.
//
// The comment of CleanUpOldVMs (and CleanUpOldPodsLoop) includes:
//
// This is the safety mechanism to delete VMs which stray from the
// normal deleting process. VMs are created to run a single build and
// should be shut down by a controlling process. Due to various types
// of failures, they might get stranded. To prevent them from getting
// stranded and wasting resources forever, we instead set the
// "delete-at" metadata attribute on them when created to some time
// that's well beyond their expected lifetime.
//
// Issue go.dev/issue/52929 tracks what to do about this global
// timeout in the long term. Unless something changes,
// it needs to be maintained manually so that it's always
// "well beyond their expected lifetime" of each builder that doesn't
// otherwise override this timeout—otherwise it'll cause even more
// resources to be used due the automatic (and unlimited) retrying
// as described in go.dev/issue/42699.
//
// A global timeout of 45 minutes was chosen in 2015.
// Longtest builders were added in 2018 started to reach 45 mins by 2021-2022.
// Try 2 hours next, which might last some years (depending on test volume and test speed).
return 2 * time.Hour
}

// isBuildlet checks the name string in order to determine if the name is for a buildlet.
Expand Down
20 changes: 8 additions & 12 deletions internal/coordinator/pool/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,25 @@ func TestPoolDetermineDeleteTimeout(t *testing.T) {
testCases := []struct {
desc string
hostValue time.Duration
timeout time.Duration
wantTimeout time.Duration
}{
{
desc: "from-host",
hostValue: time.Minute,
timeout: time.Second,
wantTimeout: time.Minute,
desc: "default",
wantTimeout: 2 * time.Hour,
},
{
desc: "from-argument",
hostValue: 0,
timeout: time.Second,
wantTimeout: time.Second,
desc: "from-host",
hostValue: 8 * time.Hour,
wantTimeout: 8 * time.Hour,
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
h := &dashboard.HostConfig{
DeleteTimeout: tc.hostValue,
CustomDeleteTimeout: tc.hostValue,
}
if got := determineDeleteTimeout(h, tc.timeout); got != tc.wantTimeout {
t.Errorf("determineDeleteTimeout(%+v, %s) = %s; want %s", h, tc.timeout, got, tc.wantTimeout)
if got := determineDeleteTimeout(h); got != tc.wantTimeout {
t.Errorf("determineDeleteTimeout(%+v) = %s; want %s", h, got, tc.wantTimeout)
}
})
}
Expand Down

0 comments on commit 7413a13

Please sign in to comment.