From 7413a137a563c0980f96f80dbc5c9b5d1e676616 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Fri, 13 May 2022 12:08:32 -0400 Subject: [PATCH] cmd/coordinator: consolidate and increase global VM deletion timeout We had a lot of flexibility over timeouts, making their maintenance harder. Consolidate it to a single timeout in the pool package, and modify it from 45 minutes to 2 hours. There's room for improvement in how we maintain this timeout, but I'm leaving that for future work (with a tracking issue). Fixes golang/go#52591. Updates golang/go#52929. Updates golang/go#49666. Updates golang/go#42699. Change-Id: I2ad92648d89a714397bd8b0e1ec490fc9f6d6790 Reviewed-on: https://go-review.googlesource.com/c/build/+/406216 Run-TryBot: Dmitri Shuralyov TryBot-Result: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Carlos Amedee Reviewed-by: Heschi Kreinick --- buildlet/buildlet.go | 2 +- buildlet/ec2.go | 1 + cmd/coordinator/coordinator.go | 9 +----- dashboard/builders.go | 24 ++++++++-------- internal/coordinator/pool/ec2.go | 12 +------- internal/coordinator/pool/ec2_test.go | 1 + internal/coordinator/pool/gce.go | 6 ++-- internal/coordinator/pool/kube.go | 13 ++------- internal/coordinator/pool/pool.go | 39 ++++++++++++++++++++------ internal/coordinator/pool/pool_test.go | 20 ++++++------- 10 files changed, 61 insertions(+), 66 deletions(-) diff --git a/buildlet/buildlet.go b/buildlet/buildlet.go index c9711cb663..08d2914444 100644 --- a/buildlet/buildlet.go +++ b/buildlet/buildlet.go @@ -40,7 +40,7 @@ type VMOpts struct { // DeleteIn optionally specifies a duration at which // to delete the VM. - // If zero, a reasonable default is used. + // If zero, a short default is used (not enough for longtest builders). // Negative means no deletion timeout. DeleteIn time.Duration diff --git a/buildlet/ec2.go b/buildlet/ec2.go index 1b2a6119da..02a3efa33f 100644 --- a/buildlet/ec2.go +++ b/buildlet/ec2.go @@ -59,6 +59,7 @@ func (c *EC2Client) StartNewVM(ctx context.Context, buildEnv *buildenv.Environme opts.Description = fmt.Sprintf("Go Builder for %s", hostType) } if opts.DeleteIn == 0 { + // Note: This implements a short default in the rare case the caller doesn't care. opts.DeleteIn = 30 * time.Minute } diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go index d5ba79f348..3b811010e2 100644 --- a/cmd/coordinator/coordinator.go +++ b/cmd/coordinator/coordinator.go @@ -131,13 +131,6 @@ var maintnerClient apipb.MaintnerServiceClient const ( maxStatusDone = 30 - - // vmDeleteTimeout and podDeleteTimeout is how long before we delete a VM. - // In practice this need only be as long as the slowest - // builder (plan9 currently), because on startup this program - // already deletes all buildlets it doesn't know about - // (i.e. ones from a previous instance of the coordinator). - vmDeleteTimeout = 45 * time.Minute ) // Fake keys signed by a fake CA. @@ -266,7 +259,7 @@ func main() { // a shared package. pool.SetBuilderMasterKey(masterKey()) - err := pool.InitGCE(sc, vmDeleteTimeout, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode) + err := pool.InitGCE(sc, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode) if err != nil { if *mode == "" { *mode = "dev" diff --git a/dashboard/builders.go b/dashboard/builders.go index 13b57fa4b4..ecdf6bd3e4 100644 --- a/dashboard/builders.go +++ b/dashboard/builders.go @@ -237,7 +237,6 @@ var Hosts = map[string]*HostConfig{ goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz", Notes: "OpenBSD 6.8 (with 009_exit syspatch); GCE VM is built from script in build/env/openbsd-386", SSHUsername: "gopher", - DeleteTimeout: 60 * time.Minute, // See golang/go#49666. }, "host-openbsd-amd64-70": &HostConfig{ VMImage: "openbsd-amd64-70", @@ -254,7 +253,6 @@ var Hosts = map[string]*HostConfig{ goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz", Notes: "OpenBSD 7.0; GCE VM is built from script in build/env/openbsd-386. n2-highcpu host.", SSHUsername: "gopher", - DeleteTimeout: 60 * time.Minute, // See golang/go#49666. }, "host-openbsd-386-70-n2d": &HostConfig{ // This host config is only for the runtime team to use investigating golang/go#49209. @@ -689,13 +687,13 @@ var Hosts = map[string]*HostConfig{ env: []string{"GOROOT_BOOTSTRAP=/usr/lib/go"}, }, "host-linux-amd64-perf": &HostConfig{ - Notes: "Cascade Lake performance testing machines", - machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture. - ContainerImage: "linux-x86-bullseye:latest", - buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64", - env: []string{"GOROOT_BOOTSTRAP=/go1.4"}, - SSHUsername: "root", - DeleteTimeout: 8 * time.Hour, + Notes: "Cascade Lake performance testing machines", + machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture. + ContainerImage: "linux-x86-bullseye:latest", + buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64", + env: []string{"GOROOT_BOOTSTRAP=/go1.4"}, + SSHUsername: "root", + CustomDeleteTimeout: 8 * time.Hour, }, } @@ -786,8 +784,12 @@ type HostConfig struct { // EC2 options isEC2 bool // if true, the instance is configured to run on EC2 - // GCE or EC2 options - DeleteTimeout time.Duration // Timeout after which the VM is destroyed. Zero duration uses global default. + // GCE or EC2 options: + // + // CustomDeleteTimeout is an optional custom timeout after which the VM is forcibly destroyed and the build is retried. + // Zero duration defers decision to internal/coordinator/pool package, which is enough for longtest post-submit builds. + // (This is generally an internal implementation detail, currently left behind only for the -perf builder.) + CustomDeleteTimeout time.Duration // Reverse options ExpectNum int // expected number of reverse buildlets of this type diff --git a/internal/coordinator/pool/ec2.go b/internal/coordinator/pool/ec2.go index 07ce2cbfbb..7780a86adc 100644 --- a/internal/coordinator/pool/ec2.go +++ b/internal/coordinator/pool/ec2.go @@ -64,13 +64,6 @@ type awsClient interface { // EC2Opt is optional configuration for the the buildlet. type EC2Opt func(*EC2Buildlet) -// WithVMDeleteTimeout sets the VM deletion timeout for all EC2 VMs. -func WithVMDeleteTimeout(timeout time.Duration) EC2Opt { - return func(eb *EC2Buildlet) { - eb.vmDeleteTimeout = timeout - } -} - // EC2Buildlet manages a pool of AWS EC2 buildlets. type EC2Buildlet struct { // awsClient is the client used to interact with AWS services. @@ -92,8 +85,6 @@ type EC2Buildlet struct { ledger *ledger // cancelPoll will signal to the pollers to discontinue polling. cancelPoll context.CancelFunc - // vmDeleteTimeout contains the timeout used to determine if a VM should be deleted. - vmDeleteTimeout time.Duration // pollWait waits for all pollers to terminate polling. pollWait sync.WaitGroup } @@ -121,7 +112,6 @@ func NewEC2Buildlet(client *cloud.AWSClient, buildEnv *buildenv.Environment, hos hosts: hosts, isRemoteBuildlet: fn, ledger: newLedger(), - vmDeleteTimeout: 45 * time.Minute, // default VM delete timeout } for _, opt := range opts { opt(b) @@ -195,7 +185,7 @@ func (eb *EC2Buildlet) GetBuildlet(ctx context.Context, hostType string, lg Logg Zone: "", // allow the EC2 api pick an availability zone with capacity TLS: kp, Meta: make(map[string]string), - DeleteIn: determineDeleteTimeout(hconf, eb.vmDeleteTimeout), + DeleteIn: determineDeleteTimeout(hconf), OnInstanceRequested: func() { log.Printf("EC2 VM %q now booting", instName) }, diff --git a/internal/coordinator/pool/ec2_test.go b/internal/coordinator/pool/ec2_test.go index cb40d5331e..eb7e94374b 100644 --- a/internal/coordinator/pool/ec2_test.go +++ b/internal/coordinator/pool/ec2_test.go @@ -578,6 +578,7 @@ func (f *fakeEC2BuildletClient) StartNewVM(ctx context.Context, buildEnv *builde return nil, fmt.Errorf("invalid vmName: %q and hostType: %q", vmName, hostType) } if opts.DeleteIn == 0 { + // Note: This implements a short default in the rare case the caller doesn't care. opts.DeleteIn = 30 * time.Minute } if !f.createVMRequestSuccess { diff --git a/internal/coordinator/pool/gce.go b/internal/coordinator/pool/gce.go index 1858a0ec60..34e42635f8 100644 --- a/internal/coordinator/pool/gce.go +++ b/internal/coordinator/pool/gce.go @@ -84,16 +84,14 @@ var ( // values created due to separating the buildlet pools into a separate package gceMode string - deleteTimeout time.Duration testFiles map[string]string basePinErr *atomic.Value isGCERemoteBuildlet IsRemoteBuildletFunc ) // InitGCE initializes the GCE buildlet pool. -func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error { +func InitGCE(sc *secret.Client, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error { gceMode = mode - deleteTimeout = vmDeleteTimeout testFiles = tFiles basePinErr = basePin isGCERemoteBuildlet = fn @@ -417,7 +415,7 @@ func (p *GCEBuildlet) GetBuildlet(ctx context.Context, hostType string, lg Logge log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone) bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{ - DeleteIn: determineDeleteTimeout(hconf, deleteTimeout), + DeleteIn: determineDeleteTimeout(hconf), OnInstanceRequested: func() { log.Printf("GCE VM %q now booting", instName) }, diff --git a/internal/coordinator/pool/kube.go b/internal/coordinator/pool/kube.go index 3e762b2b45..b67a015aca 100644 --- a/internal/coordinator/pool/kube.go +++ b/internal/coordinator/pool/kube.go @@ -33,15 +33,6 @@ import ( This file implements the Kubernetes-based buildlet pool. */ -const ( - // podDeleteTimeout is how long before we delete a VM. - // In practice this need only be as long as the slowest - // builder (plan9 currently), because on startup this program - // already deletes all buildlets it doesn't know about - // (i.e. ones from a previous instance of the coordinator). - podDeleteTimeout = 45 * time.Minute -) - // Initialized by initKube: var ( buildletsKubeClient *kubernetes.Client // for "buildlets" cluster @@ -270,7 +261,7 @@ func (p *kubeBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg ProjectID: NewGCEConfiguration().BuildEnv().ProjectName, ImageRegistry: registryPrefix, Description: fmt.Sprintf("Go Builder for %s", hostType), - DeleteIn: determineDeleteTimeout(hconf, podDeleteTimeout), + DeleteIn: determineDeleteTimeout(hconf), OnPodCreating: func() { lg.LogEventTime("pod_creating") p.setPodUsed(podName, true) @@ -406,7 +397,7 @@ func (p *kubeBuildletPool) String() string { return fmt.Sprintf("Kubernetes pool capacity: %d/%d", inUse, total) } -// CleanUpOldPods loops forever and periodically enumerates pods +// CleanUpOldPodsLoop loops forever and periodically enumerates pods // and deletes those which have expired. // // A Pod is considered expired if it has a "delete-at" metadata diff --git a/internal/coordinator/pool/pool.go b/internal/coordinator/pool/pool.go index f4ac79f20a..b1e67a1df6 100644 --- a/internal/coordinator/pool/pool.go +++ b/internal/coordinator/pool/pool.go @@ -68,17 +68,40 @@ func instanceName(hostType string, length int) string { return fmt.Sprintf("buildlet-%s-rn%s", strings.TrimPrefix(hostType, "host-"), randHex(length)) } -// determineDeleteTimeout determines the buildlet timeout duration with the -// following priority: +// determineDeleteTimeout reports the buildlet delete timeout duration +// with the following priority: // -// 1. Host type default from host config. -// 2. Global default from 'timeout'. -func determineDeleteTimeout(host *dashboard.HostConfig, timeout time.Duration) time.Duration { - if host.DeleteTimeout != 0 { - return host.DeleteTimeout +// 1. Host type override from host config. +// 2. Global default. +func determineDeleteTimeout(host *dashboard.HostConfig) time.Duration { + if host.CustomDeleteTimeout != 0 { + return host.CustomDeleteTimeout } - return timeout + // The value we return below is effectively a global default. + // + // The comment of CleanUpOldVMs (and CleanUpOldPodsLoop) includes: + // + // This is the safety mechanism to delete VMs which stray from the + // normal deleting process. VMs are created to run a single build and + // should be shut down by a controlling process. Due to various types + // of failures, they might get stranded. To prevent them from getting + // stranded and wasting resources forever, we instead set the + // "delete-at" metadata attribute on them when created to some time + // that's well beyond their expected lifetime. + // + // Issue go.dev/issue/52929 tracks what to do about this global + // timeout in the long term. Unless something changes, + // it needs to be maintained manually so that it's always + // "well beyond their expected lifetime" of each builder that doesn't + // otherwise override this timeout—otherwise it'll cause even more + // resources to be used due the automatic (and unlimited) retrying + // as described in go.dev/issue/42699. + // + // A global timeout of 45 minutes was chosen in 2015. + // Longtest builders were added in 2018 started to reach 45 mins by 2021-2022. + // Try 2 hours next, which might last some years (depending on test volume and test speed). + return 2 * time.Hour } // isBuildlet checks the name string in order to determine if the name is for a buildlet. diff --git a/internal/coordinator/pool/pool_test.go b/internal/coordinator/pool/pool_test.go index 266d1cd20e..7bceb61ab6 100644 --- a/internal/coordinator/pool/pool_test.go +++ b/internal/coordinator/pool/pool_test.go @@ -18,29 +18,25 @@ func TestPoolDetermineDeleteTimeout(t *testing.T) { testCases := []struct { desc string hostValue time.Duration - timeout time.Duration wantTimeout time.Duration }{ { - desc: "from-host", - hostValue: time.Minute, - timeout: time.Second, - wantTimeout: time.Minute, + desc: "default", + wantTimeout: 2 * time.Hour, }, { - desc: "from-argument", - hostValue: 0, - timeout: time.Second, - wantTimeout: time.Second, + desc: "from-host", + hostValue: 8 * time.Hour, + wantTimeout: 8 * time.Hour, }, } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { h := &dashboard.HostConfig{ - DeleteTimeout: tc.hostValue, + CustomDeleteTimeout: tc.hostValue, } - if got := determineDeleteTimeout(h, tc.timeout); got != tc.wantTimeout { - t.Errorf("determineDeleteTimeout(%+v, %s) = %s; want %s", h, tc.timeout, got, tc.wantTimeout) + if got := determineDeleteTimeout(h); got != tc.wantTimeout { + t.Errorf("determineDeleteTimeout(%+v) = %s; want %s", h, got, tc.wantTimeout) } }) }