Skip to content

Commit

Permalink
buildlet: fix Exec to return ErrTimeout on timeout
Browse files Browse the repository at this point in the history
The coordinator relies on Exec reporting that the given timeout was
exceeded in order to mark a build as failed instead of retrying it.
A refactor resulted in Exec no longer doing that, despite what its
documentation promises, so fix that.

Also add a test since evidence shows that catching a regression can
be helpful.

For golang/go#42699.
Updates golang/go#35707.

Change-Id: Iacef90b83e7b81fad88a33baa6489d5157e3528f
Reviewed-on: https://go-review.googlesource.com/c/build/+/407555
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
  • Loading branch information
dmitshur authored and gopherbot committed May 27, 2022
1 parent a3f1e41 commit 33d38b8
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 8 deletions.
24 changes: 16 additions & 8 deletions buildlet/buildletclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,8 @@ type ExecOpts struct {
OnStartExec func()
}

// ErrTimeout is a sentinel error that represents that waiting
// for a command to complete has exceeded the given timeout.
var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")

// Exec runs cmd on the buildlet.
Expand All @@ -519,8 +521,8 @@ var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
// seen to completition. If execErr is non-nil, the remoteErr is
// meaningless.
//
// If the context's deadline is exceeded, the returned execErr is
// ErrTimeout.
// If the context's deadline is exceeded while waiting for the command
// to complete, the returned execErr is ErrTimeout.
func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr, execErr error) {
var mode string
if opts.SystemLevel {
Expand Down Expand Up @@ -553,10 +555,11 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
// (Atlanta, Paris, Sydney, etc.) the reverse buildlet is:
res, err := c.doHeaderTimeout(req, 20*time.Second)
if err == errHeaderTimeout {
// If we don't see headers after all that time,
// consider the buildlet to be unhealthy.
c.MarkBroken()
return nil, errors.New("buildlet: timeout waiting for exec header response")
}
if err != nil {
} else if err != nil {
return nil, err
}
defer res.Body.Close()
Expand All @@ -577,7 +580,7 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
out = ioutil.Discard
}
if _, err := io.Copy(out, res.Body); err != nil {
resc <- errs{execErr: fmt.Errorf("error copying response: %v", err)}
resc <- errs{execErr: fmt.Errorf("error copying response: %w", err)}
return
}

Expand All @@ -600,10 +603,15 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
select {
case res := <-resc:
if res.execErr != nil {
// Note: We've historically marked the buildlet as unhealthy after
// reaching any kind of execution error, even when it's a remote command
// execution timeout (see use of ErrTimeout below).
// This is certainly on the safer side of avoiding false positive signal,
// but maybe someday we'll want to start to rely on the buildlet to report
// such a condition and not mark it as unhealthy.

c.MarkBroken()
if res.execErr == context.DeadlineExceeded {
// Historical pre-context value.
// TODO: update docs & callers to just use the context value.
if errors.Is(res.execErr, context.DeadlineExceeded) {
res.execErr = ErrTimeout
}
}
Expand Down
54 changes: 54 additions & 0 deletions buildlet/buildletclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ package buildlet
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"net"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
)
Expand Down Expand Up @@ -171,3 +173,55 @@ func createKeyPair(t *testing.T) KeyPair {
}
return kp
}

// Test that Exec returns ErrTimeout upon reaching the context timeout
// during command execution, as its documentation promises.
func TestExecTimeoutError(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/status", func(w http.ResponseWriter, req *http.Request) {
json.NewEncoder(w).Encode(Status{})
})
mux.HandleFunc("/exec", func(w http.ResponseWriter, req *http.Request) {
w.Write([]byte("."))
w.(http.Flusher).Flush() // /exec needs to flush headers right away.
<-req.Context().Done() // Simulate that execution hangs, so no more output.
})
ts := httptest.NewServer(mux)
defer ts.Close()
u, err := url.Parse(ts.URL)
if err != nil {
t.Fatalf("unable to parse http server url %s", err)
}
cl := NewClient(u.Host, NoKeyPair)
defer cl.Close()

// Use a custom context that reports context.DeadlineExceeded
// after Exec starts command execution. (context.WithTimeout
// requires us to select an arbitrary duration, which might
// not be long enough or will make the test take too long.)
ctx := deadlineOnDemandContext{
Context: context.Background(),
done: make(chan struct{}),
}
_, execErr := cl.Exec(ctx, "./bin/test", ExecOpts{
OnStartExec: func() { close(ctx.done) },
})
if execErr != ErrTimeout {
t.Errorf("cl.Exec error = %v; want %v", execErr, ErrTimeout)
}
}

type deadlineOnDemandContext struct {
context.Context
done chan struct{}
}

func (c deadlineOnDemandContext) Done() <-chan struct{} { return c.done }
func (c deadlineOnDemandContext) Err() error {
select {
default:
return nil
case <-c.done:
return context.DeadlineExceeded
}
}

0 comments on commit 33d38b8

Please sign in to comment.