Skip to content

Commit

Permalink
internal: fix GO_AWAY deadlock
Browse files Browse the repository at this point in the history
A deadlock can occur when a GO_AWAY is followed by a connection closure. This
happens because onClose needlessly closes the current ac.transport: if a
GO_AWAY already occured, and the transport was already reset, then the later
closure (of the original address) sets ac.transport - which is now healthy -
to nil.

The manifestation of this problem is that picker_wrapper spins forever trying
to use a READY connection whose ac.transport is nil.
  • Loading branch information
jeanbza committed Oct 19, 2018
1 parent 0ee1544 commit 258cbc5
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 3 deletions.
3 changes: 0 additions & 3 deletions clientconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -1066,9 +1066,6 @@ func (ac *addrConn) createTransport(backoffNum int, addr resolver.Address, copts
case <-skipReset: // The outer resetTransport loop will handle reconnection.
return
case <-allowedToReset: // We're in the clear to reset.
ac.mu.Lock()
ac.transport = nil
ac.mu.Unlock()
oneReset.Do(func() { ac.resetTransport(false) })
}
}
Expand Down
96 changes: 96 additions & 0 deletions test/end2end_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ import (
healthgrpc "google.golang.org/grpc/health/grpc_health_v1"
healthpb "google.golang.org/grpc/health/grpc_health_v1"
"google.golang.org/grpc/internal/channelz"
"google.golang.org/grpc/internal/grpcsync"
"google.golang.org/grpc/internal/leakcheck"
"google.golang.org/grpc/internal/testutils"
"google.golang.org/grpc/keepalive"
Expand Down Expand Up @@ -6994,3 +6995,98 @@ func testLargeTimeout(t *testing.T, e env) {
}
}
}

// Proxies typically send GO_AWAY followed by connection closure a minute or so later. This
// test ensures that the connection is re-created after GO_AWAY and not affected by the
// subsequent (old) connection closure.
func TestGoAwayThenClose(t *testing.T) {
defer leakcheck.Check(t)

ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()

lis1, err := net.Listen("tcp", "localhost:0")
if err != nil {
t.Fatalf("Error while listening. Err: %v", err)
}
s1 := grpc.NewServer()
defer s1.Stop()
ts1 := &funcServer{
unaryCall: func(ctx context.Context, in *testpb.SimpleRequest) (*testpb.SimpleResponse, error) {
return &testpb.SimpleResponse{}, nil
},
fullDuplexCall: func(stream testpb.TestService_FullDuplexCallServer) error {
// Wait forever.
_, err := stream.Recv()
if err == nil {
t.Error("expected to never receive any message")
}
return err
},
}
testpb.RegisterTestServiceServer(s1, ts1)
go s1.Serve(lis1)

lis2, err := net.Listen("tcp", "localhost:0")
if err != nil {
t.Fatalf("Error while listening. Err: %v", err)
}
s2 := grpc.NewServer()
defer s2.Stop()
conn2Ready := grpcsync.NewEvent()
ts2 := &funcServer{unaryCall: func(ctx context.Context, in *testpb.SimpleRequest) (*testpb.SimpleResponse, error) {
conn2Ready.Fire()
return &testpb.SimpleResponse{}, nil
}}
testpb.RegisterTestServiceServer(s2, ts2)
go s2.Serve(lis2)

r, rcleanup := manual.GenerateAndRegisterManualResolver()
defer rcleanup()
r.InitialAddrs([]resolver.Address{
{Addr: lis1.Addr().String()},
{Addr: lis2.Addr().String()},
})
cc, err := grpc.DialContext(ctx, r.Scheme()+":///", grpc.WithInsecure())
if err != nil {
t.Fatalf("Error creating client: %v", err)
}
defer cc.Close()

client := testpb.NewTestServiceClient(cc)

// Should go on connection 1.
_, err = client.FullDuplexCall(ctx)
if err != nil {
t.Fatalf("UnaryCall(_) = _, %v; want _, nil", err)
}

// Send GO_AWAY to connection 1.
go s1.GracefulStop()

timeout := time.After(5 * time.Second)
// Wait for connection 2 to be established.
for done := false; !done; {
select {
case <-conn2Ready.Done():
done = true
case <-timeout:
t.Fatal("timed out waiting for connection to server 2 to be established")
default:
// Might fail because conn is draining, or not because race condition between this and GracefulStop being
// called.
client.UnaryCall(ctx, &testpb.SimpleRequest{})
}
}

// Close connection 1.
s1.Stop()

// Wait for client to close.
time.Sleep(100 * time.Millisecond)

// Should go on connection 2.
if _, err := client.UnaryCall(ctx, &testpb.SimpleRequest{}); err != nil {
t.Fatalf("UnaryCall(_) = _, %v; want _, nil", err)
}
}

0 comments on commit 258cbc5

Please sign in to comment.