From 36f05bc8fb4ee3aa4bc0992a00e4ea66605bbf09 Mon Sep 17 00:00:00 2001 From: cskh Date: Thu, 1 Dec 2022 10:39:09 -0500 Subject: [PATCH] integ-test: test consul upgrade from the snapshot of a running cluster (#15595) * integ-test: test consul upgrade from the snapshot of a running cluster * use Target version as default Co-authored-by: Dan Stough --- agent/config/config.go | 2 +- test/integration/consul-container/go.mod | 2 + test/integration/consul-container/go.sum | 2 + .../consul-container/libs/agent/agent.go | 2 + .../consul-container/libs/agent/builder.go | 14 +- .../consul-container/libs/agent/container.go | 54 +++++--- .../consul-container/libs/cluster/cluster.go | 60 +++++++- .../test/basic/connect_service_test.go | 8 +- .../test/upgrade/fullstopupgrade_test.go | 129 ++++++++++++++++++ .../test/upgrade/healthcheck_test.go | 12 +- 10 files changed, 257 insertions(+), 28 deletions(-) create mode 100644 test/integration/consul-container/test/upgrade/fullstopupgrade_test.go diff --git a/agent/config/config.go b/agent/config/config.go index a82f83916e94..83d0548feefc 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -198,7 +198,7 @@ type Config struct { NodeID *string `mapstructure:"node_id" json:"node_id,omitempty"` NodeMeta map[string]string `mapstructure:"node_meta" json:"node_meta,omitempty"` NodeName *string `mapstructure:"node_name" json:"node_name,omitempty"` - Peering Peering `mapstructure:"peering" json:"peering,omitempty"` + Peering Peering `mapstructure:"peering" json:"-"` Performance Performance `mapstructure:"performance" json:"-"` PidFile *string `mapstructure:"pid_file" json:"pid_file,omitempty"` Ports Ports `mapstructure:"ports" json:"ports,omitempty"` diff --git a/test/integration/consul-container/go.mod b/test/integration/consul-container/go.mod index 2fe937e55ec6..bdfa9bdb7cfc 100644 --- a/test/integration/consul-container/go.mod +++ b/test/integration/consul-container/go.mod @@ -11,9 +11,11 @@ require ( github.com/hashicorp/serf v0.10.1 github.com/itchyny/gojq v0.12.9 github.com/pkg/errors v0.9.1 + github.com/rogpeppe/go-internal v1.3.0 github.com/stretchr/testify v1.8.0 github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 github.com/testcontainers/testcontainers-go v0.13.0 + golang.org/x/mod v0.4.2 ) require ( diff --git a/test/integration/consul-container/go.sum b/test/integration/consul-container/go.sum index 08eee6c20e2f..ca83b20fc48f 100644 --- a/test/integration/consul-container/go.sum +++ b/test/integration/consul-container/go.sum @@ -857,6 +857,7 @@ github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40T github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rogpeppe/go-internal v1.3.0 h1:RR9dF3JtopPvtkroDZuVD7qquD0bnHlKSqaQhgwt8yk= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= @@ -1037,6 +1038,7 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/test/integration/consul-container/libs/agent/agent.go b/test/integration/consul-container/libs/agent/agent.go index e964757797f3..e9dad59ac7a0 100644 --- a/test/integration/consul-container/libs/agent/agent.go +++ b/test/integration/consul-container/libs/agent/agent.go @@ -17,6 +17,8 @@ type Agent interface { RegisterTermination(func() error) Terminate() error Upgrade(ctx context.Context, config Config, index int) error + Exec(ctx context.Context, cmd []string) (int, error) + DataDir() string } // Config is a set of configurations required to create a Agent diff --git a/test/integration/consul-container/libs/agent/builder.go b/test/integration/consul-container/libs/agent/builder.go index 831e1399020c..ccb51b161d71 100644 --- a/test/integration/consul-container/libs/agent/builder.go +++ b/test/integration/consul-container/libs/agent/builder.go @@ -5,6 +5,7 @@ import ( "path/filepath" "github.com/pkg/errors" + "golang.org/x/mod/semver" agentconfig "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/test/integration/consul-container/libs/utils" @@ -26,6 +27,7 @@ type BuildContext struct { injectAutoEncryption bool // initialize the built-in CA and set up agents to use auto-encrpt injectCerts bool // initializes the built-in CA and distributes client certificates to agents injectGossipEncryption bool // setup the agents to use a gossip encryption key + consulVersion string } // BuildOptions define the desired automated test setup overrides that are @@ -35,6 +37,7 @@ type BuildOptions struct { InjectCerts bool // Provides a CA for all agents and (future) agent certs. InjectAutoEncryption bool // Configures auto-encrypt for TLS and sets up certs. Overrides InjectCerts. InjectGossipEncryption bool // Provides a gossip encryption key for all agents. + ConsulVersion string // The default Consul version for agents in the cluster when none is specified. } func NewBuildContext(opts BuildOptions) (*BuildContext, error) { @@ -43,6 +46,11 @@ func NewBuildContext(opts BuildOptions) (*BuildContext, error) { injectAutoEncryption: opts.InjectAutoEncryption, injectCerts: opts.InjectCerts, injectGossipEncryption: opts.InjectGossipEncryption, + consulVersion: opts.ConsulVersion, + } + + if opts.ConsulVersion == "" { + ctx.consulVersion = *utils.TargetVersion } if opts.InjectGossipEncryption { @@ -105,12 +113,16 @@ func NewConfigBuilder(ctx *BuildContext) *Builder { HTTP: nil, HTTPS: utils.IntToPointer(8501), GRPC: utils.IntToPointer(8502), - GRPCTLS: utils.IntToPointer(8503), SerfLAN: utils.IntToPointer(8301), SerfWAN: utils.IntToPointer(8302), Server: utils.IntToPointer(8300), } + if ctx != nil && (ctx.consulVersion == "local" || semver.Compare("v"+ctx.consulVersion, "v1.14.0") >= 0) { + // Enable GRPCTLS for version after v1.14.0 + b.conf.Ports.GRPCTLS = utils.IntToPointer(8503) + } + return b } diff --git a/test/integration/consul-container/libs/agent/container.go b/test/integration/consul-container/libs/agent/container.go index 3fe4b70bdcf3..0601c25cf0f7 100644 --- a/test/integration/consul-container/libs/agent/container.go +++ b/test/integration/consul-container/libs/agent/container.go @@ -109,11 +109,6 @@ func NewConsulContainer(ctx context.Context, config Config, network string, inde return nil, err } - localIP, err := podContainer.Host(ctx) - if err != nil { - return nil, err - } - mappedPort, err := podContainer.MappedPort(ctx, "8500") if err != nil { return nil, err @@ -136,7 +131,10 @@ func NewConsulContainer(ctx context.Context, config Config, network string, inde Prefix: name, }) - uri := fmt.Sprintf("http://%s:%s", localIP, mappedPort.Port()) + uri, err := podContainer.Endpoint(ctx, "http") + if err != nil { + return nil, err + } apiConfig := api.DefaultConfig() apiConfig.Address = uri apiClient, err := api.NewClient(apiConfig) @@ -197,6 +195,10 @@ func (c *consulContainerNode) RegisterTermination(f func() error) { c.terminateFuncs = append(c.terminateFuncs, f) } +func (c *consulContainerNode) Exec(ctx context.Context, cmd []string) (int, error) { + return c.container.Exec(ctx, cmd) +} + func (c *consulContainerNode) Upgrade(ctx context.Context, config Config, index int) error { pc, err := readSomeConfigFileFields(config.JSON) if err != nil { @@ -237,14 +239,18 @@ func (c *consulContainerNode) Upgrade(ctx context.Context, config Config, index _, consulReq2 := newContainerRequest(config, opts) consulReq2.Env = c.consulReq.Env // copy license - _ = c.container.StopLogProducer() - if err := c.container.Terminate(ctx); err != nil { - return err + if c.container != nil { + _ = c.container.StopLogProducer() + if err := c.container.Terminate(c.ctx); err != nil { + return err + } } c.consulReq = consulReq2 container, err := startContainer(ctx, c.consulReq) + c.ctx = ctx + c.container = container if err != nil { return err } @@ -256,8 +262,6 @@ func (c *consulContainerNode) Upgrade(ctx context.Context, config Config, index Prefix: name, }) - c.container = container - return nil } @@ -265,13 +269,12 @@ func (c *consulContainerNode) Upgrade(ctx context.Context, config Config, index // This might also include running termination functions for containers associated with the agent. // On failure, an error will be returned and the reaper process (RYUK) will handle cleanup. func (c *consulContainerNode) Terminate() error { - // Services might register a termination function that should also fire // when the "agent" is cleaned up for _, f := range c.terminateFuncs { err := f() if err != nil { - + continue } } @@ -279,10 +282,17 @@ func (c *consulContainerNode) Terminate() error { return nil } - err := c.container.StopLogProducer() - - if err1 := c.container.Terminate(c.ctx); err == nil { - err = err1 + state, err := c.container.State(context.Background()) + if err == nil && state.Running { + // StopLogProducer can only be called on running containers + err = c.container.StopLogProducer() + if err1 := c.container.Terminate(c.ctx); err == nil { + err = err1 + } + } else { + if err1 := c.container.Terminate(c.ctx); err == nil { + err = err1 + } } c.container = nil @@ -290,7 +300,13 @@ func (c *consulContainerNode) Terminate() error { return err } +func (c *consulContainerNode) DataDir() string { + return c.dataDir +} + func startContainer(ctx context.Context, req testcontainers.ContainerRequest) (testcontainers.Container, error) { + ctx, cancel := context.WithTimeout(ctx, time.Second*40) + defer cancel() return testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ ContainerRequest: req, Started: true, @@ -313,12 +329,14 @@ type containerOpts struct { func newContainerRequest(config Config, opts containerOpts) (podRequest, consulRequest testcontainers.ContainerRequest) { skipReaper := isRYUKDisabled() + httpPort := "8500" + pod := testcontainers.ContainerRequest{ Image: pauseImage, AutoRemove: false, Name: opts.name + "-pod", SkipReaper: skipReaper, - ExposedPorts: []string{"8500/tcp"}, + ExposedPorts: []string{httpPort + "/tcp"}, Hostname: opts.hostname, Networks: opts.addtionalNetworks, } diff --git a/test/integration/consul-container/libs/cluster/cluster.go b/test/integration/consul-container/libs/cluster/cluster.go index 81eb8052302c..1e37af3cf883 100644 --- a/test/integration/consul-container/libs/cluster/cluster.go +++ b/test/integration/consul-container/libs/cluster/cluster.go @@ -3,6 +3,8 @@ package cluster import ( "context" "fmt" + "io/ioutil" + "path/filepath" "strings" "testing" "time" @@ -121,6 +123,60 @@ func (c *Cluster) Remove(n libagent.Agent) error { return nil } +// StandardUpgrade upgrades a running consul cluster following the steps from +// +// https://developer.hashicorp.com/consul/docs/upgrading#standard-upgrades +// +// - takes a snapshot +// - terminate and rejoin the new version of consul +func (c *Cluster) StandardUpgrade(t *testing.T, ctx context.Context, targetVersion string) error { + execCode, err := c.Agents[0].Exec(context.Background(), []string{"consul", "snapshot", "save", "backup.snap"}) + if execCode != 0 { + return fmt.Errorf("error taking snapshot of the cluster, returned code %d", execCode) + } + if err != nil { + return err + } + + // verify only the leader can take a snapshot + snapshotCount := 0 + for _, agent := range c.Agents { + files, err := ioutil.ReadDir(filepath.Join(agent.DataDir(), "raft", "snapshots")) + if err != nil { + return err + } + if len(files) >= 1 { + snapshotCount++ + } + } + + if snapshotCount != 1 { + return fmt.Errorf("only leader agent can have a snapshot file, got %d", snapshotCount) + } + + // Upgrade individual agent to the target version + client := c.Agents[0].GetClient() + for _, agent := range c.Agents { + agent.Terminate() + if len(c.Agents) > 3 { + WaitForLeader(t, c, client) + } else { + time.Sleep(1 * time.Second) + } + config := agent.GetConfig() + config.Version = targetVersion + err = agent.Upgrade(context.Background(), config, 1) + if err != nil { + return err + } + + // wait until the agent rejoin + WaitForLeader(t, c, client) + WaitForMembers(t, client, len(c.Agents)) + } + return nil +} + // Terminate will attempt to terminate all agents in the cluster and its network. If any agent // termination fails, Terminate will abort and return an error. func (c *Cluster) Terminate() error { @@ -213,7 +269,7 @@ func (c *Cluster) Clients() ([]libagent.Agent, error) { return clients, nil } -const retryTimeout = 20 * time.Second +const retryTimeout = 90 * time.Second const retryFrequency = 500 * time.Millisecond func LongFailer() *retry.Timer { @@ -250,6 +306,6 @@ func WaitForMembers(t *testing.T, client *api.Client, expectN int) { } } require.NoError(r, err) - require.Equal(r, activeMembers, expectN) + require.Equal(r, expectN, activeMembers) }) } diff --git a/test/integration/consul-container/test/basic/connect_service_test.go b/test/integration/consul-container/test/basic/connect_service_test.go index 099b7d728d82..2ee6bc7af8eb 100644 --- a/test/integration/consul-container/test/basic/connect_service_test.go +++ b/test/integration/consul-container/test/basic/connect_service_test.go @@ -17,10 +17,10 @@ import ( // A simulated client (a direct HTTP call) talks to it's upstream proxy through the // // Steps: -// * Create a single agent cluster. -// * Create the example static-server and sidecar containers, then register them both with Consul -// * Create an example static-client sidecar, then register both the service and sidecar with Consul -// * Make sure a call to the client sidecar local bind port returns a response from the upstream, static-server +// - Create a single agent cluster. +// - Create the example static-server and sidecar containers, then register them both with Consul +// - Create an example static-client sidecar, then register both the service and sidecar with Consul +// - Make sure a call to the client sidecar local bind port returns a response from the upstream, static-server func TestBasicConnectService(t *testing.T) { cluster := createCluster(t) defer terminate(t, cluster) diff --git a/test/integration/consul-container/test/upgrade/fullstopupgrade_test.go b/test/integration/consul-container/test/upgrade/fullstopupgrade_test.go new file mode 100644 index 000000000000..49973c813f8f --- /dev/null +++ b/test/integration/consul-container/test/upgrade/fullstopupgrade_test.go @@ -0,0 +1,129 @@ +package upgrade + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/api" + + "github.com/hashicorp/consul/sdk/testutil/retry" + libagent "github.com/hashicorp/consul/test/integration/consul-container/libs/agent" + libcluster "github.com/hashicorp/consul/test/integration/consul-container/libs/cluster" + "github.com/hashicorp/consul/test/integration/consul-container/libs/utils" +) + +// Test upgrade a cluster of latest version to the target version +func TestStandardUpgradeToTarget_fromLatest(t *testing.T) { + type testcase struct { + oldversion string + targetVersion string + expectErr bool + } + tcs := []testcase{ + // Use the case of "1.12.3" ==> "1.13.0" to verify the test can + // catch the upgrade bug found in snapshot of 1.13.0 + { + oldversion: "1.12.3", + targetVersion: "1.13.0", + expectErr: true, + }, + { + oldversion: "1.13", + targetVersion: *utils.TargetVersion, + }, + { + oldversion: "1.14", + targetVersion: *utils.TargetVersion, + }, + } + + run := func(t *testing.T, tc testcase) { + + var configs []libagent.Config + + configCtx, err := libagent.NewBuildContext(libagent.BuildOptions{ + ConsulVersion: tc.oldversion, + }) + require.NoError(t, err) + numServers := 1 + leaderConf, err := libagent.NewConfigBuilder(configCtx). + Bootstrap(numServers). + ToAgentConfig() + require.NoError(t, err) + t.Logf("Cluster config:\n%s", leaderConf.JSON) + leaderConf.Version = tc.oldversion + for i := 0; i < numServers; i++ { + configs = append(configs, *leaderConf) + } + + cluster, err := libcluster.New(configs) + require.NoError(t, err) + defer terminate(t, cluster) + + server := cluster.Agents[0] + client := server.GetClient() + libcluster.WaitForLeader(t, cluster, client) + libcluster.WaitForMembers(t, client, numServers) + + // Create a service to be stored in the snapshot + serviceName := "api" + index := serviceCreate(t, client, serviceName) + ch := make(chan []*api.ServiceEntry) + errCh := make(chan error) + go func() { + service, q, err := client.Health().Service(serviceName, "", false, &api.QueryOptions{WaitIndex: index}) + if err == nil && q.QueryBackend != api.QueryBackendStreaming { + err = fmt.Errorf("invalid backend for this test %s", q.QueryBackend) + } + if err != nil { + errCh <- err + } else { + ch <- service + } + }() + require.NoError(t, client.Agent().ServiceRegister( + &api.AgentServiceRegistration{Name: serviceName, Port: 9998}, + )) + timer := time.NewTimer(1 * time.Second) + select { + case err := <-errCh: + require.NoError(t, err) + case service := <-ch: + require.Len(t, service, 1) + require.Equal(t, serviceName, service[0].Service.Service) + require.Equal(t, 9998, service[0].Service.Port) + case <-timer.C: + t.Fatalf("test timeout") + } + + // upgrade the cluster to the Target version + err = cluster.StandardUpgrade(t, context.Background(), tc.targetVersion) + if !tc.expectErr { + require.NoError(t, err) + libcluster.WaitForLeader(t, cluster, client) + libcluster.WaitForMembers(t, client, numServers) + + // Verify service is restored from the snapshot + retry.RunWith(&retry.Timer{Timeout: 5 * time.Second, Wait: 500 * time.Microsecond}, t, func(r *retry.R) { + service, _, err := client.Catalog().Service(serviceName, "", &api.QueryOptions{}) + require.NoError(r, err) + require.Len(r, service, 1) + require.Equal(r, serviceName, service[0].ServiceName) + }) + } else { + require.Error(t, fmt.Errorf("context deadline exceeded")) + } + } + + for _, tc := range tcs { + t.Run(fmt.Sprintf("upgrade from %s to %s", tc.oldversion, tc.targetVersion), + func(t *testing.T) { + run(t, tc) + }) + time.Sleep(5 * time.Second) + } +} diff --git a/test/integration/consul-container/test/upgrade/healthcheck_test.go b/test/integration/consul-container/test/upgrade/healthcheck_test.go index f318faf60ed4..b3d49523ed3a 100644 --- a/test/integration/consul-container/test/upgrade/healthcheck_test.go +++ b/test/integration/consul-container/test/upgrade/healthcheck_test.go @@ -256,7 +256,15 @@ func clientsCreate(t *testing.T, numClients int, image string, version string, c } func serviceCreate(t *testing.T, client *api.Client, serviceName string) uint64 { - err := client.Agent().ServiceRegister(&api.AgentServiceRegistration{Name: serviceName, Port: 9999}) + err := client.Agent().ServiceRegister(&api.AgentServiceRegistration{ + Name: serviceName, + Port: 9999, + Connect: &api.AgentServiceConnect{ + SidecarService: &api.AgentServiceRegistration{ + Port: 22005, + }, + }, + }) require.NoError(t, err) service, meta, err := client.Catalog().Service(serviceName, "", &api.QueryOptions{}) @@ -272,7 +280,7 @@ func serversCluster(t *testing.T, numServers int, version string, image string) var configs []libagent.Config conf, err := libagent.NewConfigBuilder(nil). - Bootstrap(3). + Bootstrap(numServers). ToAgentConfig() require.NoError(t, err)