From 99a64341f25c3a3b0fb14ca7e2359c95e270ee71 Mon Sep 17 00:00:00 2001 From: Siyuan Zhang Date: Fri, 3 May 2024 06:20:50 -0700 Subject: [PATCH] Add cluster downgrade test. Signed-off-by: Siyuan Zhang --- tests/e2e/cluster_downgrade_test.go | 299 ++++++++++++++++++++++++++++ tests/e2e/ctl_v3_grpc_test.go | 18 +- tests/e2e/v3_curl_test.go | 5 +- tests/framework/e2e/curl.go | 11 +- tests/framework/e2e/etcd_process.go | 25 +++ tests/framework/e2e/etcdctl.go | 15 ++ tests/go.mod | 2 +- tests/testutils/execute.go | 37 ++++ 8 files changed, 387 insertions(+), 25 deletions(-) create mode 100644 tests/e2e/cluster_downgrade_test.go create mode 100644 tests/testutils/execute.go diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go new file mode 100644 index 00000000000..8729bbda57f --- /dev/null +++ b/tests/e2e/cluster_downgrade_test.go @@ -0,0 +1,299 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "encoding/json" + "fmt" + "strings" + "testing" + "time" + + "github.com/coreos/go-semver/semver" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "go.etcd.io/etcd/api/v3/version" + "go.etcd.io/etcd/client/pkg/v3/fileutil" + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/server/v3/datadir" + "go.etcd.io/etcd/server/v3/etcdserver" + "go.etcd.io/etcd/server/v3/etcdserver/api/snap" + "go.etcd.io/etcd/tests/v3/framework/e2e" + "go.etcd.io/etcd/tests/v3/testutils" +) + +func TestDowngradeUpgradeClusterOf1(t *testing.T) { + testDowngradeUpgrade(t, 1, false) +} + +func TestDowngradeUpgradeClusterOf3(t *testing.T) { + testDowngradeUpgrade(t, 3, false) +} + +func TestDowngradeUpgradeClusterOf1WithSnapshot(t *testing.T) { + testDowngradeUpgrade(t, 1, true) +} + +func TestDowngradeUpgradeClusterOf3WithSnapshot(t *testing.T) { + testDowngradeUpgrade(t, 3, true) +} + +func testDowngradeUpgrade(t *testing.T, clusterSize int, triggerSnapshot bool) { + currentEtcdBinary := e2e.BinPath + lastReleaseBinary := e2e.BinPathLastRelease + if !fileutil.Exist(lastReleaseBinary) { + t.Skipf("%q does not exist", lastReleaseBinary) + } + + currentVersion, err := e2e.GetVersionFromBinary(currentEtcdBinary) + require.NoError(t, err) + // wipe any pre-release suffix like -alpha.0 we see commonly in builds + currentVersion.PreRelease = "" + + lastVersion, err := e2e.GetVersionFromBinary(lastReleaseBinary) + require.NoError(t, err) + + require.Equalf(t, lastVersion.Minor, currentVersion.Minor-1, "unexpected minor version difference") + currentVersionStr := currentVersion.String() + lastVersionStr := lastVersion.String() + + currentClusterVersion := semver.New(currentVersionStr) + currentClusterVersion.Patch = 0 + currentClusterVersionStr := currentClusterVersion.String() + + lastClusterVersion := semver.New(lastVersionStr) + lastClusterVersion.Patch = 0 + lastClusterVersionStr := lastClusterVersion.String() + + e2e.BeforeTest(t) + + t.Logf("Create cluster with version %s", currentVersionStr) + snapshotCount := 10 + epc := newCluster(t, clusterSize, snapshotCount) + for i := 0; i < len(epc.Procs); i++ { + validateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{ + Cluster: currentClusterVersionStr, + Server: version.Version, + }) + } + cc := epc.Etcdctl() + t.Logf("Cluster created") + if len(epc.Procs) > 1 { + t.Log("Waiting health interval to required to make membership changes") + time.Sleep(etcdserver.HealthInterval) + } + + t.Log("Adding member to test membership, but a learner avoid breaking quorum") + resp, err := cc.MemberAddAsLearner("fake1", []string{"http://127.0.0.1:1001"}) + require.NoError(t, err) + if triggerSnapshot { + t.Logf("Generating snapshot") + generateSnapshot(t, snapshotCount, cc) + verifySnapshot(t, epc) + } + t.Log("Removing learner to test membership") + _, err = cc.MemberRemove(resp.Member.ID) + require.NoError(t, err) + beforeMembers, beforeKV := getMembersAndKeys(t, cc) + + t.Logf("Starting downgrade process to %q", lastVersionStr) + for i := 0; i < len(epc.Procs); i++ { + t.Logf("Downgrading member %d by running %s binary", i, lastReleaseBinary) + stopEtcd(t, epc.Procs[i]) + startEtcd(t, epc.Procs[i], lastReleaseBinary) + } + + t.Log("All members downgraded, validating downgrade") + for i := 0; i < len(epc.Procs); i++ { + validateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{ + Cluster: lastClusterVersionStr, + Server: lastVersionStr, + }) + } + + t.Log("Downgrade complete") + afterMembers, afterKV := getMembersAndKeys(t, cc) + assert.Equal(t, beforeKV.Kvs, afterKV.Kvs) + assert.Equal(t, beforeMembers.Members, afterMembers.Members) + + if len(epc.Procs) > 1 { + t.Log("Waiting health interval to required to make membership changes") + time.Sleep(etcdserver.HealthInterval) + } + t.Log("Adding learner to test membership, but avoid breaking quorum") + resp, err = cc.MemberAddAsLearner("fake2", []string{"http://127.0.0.1:1002"}) + require.NoError(t, err) + if triggerSnapshot { + t.Logf("Generating snapshot") + generateSnapshot(t, snapshotCount, cc) + verifySnapshot(t, epc) + } + t.Log("Removing learner to test membership") + _, err = cc.MemberRemove(resp.Member.ID) + require.NoError(t, err) + beforeMembers, beforeKV = getMembersAndKeys(t, cc) + + t.Logf("Starting upgrade process to %q", currentVersionStr) + for i := 0; i < len(epc.Procs); i++ { + t.Logf("Upgrading member %d", i) + stopEtcd(t, epc.Procs[i]) + startEtcd(t, epc.Procs[i], currentEtcdBinary) + // NOTE: The leader has monitor to the cluster version, which will + // update cluster version. We don't need to check the transient + // version just in case that it might be flaky. + } + + t.Log("All members upgraded, validating upgrade") + for i := 0; i < len(epc.Procs); i++ { + validateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{ + Cluster: currentClusterVersionStr, + Server: version.Version, + }) + } + t.Log("Upgrade complete") + + afterMembers, afterKV = getMembersAndKeys(t, cc) + assert.Equal(t, beforeKV.Kvs, afterKV.Kvs) + assert.Equal(t, beforeMembers.Members, afterMembers.Members) +} + +func newCluster(t *testing.T, clusterSize int, snapshotCount int) *e2e.EtcdProcessCluster { + copiedCfg := e2e.NewConfigNoTLS() + copiedCfg.ClusterSize = clusterSize + copiedCfg.SnapshotCount = snapshotCount + copiedCfg.KeepDataDir = true + copiedCfg.BasePeerScheme = "unix" // to avoid port conflict + + epc, err := e2e.NewEtcdProcessCluster(t, copiedCfg) + if err != nil { + t.Fatalf("could not start etcd process cluster (%v)", err) + } + t.Cleanup(func() { + if errC := epc.Close(); errC != nil { + t.Fatalf("error closing etcd processes (%v)", errC) + } + }) + return epc +} + +func startEtcd(t *testing.T, ep e2e.EtcdProcess, execPath string) { + ep.Config().ExecPath = execPath + if execPath == e2e.BinPathLastRelease { + ep.Config().Args = addOrRemoveArg(ep.Config().Args, "--next-cluster-version-compatible", true) + } else { + ep.Config().Args = addOrRemoveArg(ep.Config().Args, "--next-cluster-version-compatible", false) + } + err := ep.Restart() + if err != nil { + t.Fatalf("could not start etcd process cluster (%v)", err) + } +} + +func addOrRemoveArg(args []string, arg string, isAdd bool) []string { + newArgs := []string{} + for _, s := range args { + if s != arg { + newArgs = append(newArgs, s) + } + } + if isAdd { + newArgs = append(newArgs, arg) + } + return newArgs +} + +func stopEtcd(t *testing.T, ep e2e.EtcdProcess) { + if err := ep.Stop(); err != nil { + t.Fatal(err) + } +} + +func validateVersion(t *testing.T, cfg *e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess, expect version.Versions) { + testutils.ExecuteWithTimeout(t, 30*time.Second, func() { + for { + result, err := getMemberVersionByCurl(cfg, member) + if err != nil { + cfg.Logger.Warn("failed to get member version and retrying", zap.Error(err), zap.String("member", member.Config().Name)) + time.Sleep(time.Second) + continue + } + cfg.Logger.Info("Comparing versions", zap.String("member", member.Config().Name), zap.Any("got", result), zap.Any("want", expect)) + if err := compareMemberVersion(expect, result); err != nil { + cfg.Logger.Warn("Versions didn't match retrying", zap.Error(err), zap.String("member", member.Config().Name)) + time.Sleep(time.Second) + continue + } + cfg.Logger.Info("Versions match", zap.String("member", member.Config().Name)) + break + } + }) +} + +func compareMemberVersion(expect version.Versions, target version.Versions) error { + if expect.Server != "" && expect.Server != target.Server { + return fmt.Errorf("expect etcdserver version %v, but got %v", expect.Server, target.Server) + } + + if expect.Cluster != "" && expect.Cluster != target.Cluster { + return fmt.Errorf("expect etcdcluster version %v, but got %v", expect.Cluster, target.Cluster) + } + + return nil +} + +func getMemberVersionByCurl(cfg *e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess) (version.Versions, error) { + args := e2e.CURLPrefixArgsCluster(cfg, member, "GET", e2e.CURLReq{Endpoint: "/version"}) + lines, err := e2e.RunUtilCompletion(args, nil) + if err != nil { + return version.Versions{}, err + } + + data := strings.Join(lines, "\n") + result := version.Versions{} + if err := json.Unmarshal([]byte(data), &result); err != nil { + return version.Versions{}, fmt.Errorf("failed to unmarshal (%v): %w", data, err) + } + return result, nil +} + +func generateSnapshot(t *testing.T, snapshotCount int, cc *e2e.Etcdctl) { + t.Logf("Adding keys") + for i := 0; i < snapshotCount*3; i++ { + err := cc.Put(fmt.Sprintf("%d", i), "1") + assert.NoError(t, err) + } +} + +func verifySnapshot(t *testing.T, epc *e2e.EtcdProcessCluster) { + for i := range epc.Procs { + t.Logf("Verifying snapshot for member %d", i) + ss := snap.New(epc.Cfg.Logger, datadir.ToSnapDir(epc.Procs[i].Config().DataDirPath)) + _, err := ss.Load() + assert.NoError(t, err) + } + t.Logf("All members have a valid snapshot") +} + +func getMembersAndKeys(t *testing.T, cc *e2e.Etcdctl) (*clientv3.MemberListResponse, *clientv3.GetResponse) { + kvs, err := cc.GetWithPrefix("") + assert.NoError(t, err) + + members, err := cc.MemberList() + assert.NoError(t, err) + + return members, kvs +} diff --git a/tests/e2e/ctl_v3_grpc_test.go b/tests/e2e/ctl_v3_grpc_test.go index 7cdf27d1ae5..9c7bb068fd8 100644 --- a/tests/e2e/ctl_v3_grpc_test.go +++ b/tests/e2e/ctl_v3_grpc_test.go @@ -26,8 +26,8 @@ import ( "github.com/stretchr/testify/assert" - "go.etcd.io/etcd/client/pkg/v3/testutil" "go.etcd.io/etcd/tests/v3/framework/e2e" + "go.etcd.io/etcd/tests/v3/testutils" ) func TestAuthority(t *testing.T) { @@ -140,7 +140,7 @@ func TestAuthority(t *testing.T) { } } - executeWithTimeout(t, 5*time.Second, func() { + testutils.ExecuteWithTimeout(t, 5*time.Second, func() { assertAuthority(t, tc.expectAuthorityPattern, epc) }) }) @@ -179,20 +179,6 @@ func assertAuthority(t *testing.T, expectAuthorityPattern string, clus *e2e.Etcd } } -func executeWithTimeout(t *testing.T, timeout time.Duration, f func()) { - donec := make(chan struct{}) - go func() { - defer close(donec) - f() - }() - - select { - case <-time.After(timeout): - testutil.FatalStack(t, fmt.Sprintf("test timed out after %v", timeout)) - case <-donec: - } -} - type etcdctlV3 struct { cfg *e2e.EtcdProcessClusterConfig endpoints []string diff --git a/tests/e2e/v3_curl_test.go b/tests/e2e/v3_curl_test.go index 581d56d89c7..032f6377c38 100644 --- a/tests/e2e/v3_curl_test.go +++ b/tests/e2e/v3_curl_test.go @@ -18,6 +18,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "math/rand" "path" "strconv" "testing" @@ -244,7 +245,7 @@ func testV3CurlAuth(cx ctlCtx) { lineFunc = func(txt string) bool { return true } ) - cmdArgs = e2e.CURLPrefixArgsCluster(cx.epc, "POST", e2e.CURLReq{Endpoint: path.Join(p, "/auth/authenticate"), Value: string(authreq)}) + cmdArgs = e2e.CURLPrefixArgsCluster(cx.epc.Cfg, cx.epc.Procs[rand.Intn(cx.epc.Cfg.ClusterSize)], "POST", e2e.CURLReq{Endpoint: path.Join(p, "/auth/authenticate"), Value: string(authreq)}) proc, err := e2e.SpawnCmd(cmdArgs, cx.envMap) testutil.AssertNil(cx.t, err) defer proc.Close() @@ -283,7 +284,7 @@ func testV3CurlCampaign(cx ctlCtx) { if err != nil { cx.t.Fatal(err) } - cargs := e2e.CURLPrefixArgsCluster(cx.epc, "POST", e2e.CURLReq{ + cargs := e2e.CURLPrefixArgsCluster(cx.epc.Cfg, cx.epc.Procs[rand.Intn(cx.epc.Cfg.ClusterSize)], "POST", e2e.CURLReq{ Endpoint: path.Join(cx.apiPrefix, "/election/campaign"), Value: string(cdata), }) diff --git a/tests/framework/e2e/curl.go b/tests/framework/e2e/curl.go index 9df19620a07..6ad5a01a63e 100644 --- a/tests/framework/e2e/curl.go +++ b/tests/framework/e2e/curl.go @@ -43,13 +43,12 @@ type CURLReq struct { // CURLPrefixArgsCluster builds the beginning of a curl command for a given key // addressed to a random URL in the given cluster. -func CURLPrefixArgsCluster(clus *EtcdProcessCluster, method string, req CURLReq) []string { - member := clus.Procs[rand.Intn(clus.Cfg.ClusterSize)] +func CURLPrefixArgsCluster(cfg *EtcdProcessClusterConfig, member EtcdProcess, method string, req CURLReq) []string { clientURL := member.Config().Acurl if req.MetricsURLScheme != "" { clientURL = member.EndpointsMetrics()[0] } - return CURLPrefixArgs(clientURL, clus.Cfg.ClientTLS, !clus.Cfg.NoCN, method, req) + return CURLPrefixArgs(clientURL, cfg.ClientTLS, !cfg.NoCN, method, req) } // CURLPrefixArgs builds the beginning of a curl command for a given key @@ -111,13 +110,13 @@ func CURLPrefixArgs(clientURL string, connType ClientConnType, CN bool, method s } func CURLPost(clus *EtcdProcessCluster, req CURLReq) error { - return SpawnWithExpect(CURLPrefixArgsCluster(clus, "POST", req), req.Expected) + return SpawnWithExpect(CURLPrefixArgsCluster(clus.Cfg, clus.Procs[rand.Intn(clus.Cfg.ClusterSize)], "POST", req), req.Expected) } func CURLPut(clus *EtcdProcessCluster, req CURLReq) error { - return SpawnWithExpect(CURLPrefixArgsCluster(clus, "PUT", req), req.Expected) + return SpawnWithExpect(CURLPrefixArgsCluster(clus.Cfg, clus.Procs[rand.Intn(clus.Cfg.ClusterSize)], "PUT", req), req.Expected) } func CURLGet(clus *EtcdProcessCluster, req CURLReq) error { - return SpawnWithExpect(CURLPrefixArgsCluster(clus, "GET", req), req.Expected) + return SpawnWithExpect(CURLPrefixArgsCluster(clus.Cfg, clus.Procs[rand.Intn(clus.Cfg.ClusterSize)], "GET", req), req.Expected) } diff --git a/tests/framework/e2e/etcd_process.go b/tests/framework/e2e/etcd_process.go index c2084428656..bb464e4db8f 100644 --- a/tests/framework/e2e/etcd_process.go +++ b/tests/framework/e2e/etcd_process.go @@ -27,6 +27,7 @@ import ( "testing" "time" + "github.com/coreos/go-semver/semver" "go.etcd.io/etcd/client/pkg/v3/fileutil" "go.etcd.io/etcd/pkg/v3/expect" "go.etcd.io/etcd/pkg/v3/proxy" @@ -405,3 +406,27 @@ func parseFailpointsBody(body io.Reader) (map[string]string, error) { } return failpoints, nil } + +func GetVersionFromBinary(binaryPath string) (*semver.Version, error) { + lines, err := RunUtilCompletion([]string{binaryPath, "--version"}, nil) + if err != nil { + return nil, fmt.Errorf("could not find binary version from %s, err: %w", binaryPath, err) + } + + for _, line := range lines { + if strings.HasPrefix(line, "etcd Version:") { + versionString := strings.TrimSpace(strings.SplitAfter(line, ":")[1]) + version, err := semver.NewVersion(versionString) + if err != nil { + return nil, err + } + return &semver.Version{ + Major: version.Major, + Minor: version.Minor, + Patch: version.Patch, + }, nil + } + } + + return nil, fmt.Errorf("could not find version in binary output of %s, lines outputted were %v", binaryPath, lines) +} diff --git a/tests/framework/e2e/etcdctl.go b/tests/framework/e2e/etcdctl.go index 7e7361b17cd..0d21ae0f5b4 100644 --- a/tests/framework/e2e/etcdctl.go +++ b/tests/framework/e2e/etcdctl.go @@ -61,6 +61,12 @@ func (ctl *Etcdctl) Get(key string) (*clientv3.GetResponse, error) { return &resp, err } +func (ctl *Etcdctl) GetWithPrefix(key string) (*clientv3.GetResponse, error) { + var resp clientv3.GetResponse + err := ctl.spawnJsonCmd(&resp, "get", key, "--prefix") + return &resp, err +} + func (ctl *Etcdctl) Put(key, value string) error { if ctl.v2 { panic("Unsupported method for v2") @@ -149,6 +155,15 @@ func (ctl *Etcdctl) MemberAdd(name string, peerURLs []string) (*clientv3.MemberA return &resp, err } +func (ctl *Etcdctl) MemberAddAsLearner(name string, peerURLs []string) (*clientv3.MemberAddResponse, error) { + if ctl.v2 { + panic("Unsupported method for v2") + } + var resp clientv3.MemberAddResponse + err := ctl.spawnJsonCmd(&resp, "member", "add", name, "--learner", "--peer-urls", strings.Join(peerURLs, ",")) + return &resp, err +} + func (ctl *Etcdctl) MemberRemove(id uint64) (*clientv3.MemberRemoveResponse, error) { if ctl.v2 { panic("Unsupported method for v2") diff --git a/tests/go.mod b/tests/go.mod index 19f09c2c6b5..453ec8450aa 100644 --- a/tests/go.mod +++ b/tests/go.mod @@ -15,6 +15,7 @@ replace ( ) require ( + github.com/coreos/go-semver v0.3.0 github.com/dustin/go-humanize v1.0.0 github.com/etcd-io/gofail v0.0.0-20190801230047-ad7f989257ca github.com/gogo/protobuf v1.3.2 @@ -54,7 +55,6 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/coreos/go-semver v0.3.0 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/creack/pty v1.1.11 // indirect github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/tests/testutils/execute.go b/tests/testutils/execute.go new file mode 100644 index 00000000000..a80bf7a0bdf --- /dev/null +++ b/tests/testutils/execute.go @@ -0,0 +1,37 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutils + +import ( + "fmt" + "testing" + "time" + + "go.etcd.io/etcd/client/pkg/v3/testutil" +) + +func ExecuteWithTimeout(t *testing.T, timeout time.Duration, f func()) { + donec := make(chan struct{}) + go func() { + defer close(donec) + f() + }() + + select { + case <-time.After(timeout): + testutil.FatalStack(t, fmt.Sprintf("test timed out after %v", timeout)) + case <-donec: + } +}