From b2376e68461c375afe2ac75feab54b16842a0c84 Mon Sep 17 00:00:00 2001 From: CWen Date: Wed, 20 Mar 2019 11:05:34 +0800 Subject: [PATCH] stability: use fault-trigger at e2e tests and add some log (#330) --- tests/actions.go | 37 +---- tests/cmd/e2e/config.yaml | 23 +++ tests/cmd/e2e/main.go | 22 ++- tests/config.go | 65 ++++++++ tests/fault.go | 174 +++++++++++++++++++++ tests/log_dump.go | 2 +- tests/manifests/e2e-configmap.yaml | 32 ++++ tests/manifests/e2e.yaml | 9 ++ tests/pkg/fault-trigger/client/client.go | 12 +- tests/pkg/fault-trigger/manager/etcd.go | 6 + tests/pkg/fault-trigger/manager/kubelet.go | 6 + tests/pkg/fault-trigger/manager/vm.go | 6 + 12 files changed, 355 insertions(+), 39 deletions(-) create mode 100644 tests/cmd/e2e/config.yaml create mode 100644 tests/config.go create mode 100644 tests/fault.go create mode 100644 tests/manifests/e2e-configmap.yaml diff --git a/tests/actions.go b/tests/actions.go index ee87c0cff6..308bd6934e 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -54,12 +54,12 @@ const ( defaultRawSize = 100 ) -func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, logDir string) OperatorActions { +func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) OperatorActions { return &operatorActions{ cli: cli, kubeCli: kubeCli, pdControl: controller.NewDefaultPDControl(), - logDir: logDir, + cfg: cfg, } } @@ -96,32 +96,11 @@ type OperatorActions interface { getBackupDir(info *TidbClusterInfo) ([]string, error) } -type FaultTriggerActions interface { - StopNode(nodeName string) error - StartNode(nodeName string) error - StopEtcd() error - StartEtcd() error - StopKubeAPIServer() error - StartKubeAPIServer() error - StopKubeControllerManager() error - StartKubeControllerManager() error - StopKubeScheduler() error - StartKubeScheduler() error - StopKubelet(nodeName string) error - StartKubelet(nodeName string) error - StopKubeProxy(nodeName string) error - StartKubeProxy(nodeName string) error - DiskCorruption(nodeName string) error - NetworkPartition(fromNode, toNode string) error - NetworkDelay(fromNode, toNode string) error - DockerCrash(nodeName string) error -} - type operatorActions struct { cli versioned.Interface kubeCli kubernetes.Interface pdControl controller.PDControlInterface - logDir string + cfg *Config } var _ = OperatorActions(&operatorActions{}) @@ -915,27 +894,27 @@ func (oa *operatorActions) monitorNormal(clusterInfo *TidbClusterInfo) (bool, er return false, nil } if monitorDeployment.Status.ReadyReplicas < 1 { - glog.Info("monitor ready replicas %d < 1", monitorDeployment.Status.ReadyReplicas) + glog.Infof("monitor ready replicas %d < 1", monitorDeployment.Status.ReadyReplicas) return false, nil } configuratorJobName := fmt.Sprintf("%s-monitor-configurator", tcName) monitorJob, err := oa.kubeCli.BatchV1().Jobs(ns).Get(configuratorJobName, metav1.GetOptions{}) if err != nil { - glog.Info("get monitor configurator job: [%s/%s] failed", ns, configuratorJobName) + glog.Infof("get monitor configurator job: [%s/%s] failed", ns, configuratorJobName) return false, nil } if monitorJob.Status.Succeeded == 0 { - glog.Info("the monitor configurator job: [%s/%s] had not success", ns, configuratorJobName) + glog.Infof("the monitor configurator job: [%s/%s] had not success", ns, configuratorJobName) return false, nil } if err := oa.checkPrometheus(clusterInfo); err != nil { - glog.Info("check [%s/%s]'s prometheus data failed: %v", ns, monitorDeploymentName, err) + glog.Infof("check [%s/%s]'s prometheus data failed: %v", ns, monitorDeploymentName, err) return false, nil } if err := oa.checkGrafanaData(clusterInfo); err != nil { - glog.Info("check [%s/%s]'s grafana data failed: %v", ns, monitorDeploymentName, err) + glog.Infof("check [%s/%s]'s grafana data failed: %v", ns, monitorDeploymentName, err) return false, nil } return true, nil diff --git a/tests/cmd/e2e/config.yaml b/tests/cmd/e2e/config.yaml new file mode 100644 index 0000000000..2f35e38f91 --- /dev/null +++ b/tests/cmd/e2e/config.yaml @@ -0,0 +1,23 @@ +nodes: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 + - physical_node: 172.16.4.40 + nodes: + - 172.16.4.174 + - 172.16.4.175 + - 172.16.4.176 +etcds: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 +apiservers: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index 61275986e2..90f1740891 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -14,9 +14,9 @@ package main import ( - "flag" "net/http" _ "net/http/pprof" + "time" "github.com/golang/glog" "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" @@ -36,10 +36,15 @@ func perror(err error) { } func main() { - flag.Parse() logs.InitLogs() defer logs.FlushLogs() + conf := tests.NewConfig() + err := conf.Parse() + if err != nil { + glog.Fatalf("failed to parse config: %v", err) + } + go func() { glog.Info(http.ListenAndServe("localhost:6060", nil)) }() @@ -57,7 +62,7 @@ func main() { glog.Fatalf("failed to get kubernetes Clientset: %v", err) } - oa := tests.NewOperatorActions(cli, kubeCli, "/logDir") + oa := tests.NewOperatorActions(cli, kubeCli, conf) operatorInfo := &tests.OperatorInfo{ Namespace: "pingcap", @@ -218,4 +223,15 @@ func main() { oa.DumpAllLogs(operatorInfo, []*tests.TidbClusterInfo{clusterInfo, restoreClusterInfo}) glog.Fatal(err) } + + fa := tests.NewFaultTriggerAction(cli, kubeCli, conf) + if err := fa.StopETCD("172.16.4.171"); err != nil { + glog.Fatal(err) + } + + time.Sleep(1 * time.Minute) + + if err := fa.StartETCD("172.16.4.171"); err != nil { + glog.Fatal(err) + } } diff --git a/tests/config.go b/tests/config.go new file mode 100644 index 0000000000..031bf4875e --- /dev/null +++ b/tests/config.go @@ -0,0 +1,65 @@ +package tests + +import ( + "flag" + "io/ioutil" + + yaml "gopkg.in/yaml.v2" +) + +// Config defines the config of operator tests +type Config struct { + configFile string + + LogDir string `yaml:"log_dir" json:"log_dir"` + FaultTriggerPort int `yaml:"fault_trigger_port" json:"fault_trigger_port"` + Nodes []Nodes `yaml:"nodes" json:"nodes"` + ETCDs []Nodes `yaml:"etcds" json:"etcds"` + APIServers []Nodes `yaml:"apiservers" json:"apiservers"` +} + +// Nodes defines a series of nodes that belong to the same physical node. +type Nodes struct { + PhysicalNode string `yaml:"physical_node" json:"physical_node"` + Nodes []string `yaml:"nodes" json:"nodes"` +} + +// NewConfig creates a new config. +func NewConfig() *Config { + cfg := &Config{} + flag.StringVar(&cfg.configFile, "config", "/etc/e2e/config.yaml", "Config file") + flag.StringVar(&cfg.LogDir, "log-dir", "/logDir", "log directory") + flag.IntVar(&cfg.FaultTriggerPort, "fault-trigger-port", 23332, "the http port of fault trigger service") + + return cfg +} + +// Parse parses flag definitions from the argument list. +func (c *Config) Parse() error { + // Parse first to get config file + flag.Parse() + + if c.configFile != "" { + if err := c.configFromFile(c.configFile); err != nil { + return err + } + } + + // Parse again to replace with command line options. + flag.Parse() + + return nil +} + +func (c *Config) configFromFile(path string) error { + data, err := ioutil.ReadFile(path) + if err != nil { + return err + } + + if err = yaml.Unmarshal(data, c); err != nil { + return err + } + + return nil +} diff --git a/tests/fault.go b/tests/fault.go new file mode 100644 index 0000000000..b77e7527b9 --- /dev/null +++ b/tests/fault.go @@ -0,0 +1,174 @@ +package tests + +import ( + "fmt" + + "github.com/golang/glog" + "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + "github.com/pingcap/tidb-operator/pkg/controller" + "github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/client" + "github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/manager" + "k8s.io/client-go/kubernetes" +) + +type FaultTriggerActions interface { + StopNode(physicalNode string, node string) error + StartNode(physicalNode string, node string) error + StopETCD(nodes ...string) error + StartETCD(nodes ...string) error + StopKubelet(node string) error + StartKubelet(node string) error + // TODO: support more faults + // StopKubeAPIServer() error + // StartKubeAPIServer() error + // StopKubeControllerManager() error + // StartKubeControllerManager() error + // StopKubeScheduler() error + // StartKubeScheduler() error + // StopKubeProxy(node string) error + // StartKubeProxy(node string) error + // DiskCorruption(node string) error + // NetworkPartition(fromNode, toNode string) error + // NetworkDelay(fromNode, toNode string) error + // DockerCrash(nodeName string) error +} + +func NewFaultTriggerAction(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) FaultTriggerActions { + return &faultTriggerActions{ + cli: cli, + kubeCli: kubeCli, + pdControl: controller.NewDefaultPDControl(), + cfg: cfg, + } +} + +type faultTriggerActions struct { + cli versioned.Interface + kubeCli kubernetes.Interface + pdControl controller.PDControlInterface + cfg *Config +} + +func (fa *faultTriggerActions) StopNode(physicalNode string, node string) error { + faultCli := client.NewClient(client.Config{ + Addr: fa.genFaultTriggerAddr(physicalNode), + }) + + err := faultCli.StopVM(&manager.VM{ + IP: node, + }) + + if err != nil { + glog.Errorf("failed to stop node %s on physical node: %s: %v", node, physicalNode, err) + return err + } + + glog.Infof("node %s on physical node %s is stopped", node, physicalNode) + + return nil +} + +func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error { + faultCli := client.NewClient(client.Config{ + Addr: fa.genFaultTriggerAddr(physicalNode), + }) + + err := faultCli.StartVM(&manager.VM{ + IP: node, + }) + + if err != nil { + glog.Errorf("failed to start node %s on physical node %s: %v", node, physicalNode, err) + return err + } + + glog.Infof("node %s on physical node %s is started", physicalNode, node) + + return nil +} + +// StopETCD stops the etcd service. +// If the `nodes` is empty, StopEtcd will stop all etcd service. +func (fa *faultTriggerActions) StopETCD(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.ETCDs { + nodes = append(nodes, ns.Nodes...) + } + } + + for _, node := range nodes { + faultCli := client.NewClient(client.Config{ + Addr: fa.genFaultTriggerAddr(node), + }) + + if err := faultCli.StopETCD(); err != nil { + glog.Errorf("failed to stop etcd %s: %v", node, err) + return err + } + + glog.Infof("etcd %s is stopped", node) + } + + return nil +} + +// StartETCD starts the etcd service. +// If the `nodes` is empty, StartETCD will start all etcd service. +func (fa *faultTriggerActions) StartETCD(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.ETCDs { + nodes = append(nodes, ns.Nodes...) + } + } + + for _, node := range nodes { + faultCli := client.NewClient(client.Config{ + Addr: fa.genFaultTriggerAddr(node), + }) + + if err := faultCli.StartETCD(); err != nil { + glog.Errorf("failed to start etcd %s: %v", node, err) + return err + } + + glog.Infof("etcd %s is started", node) + } + + return nil +} + +// StopKubelet stops the kubelet service. +func (fa *faultTriggerActions) StopKubelet(node string) error { + faultCli := client.NewClient(client.Config{ + Addr: fa.genFaultTriggerAddr(node), + }) + + if err := faultCli.StopKubelet(); err != nil { + glog.Errorf("failed to stop kubelet %s: %v", node, err) + return err + } + + glog.Infof("kubelet %s is stopped", node) + + return nil +} + +// StartKubelet starts the kubelet service. +func (fa *faultTriggerActions) StartKubelet(node string) error { + faultCli := client.NewClient(client.Config{ + Addr: node, + }) + + if err := faultCli.StartKubelet(); err != nil { + glog.Errorf("failed to start kubelet %s: %v", node, err) + return err + } + + glog.Infof("kubelet %s is started", node) + + return nil +} + +func (fa *faultTriggerActions) genFaultTriggerAddr(node string) string { + return fmt.Sprintf("%s:%d", node, fa.cfg.FaultTriggerPort) +} diff --git a/tests/log_dump.go b/tests/log_dump.go index 97de55c0ad..f5dc11af02 100644 --- a/tests/log_dump.go +++ b/tests/log_dump.go @@ -12,7 +12,7 @@ import ( ) func (oa *operatorActions) DumpAllLogs(operatorInfo *OperatorInfo, testClusters []*TidbClusterInfo) error { - logPath := fmt.Sprintf("/%s/%s", oa.logDir, "operator-stability") + logPath := fmt.Sprintf("/%s/%s", oa.cfg.LogDir, "operator-stability") if _, err := os.Stat(logPath); os.IsNotExist(err) { err = os.MkdirAll(logPath, os.ModePerm) if err != nil { diff --git a/tests/manifests/e2e-configmap.yaml b/tests/manifests/e2e-configmap.yaml new file mode 100644 index 0000000000..0af85753bb --- /dev/null +++ b/tests/manifests/e2e-configmap.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: tidb-operator-e2e + name: tidb-operator-e2e-config +data: + e2e-config: |- + nodes: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 + - physical_node: 172.16.4.40 + nodes: + - 172.16.4.174 + - 172.16.4.175 + - 172.16.4.176 + etcds: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 + apiservers: + - physical_node: 172.16.4.39 + nodes: + - 172.16.4.171 + - 172.16.4.172 + - 172.16.4.173 + + diff --git a/tests/manifests/e2e.yaml b/tests/manifests/e2e.yaml index 77e44a4559..913dceb17b 100644 --- a/tests/manifests/e2e.yaml +++ b/tests/manifests/e2e.yaml @@ -33,9 +33,18 @@ spec: volumeMounts: - mountPath: /logDir name: logdir + - name: config + readOnly: true + mountPath: /etc/e2e volumes: - name: logdir hostPath: path: /var/log type: Directory + - name: config + configMap: + name: tidb-operator-e2e-config + items: + - key: e2e-config + path: config.yaml restartPolicy: Never diff --git a/tests/pkg/fault-trigger/client/client.go b/tests/pkg/fault-trigger/client/client.go index 17a812cbe6..3f7c2348c6 100644 --- a/tests/pkg/fault-trigger/client/client.go +++ b/tests/pkg/fault-trigger/client/client.go @@ -131,7 +131,7 @@ func (c *client) StartVM(vm *manager.VM) error { vmName = vm.IP } - url := util.GenURL(fmt.Sprintf("%s/%s/vm/%s/start", c.cfg.Addr, api.APIPrefix, vmName)) + url := util.GenURL(fmt.Sprintf("%s%s/vm/%s/start", c.cfg.Addr, api.APIPrefix, vmName)) if _, err := c.get(url); err != nil { return err } @@ -149,7 +149,7 @@ func (c *client) StopVM(vm *manager.VM) error { vmName = vm.IP } - url := util.GenURL(fmt.Sprintf("%s/%s/vm/%s/stop", c.cfg.Addr, api.APIPrefix, vmName)) + url := util.GenURL(fmt.Sprintf("%s%s/vm/%s/stop", c.cfg.Addr, api.APIPrefix, vmName)) if _, err := c.get(url); err != nil { return err } @@ -158,7 +158,7 @@ func (c *client) StopVM(vm *manager.VM) error { } func (c *client) StartETCD() error { - url := util.GenURL(fmt.Sprintf("%s/%s/etcd/start", c.cfg.Addr, api.APIPrefix)) + url := util.GenURL(fmt.Sprintf("%s%s/etcd/start", c.cfg.Addr, api.APIPrefix)) if _, err := c.get(url); err != nil { return err } @@ -167,7 +167,7 @@ func (c *client) StartETCD() error { } func (c *client) StopETCD() error { - url := util.GenURL(fmt.Sprintf("%s/%s/etcd/stop", c.cfg.Addr, api.APIPrefix)) + url := util.GenURL(fmt.Sprintf("%s%s/etcd/stop", c.cfg.Addr, api.APIPrefix)) if _, err := c.get(url); err != nil { return err } @@ -176,7 +176,7 @@ func (c *client) StopETCD() error { } func (c *client) StartKubelet() error { - url := util.GenURL(fmt.Sprintf("%s/%s/kubelet/start", c.cfg.Addr, api.APIPrefix)) + url := util.GenURL(fmt.Sprintf("%s%s/kubelet/start", c.cfg.Addr, api.APIPrefix)) if _, err := c.get(url); err != nil { return err } @@ -185,7 +185,7 @@ func (c *client) StartKubelet() error { } func (c *client) StopKubelet() error { - url := util.GenURL(fmt.Sprintf("%s/%s/kubelet/stop", c.cfg.Addr, api.APIPrefix)) + url := util.GenURL(fmt.Sprintf("%s%s/kubelet/stop", c.cfg.Addr, api.APIPrefix)) if _, err := c.get(url); err != nil { return err } diff --git a/tests/pkg/fault-trigger/manager/etcd.go b/tests/pkg/fault-trigger/manager/etcd.go index cd16a527ae..ef4d644736 100644 --- a/tests/pkg/fault-trigger/manager/etcd.go +++ b/tests/pkg/fault-trigger/manager/etcd.go @@ -28,6 +28,9 @@ func (m *Manager) StartETCD() error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Info("etcd service is started") + return nil } @@ -40,5 +43,8 @@ func (m *Manager) StopETCD() error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Info("etcd service is stopped") + return nil } diff --git a/tests/pkg/fault-trigger/manager/kubelet.go b/tests/pkg/fault-trigger/manager/kubelet.go index 4dad0c0c60..04b6b91f89 100644 --- a/tests/pkg/fault-trigger/manager/kubelet.go +++ b/tests/pkg/fault-trigger/manager/kubelet.go @@ -28,6 +28,9 @@ func (m *Manager) StartKubelet() error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Info("kubelet is started") + return nil } @@ -40,5 +43,8 @@ func (m *Manager) StopKubelet() error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Info("kubelet is stopped") + return nil } diff --git a/tests/pkg/fault-trigger/manager/vm.go b/tests/pkg/fault-trigger/manager/vm.go index e83df64b00..6a2e5eb145 100644 --- a/tests/pkg/fault-trigger/manager/vm.go +++ b/tests/pkg/fault-trigger/manager/vm.go @@ -53,6 +53,9 @@ func (m *Manager) StopVM(v *VM) error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Infof("virtual machine %s is stopped", v.Name) + return nil } @@ -65,6 +68,9 @@ func (m *Manager) StartVM(v *VM) error { glog.Errorf("exec: [%s] failed, output: %s, error: %v", shell, string(output), err) return err } + + glog.Infof("virtual machine %s is started", v.Name) + return nil }