Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[concept] add livez/readyz for etcd #16008

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions server/embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,8 @@ func (e *Etcd) serveClients() (err error) {
etcdhttp.HandleVersion(mux, e.Server)
etcdhttp.HandleMetrics(mux)
etcdhttp.HandleHealth(e.cfg.logger, mux, e.Server)
etcdhttp.HandleLivez(e.cfg.logger, mux, e.Server)
etcdhttp.HandleReadyz(e.cfg.logger, mux, e.Server)

var gopts []grpc.ServerOption
if e.cfg.GRPCKeepAliveMinTime > time.Duration(0) {
Expand Down Expand Up @@ -831,6 +833,8 @@ func (e *Etcd) serveMetrics() (err error) {
metricsMux := http.NewServeMux()
etcdhttp.HandleMetrics(metricsMux)
etcdhttp.HandleHealth(e.cfg.logger, metricsMux, e.Server)
etcdhttp.HandleLivez(e.cfg.logger, metricsMux, e.Server)
etcdhttp.HandleReadyz(e.cfg.logger, metricsMux, e.Server)

for _, murl := range e.cfg.ListenMetricsUrls {
tlsInfo := &e.cfg.ClientTLSInfo
Expand Down
48 changes: 38 additions & 10 deletions server/etcdserver/api/etcdhttp/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@ import (
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"

"go.etcd.io/raft/v3"

"go.etcd.io/etcd/api/v3/etcdserverpb"
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/client/pkg/v3/types"
"go.etcd.io/etcd/server/v3/auth"
"go.etcd.io/etcd/server/v3/config"
"go.etcd.io/raft/v3"
)

const (
PathHealth = "/health"
PathLivez = "/livez"
PathReadyz = "/readyz"
PathProxyHealth = "/proxy/health"
)

Expand All @@ -46,33 +49,58 @@ type ServerHealth interface {
// HandleHealth registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleHealth(lg *zap.Logger, mux *http.ServeMux, srv ServerHealth) {
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet, serializable bool) Health {
if h := checkAlarms(lg, srv, excludedAlarms); h.Health != "true" {
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet, serializable bool, endpoint string) Health {
if h := checkAlarms(lg, srv, excludedAlarms, endpoint); h.Health != "true" {
return h
}
if h := checkLeader(lg, srv, serializable); h.Health != "true" {
return h
}
return checkAPI(lg, srv, serializable)
}))
}, PathHealth))
}

// HandleLivez registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleLivez(lg *zap.Logger, mux *http.ServeMux, srv ServerHealth) {
mux.Handle(PathLivez, NewHealthHandler(lg, func(excludedAlarms AlarmSet, serializable bool, endpoint string) Health {
if h := checkAlarms(lg, srv, excludedAlarms, endpoint); h.Health != "true" {
return h
}
return checkAPI(lg, srv, true)
}, PathLivez, []string{etcdserverpb.AlarmType_NOSPACE.String()}...))
}

// HandleReadyz registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleReadyz(lg *zap.Logger, mux *http.ServeMux, srv ServerHealth) {
mux.Handle(PathReadyz, NewHealthHandler(lg, func(excludedAlarms AlarmSet, serializable bool, endpoint string) Health {
if h := checkAlarms(lg, srv, excludedAlarms, endpoint); h.Health != "true" {
return h
}
return checkAPI(lg, srv, false)
}, PathReadyz))
}

// NewHealthHandler handles '/health' requests.
func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet, Serializable bool) Health) http.HandlerFunc {
func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet, Serializable bool, endpoint string) Health, endpoint string, alwaysExclude ...string) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.Header().Set("Allow", http.MethodGet)
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
lg.Warn("/health error", zap.Int("status-code", http.StatusMethodNotAllowed))
lg.Warn(fmt.Sprintf("%s error", endpoint), zap.Int("status-code", http.StatusMethodNotAllowed))
return
}
excludedAlarms := getExcludedAlarms(r)
for _, additionalExcludes := range alwaysExclude {
excludedAlarms[additionalExcludes] = struct{}{}
}
// Passing the query parameter "serializable=true" ensures that the
// health of the local etcd is checked vs the health of the cluster.
// This is useful for probes attempting to validate the liveness of
// the etcd process vs readiness of the cluster to serve requests.
serializableFlag := getSerializableFlag(r)
h := hfunc(excludedAlarms, serializableFlag)
h := hfunc(excludedAlarms, serializableFlag, endpoint)
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
Expand All @@ -83,12 +111,12 @@ func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet, Serial
d, _ := json.Marshal(h)
if h.Health != "true" {
http.Error(w, string(d), http.StatusServiceUnavailable)
lg.Warn("/health error", zap.String("output", string(d)), zap.Int("status-code", http.StatusServiceUnavailable))
lg.Warn(fmt.Sprintf("%s error", endpoint), zap.String("output", string(d)), zap.Int("status-code", http.StatusServiceUnavailable))
return
}
w.WriteHeader(http.StatusOK)
w.Write(d)
lg.Debug("/health OK", zap.Int("status-code", http.StatusOK))
lg.Debug(fmt.Sprintf("%s ok", endpoint), zap.Int("status-code", http.StatusOK))
}
}

Expand Down Expand Up @@ -141,7 +169,7 @@ func getSerializableFlag(r *http.Request) bool {

// TODO: etcdserver.ErrNoLeader in health API

func checkAlarms(lg *zap.Logger, srv ServerHealth, excludedAlarms AlarmSet) Health {
func checkAlarms(lg *zap.Logger, srv ServerHealth, excludedAlarms AlarmSet, healthType string) Health {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The healthType string isn't used at all, can we remove it?

Suggested change
func checkAlarms(lg *zap.Logger, srv ServerHealth, excludedAlarms AlarmSet, healthType string) Health {
func checkAlarms(lg *zap.Logger, srv ServerHealth, excludedAlarms AlarmSet) Health {

h := Health{Health: "true"}
as := srv.Alarms()
if len(as) > 0 {
Expand Down
Loading