Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor logging to give more visibility to check and heal services #533

Merged
merged 3 commits into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions mocks/operator/redisfailover/service/RedisFailoverCheck.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 28 additions & 31 deletions operator/redisfailover/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,15 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
// Sentinel has not death nodes
// Sentinel knows the correct slave number

err := r.rfChecker.CheckRedisNumber(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset")
if !r.rfChecker.IsRedisRunning(rf) {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
return nil
}

err = r.rfChecker.CheckSentinelNumber(rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment")
if !r.rfChecker.IsSentinelRunning(rf) {
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
return nil
}

Expand All @@ -122,7 +120,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
}
switch nMasters {
case 0:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("No masters detected"))
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
redisesIP, err := r.rfChecker.GetRedisesIPs(rf)
if err != nil {
return err
Expand All @@ -138,34 +136,34 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
return err2
}
if minTime > timeToPrepare {
r.logger.Debugf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
// We can consider there's an error
if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil {
return err2
}
} else {
// We'll wait until failover is done
r.logger.Debug("No master found, wait until failover")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover")
return nil
}
case 1:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil)
default:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected"))
return errors.New("More than one master, fix manually")
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected"))
return errors.New("more than one master, fix manually")
}

master, err := r.rfChecker.GetMasterIP(rf)
if err != nil {
return err
}

err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf)
err = r.rfChecker.CheckAllSlavesFromMaster(master, rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err)
if err2 != nil {
r.logger.Debug("Not all slaves have the same master")
if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil {
return err3
if err != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error())
if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil {
return err
}
}

Expand All @@ -190,7 +188,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
err = r.rfChecker.CheckSentinelMonitor(sip, master, port)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
if err != nil {
r.logger.Debug("Sentinel is not monitoring the correct master")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil {
return err
}
Expand All @@ -200,14 +198,14 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
}

func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error {
err := r.rfChecker.CheckRedisNumber(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset")

if !r.rfChecker.IsRedisRunning(rf) {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
return nil
}

err = r.UpdateRedisesPods(rf)
err := r.UpdateRedisesPods(rf)
if err != nil {
return err
}
Expand All @@ -223,10 +221,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
}

if rf.SentinelsAllowed() {
err = r.rfChecker.CheckSentinelNumber(rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment")
if !r.rfChecker.IsSentinelRunning(rf) {
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
return nil
}

Expand All @@ -238,7 +235,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
if err != nil {
r.logger.Debug("Sentinel is not monitoring the correct master")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil {
return err
}
Expand Down Expand Up @@ -267,7 +264,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa
err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err)
if err != nil {
r.logger.Debug("Sentinel has more sentinel in memory than spected")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip)
if err := r.rfHealer.RestoreSentinel(sip); err != nil {
return err
}
Expand All @@ -278,7 +275,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa
err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf)
setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err)
if err != nil {
r.logger.Debug("Sentinel has more slaves in memory than spected")
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip)
if err := r.rfHealer.RestoreSentinel(sip); err != nil {
return err
}
Expand Down
6 changes: 3 additions & 3 deletions operator/redisfailover/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,14 @@ func TestCheckAndHeal(t *testing.T) {
mrfh := &mRFService.RedisFailoverHeal{}

if test.redisCheckNumberOK {
mrfc.On("CheckRedisNumber", rf).Once().Return(nil)
mrfc.On("IsRedisRunning", rf).Once().Return(true)
} else {
continueTests = false
mrfc.On("CheckRedisNumber", rf).Once().Return(errors.New(""))
mrfc.On("IsRedisRunning", rf).Once().Return(false)
}

if allowSentinels {
mrfc.On("CheckSentinelNumber", rf).Once().Return(nil)
mrfc.On("IsSentinelRunning", rf).Once().Return(true)
}

if bootstrappingTests && continueTests {
Expand Down
35 changes: 33 additions & 2 deletions operator/redisfailover/service/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ type RedisFailoverCheck interface {
GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error)
GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error)
CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error)
IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool
IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool
IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool
}

// RedisFailoverChecker is our implementation of RedisFailoverCheck interface
Expand Down Expand Up @@ -168,7 +171,7 @@ func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ...
return err
}
if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) {
return errors.New("the monitor on the sentinel config does not match with the expected one")
return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort)
}
return nil
}
Expand Down Expand Up @@ -209,11 +212,13 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove
nMasters := 0
rips, err := r.GetRedisesIPs(rf)
if err != nil {
r.logger.Errorf(err.Error())
return nMasters, err
}

password, err := k8s.GetRedisPassword(r.k8sService, rf)
if err != nil {
r.logger.Errorf("Error getting password: %s", err.Error())
return nMasters, err
}

Expand Down Expand Up @@ -274,7 +279,7 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF
}
start := redisNode.Status.StartTime.Round(time.Second)
alive := time.Since(start)
r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds())
r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds())
if alive < minTime {
minTime = alive
}
Expand Down Expand Up @@ -383,6 +388,32 @@ func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redis
return r.redisClient.SlaveIsReady(ip, port, password)
}

// IsRedisRunning returns true if all the pods are Running
func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool {
dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover))
return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
}

// IsSentinelRunning returns true if all the pods are Running
func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool {
dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover))
return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
}

// IsClusterRunning returns true if all the pods in the given redisfailover are Running
func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool {
return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover)
}

func getRedisPort(p int32) string {
return strconv.Itoa(int(p))
}

func AreAllRunning(pods *corev1.PodList) bool {
for _, pod := range pods.Items {
if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil {
return false
}
}
return true
}
87 changes: 87 additions & 0 deletions operator/redisfailover/service/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -869,3 +869,90 @@ func TestGetRedisRevisionHash(t *testing.T) {
}

}

func TestClusterRunning(t *testing.T) {
assert := assert.New(t)

rf := generateRF()

allRunning := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

notAllRunning := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodPending,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

notAllReplicas := &corev1.PodList{
Items: []corev1.Pod{
{
Status: corev1.PodStatus{
PodIP: "0.0.0.0",
Phase: corev1.PodRunning,
},
},
{
Status: corev1.PodStatus{
PodIP: "1.1.1.1",
Phase: corev1.PodRunning,
},
},
},
}

ms := &mK8SService.Services{}
ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil)
mr := &mRedisService.Client{}

checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy)

assert.True(checker.IsClusterRunning(rf))

ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(notAllReplicas, nil)
assert.False(checker.IsClusterRunning(rf))

ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(notAllRunning, nil)
ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil)
assert.False(checker.IsClusterRunning(rf))

}
Loading