Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NO-JIRA: deferred updates: fix in-place update handling on reboot #1162

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions pkg/tuned/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,8 @@ func (c *Controller) sync(key wqKeyKube) error {
err = util.SetLogLevel(profile.Spec.Config.Verbosity)
if err != nil {
klog.Errorf("failed to set log level %d: %v", profile.Spec.Config.Verbosity, err)
} else {
klog.Infof("set log level %d", profile.Spec.Config.Verbosity)
}
change.reapplySysctl = true
if profile.Spec.Config.TuneDConfig.ReapplySysctl != nil {
Expand Down Expand Up @@ -1029,15 +1031,21 @@ func (c *Controller) changeSyncerTuneD(change Change) (synced bool, err error) {
// Cache the value written to tunedRecommendFile.
c.daemon.recommendedProfile = change.recommendedProfile
klog.V(1).Infof("recommended TuneD profile updated from %q to %q [inplaceUpdate=%v nodeRestart=%v]", prevRecommended, change.recommendedProfile, inplaceUpdate, change.nodeRestart)
changeRecommend = true

if change.deferredMode == util.DeferUpdate && !inplaceUpdate && c.daemon.recoveredRecommendedProfile == change.recommendedProfile {
klog.V(1).Infof("recommended TuneD profile changed; skip TuneD reload [deferred=%v recoveredRecommended=%v]", change.deferredMode, c.daemon.recoveredRecommendedProfile)
// If we get this far, it's either because we detected a node restart or the recommended profile changed.
// If it's a node restart, we just process the change. If it's the latter -- TuneD profile switch, i.e. not an in-place update -- we need further
// logic to distinguish between the "update" and "always" DeferMode.
// For the "update" DeferMode, we only defer profile edits. If we are re-processing a profile which we already applied, then we must
// not reload tuned, otherwise we're missing the point of deferred updates.
// See the test "Profile deferred when applied should trigger changes when applied first, then deferred when edited, if tuned restart should be kept deferred"
// See the commit 3655f22656d4a3aa9f471099305dcd78a9c80320
if !inplaceUpdate && change.deferredMode == util.DeferUpdate && c.daemon.recoveredRecommendedProfile == change.recommendedProfile {
klog.V(1).Infof("reprocessing profile already in effect; this seems a daemon reload. Skip TuneD reload [deferred=%v recoveredRecommended=%v]", change.deferredMode, c.daemon.recoveredRecommendedProfile)
// Reset because we need only once the first time we process the TuneD k8s object. Let's avoid stale data.
c.daemon.recoveredRecommendedProfile = ""
} else {
klog.V(1).Infof("recommended TuneD profile changed; trigger TuneD reload [deferred=%v]", change.deferredMode)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The log still says tuned should be reloaded. I will confuse people working on this in the future.. I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the flow will ultimately trigger a tuned reload: see lines around 1065.
But still: how can we improve the logging?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right! So this is a fix that updates the fingerprint in addition to setting the reload flag. The reload part is not changing. Ok then.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do want to not set the reload flag if very specific conditions are met, like the one captured in the test mentioned in the commit message, which is constantly failing without this fix

reload = true
changeRecommend = true
}
} else if util.IsImmediateUpdate(change.deferredMode) && (c.daemon.status&scDeferred != 0) {
klog.V(1).Infof("detected deferred update changed to immediate after object update")
Expand Down Expand Up @@ -1727,7 +1735,7 @@ func RunInCluster(stopCh <-chan struct{}, version string) error {
c.daemon.profileFingerprintUnpacked = profileFP
// We should *never* recover `c.daemon.recommendedProfile`.
// NTO operand supervises the TuneD daemon, so we must trigger
// a TuneD restart. Currently, the main trigger for the tuned
// a TuneD restart. Currently, the main reason to trigger a tuned
// reload to be triggered is when the current recommended profile
// is different from the change.recommended profile (see changeSyncerTuneD).
// Hence, we MUST NOT recover the recommended profile name
Expand All @@ -1741,10 +1749,10 @@ func RunInCluster(stopCh <-chan struct{}, version string) error {
deferredFP, isNodeReboot, err := c.recoverAndClearDeferredUpdate()
if err != nil {
klog.ErrorS(err, "unable to recover the pending update")
} else if deferredFP == "" {
klog.Infof("starting: no pending deferred update")
} else if !isNodeReboot {
klog.Infof("starting: does not seem a node reboot, but a daemon restart. Ignoring pending deferred updates (if any)")
} else if deferredFP == "" {
klog.Infof("starting: node reboot, but no pending deferred update")
} else {
klog.Infof("starting: recovered and cleared pending deferred update %q for %s (fingerprint=%q)", recommended, restartReason(isNodeReboot), deferredFP)
c.pendingChange = &Change{
Expand Down
51 changes: 35 additions & 16 deletions test/e2e/deferred/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}
util.ExecAndLogCommand("oc", "label", "node", targetNode.Name, tunedMatchLabelLater+"-")

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should not trigger any actual change", func(ctx context.Context) {
Expand All @@ -89,15 +91,15 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
tunedMutated := setDeferred(tuned.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tuned.Name)
gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile
gomega.Expect(tunedCreated.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tunedCreated.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tunedCreated.Spec.Recommend[0].Profile
ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))

gomega.Eventually(func() error {
Expand Down Expand Up @@ -178,15 +180,15 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
tunedMutated := setDeferred(tuned.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tuned.Name)
gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile
gomega.Expect(tunedCreated.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tunedCreated.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tunedCreated.Spec.Recommend[0].Profile
ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))

gomega.Expect(verifData.TargetTunedPod.Status.Phase).To(gomega.Equal(corev1.PodRunning), "TargetTunedPod %s/%s uid %s phase %s", verifData.TargetTunedPod.Namespace, verifData.TargetTunedPod.Name, verifData.TargetTunedPod.UID, verifData.TargetTunedPod.Status.Phase)
Expand Down Expand Up @@ -463,7 +465,12 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
})

ginkgo.It("should apply defer-on-update immediately after node is labelled", func(ctx context.Context) {
ginkgo.It("should apply defer-on-update immediately after node is labelled", ginkgo.Label("node-label"), func(ctx context.Context) {
nodeLabel := tunedMatchLabelLater
_, ok := targetNode.Labels[nodeLabel]
ginkgo.By(fmt.Sprintf("checking the target node labels: on %q: %q present = %v", targetNode.Name, nodeLabel, ok))
util.Logf("labels on %q: %v", targetNode.Name, targetNode.Labels)

tunedPath := filepath.Join(dirPath, tunedSHMMNI)
ginkgo.By(fmt.Sprintf("loading tuned data from %s (basepath=%s)", tunedPath, dirPath))

Expand All @@ -481,27 +488,39 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
// Change the recommendation to use non-role label to test a scenario where the Tuned is created first (and processed by NTO)
// and only later a node is changed to consume it. This results in a flow where no change is detected on the Tuned level (it is already)
// on the disk, but an update must be triggered.
nodeLabel := tunedMatchLabelLater
tunedMutated.Spec.Recommend[0].Match[0].Label = &nodeLabel

ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))
_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile

checkCurrentProfileIsDefaultForWorker(ctx, targetNode.Name, verifData.TargetTunedPod)

ginkgo.By(fmt.Sprintf("labelling node %q with %q", targetNode.Name, nodeLabel))
gomega.Eventually(func() error {
nd, err := cs.CoreV1Interface.Nodes().Get(ctx, targetNode.Name, metav1.GetOptions{})
if err != nil {
return err
}
nd = nd.DeepCopy()
gomega.Expect(nd.Labels).ToNot(gomega.HaveKey(nodeLabel), "node %q has label %q too early", nd.Name, nodeLabel)

gomega.Expect(targetNode.Labels).ToNot(gomega.HaveKey(nodeLabel), "node %q has label %q too early", targetNode.Name, nodeLabel)
targetNode.Labels[nodeLabel] = ""
_, err = cs.CoreV1Interface.Nodes().Update(ctx, targetNode, metav1.UpdateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())
nd.Labels[nodeLabel] = ""
_, err = cs.CoreV1Interface.Nodes().Update(ctx, nd, metav1.UpdateOptions{})
return err
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())

ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))
// feedback for debugging
tmpNd, err := cs.CoreV1Interface.Nodes().Get(ctx, targetNode.Name, metav1.GetOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get feedback about %q", targetNode.Name)
util.Logf("labels on %q after update: %v", tmpNd.Name, tmpNd.Labels)

gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNode.Name, metav1.GetOptions{})
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/deferred/non_regression.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ var _ = ginkgo.Describe("Profile non-deferred", ginkgo.Label("deferred", "non-re
ginkgo.By(fmt.Sprintf("cluster changes rollback: %q", createdTuned))
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should trigger changes", func(ctx context.Context) {
Expand Down
53 changes: 53 additions & 0 deletions test/e2e/deferred/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,56 @@ func checkNonTargetWorkerNodesAreUnaffected(ctx context.Context, workerNodes []c
}).WithPolling(10. * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
}
}

func checkCurrentProfileIsDefaultForWorker(ctx context.Context, targetNodeName string, targetTunedPod *corev1.Pod) {
ginkgo.GinkgoHelper()

gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNodeName, metav1.GetOptions{})
if err != nil {
return err
}
ginkgo.By(fmt.Sprintf("checking profile for target node %q matches expectations about %q", curProf.Name, defaultWorkerProfile))

if len(curProf.Status.Conditions) == 0 {
return fmt.Errorf("missing status conditions")
}
cond := findCondition(curProf.Status.Conditions, tunedv1.TunedProfileApplied)
if cond == nil {
return fmt.Errorf("missing status applied condition")
}
err = checkAppliedConditionOK(cond)
if err != nil {
util.Logf("profile for target node %q does not match expectations about %q: %v", curProf.Name, defaultWorkerProfile, err)
}
recommended, err := getRecommendedProfile(targetTunedPod)
if err != nil {
return err
}
if recommended != defaultWorkerProfile {
return fmt.Errorf("recommended profile is %q expected %q", recommended, defaultWorkerProfile)
}
return err
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
}

func checkWorkerNodeIsDefaultState(ctx context.Context, targetNode *corev1.Node) {
ginkgo.GinkgoHelper()

var targetTunedPod *corev1.Pod
gomega.Eventually(func() error {
tunedPod, err := util.GetTunedForNode(cs, targetNode)
if err != nil {
return err
}
if tunedPod.Status.Phase != corev1.PodRunning {
return fmt.Errorf("pod %s/%s phase %v not running", tunedPod.Namespace, tunedPod.Name, tunedPod.Status.Phase)
}
targetTunedPod = tunedPod
return nil
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())

checkCurrentProfileIsDefaultForWorker(context.Background(), targetNode.Name, targetTunedPod)

ginkgo.By(fmt.Sprintf("node %q seems to match default worker state", targetNode.Name))
}
38 changes: 7 additions & 31 deletions test/e2e/deferred/restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di
if len(createdTuneds) == 0 {
return
}
checkNonTargetWorkerNodesAreUnaffected(context.TODO(), workerNodes, targetNode.Name)

ctx := context.Background()
checkNonTargetWorkerNodesAreUnaffected(ctx, workerNodes, targetNode.Name)
checkWorkerNodeIsDefaultState(ctx, targetNode)
})

ginkgo.Context("the tuned daemon", ginkgo.Label("slow", "pod"), func() {
Expand All @@ -82,6 +85,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di
verifData := util.MustExtractVerificationOutputAndCommand(cs, targetNode, tunedImmediate)
gomega.Expect(verifData.OutputCurrent).ToNot(gomega.Equal(verifData.OutputExpected), "current output %q already matches expected %q", verifData.OutputCurrent, verifData.OutputExpected)

checkCurrentProfileIsDefaultForWorker(ctx, targetNode.Name, verifData.TargetTunedPod)

tunedMutated := setDeferred(tunedImmediate.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

Expand Down Expand Up @@ -396,37 +401,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di

ginkgo.By(fmt.Sprintf("checking target node after rollback: %q", targetNode.Name))
_, createdTuneds, _ = popleft(createdTuneds)
gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNode.Name, metav1.GetOptions{})
if err != nil {
return err
}
ginkgo.By(fmt.Sprintf("checking profile for target node %q matches expectations about %q", curProf.Name, expectedProfile))
gomega.Expect(curProf.Name).To(gomega.Equal(expectedProfile), "checking profile for worker node %q matches expectations %q", curProf.Name, expectedProfile)

if len(curProf.Status.Conditions) == 0 {
return fmt.Errorf("missing status conditions")
}
cond := findCondition(curProf.Status.Conditions, tunedv1.TunedProfileApplied)
if cond == nil {
return fmt.Errorf("missing status applied condition")
}
err = checkAppliedConditionOK(cond)
if err != nil {
util.Logf("profile for target node %q does not match expectations about %q: %v", curProf.Name, expectedProfile, err)
}

ginkgo.By(fmt.Sprintf("checking real node conditions are restored pristine: %v -> %s", verifData.CommandArgs, verifData.OutputCurrent))
out, err := util.ExecCmdInPod(targetTunedPod, verifData.CommandArgs...)
if err != nil {
return err
}
out = strings.TrimSpace(out)
if out != verifData.OutputCurrent {
return fmt.Errorf("got: %s; expected: %s", out, verifData.OutputCurrent)
}
return nil
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
checkWorkerNodeIsDefaultState(ctx, targetNode)

checkNonTargetWorkerNodesAreUnaffected(ctx, workerNodes, targetNode.Name)
})
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/deferred/updates.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "inplace-up
ginkgo.By(fmt.Sprintf("cluster changes rollback: %q", createdTuned))
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should trigger changes when applied fist, then deferred when edited", func(ctx context.Context) {
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-00.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 20
profile: test-shmmni
operand:
verbosity: 4
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-cpu-energy
operand:
verbosity: 4
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-20.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vm-latency
operand:
verbosity: 4
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vmlat-inplace
operand:
verbosity: 4
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vmlat-inplace
operand:
verbosity: 4