From ceee8e4659c8cfe11842a75563b2f39d5f899ce0 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:48:29 +0300 Subject: [PATCH 1/5] implement RemovePfAppliedStatus function in store Signed-off-by: Sebastian Sch --- pkg/helper/mock/mock_helper.go | 14 ++++++++++++++ pkg/host/store/mock/mock_store.go | 14 ++++++++++++++ pkg/host/store/store.go | 12 ++++++++++++ 3 files changed, 40 insertions(+) diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 3c5712861..e3509ccb0 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -929,6 +929,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) RemovePersistPFNameUdevRule(pfPc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePersistPFNameUdevRule", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePersistPFNameUdevRule), pfPciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockHostHelpersInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockHostHelpersInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // RemoveVfRepresentorUdevRule mocks base method. func (m *MockHostHelpersInterface) RemoveVfRepresentorUdevRule(pfPciAddress string) error { m.ctrl.T.Helper() diff --git a/pkg/host/store/mock/mock_store.go b/pkg/host/store/mock/mock_store.go index 2e0071dfd..d405543a2 100644 --- a/pkg/host/store/mock/mock_store.go +++ b/pkg/host/store/mock/mock_store.go @@ -79,6 +79,20 @@ func (mr *MockManagerInterfaceMockRecorder) LoadPfsStatus(pciAddress interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadPfsStatus", reflect.TypeOf((*MockManagerInterface)(nil).LoadPfsStatus), pciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockManagerInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockManagerInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockManagerInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // SaveLastPfAppliedStatus mocks base method. func (m *MockManagerInterface) SaveLastPfAppliedStatus(PfInfo *v1.Interface) error { m.ctrl.T.Helper() diff --git a/pkg/host/store/store.go b/pkg/host/store/store.go index 77e4bbd32..67c0b17e3 100644 --- a/pkg/host/store/store.go +++ b/pkg/host/store/store.go @@ -20,6 +20,7 @@ import ( type ManagerInterface interface { ClearPCIAddressFolder() error SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error + RemovePfAppliedStatus(pciAddress string) error LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error) @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro return err } +func (s *manager) RemovePfAppliedStatus(pciAddress string) error { + hostExtension := utils.GetHostExtension() + pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress) + err := os.RemoveAll(pathFile) + if err != nil { + log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile) + return err + } + return nil +} + // LoadPfsStatus convert the /etc/sriov-operator/pci/ json to pfstatus // returns false if the file doesn't exist. func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) { From e74bcf95db769e66110be2706d0e94773fce56ce Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:48:55 +0300 Subject: [PATCH 2/5] Remove store file on reset Signed-off-by: Sebastian Sch --- pkg/host/internal/sriov/sriov.go | 13 +++++++++++++ pkg/host/internal/sriov/sriov_test.go | 2 ++ 2 files changed, 15 insertions(+) diff --git a/pkg/host/internal/sriov/sriov.go b/pkg/host/internal/sriov/sriov.go index 2fc8438fe..f7d1e0970 100644 --- a/pkg/host/internal/sriov/sriov.go +++ b/pkg/host/internal/sriov/sriov.go @@ -798,6 +798,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset", "pf-name", ifaceStatus.Name, "address", ifaceStatus.PciAddress) + + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } err = s.removeUdevRules(ifaceStatus.PciAddress) @@ -809,6 +816,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, return err } + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } diff --git a/pkg/host/internal/sriov/sriov_test.go b/pkg/host/internal/sriov/sriov_test.go index 9747b1098..aab7709a9 100644 --- a/pkg/host/internal/sriov/sriov_test.go +++ b/pkg/host/internal/sriov/sriov_test.go @@ -347,6 +347,7 @@ var _ = Describe("SRIOV", func() { PciAddress: "0000:d8:00.0", NumVfs: 2, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return( &netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}}, nil) @@ -374,6 +375,7 @@ var _ = Describe("SRIOV", func() { NumVfs: 2, ExternallyManaged: true, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) Expect(s.ConfigSriovInterfaces(storeManagerMode, []sriovnetworkv1.Interface{}, []sriovnetworkv1.InterfaceExt{ From 06b0ce5b160428bfdd2d12f862b2ba27507a53dd Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:49:33 +0300 Subject: [PATCH 3/5] Skip firmware reset for devices we don't control Signed-off-by: Sebastian Sch --- pkg/plugins/mellanox/mellanox_plugin.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 87363464b..353080e42 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -143,6 +143,11 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS processedNics[pciPrefix] = true pciAddress := pciPrefix + "0" + // Skip devices not configured by the operator + if p.nicNotConfiguredByOperator(portsMap) { + continue + } + // Skip externally managed NICs if p.nicHasExternallyManagedPFs(portsMap) { continue @@ -206,3 +211,19 @@ func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriov } return false } + +// nicNotConfiguredByOperator returns true if one of the ports(interface) of the NIC is not configured by operator +func (p *MellanoxPlugin) nicNotConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { + for _, iface := range nicPortsMap { + _, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) + if err != nil { + log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) + continue + } + if exist { + log.Log.V(2).Info("PF configured by the operator", "interface", iface) + return true + } + } + return false +} From 0e2542e4e69d29efed54d9cd4709fd0608a9da1f Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:50:04 +0300 Subject: [PATCH 4/5] implement functional test for both regular and externally manage Signed-off-by: Sebastian Sch --- test/conformance/tests/test_sriov_operator.go | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 8bb599fb9..1713f9d6e 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -1199,7 +1199,7 @@ var _ = Describe("[sriov] operator", func() { It("Should be possible to create a vfio-pci resource and allocate to a pod", func() { By("creating a vfio-pci node policy") resourceName := "testvfio" - _, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") + vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") Expect(err).ToNot(HaveOccurred()) By("waiting for the node state to be updated") @@ -1226,6 +1226,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 10*time.Minute, time.Second).Should(Equal(int64(5))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("Creating sriov network to use the vfio device") sriovNetwork := &sriovv1.SriovNetwork{ ObjectMeta: metav1.ObjectMeta{ @@ -1255,6 +1260,25 @@ var _ = Describe("[sriov] operator", func() { networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"] Expect(exist).To(BeTrue()) Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu))) + + By("deleting the policy") + err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{}) + Expect(err).ToNot(HaveOccurred()) + WaitForSRIOVStable() + + Eventually(func() int64 { + testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)] + allocatable, _ := resNum.AsInt64() + return allocatable + }, 2*time.Minute, time.Second).Should(Equal(int64(0))) + + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + }) }) @@ -2173,15 +2197,19 @@ var _ = Describe("[sriov] operator", func() { Context("ExternallyManaged Validation", func() { numVfs := 5 var node string - var nic *sriovv1.InterfaceExt + var nic sriovv1.InterfaceExt externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) { policy.Spec.ExternallyManaged = true } execute.BeforeAll(func() { - var err error - node, nic, err = sriovInfos.FindOneSriovNodeAndDevice() - Expect(err).ToNot(HaveOccurred()) + node, nic = sriovInfos.FindOneVfioSriovDevice() + }) + + BeforeEach(func() { + if node == "" { + Skip("not suitable device found for the test") + } By("Using device " + nic.Name + " on node " + node) }) @@ -2242,6 +2270,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 3*time.Minute, time.Second).Should(Equal(int64(numVfs))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("deleting the policy") err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -2256,11 +2289,16 @@ var _ = Describe("[sriov] operator", func() { }, 2*time.Minute, time.Second).Should(Equal(int64(0))) By("checking the virtual functions are still on the host") - output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) + output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred()) Expect(errOutput).To(Equal("")) Expect(output).To(ContainSubstring("5")) + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + By("cleaning the manual sriov created") _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred()) From d2133d84ddf0ef7870e22b61f520ad72b643a086 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Mon, 22 Jul 2024 14:29:53 +0300 Subject: [PATCH 5/5] Return error on mlx plugin on firmware reset check This commit makes the functions to return error if we are not able to read from the configuration cache on the node. Also this commit fix a bug when checking if a MLX card is controlled by the operator Signed-off-by: Sebastian Sch --- pkg/plugins/mellanox/mellanox_plugin.go | 48 ++++++++++++++++++------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 353080e42..e5e9a9913 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -6,6 +6,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox" @@ -144,12 +145,24 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS pciAddress := pciPrefix + "0" // Skip devices not configured by the operator - if p.nicNotConfiguredByOperator(portsMap) { + isConfigured, err := p.nicConfiguredByOperator(portsMap) + if err != nil { + return false, false, err + } + if !isConfigured { + log.Log.V(2).Info("None of the ports are configured by the operator skipping firmware reset", + "portMap", portsMap) continue } // Skip externally managed NICs - if p.nicHasExternallyManagedPFs(portsMap) { + hasExternally, err := p.nicHasExternallyManagedPFs(portsMap) + if err != nil { + return false, false, err + } + if hasExternally { + log.Log.V(2).Info("One of the ports is configured as externally managed skipping firmware reset", + "portMap", portsMap) continue } @@ -194,36 +207,45 @@ func (p *MellanoxPlugin) Apply() error { // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed // in StoreManagerInterface. -func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { +func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) { for _, iface := range nicPortsMap { pfStatus, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) if err != nil { - log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) - continue + // nolint:goconst + log.Log.Error(err, "failed to load PF status from disk. "+ + "This should not happen, to overcome config daemon stuck, "+ + "please remove the PCI file on the host under the operator configuration path", + "path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress) + return false, err } if !exist { continue } if pfStatus.ExternallyManaged { log.Log.V(2).Info("PF is extenally managed, skip FW TotalVfs reset") - return true + return true, nil } } - return false + return false, nil } -// nicNotConfiguredByOperator returns true if one of the ports(interface) of the NIC is not configured by operator -func (p *MellanoxPlugin) nicNotConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { +// nicConfiguredByOperator returns true if one of the ports(interface) of the NIC is configured by operator +func (p *MellanoxPlugin) nicConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) { for _, iface := range nicPortsMap { _, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) if err != nil { - log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) - continue + // nolint:goconst + log.Log.Error(err, "failed to load PF status from disk. "+ + "This should not happen, to overcome config daemon stuck, "+ + "please remove the PCI file on the host under the operator configuration path", + "path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress) + return false, err } if exist { log.Log.V(2).Info("PF configured by the operator", "interface", iface) - return true + return true, nil } } - return false + + return false, nil }