diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 3c5712861..e3509ccb0 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -929,6 +929,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) RemovePersistPFNameUdevRule(pfPc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePersistPFNameUdevRule", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePersistPFNameUdevRule), pfPciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockHostHelpersInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockHostHelpersInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // RemoveVfRepresentorUdevRule mocks base method. func (m *MockHostHelpersInterface) RemoveVfRepresentorUdevRule(pfPciAddress string) error { m.ctrl.T.Helper() diff --git a/pkg/host/internal/sriov/sriov.go b/pkg/host/internal/sriov/sriov.go index 2fc8438fe..f7d1e0970 100644 --- a/pkg/host/internal/sriov/sriov.go +++ b/pkg/host/internal/sriov/sriov.go @@ -798,6 +798,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset", "pf-name", ifaceStatus.Name, "address", ifaceStatus.PciAddress) + + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } err = s.removeUdevRules(ifaceStatus.PciAddress) @@ -809,6 +816,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, return err } + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } diff --git a/pkg/host/internal/sriov/sriov_test.go b/pkg/host/internal/sriov/sriov_test.go index 9747b1098..aab7709a9 100644 --- a/pkg/host/internal/sriov/sriov_test.go +++ b/pkg/host/internal/sriov/sriov_test.go @@ -347,6 +347,7 @@ var _ = Describe("SRIOV", func() { PciAddress: "0000:d8:00.0", NumVfs: 2, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return( &netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}}, nil) @@ -374,6 +375,7 @@ var _ = Describe("SRIOV", func() { NumVfs: 2, ExternallyManaged: true, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) Expect(s.ConfigSriovInterfaces(storeManagerMode, []sriovnetworkv1.Interface{}, []sriovnetworkv1.InterfaceExt{ diff --git a/pkg/host/store/mock/mock_store.go b/pkg/host/store/mock/mock_store.go index 2e0071dfd..d405543a2 100644 --- a/pkg/host/store/mock/mock_store.go +++ b/pkg/host/store/mock/mock_store.go @@ -79,6 +79,20 @@ func (mr *MockManagerInterfaceMockRecorder) LoadPfsStatus(pciAddress interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadPfsStatus", reflect.TypeOf((*MockManagerInterface)(nil).LoadPfsStatus), pciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockManagerInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockManagerInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockManagerInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // SaveLastPfAppliedStatus mocks base method. func (m *MockManagerInterface) SaveLastPfAppliedStatus(PfInfo *v1.Interface) error { m.ctrl.T.Helper() diff --git a/pkg/host/store/store.go b/pkg/host/store/store.go index 77e4bbd32..67c0b17e3 100644 --- a/pkg/host/store/store.go +++ b/pkg/host/store/store.go @@ -20,6 +20,7 @@ import ( type ManagerInterface interface { ClearPCIAddressFolder() error SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error + RemovePfAppliedStatus(pciAddress string) error LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error) @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro return err } +func (s *manager) RemovePfAppliedStatus(pciAddress string) error { + hostExtension := utils.GetHostExtension() + pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress) + err := os.RemoveAll(pathFile) + if err != nil { + log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile) + return err + } + return nil +} + // LoadPfsStatus convert the /etc/sriov-operator/pci/ json to pfstatus // returns false if the file doesn't exist. func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) { diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 87363464b..e5e9a9913 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -6,6 +6,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox" @@ -143,8 +144,25 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS processedNics[pciPrefix] = true pciAddress := pciPrefix + "0" + // Skip devices not configured by the operator + isConfigured, err := p.nicConfiguredByOperator(portsMap) + if err != nil { + return false, false, err + } + if !isConfigured { + log.Log.V(2).Info("None of the ports are configured by the operator skipping firmware reset", + "portMap", portsMap) + continue + } + // Skip externally managed NICs - if p.nicHasExternallyManagedPFs(portsMap) { + hasExternally, err := p.nicHasExternallyManagedPFs(portsMap) + if err != nil { + return false, false, err + } + if hasExternally { + log.Log.V(2).Info("One of the ports is configured as externally managed skipping firmware reset", + "portMap", portsMap) continue } @@ -189,20 +207,45 @@ func (p *MellanoxPlugin) Apply() error { // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed // in StoreManagerInterface. -func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { +func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) { for _, iface := range nicPortsMap { pfStatus, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) if err != nil { - log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) - continue + // nolint:goconst + log.Log.Error(err, "failed to load PF status from disk. "+ + "This should not happen, to overcome config daemon stuck, "+ + "please remove the PCI file on the host under the operator configuration path", + "path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress) + return false, err } if !exist { continue } if pfStatus.ExternallyManaged { log.Log.V(2).Info("PF is extenally managed, skip FW TotalVfs reset") - return true + return true, nil } } - return false + return false, nil +} + +// nicConfiguredByOperator returns true if one of the ports(interface) of the NIC is configured by operator +func (p *MellanoxPlugin) nicConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) { + for _, iface := range nicPortsMap { + _, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) + if err != nil { + // nolint:goconst + log.Log.Error(err, "failed to load PF status from disk. "+ + "This should not happen, to overcome config daemon stuck, "+ + "please remove the PCI file on the host under the operator configuration path", + "path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress) + return false, err + } + if exist { + log.Log.V(2).Info("PF configured by the operator", "interface", iface) + return true, nil + } + } + + return false, nil } diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 8bb599fb9..1713f9d6e 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -1199,7 +1199,7 @@ var _ = Describe("[sriov] operator", func() { It("Should be possible to create a vfio-pci resource and allocate to a pod", func() { By("creating a vfio-pci node policy") resourceName := "testvfio" - _, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") + vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") Expect(err).ToNot(HaveOccurred()) By("waiting for the node state to be updated") @@ -1226,6 +1226,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 10*time.Minute, time.Second).Should(Equal(int64(5))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("Creating sriov network to use the vfio device") sriovNetwork := &sriovv1.SriovNetwork{ ObjectMeta: metav1.ObjectMeta{ @@ -1255,6 +1260,25 @@ var _ = Describe("[sriov] operator", func() { networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"] Expect(exist).To(BeTrue()) Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu))) + + By("deleting the policy") + err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{}) + Expect(err).ToNot(HaveOccurred()) + WaitForSRIOVStable() + + Eventually(func() int64 { + testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)] + allocatable, _ := resNum.AsInt64() + return allocatable + }, 2*time.Minute, time.Second).Should(Equal(int64(0))) + + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + }) }) @@ -2173,15 +2197,19 @@ var _ = Describe("[sriov] operator", func() { Context("ExternallyManaged Validation", func() { numVfs := 5 var node string - var nic *sriovv1.InterfaceExt + var nic sriovv1.InterfaceExt externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) { policy.Spec.ExternallyManaged = true } execute.BeforeAll(func() { - var err error - node, nic, err = sriovInfos.FindOneSriovNodeAndDevice() - Expect(err).ToNot(HaveOccurred()) + node, nic = sriovInfos.FindOneVfioSriovDevice() + }) + + BeforeEach(func() { + if node == "" { + Skip("not suitable device found for the test") + } By("Using device " + nic.Name + " on node " + node) }) @@ -2242,6 +2270,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 3*time.Minute, time.Second).Should(Equal(int64(numVfs))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("deleting the policy") err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -2256,11 +2289,16 @@ var _ = Describe("[sriov] operator", func() { }, 2*time.Minute, time.Second).Should(Equal(int64(0))) By("checking the virtual functions are still on the host") - output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) + output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred()) Expect(errOutput).To(Equal("")) Expect(output).To(ContainSubstring("5")) + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + By("cleaning the manual sriov created") _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred())