Skip to content

Commit

Permalink
Merge pull request #980 from zeeke/OCPBUGS-36683-416
Browse files Browse the repository at this point in the history
OCPBUGS-38010: [release-4.16]: Skip firmware reset on devices the operator doesn't control
  • Loading branch information
openshift-merge-bot[bot] authored Aug 12, 2024
2 parents b04c39c + d2133d8 commit 48d1049
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 12 deletions.
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/sriov/sriov.go
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}
err = s.removeUdevRules(ifaceStatus.PciAddress)
Expand All @@ -809,6 +816,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
return err
}

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/host/internal/sriov/sriov_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ var _ = Describe("SRIOV", func() {
PciAddress: "0000:d8:00.0",
NumVfs: 2,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return(
&netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}},
nil)
Expand Down Expand Up @@ -374,6 +375,7 @@ var _ = Describe("SRIOV", func() {
NumVfs: 2,
ExternallyManaged: true,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
Expect(s.ConfigSriovInterfaces(storeManagerMode,
[]sriovnetworkv1.Interface{},
[]sriovnetworkv1.InterfaceExt{
Expand Down
14 changes: 14 additions & 0 deletions pkg/host/store/mock/mock_store.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions pkg/host/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
type ManagerInterface interface {
ClearPCIAddressFolder() error
SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error
RemovePfAppliedStatus(pciAddress string) error
LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error)

GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error)
Expand Down Expand Up @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro
return err
}

func (s *manager) RemovePfAppliedStatus(pciAddress string) error {
hostExtension := utils.GetHostExtension()
pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress)
err := os.RemoveAll(pathFile)
if err != nil {
log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile)
return err
}
return nil
}

// LoadPfsStatus convert the /etc/sriov-operator/pci/<pci-address> json to pfstatus
// returns false if the file doesn't exist.
func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) {
Expand Down
55 changes: 49 additions & 6 deletions pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox"
Expand Down Expand Up @@ -143,8 +144,25 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
processedNics[pciPrefix] = true
pciAddress := pciPrefix + "0"

// Skip devices not configured by the operator
isConfigured, err := p.nicConfiguredByOperator(portsMap)
if err != nil {
return false, false, err
}
if !isConfigured {
log.Log.V(2).Info("None of the ports are configured by the operator skipping firmware reset",
"portMap", portsMap)
continue
}

// Skip externally managed NICs
if p.nicHasExternallyManagedPFs(portsMap) {
hasExternally, err := p.nicHasExternallyManagedPFs(portsMap)
if err != nil {
return false, false, err
}
if hasExternally {
log.Log.V(2).Info("One of the ports is configured as externally managed skipping firmware reset",
"portMap", portsMap)
continue
}

Expand Down Expand Up @@ -189,20 +207,45 @@ func (p *MellanoxPlugin) Apply() error {

// nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed
// in StoreManagerInterface.
func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool {
func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) {
for _, iface := range nicPortsMap {
pfStatus, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress)
continue
// nolint:goconst
log.Log.Error(err, "failed to load PF status from disk. "+
"This should not happen, to overcome config daemon stuck, "+
"please remove the PCI file on the host under the operator configuration path",
"path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress)
return false, err
}
if !exist {
continue
}
if pfStatus.ExternallyManaged {
log.Log.V(2).Info("PF is extenally managed, skip FW TotalVfs reset")
return true
return true, nil
}
}
return false
return false, nil
}

// nicConfiguredByOperator returns true if one of the ports(interface) of the NIC is configured by operator
func (p *MellanoxPlugin) nicConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) {
for _, iface := range nicPortsMap {
_, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
// nolint:goconst
log.Log.Error(err, "failed to load PF status from disk. "+
"This should not happen, to overcome config daemon stuck, "+
"please remove the PCI file on the host under the operator configuration path",
"path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress)
return false, err
}
if exist {
log.Log.V(2).Info("PF configured by the operator", "interface", iface)
return true, nil
}
}

return false, nil
}
50 changes: 44 additions & 6 deletions test/conformance/tests/test_sriov_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1199,7 +1199,7 @@ var _ = Describe("[sriov] operator", func() {
It("Should be possible to create a vfio-pci resource and allocate to a pod", func() {
By("creating a vfio-pci node policy")
resourceName := "testvfio"
_, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
Expect(err).ToNot(HaveOccurred())

By("waiting for the node state to be updated")
Expand All @@ -1226,6 +1226,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 10*time.Minute, time.Second).Should(Equal(int64(5)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("Creating sriov network to use the vfio device")
sriovNetwork := &sriovv1.SriovNetwork{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -1255,6 +1260,25 @@ var _ = Describe("[sriov] operator", func() {
networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"]
Expect(exist).To(BeTrue())
Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu)))

By("deleting the policy")
err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
WaitForSRIOVStable()

Eventually(func() int64 {
testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)]
allocatable, _ := resNum.AsInt64()
return allocatable
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

})
})

Expand Down Expand Up @@ -2173,15 +2197,19 @@ var _ = Describe("[sriov] operator", func() {
Context("ExternallyManaged Validation", func() {
numVfs := 5
var node string
var nic *sriovv1.InterfaceExt
var nic sriovv1.InterfaceExt
externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) {
policy.Spec.ExternallyManaged = true
}

execute.BeforeAll(func() {
var err error
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
node, nic = sriovInfos.FindOneVfioSriovDevice()
})

BeforeEach(func() {
if node == "" {
Skip("not suitable device found for the test")
}

By("Using device " + nic.Name + " on node " + node)
})
Expand Down Expand Up @@ -2242,6 +2270,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 3*time.Minute, time.Second).Should(Equal(int64(numVfs)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("deleting the policy")
err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
Expand All @@ -2256,11 +2289,16 @@ var _ = Describe("[sriov] operator", func() {
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("checking the virtual functions are still on the host")
output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expect(errOutput).To(Equal(""))
Expect(output).To(ContainSubstring("5"))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

By("cleaning the manual sriov created")
_, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expand Down

0 comments on commit 48d1049

Please sign in to comment.