Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip firmware reset on devices the operator doesn't control #734

Merged
merged 4 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/sriov/sriov.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}
err = s.removeUdevRules(ifaceStatus.PciAddress)
Expand All @@ -828,6 +835,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
return err
}

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/host/internal/sriov/sriov_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ var _ = Describe("SRIOV", func() {
PciAddress: "0000:d8:00.0",
NumVfs: 2,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return(
&netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}},
nil)
Expand Down Expand Up @@ -479,6 +480,7 @@ var _ = Describe("SRIOV", func() {
NumVfs: 2,
ExternallyManaged: true,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
Expect(s.ConfigSriovInterfaces(storeManagerMode,
[]sriovnetworkv1.Interface{},
[]sriovnetworkv1.InterfaceExt{
Expand Down
14 changes: 14 additions & 0 deletions pkg/host/store/mock/mock_store.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions pkg/host/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
type ManagerInterface interface {
ClearPCIAddressFolder() error
SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error
RemovePfAppliedStatus(pciAddress string) error
LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error)

GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error)
Expand Down Expand Up @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro
return err
}

func (s *manager) RemovePfAppliedStatus(pciAddress string) error {
hostExtension := utils.GetHostExtension()
pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress)
err := os.RemoveAll(pathFile)
if err != nil {
log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile)
return err
}
return nil
}

// LoadPfsStatus convert the /etc/sriov-operator/pci/<pci-address> json to pfstatus
// returns false if the file doesn't exist.
func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) {
Expand Down
21 changes: 21 additions & 0 deletions pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
processedNics[pciPrefix] = true
pciAddress := pciPrefix + "0"

// Skip devices not configured by the operator
if p.nicNotConfiguredByOperator(portsMap) {
continue
}

// Skip externally managed NICs
if p.nicHasExternallyManagedPFs(portsMap) {
continue
Expand Down Expand Up @@ -206,3 +211,19 @@ func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriov
}
return false
}

// nicNotConfiguredByOperator returns true if one of the ports(interface) of the NIC is not configured by operator
func (p *MellanoxPlugin) nicNotConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool {
for _, iface := range nicPortsMap {
_, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress)
continue
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we fail here ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fair enough :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps we shoud consider failing in both places, LMK what you think i can submit a followup PR.

LoadPfStatus will fail if file exists but it failed reading from it or its corrupt and we werent able to unmarshall

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good I agree that should failed in both functions :)

when you have time please open the PR I can review it

}
if exist {
log.Log.V(2).Info("PF configured by the operator", "interface", iface)
return true
}
}
return false
}
50 changes: 44 additions & 6 deletions test/conformance/tests/test_sriov_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ var _ = Describe("[sriov] operator", func() {
It("Should be possible to create a vfio-pci resource and allocate to a pod", func() {
By("creating a vfio-pci node policy")
resourceName := "testvfio"
_, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
Expect(err).ToNot(HaveOccurred())

By("waiting for the node state to be updated")
Expand All @@ -1249,6 +1249,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 10*time.Minute, time.Second).Should(Equal(int64(5)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("Creating sriov network to use the vfio device")
sriovNetwork := &sriovv1.SriovNetwork{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -1278,6 +1283,25 @@ var _ = Describe("[sriov] operator", func() {
networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"]
Expect(exist).To(BeTrue())
Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu)))

By("deleting the policy")
err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
WaitForSRIOVStable()

Eventually(func() int64 {
testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)]
allocatable, _ := resNum.AsInt64()
return allocatable
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

})
})

Expand Down Expand Up @@ -2196,15 +2220,19 @@ var _ = Describe("[sriov] operator", func() {
Context("ExternallyManaged Validation", func() {
numVfs := 5
var node string
var nic *sriovv1.InterfaceExt
var nic sriovv1.InterfaceExt
externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) {
policy.Spec.ExternallyManaged = true
}

execute.BeforeAll(func() {
var err error
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
node, nic = sriovInfos.FindOneVfioSriovDevice()
})

BeforeEach(func() {
if node == "" {
Skip("not suitable device found for the test")
}

By("Using device " + nic.Name + " on node " + node)
})
Expand Down Expand Up @@ -2265,6 +2293,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 3*time.Minute, time.Second).Should(Equal(int64(numVfs)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("deleting the policy")
err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
Expand All @@ -2279,11 +2312,16 @@ var _ = Describe("[sriov] operator", func() {
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("checking the virtual functions are still on the host")
output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expect(errOutput).To(Equal(""))
Expect(output).To(ContainSubstring("5"))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

By("cleaning the manual sriov created")
_, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expand Down
Loading