From dc5c9d20d89f49a303817b324d6658cb0ba98ce8 Mon Sep 17 00:00:00 2001 From: adrianc Date: Wed, 27 Dec 2023 14:26:06 +0200 Subject: [PATCH] Do not reset Firmware config in Mellanox plugin When interface(pf) is externally managed, we need to avoid resetting firmware configurations. - use store manager to get latest pf config from host so we can reliably determine if interface is externally managed if it does not appear in spec - when interface appears in spec, block both sriov and link type firmware configurations if interface is externally managed - skip pfs that are externally managed Signed-off-by: adrianc --- pkg/daemon/plugin.go | 4 +- pkg/plugins/mellanox/mellanox_plugin.go | 76 +++++++++++++++++-------- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/pkg/daemon/plugin.go b/pkg/daemon/plugin.go index 38e1d9d73..8cb344347 100644 --- a/pkg/daemon/plugin.go +++ b/pkg/daemon/plugin.go @@ -82,7 +82,9 @@ func registerVendorPlugins(ns *sriovnetworkv1.SriovNetworkNodeState, helpers hel log.Log.Error(err, "registerVendorPlugins(): failed to load plugin", "plugin-name", plug.Name()) return vendorPlugins, fmt.Errorf("registerVendorPlugins(): failed to load the %s plugin error: %v", plug.Name(), err) } - vendorPlugins[plug.Name()] = plug + if _, ok := vendorPlugins[plug.Name()]; !ok { + vendorPlugins[plug.Name()] = plug + } } } diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index d36996ca6..a95d24b9d 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -52,22 +52,21 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS needReboot = false err = nil attributesToChange = map[string]mlx.MlxNic{} + mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{} mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{} processedNics := map[string]bool{} - // Read mellanox NIC status once - if len(mellanoxNicsStatus) == 0 { - for _, iface := range new.Status.Interfaces { - if iface.Vendor != mlx.MellanoxVendorID { - continue - } + // fill mellanoxNicsStatus + for _, iface := range new.Status.Interfaces { + if iface.Vendor != mlx.MellanoxVendorID { + continue + } - pciPrefix := mlx.GetPciAddressPrefix(iface.PciAddress) - if ifaces, ok := mellanoxNicsStatus[pciPrefix]; ok { - ifaces[iface.PciAddress] = iface - } else { - mellanoxNicsStatus[pciPrefix] = map[string]sriovnetworkv1.InterfaceExt{iface.PciAddress: iface} - } + pciPrefix := mlx.GetPciAddressPrefix(iface.PciAddress) + if ifaces, ok := mellanoxNicsStatus[pciPrefix]; ok { + ifaces[iface.PciAddress] = iface + } else { + mellanoxNicsStatus[pciPrefix] = map[string]sriovnetworkv1.InterfaceExt{iface.PciAddress: iface} } } @@ -106,25 +105,29 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS attrs := &mlx.MlxNic{TotalVfs: -1} var changeWithoutReboot bool - var totalVfs int - totalVfs, needReboot, changeWithoutReboot = mlx.HandleTotalVfs(fwCurrent, fwNext, attrs, ifaceSpec, isDualPort, mellanoxNicsSpec) + totalVfs, totalVfsNeedReboot, totalVfsChangeWithoutReboot := mlx.HandleTotalVfs(fwCurrent, fwNext, attrs, ifaceSpec, isDualPort, mellanoxNicsSpec) sriovEnNeedReboot, sriovEnChangeWithoutReboot := mlx.HandleEnableSriov(totalVfs, fwCurrent, fwNext, attrs) - needReboot = needReboot || sriovEnNeedReboot - changeWithoutReboot = changeWithoutReboot || sriovEnChangeWithoutReboot - - // failing as we can't the fwTotalVf is lower than the request one on a nic with externallyManage configured - if ifaceSpec.ExternallyManaged && needReboot { - return true, true, fmt.Errorf( - "interface %s required a change in the TotalVfs but the policy is externally managed failing: firmware TotalVf %d requested TotalVf %d", - ifaceSpec.PciAddress, fwCurrent.TotalVfs, totalVfs) - } + needReboot = totalVfsNeedReboot || sriovEnNeedReboot + changeWithoutReboot = totalVfsChangeWithoutReboot || sriovEnChangeWithoutReboot needLinkChange, err := mlx.HandleLinkType(pciPrefix, fwCurrent, attrs, mellanoxNicsSpec, mellanoxNicsStatus) if err != nil { return false, false, err } - needReboot = needReboot || needLinkChange + + // no FW changes allowed when NIC is externally managed + if ifaceSpec.ExternallyManaged { + if totalVfsNeedReboot || totalVfsChangeWithoutReboot { + return false, false, fmt.Errorf( + "interface %s required a change in the TotalVfs but the policy is externally managed failing: firmware TotalVf %d requested TotalVf %d", + ifaceSpec.PciAddress, fwCurrent.TotalVfs, totalVfs) + } + if needLinkChange { + return false, false, fmt.Errorf("change required for link type but the policy is externally managed, failing") + } + } + if needReboot || changeWithoutReboot { attributesToChange[ifaceSpec.PciAddress] = *attrs } @@ -140,6 +143,11 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS processedNics[pciPrefix] = true pciAddress := pciPrefix + "0" + // Skip externally managed NICs + if p.nicHasExternallyManagedPFs(portsMap) { + continue + } + // Skip unsupported devices if id := sriovnetworkv1.GetVfDeviceID(portsMap[pciAddress].DeviceID); id == "" { continue @@ -172,3 +180,23 @@ func (p *MellanoxPlugin) Apply() error { log.Log.Info("mellanox-plugin Apply()") return p.helpers.MlxConfigFW(attributesToChange) } + +// nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed +// in StoreManagerInterface. +func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { + for _, iface := range nicPortsMap { + pfStatus, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) + if err != nil { + log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) + continue + } + if !exist { + continue + } + if pfStatus.ExternallyManaged { + log.Log.V(2).Info("PF is extenally managed, skip FW TotalVfs reset") + return true + } + } + return false +}