From 35ce63ec0a5b8f3a54b860198911130ec48d10a7 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Thu, 12 Sep 2024 12:20:45 +0200 Subject: [PATCH] metrics: Add `node` label to `sriov_*` metrics It might happen that two SR-IOV pods, deployed on different node, are using devices with the same PCI address. In such cases, the query suggested [1] by the sriov-network-metrics-exporter produces the error: ``` Error loading values found duplicate series for the match group {pciAddr="0000:3b:02.4"} on the right hand-side of the operation: [ { __name__="sriov_kubepoddevice", container="test", dev_type="openshift.io/intelnetdevice", endpoint="sriov-network-metrics", instance="10.1.98.60:9110", job="sriov-network-metrics-exporter-service", namespace="cnf-4916", pciAddr="0000:3b:02.4", pod="pod-cnfdr22.telco5g.eng.rdu2.redhat.com", prometheus="openshift-monitoring/k8s", service="sriov-network-metrics-exporter-service" }, { __name__="sriov_kubepoddevice", container="test", dev_type="openshift.io/intelnetdevice", endpoint="sriov-network-metrics", instance="10.1.98.230:9110", job="sriov-network-metrics-exporter-service", namespace="cnf-4916", pciAddr="0000:3b:02.4", pod="pod-dhcp-98-230.telco5g.eng.rdu2.redhat.com", prometheus="openshift-monitoring/k8s", service="sriov-network-metrics-exporter-service" } ];many-to-many matching not allowed: matching labels must be unique on one side ``` Configure the ServiceMonitor resource to add a `node` label to all metrics. The right query to get metrics, as updated in the PrometheusRule, will be: ``` sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice ``` Also remove `pod`, `namespace` and `container` label from the `sriov_vf_*` metrics, as they were wrongly set to `sriov-network-metrics-exporter-zj2n9`, `openshift-sriov-network-operator`, `kube-rbac-proxy` [1] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/blob/0f6a784f377ede87b95f31e569116ceb9775b5b9/README.md?plain=1#L38 Signed-off-by: Andrea Panattoni --- .../metrics-prometheus-rule.yaml | 16 ++-- .../metrics-exporter/metrics-prometheus.yaml | 11 +++ .../tests/test_exporter_metrics.go | 95 ++++++++++++++----- test/util/k8sreporter/reporter.go | 20 ++++ 4 files changed, 111 insertions(+), 31 deletions(-) diff --git a/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml index efd760113..a385fa677 100644 --- a/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml +++ b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml @@ -11,28 +11,28 @@ spec: interval: 30s rules: - expr: | - sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_tx_packets - expr: | - sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_rx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_rx_packets - expr: | - sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_tx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_tx_bytes - expr: | - sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_rx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_rx_bytes - expr: | - sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_tx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_tx_dropped - expr: | - sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_rx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_rx_dropped - expr: | - sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_rx_broadcast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_rx_broadcast - expr: | - sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + sriov_vf_rx_multicast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice record: network:sriov_vf_rx_multicast {{ end }} diff --git a/bindata/manifests/metrics-exporter/metrics-prometheus.yaml b/bindata/manifests/metrics-exporter/metrics-prometheus.yaml index 45ae7adbf..d1772a554 100644 --- a/bindata/manifests/metrics-exporter/metrics-prometheus.yaml +++ b/bindata/manifests/metrics-exporter/metrics-prometheus.yaml @@ -12,6 +12,17 @@ spec: bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token" scheme: "https" honorLabels: true + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_endpoint_node_name + targetLabel: node + - action: labeldrop + regex: pod + - action: labeldrop + regex: container + - action: labeldrop + regex: namespace tlsConfig: serverName: sriov-network-metrics-exporter-service.{{.Namespace}}.svc caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt diff --git a/test/conformance/tests/test_exporter_metrics.go b/test/conformance/tests/test_exporter_metrics.go index 804432f04..f02317fec 100644 --- a/test/conformance/tests/test_exporter_metrics.go +++ b/test/conformance/tests/test_exporter_metrics.go @@ -19,21 +19,18 @@ import ( "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) -var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { +var _ = Describe("[sriov] Metrics Exporter", Ordered, ContinueOnFailure, func() { var node string var nic *sriovv1.InterfaceExt BeforeAll(func() { - if cluster.VirtualCluster() { - Skip("IGB driver does not support VF statistics") - } - err := namespaces.Create(namespaces.Test, clients) Expect(err).ToNot(HaveOccurred()) @@ -73,6 +70,9 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { }) It("collects metrics regarding receiving traffic via VF", func() { + if cluster.VirtualCluster() { + Skip("IGB driver does not support VF statistics") + } pod := createTestPod(node, []string{"test-me-network"}) DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) @@ -98,27 +98,76 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets)) }) - It("PrometheusRule should provide namespaced metrics", func() { - pod := createTestPod(node, []string{"test-me-network"}) - DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) + Context("When Prometheus operator is available", func() { + BeforeEach(func() { + _, err := clients.ServiceMonitors(operatorNamespace).List(context.Background(), metav1.ListOptions{}) + if k8serrors.IsNotFound(err) { + Skip("Prometheus operator not available in the cluster") + } + }) - namespacedMetricNames := []string{ - "network:sriov_vf_rx_bytes", - "network:sriov_vf_tx_bytes", - "network:sriov_vf_rx_packets", - "network:sriov_vf_tx_packets", - "network:sriov_vf_rx_dropped", - "network:sriov_vf_tx_dropped", - "network:sriov_vf_rx_broadcast", - "network:sriov_vf_rx_multicast", - } + It("PrometheusRule should provide namespaced metrics", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) + + namespacedMetricNames := []string{ + "network:sriov_vf_rx_bytes", + "network:sriov_vf_tx_bytes", + "network:sriov_vf_rx_packets", + "network:sriov_vf_tx_packets", + "network:sriov_vf_rx_dropped", + "network:sriov_vf_tx_dropped", + "network:sriov_vf_rx_broadcast", + "network:sriov_vf_rx_multicast", + } - Eventually(func(g Gomega) { - for _, metricName := range namespacedMetricNames { - values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name)) - g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName) + Eventually(func(g Gomega) { + for _, metricName := range namespacedMetricNames { + values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name)) + g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName) + } + }, "90s", "1s").Should(Succeed()) + }) + + It("Metrics should have the correct labels", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) + + metricsName := []string{ + "sriov_vf_rx_bytes", + "sriov_vf_tx_bytes", + "sriov_vf_rx_packets", + "sriov_vf_tx_packets", + "sriov_vf_rx_dropped", + "sriov_vf_tx_dropped", + "sriov_vf_rx_broadcast", + "sriov_vf_rx_multicast", } - }, "40s", "1s").Should(Succeed()) + + Eventually(func(g Gomega) { + for _, metricName := range metricsName { + samples := runPromQLQuery(metricName) + g.Expect(samples).ToNot(BeEmpty(), "no value for metric %s", metricName) + g.Expect(samples[0].Metric).To(And( + HaveKey(model.LabelName("pciAddress")), + HaveKey(model.LabelName("node")), + HaveKey(model.LabelName("pf")), + HaveKey(model.LabelName("vf")), + )) + } + }, "90s", "1s").Should(Succeed()) + + // sriov_kubepoddevice has a different sets of label than statistics metrics + samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name)) + Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice") + Expect(samples[0].Metric).To(And( + HaveKey(model.LabelName("pciAddr")), + HaveKeyWithValue(model.LabelName("node"), model.LabelValue(pod.Spec.NodeName)), + HaveKeyWithValue(model.LabelName("dev_type"), model.LabelValue("openshift.io/test-me-network")), + HaveKeyWithValue(model.LabelName("namespace"), model.LabelValue(pod.Namespace)), + HaveKeyWithValue(model.LabelName("pod"), model.LabelValue(pod.Name)), + )) + }) }) }) diff --git a/test/util/k8sreporter/reporter.go b/test/util/k8sreporter/reporter.go index 5a3405a91..13baac0aa 100644 --- a/test/util/k8sreporter/reporter.go +++ b/test/util/k8sreporter/reporter.go @@ -10,6 +10,9 @@ import ( sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + rbacv1 "k8s.io/api/rbac/v1" ) func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) { @@ -18,6 +21,17 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) { if err != nil { return err } + + err = monitoringv1.AddToScheme(s) + if err != nil { + return err + } + + err = rbacv1.AddToScheme(s) + if err != nil { + return err + } + return nil } @@ -38,6 +52,8 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) { return true case multusNamespace != "" && ns == multusNamespace: return true + case ns == "openshift-monitoring": + return true } return false } @@ -47,6 +63,10 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) { {Cr: &sriovv1.SriovNetworkNodePolicyList{}}, {Cr: &sriovv1.SriovNetworkList{}}, {Cr: &sriovv1.SriovOperatorConfigList{}}, + {Cr: &monitoringv1.ServiceMonitorList{}, Namespace: &operatorNamespace}, + {Cr: &monitoringv1.PrometheusRuleList{}, Namespace: &operatorNamespace}, + {Cr: &rbacv1.RoleList{}, Namespace: &operatorNamespace}, + {Cr: &rbacv1.RoleBindingList{}, Namespace: &operatorNamespace}, } err := os.Mkdir(reportPath, 0755)