diff --git a/snafu/fio_wrapper/trigger_fio.py b/snafu/fio_wrapper/trigger_fio.py index 64274ee3..f153577e 100644 --- a/snafu/fio_wrapper/trigger_fio.py +++ b/snafu/fio_wrapper/trigger_fio.py @@ -261,7 +261,11 @@ def emit_actions(self): fio_output_file = os.path.join(job_dir, "fio-result.json") fio_job_file = os.path.join(job_dir, "fiojob") self._build_fio_job(job, job_dir, fio_job_file) + + # capture sample start time, used for prom data collection + sample_starttime = datetime.utcnow().strftime('%s') stdout, stderr, rc = self._run_fiod(fio_job_file, job_dir, fio_output_file) + if rc != 0: logger.error("Fio failed to execute") with open(fio_output_file, "r") as output: @@ -276,6 +280,8 @@ def emit_actions(self): "are in the dir {}\n".format( self.sample, job, job_dir)) + # capture sample end time, used for prom data collection + sample_endtime = datetime.utcnow().strftime('%s') with open(fio_output_file) as f: data = json.load(f) fio_endtime = int(data['timestamp']) # in epoch seconds @@ -328,7 +334,18 @@ def emit_actions(self): self._process_histogram(job, job_dir, processed_histogram_prefix, histogram_output_file) histogram_documents = self._histogram_payload(histogram_output_file, earliest_starttime, job) # if indexing is turned on yield back normalized data - index = "hist-log" for document in histogram_documents: yield document, index + # trigger collection of prom data + sample_info_dict = {"uuid": self.uuid, + "user": self.user, + "cluster_name": self.cluster_name, + "starttime": sample_starttime, + "endtime": sample_endtime, + "sample": self.sample, + "tool": "fio", + "test_config": self.fio_jobs_dict + } + + yield sample_info_dict, "get_prometheus_trigger" diff --git a/snafu/run_snafu.py b/snafu/run_snafu.py index 2fdab46e..a5f3e3e5 100755 --- a/snafu/run_snafu.py +++ b/snafu/run_snafu.py @@ -87,8 +87,9 @@ def main(): ssl_context=ssl_ctx, use_ssl=True) else: es = elasticsearch.Elasticsearch([_es_connection_string], send_get_body_as='POST') - logger.info("Connected to the elasticsearch cluster with info as follows:{0}".format( - str(es.info()))) + logger.info("Connected to the elasticsearch cluster with info as follows:") + logger.info(json.dumps(es.info(), indent=4)) + except Exception as e: logger.warn("Elasticsearch connection caused an exception : %s" % e) index_args.index_results = False @@ -130,7 +131,6 @@ def main(): total_capacity_bytes = index_args.document_size_capacity_bytes logger.info("Duration of execution - %s, with total size of %s bytes" % (tdelta, total_capacity_bytes)) - def process_generator(index_args, parser): benchmark_wrapper_object_generator = generate_wrapper_object(index_args, parser) @@ -146,11 +146,13 @@ def process_generator(index_args, parser): "uuid": "user": "clustername": + "sample": "starttime": datetime.utcnow().strftime('%s') "endtime": test_config: {...} } """ + index_prom_data(index_args, action) else: es_valid_document = get_valid_es_document(action, @@ -158,7 +160,6 @@ def process_generator(index_args, parser): index_args) yield es_valid_document - def generate_wrapper_object(index_args, parser): benchmark_wrapper_object = wrapper_factory(index_args.tool, parser) @@ -181,35 +182,69 @@ def get_valid_es_document(action, index, index_args): return es_valid_document -def index_prom_data(prometheus_doc, index_args, action): +def index_prom_data(index_args, action): # definition of prometheus data getter, will yield back prom doc def get_prometheus_generator(index_args, action): prometheus_doc_generator = get_prometheus_data(action) for prometheus_doc in prometheus_doc_generator.get_all_metrics(): + es_valid_document = get_valid_es_document(prometheus_doc, "prometheus_data", index_args) + yield es_valid_document + es = {} if "prom_es" in os.environ: - es = {} if os.environ["prom_es"] != "": es['server'] = os.environ["prom_es"] logger.info("Using Prometheus elasticsearch server with host: %s" % es['server']) if os.environ["prom_port"] != "": es['port'] = os.environ["prom_port"] logger.info("Using Prometheus elasticsearch server with port: %s" % es['port']) + es_verify_cert = os.getenv("es_verify_cert", "true") + if len(es.keys()) == 2: + if os.environ["es_index"] != "": + index_args.prefix = os.environ["es_index"] + logger.info("Using index prefix for ES:" + index_args.prefix) + index_args.index_results = True + try: + _es_connection_string = str(es['server']) + ':' + str(es['port']) + if es_verify_cert == "false": + logger.info("Turning off TLS certificate verification") + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = False + ssl_ctx.verify_mode = ssl.CERT_NONE + es = elasticsearch.Elasticsearch([_es_connection_string], send_get_body_as='POST', + ssl_context=ssl_ctx, use_ssl=True) + else: + es = elasticsearch.Elasticsearch([_es_connection_string], send_get_body_as='POST') + logger.info("Connected to the elasticsearch cluster with info as follows:") + logger.info(json.dumps(es.info(), indent=4)) - if index_args.index_results: + except Exception as e: + logger.warn("Elasticsearch connection caused an exception : %s" % e) + index_args.index_results = False + + # check that we want to index and that the prom_es exist. + if index_args.index_results and "prom_es" in os.environ: + logger.info("initializing prometheus indexing") parallel_setting = strtobool(os.environ.get('parallel', "false")) res_beg, res_end, res_suc, res_dup, res_fail, res_retry = streaming_bulk(es, get_prometheus_generator( - prometheus_doc, index_args, action), parallel_setting) + logger.info( + "Prometheus indexed results - %s success, %s duplicates, %s failures, with %s retries." % ( + res_suc, + res_dup, + res_fail, + res_retry)) start_t = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime(res_beg)) end_t = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime(res_end)) # set up a standard format for time diff --git a/snafu/utils/get_prometheus_data.py b/snafu/utils/get_prometheus_data.py index 198a209c..b30324b9 100644 --- a/snafu/utils/get_prometheus_data.py +++ b/snafu/utils/get_prometheus_data.py @@ -2,9 +2,8 @@ import json import logging import urllib3 -from datetime import datetime, timedelta +from datetime import datetime import time -import sys from prometheus_api_client import PrometheusConnect urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -13,23 +12,27 @@ class get_prometheus_data(): def __init__(self, action): + self.sample_info_dict = action self.uuid = action["uuid"] self.user = action["user"] self.cluster_name = action["cluster_name"] self.test_config = action["test_config"] # change datetime in seconds string to datetime object - starttime = datetime.fromtimestamp(int(action["starttime"])) - self.start = starttime.datetime() + starttime = datetime.fromtimestamp(int(self.sample_info_dict["starttime"])) + self.start = starttime # change datetime in seconds string to datetime object - endtime = datetime.fromtimestamp(int(action["endtime"])) - # add 120s buffer to end time - endtime = endtime + timedelta(seconds=120) - self.end = endtime.datetime + endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"])) + self.end = endtime # step value to be used in prometheus query - self.T_Delta = 30 + # default is 30 seconds(openshift default scraping interval) + # but can be overridden with env + if "prom_step" in os.environ: + self.T_Delta = os.environ["prom_step"] + else: + self.T_Delta = 30 self.get_data = False if "prom_token" in os.environ and "prom_url" in os.environ: @@ -49,41 +52,49 @@ def get_all_metrics(self): if self.get_data: start_time = time.time() - filename = os.path.join(sys.path[0], 'utils/prometheus_labels/included_labels.json') + # resolve directory the tool include file + dirname = os.path.dirname(__file__) + include_file_dir = os.path.join(dirname, '/utils/prometheus_labels/') + tool_include_file = include_file_dir + self.sample_info_dict["tool"] + "_included_labels.json" + + # check if tools include file is there + # if not use the default include file + if os.path.isfile(tool_include_file): + filename = tool_include_file + else: + filename = os.path.join(include_file_dir, 'included_labels.json') + + # open tools include file and loop through all with open(filename, 'r') as f: datastore = json.load(f) - # for label in self.get_label_list(): - for label in datastore["data"]: + for metric_name in datastore["data"]: + + query_item = datastore["data"][metric_name] + query = query_item["query"] + label = query_item["label"] - # query_start_time = time.time() - query = "irate(%s[1m])" % label - """ - If there are additional queries need we should create a list or dict that can be iterated on - """ step = str(self.T_Delta) + "s" try: - # response = self.api_call(query) + # Execute custom query to pull the desired labels between X and Y time. response = self.pc.custom_query_range(query, self.start, self.end, step, None) + except Exception as e: + logger.info(query) logger.warn("failure to get metric results %s" % e) - results = response['result'] - - # results is a list of all hits - """ - TODO: update with proper parsing of response document - """ - - for result in results: + for result in response: # clean up name key from __name__ to name result["metric"]["name"] = "" - result["metric"]["name"] = result["metric"]["__name__"] - del result["metric"]["__name__"] + if "__name__" in result["metric"]: + result["metric"]["name"] = result["metric"]["__name__"] + del result["metric"]["__name__"] + else: + result["metric"]["name"] = label # each result has a list, we must flatten it out in order to send to ES for value in result["values"]: # fist index is time stamp @@ -94,18 +105,14 @@ def get_all_metrics(self): else: metric_value = float(value[1]) - flat_doc = {"uuid": self.uuid, - "user": self.user, - "cluster_name": self.cluster_name, - "metric": result["metric"], - "Date": timestamp, - "value": metric_value, - "test_config": self.test_config - } + flat_doc = { + "metric": result["metric"], + "Date": timestamp, + "value": metric_value, + "metric_name": metric_name + } + flat_doc.update(self.sample_info_dict) yield flat_doc - else: - pass - # logger.debug("Not exporting data for %s" % label) logger.debug("Total Time --- %s seconds ---" % (time.time() - start_time)) diff --git a/snafu/utils/prometheus_labels/fio_included_labels.json b/snafu/utils/prometheus_labels/fio_included_labels.json new file mode 100644 index 00000000..392c4018 --- /dev/null +++ b/snafu/utils/prometheus_labels/fio_included_labels.json @@ -0,0 +1,432 @@ +{ + "data": { + "Average_Disk_IOPS_Read": { + "label": "node_disk_reads_completed_total", + "query": "(irate(node_disk_reads_completed_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_IOPS_Write": { + "label": "node_disk_writes_completed_total", + "query": "(irate(node_disk_writes_completed_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_Throughput_Read": { + "label": "node_disk_read_bytes_total", + "query": "(irate(node_disk_read_bytes_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_Throughput_Write": { + "label": "node_disk_written_bytes_total", + "query": "(irate(node_disk_written_bytes_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Network_Throughput_Rx": { + "label": "node_network_receive_bytes_total", + "query": "(irate(node_network_receive_bytes_total{device!~\"tun.*\",device!~\"vxlan.*\",device!~\"ovs.*\",device!~\"br.*\",device!~\"veth.*\"}[1m]) * 8)" + }, + "Average_Network_Throughput_Tx": { + "label": "node_network_transmit_bytes_total", + "query": "(irate(node_network_transmit_bytes_total{device!~\"tun.*\",device!~\"vxlan.*\",device!~\"ovs.*\",device!~\"br.*\",device!~\"veth.*\"}[1m]) * 8)" + }, + "Average_CPU_Usage_per_Instance": { + "label": "node_cpu_seconds_total", + "query": "(irate(node_cpu_seconds_total[1m]))" + }, + "Average_Memory_Usage_Active": { + "label": "node_memory_MemTotal_bytes", + "query": "avg(node_memory_Active_bytes) by (instance)" + }, + "Average_Memory_Usage_Cached_Buffers": { + "label": "node_memory_Cached_bytes,node_memory_Buffers_bytes", + "query": "avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)" + }, + "Average_Memory_Usage_Available": { + "label": "node_memory_MemAvailable_bytes,node_memory_Cached_bytes,node_memory_Buffers_bytes", + "query": "avg(node_memory_MemAvailable_bytes) by (instance) - (avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance))" + }, + "Ceph_Per_OSD_Throughput_Read": { + "label": "ceph_osd_op_r_out_bytes", + "query": "irate(ceph_osd_op_r_out_bytes[1m]) " + }, + "Ceph_Per_OSD_Throughput_Write": { + "label": "ceph_osd_op_w_in_bytes", + "query": "irate(ceph_osd_op_w_in_bytes[1m]) " + }, + "Ceph_Per_OSD_Throughput_Read-Modify-Write_in": { + "label": "ceph_osd_op_rw_in_bytes", + "query": "irate(ceph_osd_op_rw_in_bytes[1m])" + }, + "Ceph_Per_OSD_Throughput_Read-Modify-Write_out": { + "label": "ceph_osd_op_rw_out_bytes", + "query": "irate(ceph_osd_op_rw_out_bytes[1m])" + }, + "Ceph_Avg_OSD_Op_Latency_read": { + "label": "ceph_osd_op_r_latency_sum,ceph_osd_op_r_latency_count", + "query": "avg(rate(ceph_osd_op_r_latency_sum[5m]) / rate(ceph_osd_op_r_latency_count[5m]) >= 0)" + }, + "Ceph_Avg_OSD_Op_Latency_write": { + "label": "ceph_osd_op_w_latency_sum,ceph_osd_op_w_latency_count", + "query": "avg(rate(ceph_osd_op_w_latency_sum[5m]) / rate(ceph_osd_op_w_latency_count[5m]) >= 0)" + }, + "Memory_Usage_RSS": { + "label": "container_memory_rss", + "query": "sum(container_memory_rss) by (service, node)" + }, + "Ceph-OSD_Containers_CPU_usage": { + "label": "container_cpu_usage_seconds_total", + "query": "(irate(container_cpu_usage_seconds_total[1m]))" + }, + "Number_of_PVCs": { + "label": "kube_persistentvolumeclaim_info", + "query": "avg(count(kube_persistentvolumeclaim_info))" + }, + "Load_1m": { + "label": "node_load1", + "query": "avg(node_load1) by (instance)" + }, + "Load_5m": { + "label": "node_load5", + "query": "avg(node_load5) by (instance)" + }, + "load_15m": { + "label": "node_load15", + "query": "avg(node_load15) by (instance)" + }, + "ceph_health": { + "label": "ceph_health_status", + "query": "sum(ceph_health_status)" + }, + "ceph_in": { + "label": "ceph_osd_in", + "query": "sum(ceph_osd_in)" + }, + "ceph_out": { + "label": "ceph_osd_out", + "query": "count(ceph_osd_up) - count(ceph_osd_in)" + }, + "ceph_up": { + "label": "ceph_osd_up", + "query": "sum(ceph_osd_up)" + }, + "ceph_down": { + "label": "ceph_osd_down", + "query": "count(ceph_osd_up == 0) OR vector(0)" + }, + "ceph_average_PGs Per OSD": { + "label": "ceph_osd_numpg", + "query": "avg(ceph_osd_numpg) by (ceph_daemon)" + }, + "ceph_monitor_in_quorum": { + "label": "ceph_mon_quorum_status", + "query": "sum(ceph_mon_quorum_status)" + }, + "ceph_avg_available_capacity": { + "label": "ceph_cluster_total_bytes,ceph_cluster_total_used_bytes", + "query": "avg(ceph_cluster_total_used_bytes) / avg(ceph_cluster_total_bytes)" + }, + "ceph_sum_available_capacity": { + "label": "ceph_cluster_total_bytes,ceph_cluster_total_used_bytes", + "query": "sum(ceph_cluster_total_bytes - ceph_cluster_total_used_bytes)" + }, + "ceph_sum_total_capacity": { + "label": "ceph_cluster_total_bytes,ceph_cluster_total_used_bytes", + "query": "sum(ceph_cluster_total_bytes)" + }, + "ceph_sum_used_capacity": { + "label": "ceph_cluster_total_bytes,ceph_cluster_total_used_bytes", + "query": "sum(ceph_cluster_total_used_bytes)" + }, + "ceph_IOPS_write": { + "label": "ceph_osd_op_w", + "query": "sum(rate(ceph_osd_op_w[1m]))" + }, + "ceph_IOPS_read": { + "label": "ceph_osd_op_r", + "query": "sum(rate(ceph_osd_op_r[1m]))" + }, + "ceph_througput_write": { + "label": "ceph_osd_op_w_in_bytes", + "query": "sum(irate(ceph_osd_op_w_in_bytes[1m]))" + }, + "ceph_throughput_read": { + "label": "ceph_osd_op_r_in_bytes", + "query": "sum(irate(ceph_osd_op_r_out_bytes[1m]))" + }, + "ceph_Average_OSD_Apply_Latency": { + "label": "ceph_osd_apply_latency_ms", + "query": "avg(ceph_osd_apply_latency_ms)" + }, + "ceph_Average_OSD_Commit_Latency": { + "label": "ceph_osd_commit_latency_ms", + "query": "avg(ceph_osd_commit_latency_ms)" + }, + "ceph_Objects_in_the_Cluster": { + "label": "ceph_pool_objects", + "query": "sum(ceph_pool_objects)" + }, + "ceph_Per OSD Capacity_stat": { + "label": "Per OSD Capacity", + "query": "sum(ceph_osd_stat_bytes) by (ceph_daemon)" + }, + "ceph_Per OSD Capacity_used": { + "label": "ceph_osd_stat_bytes_used", + "query": "sum(ceph_osd_stat_bytes_used) by (ceph_daemon)" + }, + "ceph_Per OSD Capacity_percentage": { + "label": "ceph_osd_stat_bytes_used,ceph_osd_stat_bytes", + "query": "(sum (ceph_osd_stat_bytes_used) by (ceph_daemon) / sum(ceph_osd_stat_bytes) by (ceph_daemon)) * 100" + }, + "ceph_PGs_active": { + "label": "ceph_pg_active", + "query": "sum(ceph_pg_active)" + }, + "ceph_PGs_clean": { + "label": "ceph_pg_clean", + "query": "sum(ceph_pg_clean)" + }, + "ceph_PGs_peering": { + "label": "ceph_pg_peering", + "query": "sum(ceph_pg_peering)" + }, + "ceph_PGs_degraded": { + "label": "ceph_pg_degraded", + "query": "sum(ceph_pg_degraded)" + }, + "ceph_PGs_stale": { + "label": "ceph_pg_stale", + "query": "sum(ceph_pg_stale)" + }, + "ceph_PGs_unclean_pgs": { + "label": "ceph_unclean_pgs", + "query": "sum(ceph_unclean_pgs)" + }, + "ceph_PGs_undersized": { + "label": "ceph_pg_undersized", + "query": "sum(ceph_pg_undersized)" + }, + "ceph_PGs_incomplete": { + "label": "ceph_pg_incomplete", + "query": "sum(ceph_pg_incomplete)" + }, + "ceph_PGs_forced_backfill": { + "label": "ceph_pg_forced_backfill", + "query": "sum(ceph_pg_forced_backfill)" + }, + "ceph_PGs_inconsistent": { + "label": "ceph_pg_inconsistent", + "query": "sum(ceph_pg_inconsistent)" + }, + "ceph_PGs_forced_recovery": { + "label": "ceph_pg_forced_recovery", + "query": "sum(ceph_pg_forced_recovery)" + }, + "ceph_PGs_creating": { + "label": "ceph_pg_creating", + "query": "sum(ceph_pg_creating)" + }, + "ceph_PGs_wait_backfill": { + "label": "ceph_pg_wait_backfill", + "query": "sum(ceph_pg_wait_backfill)" + }, + "ceph_PGs_deep": { + "label": "ceph_pg_deep", + "query": "sum(ceph_pg_deep)" + }, + "ceph_PGs_scrubbing": { + "label": "ceph_pg_scrubbing", + "query": "sum(ceph_pg_scrubbing)" + }, + "ceph_PGs_recovering": { + "label": "ceph_pg_recovering", + "query": "sum(ceph_pg_recovering)" + }, + "ceph_PGs_repair": { + "label": "ceph_pg_repair", + "query": "sum(ceph_pg_repair)" + }, + "ceph_PGs_down": { + "label": "ceph_pg_down", + "query": "sum(ceph_pg_down)" + }, + "ceph_PGs_peered": { + "label": "ceph_pg_peered", + "query": "sum(ceph_pg_peered)" + }, + "ceph_PGs_backfill": { + "label": "ceph_pg_backfill", + "query": "sum(ceph_pg_backfill)" + }, + "ceph_PGs_remapped": { + "label": "ceph_pg_remapped", + "query": "sum(ceph_pg_remapped)" + }, + "ceph_PGs_backfill_toofull": { + "label": "ceph_pg_backfill_toofull", + "query": "sum(ceph_pg_backfill_toofull)" + }, + "ceph_Average_OSD_Resource_Quota_TABLE_CPU": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate", + "query": "avg(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (container)" + }, + "ceph_Average_OSD_Resource_Quota_TABLE_CPU_limit_percentage": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,kube_pod_container_resource_limits_cpu_cores", + "query": "avg(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (container) / avg(kube_pod_container_resource_limits_cpu_cores{namespace=\"openshift-storage\"}) by (container)" + }, + "ceph_Average_OSD_Resource_Quota_TABLE_MEM": { + "label": "container_memory_working_set_bytes", + "query": "avg(container_memory_working_set_bytes{namespace=\"openshift-storage\"}) by (container)" + }, + "ceph_Average_OSD_Resource_Quota_TABLE_MEM_limit_percentage": { + "label": "container_memory_working_set_bytes,kube_pod_container_resource_limits_memory_bytes", + "query": "avg(container_memory_working_set_bytes{namespace=\"openshift-storage\"}) by (container) / avg(kube_pod_container_resource_limits_memory_bytes{namespace=\"openshift-storage\"}) by (container)" + }, + "ceph_OSD_Resource_Quota_mem_used": { + "label": "container_memory_working_set_bytes", + "query": "sum(container_memory_working_set_bytes{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_mem_request": { + "label": "kube_pod_container_resource_requests_memory_bytes", + "query": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_mem_limit": { + "label": "kube_pod_container_resource_limits_memory_bytes", + "query": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_mem_RSS_used": { + "label": "container_memory_rss", + "query": "sum(container_memory_rss{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_mem_cache_used": { + "label": "container_memory_cache", + "query": "sum(container_memory_cache{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_mem_swap_used": { + "label": "container_memory_swap", + "query": "sum(container_memory_swap{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_CPU_usage": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate", + "query": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_CPU_request": { + "label": "kube_pod_container_resource_requests_cpu_cores", + "query": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_OSD_Resource_Quota_CPU_limit": { + "label": "kube_pod_container_resource_limits_cpu_cores", + "query": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"openshift-storage\"}) by (container, pod)" + }, + "ceph_CPU_Quota_TABLE_usage": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate", + "query": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_CPU_Quota_TABLE_request": { + "label": "kube_pod_container_resource_requests_cpu_cores", + "query": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_CPU_Quota_TABLE_percentage_request_used": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,kube_pod_container_resource_requests_cpu_cores", + "query": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_CPU_Quota_TABLE_limit": { + "label": "kube_pod_container_resource_limits_cpu_cores", + "query": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_CPU_Quota_TABLE_percentage_limit_used": { + "label": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,kube_pod_container_resource_limits_cpu_cores", + "query": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"openshift-storage\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_usage": { + "label": "container_memory_working_set_bytes", + "query": "sum(container_memory_working_set_bytes{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_request": { + "label": "kube_pod_container_resource_requests_memory_bytes", + "query": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_request_used": { + "label": "container_memory_working_set_bytes,kube_pod_container_resource_requests_memory_bytes", + "query": "sum(container_memory_working_set_bytes{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_limit": { + "label": "kube_pod_container_resource_limits_memory_bytes", + "query": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"openshift-storage\"}) by (pod) " + }, + "ceph_Memory_Quota_TABLE_percentage_mem_limit_used": { + "label": "container_memory_working_set_bytes,kube_pod_container_resource_limits_memory_bytes", + "query": "sum(container_memory_working_set_bytes{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"openshift-storage\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_usage_rss": { + "label": "container_memory_rss", + "query": "sum(container_memory_rss{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_usage_cache": { + "label": "container_memory_cache", + "query": "sum(container_memory_cache{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod)" + }, + "ceph_Memory_Quota_TABLE_mem_usage_swap": { + "label": "container_memory_swap", + "query": "sum(container_memory_swap{namespace=\"openshift-storage\",id=~\"slice$\"}) by (pod)" + }, + "ceph_OSD_Operations_Latency": { + "label": "ceph_osd_op_latency_sum,ceph_osd_op_latency_count", + "query": "avg(rate(ceph_osd_op_latency_sum{namespace=\"openshift-storage\"}[1m]) / rate(ceph_osd_op_latency_count{namespace=\"openshift-storage\"}[1m]) >= 0) by (ceph_daemon)" + }, + "ceph_Average OSD Read Operation latency (ms)": { + "label": "ceph_osd_op_r_latency_sum,ceph_osd_op_r_latency_count", + "query": "avg(rate(ceph_osd_op_r_latency_sum{namespace=\"openshift-storage\"}[1m]) / rate(ceph_osd_op_r_latency_count{namespace=\"openshift-storage\"}[1m]) >= 0) by (ceph_daemon)" + }, + "ceph_OSD Read Op Latency Distribution (ms)": { + "label": "ceph_osd_op_r_latency_sum", + "query": "rate(ceph_osd_op_r_latency_sum{namespace=\"openshift-storage\"}[1m]) / rate(ceph_osd_op_r_latency_count{namespace=\"openshift-storage\"}[1m]) >= 0" + }, + "ceph_Average OSD Write Operation latency (ms)": { + "label": "ceph_osd_op_w_latency_sum,ceph_osd_op_w_latency_count", + "query": "avg(rate(ceph_osd_op_w_latency_sum{namespace=\"openshift-storage\"}[1m]) / rate(ceph_osd_op_w_latency_count{namespace=\"openshift-storage\"}[1m]) >= 0) by (ceph_daemon)" + }, + "ceph_OSD Write Op Latency Distribution (ms)": { + "label": "ceph_osd_op_w_latency_sum,ceph_osd_op_w_latency_count", + "query": "rate(ceph_osd_op_w_latency_sum{namespace=\"openshift-storage\"}[1m]) / rate(ceph_osd_op_w_latency_count{namespace=\"openshift-storage\"}[1m]) >= 0" + }, + "ceph_OSD Red-Modify-Write Operation latency (ms)": { + "label": "ceph_osd_op_rw_latency_sum", + "query": "sum(irate(ceph_osd_op_rw_latency_sum{namespace=\"openshift-storage\"}[1m])) by (ceph_daemon)" + }, + "ceph_OSD Read Operations Out (bytes)": { + "label": "ceph_osd_op_r_out_bytes", + "query": "sum(irate(ceph_osd_op_r_out_bytes{namespace=\"openshift-storage\"}[1m])) by (ceph_daemon)" + }, + "ceph_OSD Write Operations In (bytes)": { + "label": "ceph_osd_op_w_in_bytes", + "query": "sum(irate(ceph_osd_op_w_in_bytes{namespace=\"openshift-storage\"}[1m])) by (ceph_daemon)" + }, + "ceph_Bluefs Slow Used Bytes": { + "label": "ceph_bluefs_slow_used_bytes", + "query": "avg(irate(ceph_bluefs_slow_used_bytes{namespace=\"openshift-storage\"}[10s])) by (ceph_daemon)" + }, + "ceph_Bluefs Read Bytes": { + "label": "ceph_bluefs_read_bytes", + "query": "avg(ceph_bluefs_read_bytes{namespace=\"openshift-storage\"}) by (ceph_daemon)" + }, + "ceph_Bluefs Read Prefetch Bytes": { + "label": "ceph_bluefs_read_prefetch_bytes", + "query": "avg(ceph_bluefs_read_prefetch_bytes{namespace=\"openshift-storage\"}) by (ceph_daemon)" + }, + "ceph_Bluefs DB Used Bytes": { + "label": "ceph_bluefs_db_used_bytes", + "query": "avg(ceph_bluefs_db_used_bytes{namespace=\"openshift-storage\"}) by (ceph_daemon)" + }, + "ceph_Recovery Operations": { + "label": "ceph_osd_recovery_ops", + "query": "sum(irate(ceph_osd_recovery_ops{namespace=\"openshift-storage\"}[1m]))" + }, + "ceph_bluestore_compressed": { + "label": "ceph_bluestore_bluestore_compressed", + "query": "irate(ceph_bluestore_bluestore_compressed[1m])" + }, + "ceph_bluestore_compressed_allocated": { + "label": "ceph_bluestore_bluestore_compressed_allocated", + "query": "irate(ceph_bluestore_bluestore_compressed_allocated[1m])" + }, + "ceph_bluestore_compressed_original": { + "label": "ceph_bluestore_bluestore_compressed_original", + "query": "irate(ceph_bluestore_bluestore_compressed_original[1m])" + } + } +} diff --git a/snafu/utils/prometheus_labels/included_labels.json b/snafu/utils/prometheus_labels/included_labels.json index 36ac5502..21d3a275 100644 --- a/snafu/utils/prometheus_labels/included_labels.json +++ b/snafu/utils/prometheus_labels/included_labels.json @@ -1,1124 +1,69 @@ { - "data": [ - ":kube_pod_info_node_count:", - ":node_memory_MemFreeCachedBuffers_bytes:sum", - "ceph_bluefs_bytes_written_slow", - "ceph_bluefs_bytes_written_sst", - "ceph_bluefs_bytes_written_wal", - "ceph_bluefs_db_total_bytes", - "ceph_bluefs_db_used_bytes", - "ceph_bluefs_log_bytes", - "ceph_bluefs_logged_bytes", - "ceph_bluefs_num_files", - "ceph_bluefs_read_bytes", - "ceph_bluefs_read_prefetch_bytes", - "ceph_bluefs_read_random_buffer_bytes", - "ceph_bluefs_read_random_bytes", - "ceph_bluefs_read_random_disk_bytes", - "ceph_bluefs_slow_total_bytes", - "ceph_bluefs_slow_used_bytes", - "ceph_bluefs_wal_total_bytes", - "ceph_bluefs_wal_used_bytes", - "ceph_bluestore_commit_lat_count", - "ceph_bluestore_commit_lat_sum", - "ceph_bluestore_kv_final_lat_count", - "ceph_bluestore_kv_final_lat_sum", - "ceph_bluestore_kv_flush_lat_count", - "ceph_bluestore_kv_flush_lat_sum", - "ceph_bluestore_kv_sync_lat_count", - "ceph_bluestore_kv_sync_lat_sum", - "ceph_bluestore_read_lat_count", - "ceph_bluestore_read_lat_sum", - "ceph_bluestore_state_aio_wait_lat_count", - "ceph_bluestore_state_aio_wait_lat_sum", - "ceph_bluestore_submit_lat_count", - "ceph_bluestore_submit_lat_sum", - "ceph_bluestore_throttle_lat_count", - "ceph_bluestore_throttle_lat_sum", - "ceph_cluster_total_bytes", - "ceph_cluster_total_used_bytes", - "ceph_cluster_total_used_raw_bytes", - "ceph_disk_occupation", - "ceph_health_status", - "ceph_mgr_metadata", - "ceph_mgr_module_can_run", - "ceph_mgr_module_status", - "ceph_mgr_status", - "ceph_mon_election_call", - "ceph_mon_election_lose", - "ceph_mon_election_win", - "ceph_mon_metadata", - "ceph_mon_num_elections", - "ceph_mon_num_sessions", - "ceph_mon_quorum_status", - "ceph_mon_session_add", - "ceph_mon_session_rm", - "ceph_mon_session_trim", - "ceph_num_objects_degraded", - "ceph_num_objects_misplaced", - "ceph_num_objects_unfound", - "ceph_objecter_op_active", - "ceph_objecter_op_r", - "ceph_objecter_op_rmw", - "ceph_objecter_op_w", - "ceph_osd_apply_latency_ms", - "ceph_osd_commit_latency_ms", - "ceph_osd_flag_nobackfill", - "ceph_osd_flag_nodeep_scrub", - "ceph_osd_flag_nodown", - "ceph_osd_flag_noin", - "ceph_osd_flag_noout", - "ceph_osd_flag_norebalance", - "ceph_osd_flag_norecover", - "ceph_osd_flag_noscrub", - "ceph_osd_flag_noup", - "ceph_osd_in", - "ceph_osd_metadata", - "ceph_osd_numpg", - "ceph_osd_numpg_removing", - "ceph_osd_op", - "ceph_osd_op_in_bytes", - "ceph_osd_op_latency_count", - "ceph_osd_op_latency_sum", - "ceph_osd_op_out_bytes", - "ceph_osd_op_prepare_latency_count", - "ceph_osd_op_prepare_latency_sum", - "ceph_osd_op_process_latency_count", - "ceph_osd_op_process_latency_sum", - "ceph_osd_op_r", - "ceph_osd_op_r_latency_count", - "ceph_osd_op_r_latency_sum", - "ceph_osd_op_r_out_bytes", - "ceph_osd_op_r_prepare_latency_count", - "ceph_osd_op_r_prepare_latency_sum", - "ceph_osd_op_r_process_latency_count", - "ceph_osd_op_r_process_latency_sum", - "ceph_osd_op_rw", - "ceph_osd_op_rw_in_bytes", - "ceph_osd_op_rw_latency_count", - "ceph_osd_op_rw_latency_sum", - "ceph_osd_op_rw_out_bytes", - "ceph_osd_op_rw_prepare_latency_count", - "ceph_osd_op_rw_prepare_latency_sum", - "ceph_osd_op_rw_process_latency_count", - "ceph_osd_op_rw_process_latency_sum", - "ceph_osd_op_w", - "ceph_osd_op_w_in_bytes", - "ceph_osd_op_w_latency_count", - "ceph_osd_op_w_latency_sum", - "ceph_osd_op_w_prepare_latency_count", - "ceph_osd_op_w_prepare_latency_sum", - "ceph_osd_op_w_process_latency_count", - "ceph_osd_op_w_process_latency_sum", - "ceph_osd_op_wip", - "ceph_osd_recovery_bytes", - "ceph_osd_recovery_ops", - "ceph_osd_stat_bytes", - "ceph_osd_stat_bytes_used", - "ceph_osd_up", - "ceph_osd_weight", - "ceph_paxos_accept_timeout", - "ceph_paxos_begin", - "ceph_paxos_begin_bytes_count", - "ceph_paxos_begin_bytes_sum", - "ceph_paxos_begin_keys_count", - "ceph_paxos_begin_keys_sum", - "ceph_paxos_begin_latency_count", - "ceph_paxos_begin_latency_sum", - "ceph_paxos_collect", - "ceph_paxos_collect_bytes_count", - "ceph_paxos_collect_bytes_sum", - "ceph_paxos_collect_keys_count", - "ceph_paxos_collect_keys_sum", - "ceph_paxos_collect_latency_count", - "ceph_paxos_collect_latency_sum", - "ceph_paxos_collect_timeout", - "ceph_paxos_collect_uncommitted", - "ceph_paxos_commit", - "ceph_paxos_commit_bytes_count", - "ceph_paxos_commit_bytes_sum", - "ceph_paxos_commit_keys_count", - "ceph_paxos_commit_keys_sum", - "ceph_paxos_commit_latency_count", - "ceph_paxos_commit_latency_sum", - "ceph_paxos_lease_ack_timeout", - "ceph_paxos_lease_timeout", - "ceph_paxos_new_pn", - "ceph_paxos_new_pn_latency_count", - "ceph_paxos_new_pn_latency_sum", - "ceph_paxos_refresh", - "ceph_paxos_refresh_latency_count", - "ceph_paxos_refresh_latency_sum", - "ceph_paxos_restart", - "ceph_paxos_share_state", - "ceph_paxos_share_state_bytes_count", - "ceph_paxos_share_state_bytes_sum", - "ceph_paxos_share_state_keys_count", - "ceph_paxos_share_state_keys_sum", - "ceph_paxos_start_leader", - "ceph_paxos_start_peon", - "ceph_paxos_store_state", - "ceph_paxos_store_state_bytes_count", - "ceph_paxos_store_state_bytes_sum", - "ceph_paxos_store_state_keys_count", - "ceph_paxos_store_state_keys_sum", - "ceph_paxos_store_state_latency_count", - "ceph_paxos_store_state_latency_sum", - "ceph_pg_activating", - "ceph_pg_active", - "ceph_pg_backfill_toofull", - "ceph_pg_backfill_unfound", - "ceph_pg_backfill_wait", - "ceph_pg_backfilling", - "ceph_pg_clean", - "ceph_pg_creating", - "ceph_pg_deep", - "ceph_pg_degraded", - "ceph_pg_down", - "ceph_pg_forced_backfill", - "ceph_pg_forced_recovery", - "ceph_pg_incomplete", - "ceph_pg_inconsistent", - "ceph_pg_peered", - "ceph_pg_peering", - "ceph_pg_recovering", - "ceph_pg_recovery_toofull", - "ceph_pg_recovery_unfound", - "ceph_pg_recovery_wait", - "ceph_pg_remapped", - "ceph_pg_repair", - "ceph_pg_scrubbing", - "ceph_pg_snaptrim", - "ceph_pg_snaptrim_error", - "ceph_pg_snaptrim_wait", - "ceph_pg_stale", - "ceph_pg_total", - "ceph_pg_undersized", - "ceph_pg_unknown", - "ceph_pool_dirty", - "ceph_pool_max_avail", - "ceph_pool_metadata", - "ceph_pool_num_bytes_recovered", - "ceph_pool_num_objects_recovered", - "ceph_pool_objects", - "ceph_pool_quota_bytes", - "ceph_pool_quota_objects", - "ceph_pool_rd", - "ceph_pool_rd_bytes", - "ceph_pool_recovering_bytes_per_sec", - "ceph_pool_recovering_keys_per_sec", - "ceph_pool_recovering_objects_per_sec", - "ceph_pool_stored", - "ceph_pool_stored_raw", - "ceph_pool_wr", - "ceph_pool_wr_bytes", - "ceph_rocksdb_compact", - "ceph_rocksdb_compact_queue_len", - "ceph_rocksdb_compact_queue_merge", - "ceph_rocksdb_compact_range", - "ceph_rocksdb_get", - "ceph_rocksdb_get_latency_count", - "ceph_rocksdb_get_latency_sum", - "ceph_rocksdb_rocksdb_write_delay_time_count", - "ceph_rocksdb_rocksdb_write_delay_time_sum", - "ceph_rocksdb_rocksdb_write_memtable_time_count", - "ceph_rocksdb_rocksdb_write_memtable_time_sum", - "ceph_rocksdb_rocksdb_write_pre_and_post_time_count", - "ceph_rocksdb_rocksdb_write_pre_and_post_time_sum", - "ceph_rocksdb_rocksdb_write_wal_time_count", - "ceph_rocksdb_rocksdb_write_wal_time_sum", - "ceph_rocksdb_submit_latency_count", - "ceph_rocksdb_submit_latency_sum", - "ceph_rocksdb_submit_sync_latency_count", - "ceph_rocksdb_submit_sync_latency_sum", - "ceph_rocksdb_submit_transaction", - "ceph_rocksdb_submit_transaction_sync", - "ceph_scrape_duration_secs", - "cluster:capacity_cpu_cores:sum", - "cluster:capacity_memory_bytes:sum", - "cluster:ceph_node_down:join_kube", - "cluster:container_cpu_usage:ratio", - "cluster:container_spec_cpu_shares:ratio", - "cluster:cpu_usage_cores:sum", - "cluster:master_nodes", - "cluster:memory_usage:ratio", - "cluster:node_cpu:sum_rate5m", - "cluster:node_instance_type_count:sum", - "cluster_autoscaler_controller_adds", - "cluster_autoscaler_controller_depth", - "cluster_autoscaler_controller_longest_running_processor_microseconds", - "cluster_autoscaler_controller_queue_latency", - "cluster_autoscaler_controller_queue_latency_count", - "cluster_autoscaler_controller_queue_latency_sum", - "cluster_autoscaler_controller_retries", - "cluster_autoscaler_controller_unfinished_work_seconds", - "cluster_autoscaler_controller_work_duration", - "cluster_autoscaler_controller_work_duration_count", - "cluster_autoscaler_controller_work_duration_sum", - "cluster_feature_set", - "cluster_infrastructure_provider", - "cluster_installer", - "cluster_master_schedulable", - "cluster_monitoring_operator_reconcile_attempts_total", - "cluster_monitoring_operator_reconcile_errors_total", - "cluster_operator_condition_transitions", - "cluster_operator_conditions", - "cluster_operator_payload_errors", - "cluster_operator_up", - "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", - "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", - "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", - "cluster_version", - "cluster_version_available_updates", - "cluster_version_payload", - "code:apiserver_request_count:rate:sum", - "code:cluster:ingress_http_request_count:rate5m:sum", - "code:registry_api_request_count:rate:sum", - "console_url", - "container_cpu_cfs_periods_total", - "container_cpu_cfs_throttled_periods_total", - "container_cpu_cfs_throttled_seconds_total", - "container_cpu_system_seconds_total", - "container_cpu_usage_seconds_total", - "container_cpu_user_seconds_total", - "container_fs_inodes_free", - "container_fs_inodes_total", - "container_fs_io_current", - "container_fs_io_time_seconds_total", - "container_fs_io_time_weighted_seconds_total", - "container_fs_limit_bytes", - "container_fs_read_seconds_total", - "container_fs_reads_bytes_total", - "container_fs_reads_merged_total", - "container_fs_reads_total", - "container_fs_sector_reads_total", - "container_fs_sector_writes_total", - "container_fs_usage_bytes", - "container_fs_write_seconds_total", - "container_fs_writes_bytes_total", - "container_fs_writes_merged_total", - "container_fs_writes_total", - "container_last_seen", - "container_memory_cache", - "container_memory_failcnt", - "container_memory_failures_total", - "container_memory_mapped_file", - "container_memory_max_usage_bytes", - "container_memory_rss", - "container_memory_swap", - "container_memory_usage_bytes", - "container_memory_working_set_bytes", - "container_network_receive_bytes_total", - "container_network_receive_errors_total", - "container_network_receive_packets_dropped_total", - "container_network_receive_packets_total", - "container_network_transmit_bytes_total", - "container_network_transmit_errors_total", - "container_network_transmit_packets_dropped_total", - "container_network_transmit_packets_total", - "container_runtime_crio_operations", - "container_runtime_crio_operations_errors", - "container_runtime_crio_operations_latency_microseconds", - "container_runtime_crio_operations_latency_microseconds_count", - "container_runtime_crio_operations_latency_microseconds_sum", - "container_scrape_error", - "container_spec_cpu_period", - "container_spec_cpu_quota", - "container_spec_cpu_shares", - "container_spec_memory_limit_bytes", - "container_spec_memory_reservation_limit_bytes", - "container_spec_memory_swap_limit_bytes", - "container_start_time_seconds", - "controller_clusterquotamappingcontroller_clusterquotas_adds", - "controller_clusterquotamappingcontroller_clusterquotas_depth", - "controller_clusterquotamappingcontroller_clusterquotas_longest_running_processor_microseconds", - "controller_clusterquotamappingcontroller_clusterquotas_queue_latency", - "controller_clusterquotamappingcontroller_clusterquotas_queue_latency_count", - "controller_clusterquotamappingcontroller_clusterquotas_queue_latency_sum", - "controller_clusterquotamappingcontroller_clusterquotas_retries", - "controller_clusterquotamappingcontroller_clusterquotas_unfinished_work_seconds", - "controller_clusterquotamappingcontroller_clusterquotas_work_duration", - "controller_clusterquotamappingcontroller_clusterquotas_work_duration_count", - "controller_clusterquotamappingcontroller_clusterquotas_work_duration_sum", - "controller_clusterquotamappingcontroller_namespaces_adds", - "controller_clusterquotamappingcontroller_namespaces_depth", - "controller_clusterquotamappingcontroller_namespaces_longest_running_processor_microseconds", - "controller_clusterquotamappingcontroller_namespaces_queue_latency", - "controller_clusterquotamappingcontroller_namespaces_queue_latency_count", - "controller_clusterquotamappingcontroller_namespaces_queue_latency_sum", - "controller_clusterquotamappingcontroller_namespaces_retries", - "controller_clusterquotamappingcontroller_namespaces_unfinished_work_seconds", - "controller_clusterquotamappingcontroller_namespaces_work_duration", - "controller_clusterquotamappingcontroller_namespaces_work_duration_count", - "controller_clusterquotamappingcontroller_namespaces_work_duration_sum", - "csv_count", - "csv_upgrade_count", - "deployment_adds", - "deployment_controller_rate_limiter_use", - "deployment_depth", - "deployment_longest_running_processor_microseconds", - "deployment_queue_latency", - "deployment_queue_latency_count", - "deployment_queue_latency_sum", - "deployment_retries", - "deployment_unfinished_work_seconds", - "deployment_work_duration", - "deployment_work_duration_count", - "deployment_work_duration_sum", - "etcd_debugging_mvcc_db_compaction_keys_total", - "etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket", - "etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_count", - "etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_sum", - "etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_bucket", - "etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_count", - "etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_sum", - "etcd_debugging_mvcc_db_total_size_in_bytes", - "etcd_debugging_mvcc_delete_total", - "etcd_debugging_mvcc_events_total", - "etcd_debugging_mvcc_index_compaction_pause_duration_milliseconds_bucket", - "etcd_debugging_mvcc_index_compaction_pause_duration_milliseconds_count", - "etcd_debugging_mvcc_index_compaction_pause_duration_milliseconds_sum", - "etcd_debugging_mvcc_keys_total", - "etcd_debugging_mvcc_pending_events_total", - "etcd_debugging_mvcc_put_total", - "etcd_debugging_mvcc_range_total", - "etcd_debugging_mvcc_slow_watcher_total", - "etcd_debugging_mvcc_txn_total", - "etcd_debugging_mvcc_watch_stream_total", - "etcd_debugging_mvcc_watcher_total", - "etcd_debugging_server_lease_expired_total", - "etcd_debugging_snap_save_marshalling_duration_seconds_bucket", - "etcd_debugging_snap_save_marshalling_duration_seconds_count", - "etcd_debugging_snap_save_marshalling_duration_seconds_sum", - "etcd_debugging_snap_save_total_duration_seconds_bucket", - "etcd_debugging_snap_save_total_duration_seconds_count", - "etcd_debugging_snap_save_total_duration_seconds_sum", - "etcd_debugging_store_expires_total", - "etcd_debugging_store_reads_total", - "etcd_debugging_store_watch_requests_total", - "etcd_debugging_store_watchers", - "etcd_debugging_store_writes_total", - "etcd_disk_backend_commit_duration_seconds_bucket", - "etcd_disk_backend_commit_duration_seconds_count", - "etcd_disk_backend_commit_duration_seconds_sum", - "etcd_disk_backend_defrag_duration_seconds_bucket", - "etcd_disk_backend_defrag_duration_seconds_count", - "etcd_disk_backend_defrag_duration_seconds_sum", - "etcd_disk_backend_snapshot_duration_seconds_bucket", - "etcd_disk_backend_snapshot_duration_seconds_count", - "etcd_disk_backend_snapshot_duration_seconds_sum", - "etcd_disk_wal_fsync_duration_seconds_bucket", - "etcd_disk_wal_fsync_duration_seconds_count", - "etcd_disk_wal_fsync_duration_seconds_sum", - "etcd_grpc_proxy_cache_hits_total", - "etcd_grpc_proxy_cache_keys_total", - "etcd_grpc_proxy_cache_misses_total", - "etcd_grpc_proxy_events_coalescing_total", - "etcd_grpc_proxy_watchers_coalescing_total", - "etcd_helper_cache_entry_count", - "etcd_helper_cache_entry_total", - "etcd_helper_cache_hit_count", - "etcd_helper_cache_hit_total", - "etcd_helper_cache_miss_count", - "etcd_helper_cache_miss_total", - "etcd_mvcc_db_total_size_in_bytes", - "etcd_mvcc_db_total_size_in_use_in_bytes", - "etcd_mvcc_hash_duration_seconds_bucket", - "etcd_mvcc_hash_duration_seconds_count", - "etcd_mvcc_hash_duration_seconds_sum", - "etcd_mvcc_hash_rev_duration_seconds_bucket", - "etcd_mvcc_hash_rev_duration_seconds_count", - "etcd_mvcc_hash_rev_duration_seconds_sum", - "etcd_network_client_grpc_received_bytes_total", - "etcd_network_client_grpc_sent_bytes_total", - "etcd_network_peer_received_bytes_total", - "etcd_network_peer_round_trip_time_seconds_bucket", - "etcd_network_peer_round_trip_time_seconds_count", - "etcd_network_peer_round_trip_time_seconds_sum", - "etcd_network_peer_sent_bytes_total", - "etcd_network_peer_sent_failures_total", - "etcd_object_counts", - "etcd_request_cache_add_duration_seconds_bucket", - "etcd_request_cache_add_duration_seconds_count", - "etcd_request_cache_add_duration_seconds_sum", - "etcd_request_cache_add_latencies_summary", - "etcd_request_cache_add_latencies_summary_count", - "etcd_request_cache_add_latencies_summary_sum", - "etcd_request_cache_get_duration_seconds_bucket", - "etcd_request_cache_get_duration_seconds_count", - "etcd_request_cache_get_duration_seconds_sum", - "etcd_request_cache_get_latencies_summary", - "etcd_request_cache_get_latencies_summary_count", - "etcd_request_cache_get_latencies_summary_sum", - "etcd_server_go_version", - "etcd_server_has_leader", - "etcd_server_health_failures", - "etcd_server_health_success", - "etcd_server_heartbeat_send_failures_total", - "etcd_server_id", - "etcd_server_is_leader", - "etcd_server_leader_changes_seen_total", - "etcd_server_proposals_applied_total", - "etcd_server_proposals_committed_total", - "etcd_server_proposals_failed_total", - "etcd_server_proposals_pending", - "etcd_server_quota_backend_bytes", - "etcd_server_read_indexes_failed_total", - "etcd_server_slow_apply_total", - "etcd_server_slow_read_indexes_total", - "etcd_server_version", - "etcd_snap_db_fsync_duration_seconds_bucket", - "etcd_snap_db_fsync_duration_seconds_count", - "etcd_snap_db_fsync_duration_seconds_sum", - "etcd_snap_db_save_total_duration_seconds_bucket", - "etcd_snap_db_save_total_duration_seconds_count", - "etcd_snap_db_save_total_duration_seconds_sum", - "federate_errors", - "federate_filtered_samples", - "federate_samples", - "frontend:cluster:ingress_frontend_bytes_in:rate5m:sum", - "frontend:cluster:ingress_frontend_bytes_out:rate5m:sum", - "frontend:cluster:ingress_frontend_connections:sum", - "gc_controller_rate_limiter_use", - "get_token_count", - "get_token_fail_count", - "go_gc_duration_seconds", - "go_gc_duration_seconds_count", - "go_gc_duration_seconds_sum", - "go_goroutines", - "go_info", - "go_memstats_alloc_bytes", - "go_memstats_alloc_bytes_total", - "go_memstats_buck_hash_sys_bytes", - "go_memstats_frees_total", - "go_memstats_gc_cpu_fraction", - "go_memstats_gc_sys_bytes", - "go_memstats_heap_alloc_bytes", - "go_memstats_heap_idle_bytes", - "go_memstats_heap_inuse_bytes", - "go_memstats_heap_objects", - "go_memstats_heap_released_bytes", - "go_memstats_heap_sys_bytes", - "go_memstats_last_gc_time_seconds", - "go_memstats_lookups_total", - "go_memstats_mallocs_total", - "go_memstats_mcache_inuse_bytes", - "go_memstats_mcache_sys_bytes", - "go_memstats_mspan_inuse_bytes", - "go_memstats_mspan_sys_bytes", - "go_memstats_next_gc_bytes", - "go_memstats_other_sys_bytes", - "go_memstats_stack_inuse_bytes", - "go_memstats_stack_sys_bytes", - "go_memstats_sys_bytes", - "go_threads", - "http_request_duration_microseconds", - "http_request_duration_microseconds_count", - "http_request_duration_microseconds_sum", - "http_request_duration_milliseconds", - "http_request_duration_milliseconds_count", - "http_request_duration_milliseconds_sum", - "http_request_size_bytes", - "http_request_size_bytes_count", - "http_request_size_bytes_sum", - "http_request_total", - "http_requests_total", - "http_response_size_bytes", - "http_response_size_bytes_count", - "http_response_size_bytes_sum", - "instance:etcd_object_counts:sum", - "instance:node_cpu:rate:sum", - "instance:node_cpu:ratio", - "instance:node_cpu_utilisation:rate1m", - "instance:node_filesystem_usage:sum", - "instance:node_load1_per_cpu:ratio", - "instance:node_memory_swap_io_pages:rate1m", - "instance:node_memory_utilisation:ratio", - "instance:node_network_receive_bytes:rate:sum", - "instance:node_network_receive_bytes_excluding_lo:rate1m", - "instance:node_network_receive_drop_excluding_lo:rate1m", - "instance:node_network_transmit_bytes:rate:sum", - "instance:node_network_transmit_bytes_excluding_lo:rate1m", - "instance:node_network_transmit_drop_excluding_lo:rate1m", - "instance:node_num_cpu:sum", - "instance_device:node_disk_io_time_seconds:rate1m", - "instance_device:node_disk_io_time_weighted_seconds:rate1m", - "kube_certificatesigningrequest_cert_length", - "kube_certificatesigningrequest_condition", - "kube_certificatesigningrequest_created", - "kube_certificatesigningrequest_labels", - "kube_configmap_created", - "kube_configmap_info", - "kube_configmap_metadata_resource_version", - "kube_daemonset_created", - "kube_daemonset_labels", - "kube_daemonset_metadata_generation", - "kube_daemonset_status_current_number_scheduled", - "kube_daemonset_status_desired_number_scheduled", - "kube_daemonset_status_number_available", - "kube_daemonset_status_number_misscheduled", - "kube_daemonset_status_number_ready", - "kube_daemonset_status_number_unavailable", - "kube_daemonset_updated_number_scheduled", - "kube_deployment_created", - "kube_deployment_labels", - "kube_deployment_metadata_generation", - "kube_deployment_spec_paused", - "kube_deployment_spec_replicas", - "kube_deployment_spec_strategy_rollingupdate_max_surge", - "kube_deployment_spec_strategy_rollingupdate_max_unavailable", - "kube_deployment_status_observed_generation", - "kube_deployment_status_replicas", - "kube_deployment_status_replicas_available", - "kube_deployment_status_replicas_unavailable", - "kube_deployment_status_replicas_updated", - "kube_endpoint_address_available", - "kube_endpoint_address_not_ready", - "kube_endpoint_created", - "kube_endpoint_info", - "kube_endpoint_labels", - "kube_job_complete", - "kube_job_created", - "kube_job_info", - "kube_job_labels", - "kube_job_owner", - "kube_job_spec_completions", - "kube_job_spec_parallelism", - "kube_job_status_active", - "kube_job_status_completion_time", - "kube_job_status_failed", - "kube_job_status_start_time", - "kube_job_status_succeeded", - "kube_namespace_created", - "kube_namespace_labels", - "kube_namespace_status_phase", - "kube_node_created", - "kube_node_info", - "kube_node_labels", - "kube_node_role", - "kube_node_spec_unschedulable", - "kube_node_status_allocatable", - "kube_node_status_allocatable_cpu_cores", - "kube_node_status_allocatable_memory_bytes", - "kube_node_status_allocatable_pods", - "kube_node_status_capacity", - "kube_node_status_capacity_cpu_cores", - "kube_node_status_capacity_memory_bytes", - "kube_node_status_capacity_pods", - "kube_node_status_condition", - "kube_persistentvolume_capacity_bytes", - "kube_persistentvolume_info", - "kube_persistentvolume_labels", - "kube_persistentvolume_status_phase", - "kube_persistentvolumeclaim_access_mode", - "kube_persistentvolumeclaim_info", - "kube_persistentvolumeclaim_labels", - "kube_persistentvolumeclaim_resource_requests_storage_bytes", - "kube_persistentvolumeclaim_status_phase", - "kube_pod_completion_time", - "kube_pod_container_info", - "kube_pod_container_resource_limits", - "kube_pod_container_resource_limits_cpu_cores", - "kube_pod_container_resource_limits_memory_bytes", - "kube_pod_container_resource_requests", - "kube_pod_container_resource_requests_cpu_cores", - "kube_pod_container_resource_requests_memory_bytes", - "kube_pod_container_status_last_terminated_reason", - "kube_pod_container_status_ready", - "kube_pod_container_status_restarts_total", - "kube_pod_container_status_running", - "kube_pod_container_status_terminated", - "kube_pod_container_status_terminated_reason", - "kube_pod_container_status_waiting", - "kube_pod_container_status_waiting_reason", - "kube_pod_created", - "kube_pod_info", - "kube_pod_init_container_info", - "kube_pod_init_container_status_last_terminated_reason", - "kube_pod_init_container_status_ready", - "kube_pod_init_container_status_restarts_total", - "kube_pod_init_container_status_running", - "kube_pod_init_container_status_terminated", - "kube_pod_init_container_status_terminated_reason", - "kube_pod_init_container_status_waiting", - "kube_pod_init_container_status_waiting_reason", - "kube_pod_labels", - "kube_pod_owner", - "kube_pod_restart_policy", - "kube_pod_spec_volumes_persistentvolumeclaims_info", - "kube_pod_spec_volumes_persistentvolumeclaims_readonly", - "kube_pod_start_time", - "kube_pod_status_phase", - "kube_pod_status_ready", - "kube_pod_status_ready:etcd:sum", - "kube_pod_status_ready:image_registry:sum", - "kube_pod_status_scheduled", - "kube_pod_status_scheduled_time", - "kube_poddisruptionbudget_created", - "kube_poddisruptionbudget_status_current_healthy", - "kube_poddisruptionbudget_status_desired_healthy", - "kube_poddisruptionbudget_status_expected_pods", - "kube_poddisruptionbudget_status_observed_generation", - "kube_poddisruptionbudget_status_pod_disruptions_allowed", - "kube_replicaset_created", - "kube_replicaset_labels", - "kube_replicaset_metadata_generation", - "kube_replicaset_owner", - "kube_replicaset_spec_replicas", - "kube_replicaset_status_fully_labeled_replicas", - "kube_replicaset_status_observed_generation", - "kube_replicaset_status_ready_replicas", - "kube_replicaset_status_replicas", - "kube_secret_created", - "kube_secret_info", - "kube_secret_labels", - "kube_secret_metadata_resource_version", - "kube_secret_type", - "kube_service_created", - "kube_service_info", - "kube_service_labels", - "kube_service_spec_type", - "kube_state_metrics_list_total", - "kube_state_metrics_watch_total", - "kube_statefulset_created", - "kube_statefulset_labels", - "kube_statefulset_metadata_generation", - "kube_statefulset_replicas", - "kube_statefulset_status_current_revision", - "kube_statefulset_status_observed_generation", - "kube_statefulset_status_replicas", - "kube_statefulset_status_replicas_current", - "kube_statefulset_status_replicas_ready", - "kube_statefulset_status_replicas_updated", - "kube_statefulset_status_update_revision", - "kube_storageclass_created", - "kube_storageclass_info", - "kube_storageclass_labels", - "kubelet_certificate_manager_client_expiration_seconds", - "kubelet_certificate_manager_server_expiration_seconds", - "kubelet_cgroup_manager_duration_seconds_bucket", - "kubelet_cgroup_manager_duration_seconds_count", - "kubelet_cgroup_manager_duration_seconds_sum", - "kubelet_cgroup_manager_latency_microseconds", - "kubelet_cgroup_manager_latency_microseconds_count", - "kubelet_cgroup_manager_latency_microseconds_sum", - "kubelet_container_log_filesystem_used_bytes", - "kubelet_containers_per_pod_count_bucket", - "kubelet_containers_per_pod_count_count", - "kubelet_containers_per_pod_count_sum", - "kubelet_node_config_error", - "kubelet_node_name", - "kubelet_pleg_relist_duration_seconds_bucket", - "kubelet_pleg_relist_duration_seconds_count", - "kubelet_pleg_relist_duration_seconds_sum", - "kubelet_pleg_relist_interval_microseconds", - "kubelet_pleg_relist_interval_microseconds_count", - "kubelet_pleg_relist_interval_microseconds_sum", - "kubelet_pleg_relist_interval_seconds_bucket", - "kubelet_pleg_relist_interval_seconds_count", - "kubelet_pleg_relist_interval_seconds_sum", - "kubelet_pleg_relist_latency_microseconds", - "kubelet_pleg_relist_latency_microseconds_count", - "kubelet_pleg_relist_latency_microseconds_sum", - "kubelet_pod_start_duration_seconds_bucket", - "kubelet_pod_start_duration_seconds_count", - "kubelet_pod_start_duration_seconds_sum", - "kubelet_pod_start_latency_microseconds", - "kubelet_pod_start_latency_microseconds_count", - "kubelet_pod_start_latency_microseconds_sum", - "kubelet_pod_worker_duration_seconds_bucket", - "kubelet_pod_worker_duration_seconds_count", - "kubelet_pod_worker_duration_seconds_sum", - "kubelet_pod_worker_latency_microseconds", - "kubelet_pod_worker_latency_microseconds_count", - "kubelet_pod_worker_latency_microseconds_sum", - "kubelet_pod_worker_start_duration_seconds_bucket", - "kubelet_pod_worker_start_duration_seconds_count", - "kubelet_pod_worker_start_duration_seconds_sum", - "kubelet_pod_worker_start_latency_microseconds", - "kubelet_pod_worker_start_latency_microseconds_count", - "kubelet_pod_worker_start_latency_microseconds_sum", - "kubelet_running_container_count", - "kubelet_running_pod_count", - "kubelet_runtime_operations", - "kubelet_runtime_operations_duration_seconds_bucket", - "kubelet_runtime_operations_duration_seconds_count", - "kubelet_runtime_operations_duration_seconds_sum", - "kubelet_runtime_operations_errors", - "kubelet_runtime_operations_errors_total", - "kubelet_runtime_operations_latency_microseconds", - "kubelet_runtime_operations_latency_microseconds_count", - "kubelet_runtime_operations_latency_microseconds_sum", - "kubelet_runtime_operations_total", - "kubelet_volume_stats_available_bytes", - "kubelet_volume_stats_capacity_bytes", - "kubelet_volume_stats_inodes", - "kubelet_volume_stats_inodes_free", - "kubelet_volume_stats_inodes_used", - "kubelet_volume_stats_used_bytes", - "kubernetes_build_info", - "machine_cpu_cores", - "machine_memory_bytes", - "metricsclient_request_retrieve", - "metricsclient_request_send", - "mixin_pod_workload", - "namespace:container_cpu_usage:sum", - "namespace:container_cpu_usage_seconds_total:sum_rate", - "namespace:container_memory_usage_bytes:sum", - "namespace:container_spec_cpu_shares:sum", - "namespace:kube_pod_container_resource_requests_cpu_cores:sum", - "namespace:kube_pod_container_resource_requests_memory_bytes:sum", - "namespace_adds", - "namespace_controller_rate_limiter_use", - "namespace_depth", - "namespace_longest_running_processor_microseconds", - "namespace_pod_container:container_cpu_usage_seconds_total:sum_rate", - "namespace_queue_latency", - "namespace_queue_latency_count", - "namespace_queue_latency_sum", - "namespace_retries", - "namespace_unfinished_work_seconds", - "namespace_work_duration", - "namespace_work_duration_count", - "namespace_work_duration_sum", - "node:node_num_cpu:sum", - "node_arp_entries", - "node_boot_time_seconds", - "node_collector_evictions_number", - "node_collector_unhealthy_nodes_in_zone", - "node_collector_zone_health", - "node_collector_zone_size", - "node_context_switches_total", - "node_cpu_core_throttles_total", - "node_cpu_frequency_max_hertz", - "node_cpu_frequency_min_hertz", - "node_cpu_guest_seconds_total", - "node_cpu_package_throttles_total", - "node_cpu_scaling_frequency_hertz", - "node_cpu_scaling_frequency_max_hrts", - "node_cpu_scaling_frequency_min_hrts", - "node_cpu_seconds_total", - "node_disk_discard_time_seconds_total", - "node_disk_discarded_sectors_total", - "node_disk_discards_completed_total", - "node_disk_discards_merged_total", - "node_disk_io_now", - "node_disk_io_time_seconds_total", - "node_disk_io_time_weighted_seconds_total", - "node_disk_read_bytes_total", - "node_disk_read_time_seconds_total", - "node_disk_reads_completed_total", - "node_disk_reads_merged_total", - "node_disk_write_time_seconds_total", - "node_disk_writes_completed_total", - "node_disk_writes_merged_total", - "node_disk_written_bytes_total", - "node_edac_correctable_errors_total", - "node_edac_csrow_correctable_errors_total", - "node_edac_csrow_uncorrectable_errors_total", - "node_edac_uncorrectable_errors_total", - "node_entropy_available_bits", - "node_exporter_build_info", - "node_filefd_allocated", - "node_filefd_maximum", - "node_filesystem_avail_bytes", - "node_filesystem_device_error", - "node_filesystem_files", - "node_filesystem_files_free", - "node_filesystem_free_bytes", - "node_filesystem_readonly", - "node_filesystem_size_bytes", - "node_forks_total", - "node_hwmon_chip_names", - "node_hwmon_sensor_label", - "node_hwmon_temp_celsius", - "node_hwmon_temp_crit_alarm_celsius", - "node_hwmon_temp_crit_celsius", - "node_hwmon_temp_max_celsius", - "node_intr_total", - "node_lifecycle_controller_adds", - "node_lifecycle_controller_depth", - "node_lifecycle_controller_longest_running_processor_microseconds", - "node_lifecycle_controller_queue_latency", - "node_lifecycle_controller_queue_latency_count", - "node_lifecycle_controller_queue_latency_sum", - "node_lifecycle_controller_rate_limiter_use", - "node_lifecycle_controller_unfinished_work_seconds", - "node_lifecycle_controller_work_duration", - "node_lifecycle_controller_work_duration_count", - "node_lifecycle_controller_work_duration_sum", - "node_load1", - "node_load15", - "node_load5", - "node_memory_Active_anon_bytes", - "node_memory_Active_bytes", - "node_memory_Active_file_bytes", - "node_memory_AnonHugePages_bytes", - "node_memory_AnonPages_bytes", - "node_memory_Bounce_bytes", - "node_memory_Buffers_bytes", - "node_memory_Cached_bytes", - "node_memory_CommitLimit_bytes", - "node_memory_Committed_AS_bytes", - "node_memory_DirectMap1G_bytes", - "node_memory_DirectMap2M_bytes", - "node_memory_DirectMap4k_bytes", - "node_memory_Dirty_bytes", - "node_memory_HardwareCorrupted_bytes", - "node_memory_HugePages_Free", - "node_memory_HugePages_Rsvd", - "node_memory_HugePages_Surp", - "node_memory_HugePages_Total", - "node_memory_Hugepagesize_bytes", - "node_memory_Hugetlb_bytes", - "node_memory_Inactive_anon_bytes", - "node_memory_Inactive_bytes", - "node_memory_Inactive_file_bytes", - "node_memory_KernelStack_bytes", - "node_memory_Mapped_bytes", - "node_memory_MemAvailable_bytes", - "node_memory_MemFree_bytes", - "node_memory_MemTotal_bytes", - "node_memory_Mlocked_bytes", - "node_memory_NFS_Unstable_bytes", - "node_memory_PageTables_bytes", - "node_memory_SReclaimable_bytes", - "node_memory_SUnreclaim_bytes", - "node_memory_ShmemHugePages_bytes", - "node_memory_ShmemPmdMapped_bytes", - "node_memory_Shmem_bytes", - "node_memory_Slab_bytes", - "node_memory_SwapCached_bytes", - "node_memory_SwapFree_bytes", - "node_memory_SwapTotal_bytes", - "node_memory_Unevictable_bytes", - "node_memory_VmallocChunk_bytes", - "node_memory_VmallocTotal_bytes", - "node_memory_VmallocUsed_bytes", - "node_memory_WritebackTmp_bytes", - "node_memory_Writeback_bytes", - "node_namespace_pod:kube_pod_info:", - "node_netstat_Icmp6_InErrors", - "node_netstat_Icmp6_InMsgs", - "node_netstat_Icmp6_OutMsgs", - "node_netstat_Icmp_InErrors", - "node_netstat_Icmp_InMsgs", - "node_netstat_Icmp_OutMsgs", - "node_netstat_Ip6_InOctets", - "node_netstat_Ip6_OutOctets", - "node_netstat_IpExt_InOctets", - "node_netstat_IpExt_OutOctets", - "node_netstat_Ip_Forwarding", - "node_netstat_TcpExt_ListenDrops", - "node_netstat_TcpExt_ListenOverflows", - "node_netstat_TcpExt_SyncookiesFailed", - "node_netstat_TcpExt_SyncookiesRecv", - "node_netstat_TcpExt_SyncookiesSent", - "node_netstat_TcpExt_TCPSynRetrans", - "node_netstat_Tcp_ActiveOpens", - "node_netstat_Tcp_CurrEstab", - "node_netstat_Tcp_InErrs", - "node_netstat_Tcp_InSegs", - "node_netstat_Tcp_OutSegs", - "node_netstat_Tcp_PassiveOpens", - "node_netstat_Tcp_RetransSegs", - "node_netstat_Udp6_InDatagrams", - "node_netstat_Udp6_InErrors", - "node_netstat_Udp6_NoPorts", - "node_netstat_Udp6_OutDatagrams", - "node_netstat_UdpLite6_InErrors", - "node_netstat_UdpLite_InErrors", - "node_netstat_Udp_InDatagrams", - "node_netstat_Udp_InErrors", - "node_netstat_Udp_NoPorts", - "node_netstat_Udp_OutDatagrams", - "node_network_address_assign_type", - "node_network_carrier", - "node_network_carrier_changes_total", - "node_network_carrier_down_changes_total", - "node_network_carrier_up_changes_total", - "node_network_device_id", - "node_network_dormant", - "node_network_flags", - "node_network_iface_id", - "node_network_iface_link", - "node_network_iface_link_mode", - "node_network_info", - "node_network_mtu_bytes", - "node_network_name_assign_type", - "node_network_net_dev_group", - "node_network_protocol_type", - "node_network_receive_bytes_total", - "node_network_receive_compressed_total", - "node_network_receive_drop_total", - "node_network_receive_errs_total", - "node_network_receive_fifo_total", - "node_network_receive_frame_total", - "node_network_receive_multicast_total", - "node_network_receive_packets_total", - "node_network_speed_bytes", - "node_network_transmit_bytes_total", - "node_network_transmit_carrier_total", - "node_network_transmit_colls_total", - "node_network_transmit_compressed_total", - "node_network_transmit_drop_total", - "node_network_transmit_errs_total", - "node_network_transmit_fifo_total", - "node_network_transmit_packets_total", - "node_network_transmit_queue_length", - "node_network_up", - "node_nf_conntrack_entries", - "node_nf_conntrack_entries_limit", - "node_procs_blocked", - "node_procs_running", - "node_role_os_version_machine:cpu_capacity_cores:sum", - "node_scrape_collector_duration_seconds", - "node_scrape_collector_success", - "node_sockstat_FRAG_inuse", - "node_sockstat_FRAG_memory", - "node_sockstat_RAW_inuse", - "node_sockstat_TCP_alloc", - "node_sockstat_TCP_inuse", - "node_sockstat_TCP_mem", - "node_sockstat_TCP_mem_bytes", - "node_sockstat_TCP_orphan", - "node_sockstat_TCP_tw", - "node_sockstat_UDPLITE_inuse", - "node_sockstat_UDP_inuse", - "node_sockstat_UDP_mem", - "node_sockstat_UDP_mem_bytes", - "node_sockstat_sockets_used", - "node_textfile_scrape_error", - "node_time_seconds", - "node_timex_estimated_error_seconds", - "node_timex_frequency_adjustment_ratio", - "node_timex_loop_time_constant", - "node_timex_maxerror_seconds", - "node_timex_offset_seconds", - "node_timex_pps_calibration_total", - "node_timex_pps_error_total", - "node_timex_pps_frequency_hertz", - "node_timex_pps_jitter_seconds", - "node_timex_pps_jitter_total", - "node_timex_pps_shift_seconds", - "node_timex_pps_stability_exceeded_total", - "node_timex_pps_stability_hertz", - "node_timex_status", - "node_timex_sync_status", - "node_timex_tai_offset_seconds", - "node_timex_tick_seconds", - "node_uname_info", - "node_vmstat_oom_kill", - "node_vmstat_pgfault", - "node_vmstat_pgmajfault", - "node_vmstat_pgpgin", - "node_vmstat_pgpgout", - "node_vmstat_pswpin", - "node_vmstat_pswpout", - "node_xfs_allocation_btree_compares_total", - "node_xfs_allocation_btree_lookups_total", - "node_xfs_allocation_btree_records_deleted_total", - "node_xfs_allocation_btree_records_inserted_total", - "node_xfs_block_map_btree_compares_total", - "node_xfs_block_map_btree_lookups_total", - "node_xfs_block_map_btree_records_deleted_total", - "node_xfs_block_map_btree_records_inserted_total", - "node_xfs_block_mapping_extent_list_compares_total", - "node_xfs_block_mapping_extent_list_deletions_total", - "node_xfs_block_mapping_extent_list_insertions_total", - "node_xfs_block_mapping_extent_list_lookups_total", - "node_xfs_block_mapping_reads_total", - "node_xfs_block_mapping_unmaps_total", - "node_xfs_block_mapping_writes_total", - "node_xfs_extent_allocation_blocks_allocated_total", - "node_xfs_extent_allocation_blocks_freed_total", - "node_xfs_extent_allocation_extents_allocated_total", - "node_xfs_extent_allocation_extents_freed_total", - "openshift:cpu_usage_cores:sum", - "openshift_apps_deploymentconfigs_complete_rollouts_total", - "openshift_auth_form_password_count", - "openshift_auth_form_password_count_result", - "openshift_auth_password_total", - "openshift_build_info", - "openshift_cluster_authentication_operator_build_info", - "openshift_cluster_kube_apiserver_operator_build_info", - "openshift_cluster_kube_controller_manager_operator_build_info", - "openshift_cluster_kube_scheduler_operator_build_info", - "openshift_cluster_openshift_apiserver_operator_build_info", - "openshift_console_operator_build_info", - "openshift_imagestreamcontroller_error_count", - "openshift_imagestreamcontroller_success_count", - "openshift_route_created", - "openshift_route_info", - "openshift_route_labels", - "openshift_route_status", - "openshift_sdn_arp_cache_entries", - "openshift_sdn_ovs_flows", - "openshift_sdn_pod_ips", - "openshift_sdn_pod_operations_latency", - "openshift_sdn_pod_operations_latency_count", - "openshift_sdn_pod_operations_latency_sum", - "openshift_sdn_vnid_not_found_errors", - "openshift_service_serving_cert_signer_build_info", - "openshift_template_instance_active_age_seconds_bucket", - "openshift_template_instance_active_age_seconds_count", - "openshift_template_instance_active_age_seconds_sum", - "persistentvolume_protection_controller_rate_limiter_use", - "persistentvolumeclaim_protection_controller_rate_limiter_use", - "pod_name:container_cpu_usage:sum", - "pod_name:container_fs_usage_bytes:sum", - "pod_name:container_memory_usage_bytes:sum", - "pod_name:container_spec_cpu_shares:sum", - "process_cpu_seconds_total", - "process_max_fds", - "process_open_fds", - "process_resident_memory_bytes", - "process_start_time_seconds", - "process_virtual_memory_bytes", - "process_virtual_memory_max_bytes", - "promhttp_metric_handler_requests_total", - "pv_collector_bound_pv_count", - "pv_collector_bound_pvc_count", - "pv_collector_unbound_pv_count", - "pv_collector_unbound_pvc_count", - "pvcs_adds", - "pvcs_depth", - "pvcs_longest_running_processor_microseconds", - "pvcs_queue_latency", - "pvcs_queue_latency_count", - "pvcs_queue_latency_sum", - "pvcs_retries", - "pvcs_unfinished_work_seconds", - "pvcs_work_duration", - "pvcs_work_duration_count", - "pvcs_work_duration_sum", - "pvprotection_adds", - "pvprotection_depth", - "pvprotection_longest_running_processor_microseconds", - "pvprotection_queue_latency", - "pvprotection_queue_latency_count", - "pvprotection_queue_latency_sum", - "pvprotection_retries", - "pvprotection_unfinished_work_seconds", - "pvprotection_work_duration", - "pvprotection_work_duration_count", - "service_adds", - "service_controller_rate_limiter_use", - "service_depth", - "service_longest_running_processor_microseconds", - "service_queue_latency", - "service_queue_latency_count", - "service_queue_latency_sum", - "service_retries", - "service_unfinished_work_seconds", - "service_work_duration", - "service_work_duration_count", - "service_work_duration_sum", - "subscription_count", - "subscription_sync_total", - "up", - "workload:cpu_usage_cores:sum", - "workload:memory_usage_bytes:sum" - ] + "data": { + "Average_Disk_IOPS_Read": { + "label": "node_disk_reads_completed_total", + "query": "(irate(node_disk_reads_completed_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_IOPS_Write": { + "label": "node_disk_writes_completed_total", + "query": "(irate(node_disk_writes_completed_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_Throughput_Read": { + "label": "node_disk_read_bytes_total", + "query": "(irate(node_disk_read_bytes_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Disk_Throughput_Write": { + "label": "node_disk_written_bytes_total", + "query": "(irate(node_disk_written_bytes_total{device!~\"dm.*\",device!~\"rb.*\",device!~\"nbd.*\"}[1m]))" + }, + "Average_Network_Throughput_Rx": { + "label": "node_network_receive_bytes_total", + "query": "(irate(node_network_receive_bytes_total{device!~\"tun.*\",device!~\"vxlan.*\",device!~\"ovs.*\",device!~\"br.*\",device!~\"veth.*\"}[1m]) * 8)" + }, + "Average_Network_Throughput_Tx": { + "label": "node_network_transmit_bytes_total", + "query": "(irate(node_network_transmit_bytes_total{device!~\"tun.*\",device!~\"vxlan.*\",device!~\"ovs.*\",device!~\"br.*\",device!~\"veth.*\"}[1m]) * 8)" + }, + "Average_CPU_Usage_per_Instance": { + "label": "node_cpu_seconds_total", + "query": "(irate(node_cpu_seconds_total[1m]))" + }, + "Average_Memory_Usage_Active": { + "label": "node_memory_MemTotal_bytes", + "query": "avg(node_memory_Active_bytes) by (instance)" + }, + "Average_Memory_Usage_Cached_Buffers": { + "label": "node_memory_Cached_bytes,node_memory_Buffers_bytes", + "query": "avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)" + }, + "Average_Memory_Usage_Available": { + "label": "node_memory_MemAvailable_bytes,node_memory_Cached_bytes,node_memory_Buffers_bytes", + "query": "avg(node_memory_MemAvailable_bytes) by (instance) - (avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance))" + }, + "Memory_Usage_RSS": { + "label": "container_memory_rss", + "query": "sum(container_memory_rss) by (service, node)" + }, + "Containers_CPU_usage": { + "label": "container_cpu_usage_seconds_total", + "query": "(irate(container_cpu_usage_seconds_total[1m]))" + }, + "Number_of_PVCs": { + "label": "kube_persistentvolumeclaim_info", + "query": "avg(count(kube_persistentvolumeclaim_info))" + }, + "Load_1m": { + "label": "node_load1", + "query": "avg(node_load1) by (instance)" + }, + "Load_5m": { + "label": "node_load5", + "query": "avg(node_load5) by (instance)" + }, + "load_15m": { + "label": "node_load15", + "query": "avg(node_load15) by (instance)" + } + + } } \ No newline at end of file diff --git a/snafu/utils/py_es_bulk.py b/snafu/utils/py_es_bulk.py index 1b22e58b..432ebff6 100644 --- a/snafu/utils/py_es_bulk.py +++ b/snafu/utils/py_es_bulk.py @@ -147,6 +147,7 @@ def actions_tracking_closure(cl_actions): generator = actions_tracking_closure(actions) if parallel: + logger.info("Using parallel bulk indexer") streaming_bulk_generator = helpers.parallel_bulk(es, generator, chunk_size=10000000, @@ -157,6 +158,7 @@ def actions_tracking_closure(cl_actions): raise_on_exception=False, request_timeout=_request_timeout) else: + logger.info("Using streaming bulk indexer") streaming_bulk_generator = helpers.streaming_bulk(es, generator, raise_on_error=False,