Skip to content

Commit

Permalink
Add collect of metrics for buffer cache usage
Browse files Browse the repository at this point in the history
The pg_buffercache extension provides a pg_buffercache view that give
a detailed report the usage of shared buffer: which relation is using
it, whether it's dirty and how many backends are pinning this buffer.

This patch adds the capacity to collect data from pg_buffercache to
provide visibility on shared buffer usage.
  • Loading branch information
bonnefoa committed Jul 3, 2024
1 parent 23f0c89 commit 9035410
Show file tree
Hide file tree
Showing 13 changed files with 110 additions and 3 deletions.
7 changes: 7 additions & 0 deletions postgres/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,13 @@ files:
example:
- application_name
default: []
- name: collect_buffercache_metrics
description: |
If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
pg_buffercache extension must be installed.
value:
type: boolean
example: false
- name: collect_database_size_metrics
description: Collect database size metrics.
value:
Expand Down
1 change: 1 addition & 0 deletions postgres/changelog.d/17960.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add collect of metrics for buffer cache usage
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(self, instance, init_config):
self.ssl_key = instance.get('ssl_key', None)
self.ssl_password = instance.get('ssl_password', None)
self.table_count_limit = instance.get('table_count_limit', TABLE_COUNT_LIMIT)
self.collect_buffercache_metrics = is_affirmative(instance.get('collect_buffercache_metrics', False))
self.collect_function_metrics = is_affirmative(instance.get('collect_function_metrics', False))
# Default value for `count_metrics` is True for backward compatibility
self.collect_count_metrics = is_affirmative(instance.get('collect_count_metrics', True))
Expand Down
4 changes: 4 additions & 0 deletions postgres/datadog_checks/postgres/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ def instance_collect_bloat_metrics():
return False


def instance_collect_buffercache_metrics():
return False


def instance_collect_checksum_metrics():
return False

Expand Down
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ class InstanceConfig(BaseModel):
azure: Optional[Azure] = None
collect_activity_metrics: Optional[bool] = None
collect_bloat_metrics: Optional[bool] = None
collect_buffercache_metrics: Optional[bool] = None
collect_checksum_metrics: Optional[bool] = None
collect_count_metrics: Optional[bool] = None
collect_database_size_metrics: Optional[bool] = None
Expand Down
6 changes: 6 additions & 0 deletions postgres/datadog_checks/postgres/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,12 @@ instances:
# activity_metrics_excluded_aggregations:
# - application_name

## @param collect_buffercache_metrics - boolean - optional - default: false
## If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
## pg_buffercache extension must be installed.
#
# collect_buffercache_metrics: false

## @param collect_database_size_metrics - boolean - optional - default: true
## Collect database size metrics.
#
Expand Down
3 changes: 3 additions & 0 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
ANALYZE_PROGRESS_METRICS,
AWS_RDS_HOSTNAME_SUFFIX,
AZURE_DEPLOYMENT_TYPE_TO_RESOURCE_TYPE,
BUFFERCACHE_METRICS,
CLUSTER_VACUUM_PROGRESS_METRICS,
CONNECTION_METRICS,
COUNT_METRICS,
Expand Down Expand Up @@ -301,6 +302,8 @@ def dynamic_queries(self):
if self._config.collect_wal_metrics is not False:
# collect wal metrics for pg >= 10 only if the user has not explicitly disabled it
queries.append(WAL_FILE_METRICS)
if self._config.collect_buffercache_metrics:
queries.append(BUFFERCACHE_METRICS)
queries.append(QUERY_PG_REPLICATION_SLOTS)
queries.append(VACUUM_PROGRESS_METRICS)
queries.append(STAT_SUBSCRIPTION_METRICS)
Expand Down
2 changes: 1 addition & 1 deletion postgres/datadog_checks/postgres/relationsmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@
}

# We used to rely on pg_stat_user_tables to get tuples and scan metrics
# However, using this view is inefficient as it groups by aggregation on oid and schema
# However, using this view is inefficient as it group by aggregation on oid and schema
# behind the hood, leading to possible temporary bytes being written to handle the sort.
# To avoid this, we need to directly call the pg_stat_* functions
QUERY_PG_CLASS = {
Expand Down
46 changes: 46 additions & 0 deletions postgres/datadog_checks/postgres/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,52 @@ def get_list_chunks(lst, n):
"max(age(backend_xmin))",
]

# pg_buffercache is implemented with a function scan. Thus, the planner doesn't
# have much reliable estimation on the number of rows returned by pg_buffercache.
# The function's pgproc.prorows is used and 1000 is used as a default value.
# On top of that, the function is volatile, preventing possible inlining and
# optimisation.
# It is very likely that we have way more buffers than relations: 16GB of shared_buffers
# will have 2097152 buffers returned by pg_buffercache while pg_class will mostly be
# around thousands of rows. Therefore, we write the query as a CTE aggregating on reldatabase
# and relfilenode. Given that the function is volatile, this will force the CTE to be
# materialized and we should have less or the same cardinality as output as pg_class's
# rows.
# This is more efficient than the cte-less version which will rely on a merge join and thus
# sort the output of pg_buffercache.
BUFFERCACHE_METRICS = {
'name': 'buffercache_metrics',
'query': """
WITH buffer_by_relfilenode AS (
SELECT reldatabase, relfilenode,
NULLIF(COUNT(CASE WHEN relfilenode IS NOT NULL THEN 1 END), 0) as used,
COUNT(CASE WHEN relfilenode IS NULL THEN 1 END) as unused,
SUM(usagecount) as sum_usagecount,
NULLIF(SUM(isdirty::int), 0) as sum_dirty,
NULLIF(SUM(pinning_backends), 0) as sum_pinning
FROM pg_buffercache
GROUP BY reldatabase, relfilenode
)
SELECT COALESCE(d.datname, 'shared'), n.nspname, c.relname,
used, unused, sum_usagecount, sum_dirty, sum_pinning
FROM buffer_by_relfilenode b
LEFT JOIN pg_database d ON b.reldatabase = d.oid
LEFT JOIN pg_class c ON b.relfilenode = pg_relation_filenode(c.oid)
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace;
""",
'columns': [
{'name': 'db', 'type': 'tag'},
{'name': 'schema', 'type': 'tag_not_null'},
{'name': 'relation', 'type': 'tag_not_null'},
{'name': 'used_buffers', 'type': 'gauge'},
{'name': 'unused_buffers', 'type': 'gauge'},
{'name': 'usage_count', 'type': 'gauge'},
{'name': 'dirty_buffers', 'type': 'gauge'},
{'name': 'pinning_backends', 'type': 'gauge'},
],
'metric_prefix': 'postgresql.buffercache',
}

# The metrics we retrieve from pg_stat_activity when the postgres version >= 9.6
ACTIVITY_METRICS_9_6 = [
"SUM(CASE WHEN xact_start IS NOT NULL THEN 1 ELSE 0 END)",
Expand Down
5 changes: 5 additions & 0 deletions postgres/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ postgresql.bgwriter.maxwritten_clean,count,,,,The number of times the background
postgresql.bgwriter.sync_time,count,,millisecond,,The total amount of checkpoint processing time spent synchronizing files to disk.,0,postgres,bgw sync time,
postgresql.bgwriter.write_time,count,,millisecond,,The total amount of checkpoint processing time spent writing files to disk.,0,postgres,bgw wrt time,
postgresql.buffer_hit,gauge,,hit,second,"The number of times disk blocks were found in the buffer cache, preventing the need to read from the database. This metric is tagged with db.",1,postgres,buff hit,
postgresql.buffercache.dirty_buffers,gauge,,buffer,,"Number of dirty shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache dirty buffers,
postgresql.buffercache.pinning_backends,gauge,,,,"Number of backends pinning shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache pinning backends,
postgresql.buffercache.unused_buffers,gauge,,buffer,,"Number of unused shared buffers. pg_buffercache extension needs to be installed.",0,postgres,buffercache unused buffers,
postgresql.buffercache.usage_count,gauge,,,,"Sum of shared buffers' usage_count. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache usage count,
postgresql.buffercache.used_buffers,gauge,,buffer,,"Number of shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache buffers,
postgresql.checksums.checksum_failures,count,,,,"The number of checksum failures in this database. This metric is tagged with db.",0,postgres,checksums,
postgresql.checksums.enabled,count,,,,"Whether database checksums are enabled. Value is always 1 and tagged with enabled:true or enabled:false. This metric is tagged with db.",0,postgres,checksums.enabled,
postgresql.cluster_vacuum.heap_blks_scanned,gauge,,block,,"Number of heap blocks scanned. This counter only advances when the phase is seq scanning heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster blk_scanned,
Expand Down
5 changes: 4 additions & 1 deletion postgres/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,13 @@

def _iterate_metric_name(query):
if 'columns' in query:
metric_prefix = ''
if 'metric_prefix' in query:
metric_prefix = f'{query["metric_prefix"]}.'
for column in query['columns']:
if column['type'].startswith('tag'):
continue
yield column['name']
yield f'{metric_prefix}{column["name"]}'
else:
for metric in query['metrics'].values():
yield metric[0]
Expand Down
1 change: 1 addition & 0 deletions postgres/tests/compose/resources/02_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ EOSQL
fi

psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" "datadog_test" <<-'EOSQL'
CREATE EXTENSION pg_buffercache SCHEMA public;
CREATE EXTENSION pg_stat_statements SCHEMA public;
GRANT SELECT ON pg_stat_statements TO datadog;
Expand Down
31 changes: 30 additions & 1 deletion postgres/tests/test_pg_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from datadog_checks.base.stubs import datadog_agent
from datadog_checks.postgres import PostgreSql
from datadog_checks.postgres.__about__ import __version__
from datadog_checks.postgres.util import DatabaseHealthCheckError, PartialFormatter, fmt
from datadog_checks.postgres.util import BUFFERCACHE_METRICS, DatabaseHealthCheckError, PartialFormatter, fmt

from .common import (
COMMON_METRICS,
Expand All @@ -25,6 +25,7 @@
POSTGRES_VERSION,
USER_ADMIN,
_get_expected_tags,
_iterate_metric_name,
assert_metric_at_least,
check_activity_metrics,
check_bgw_metrics,
Expand Down Expand Up @@ -339,6 +340,34 @@ def test_connections_metrics(aggregator, integration_check, pg_instance):
aggregator.assert_metric('postgresql.connections', count=1, tags=expected_tags)


@requires_over_10
def test_buffercache_metrics(aggregator, integration_check, pg_instance):
pg_instance['collect_buffercache_metrics'] = True
check = integration_check(pg_instance)

with _get_superconn(pg_instance) as conn:
with conn.cursor() as cur:
# Generate some usage on persons relation
cur.execute('select * FROM persons;')

check.check(pg_instance)
base_tags = _get_expected_tags(check, pg_instance)

# Check specific persons relation
persons_tags = base_tags + ['relation:persons', 'db:datadog_test', 'schema:public']
metrics_not_emitted_if_zero = ['postgresql.buffercache.pinning_backends', 'postgresql.buffercache.dirty_buffers']
for metric in _iterate_metric_name(BUFFERCACHE_METRICS):
if metric in metrics_not_emitted_if_zero:
aggregator.assert_metric(metric, count=0, tags=persons_tags)
else:
aggregator.assert_metric(metric, count=1, tags=persons_tags)

# Check metric reported for unused buffers
unused_buffers_tags = base_tags + ['db:shared']
unused_metric = 'postgresql.buffercache.unused_buffers'
aggregator.assert_metric(unused_metric, count=1, tags=unused_buffers_tags)


def test_locks_metrics_no_relations(aggregator, integration_check, pg_instance):
"""
Since 4.0.0, to prevent tag explosion, lock metrics are not collected anymore unless relations are specified
Expand Down

0 comments on commit 9035410

Please sign in to comment.