Skip to content

Commit

Permalink
Add collect of metrics for buffer cache usage
Browse files Browse the repository at this point in the history
The pg_buffercache extension provides a pg_buffercache view that give
a detailed report the usage of shared buffer: which relation is using
it, whether it's dirty and how many backends are pinning this buffer.

This patch adds the capacity to collect data from pg_buffercache to
provide visibility on shared buffer usage.
  • Loading branch information
bonnefoa committed Jul 1, 2024
1 parent 76c0471 commit f6ca9fb
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 1 deletion.
6 changes: 6 additions & 0 deletions postgres/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ files:
example:
- application_name
default: []
- name: collect_buffercache_metrics
description: |
If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
value:
type: boolean
example: false
- name: collect_database_size_metrics
description: Collect database size metrics.
value:
Expand Down
1 change: 1 addition & 0 deletions postgres/changelog.d/17960.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add collect of metrics for buffer cache usage
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(self, instance, init_config):
self.ssl_key = instance.get('ssl_key', None)
self.ssl_password = instance.get('ssl_password', None)
self.table_count_limit = instance.get('table_count_limit', TABLE_COUNT_LIMIT)
self.collect_buffercache_metrics = is_affirmative(instance.get('collect_buffercache_metrics', False))
self.collect_function_metrics = is_affirmative(instance.get('collect_function_metrics', False))
# Default value for `count_metrics` is True for backward compatibility
self.collect_count_metrics = is_affirmative(instance.get('collect_count_metrics', True))
Expand Down
4 changes: 4 additions & 0 deletions postgres/datadog_checks/postgres/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ def instance_collect_count_metrics():
return True


def instance_collect_buffercache_metrics():
return False


def instance_collect_database_size_metrics():
return True

Expand Down
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ class InstanceConfig(BaseModel):
azure: Optional[Azure] = None
collect_activity_metrics: Optional[bool] = None
collect_bloat_metrics: Optional[bool] = None
collect_buffercache_metrics: Optional[bool]
collect_checksum_metrics: Optional[bool] = None
collect_count_metrics: Optional[bool] = None
collect_database_size_metrics: Optional[bool] = None
Expand Down
5 changes: 5 additions & 0 deletions postgres/datadog_checks/postgres/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,11 @@ instances:
# activity_metrics_excluded_aggregations:
# - application_name

## @param collect_buffercache_metrics - boolean - optional - default: false
## If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
#
# collect_buffercache_metrics: false

## @param collect_database_size_metrics - boolean - optional - default: true
## Collect database size metrics.
#
Expand Down
3 changes: 3 additions & 0 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
ANALYZE_PROGRESS_METRICS,
AWS_RDS_HOSTNAME_SUFFIX,
AZURE_DEPLOYMENT_TYPE_TO_RESOURCE_TYPE,
BUFFERCACHE_METRICS,
CLUSTER_VACUUM_PROGRESS_METRICS,
CONNECTION_METRICS,
COUNT_METRICS,
Expand Down Expand Up @@ -301,6 +302,8 @@ def dynamic_queries(self):
if self._config.collect_wal_metrics is not False:
# collect wal metrics for pg >= 10 only if the user has not explicitly disabled it
queries.append(WAL_FILE_METRICS)
if self._config.collect_buffercache_metrics:
queries.append(BUFFERCACHE_METRICS)
queries.append(QUERY_PG_REPLICATION_SLOTS)
queries.append(VACUUM_PROGRESS_METRICS)
queries.append(STAT_SUBSCRIPTION_METRICS)
Expand Down
44 changes: 44 additions & 0 deletions postgres/datadog_checks/postgres/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,50 @@ def get_list_chunks(lst, n):
"max(age(backend_xmin))",
]

# pg_buffercache is implemented with a function scan. Thus, the planner doesn't
# have much reliable estimation on the number of rows returned by pg_buffercache.
# The function's pgproc.prorows is used and 1000 is used as a default value.
# On top of that, the function is volatile, preventing possible inlining and
# optimisation.
# It is very likely that we have way more buffers than relations: 16GB of shared_buffers
# will have 2097152 buffers returned by pg_buffercache while pg_class will mostly be
# around thousands of rows. Therefore, we write the query as a CTE aggregating on reldatabase
# and relfilenode. Given that the function is volatile, this will force the CTE to be
# materialized and we should have less or the same cardinality as output as pg_class's
# rows.
# This is more efficient than the cte-less version which will rely on a merge join and thus
# sort the output of pg_buffercache.
BUFFERCACHE_METRICS = {
'name': 'buffercache_metrics',
'query': """
WITH buffer_by_relfilenode AS (
SELECT reldatabase, relfilenode,
COUNT(*) as count,
SUM(usagecount) as sum_usagecount,
NULLIF(SUM(isdirty::int), 0) as sum_dirty,
NULLIF(SUM(pinning_backends), 0) as sum_pinning
FROM pg_buffercache
GROUP BY reldatabase, relfilenode
)
SELECT COALESCE(d.datname, 'shared'), n.nspname, COALESCE(c.relname, 'free'),
count, sum_usagecount, sum_dirty, sum_pinning
FROM buffer_by_relfilenode b
LEFT JOIN pg_database d ON b.reldatabase = d.oid
LEFT JOIN pg_class c ON b.relfilenode = pg_relation_filenode(c.oid)
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace;
""",
'columns': [
{'name': 'db', 'type': 'tag'},
{'name': 'schema', 'type': 'tag'},
{'name': 'relation', 'type': 'tag'},
{'name': 'buffers', 'type': 'gauge'},
{'name': 'usage_count', 'type': 'gauge'},
{'name': 'dirty_buffers', 'type': 'gauge'},
{'name': 'pinning_backends', 'type': 'gauge'},
],
'metric_prefix': 'postgresql.buffercache',
}

# The metrics we retrieve from pg_stat_activity when the postgres version >= 9.6
ACTIVITY_METRICS_9_6 = [
"SUM(CASE WHEN xact_start IS NOT NULL THEN 1 ELSE 0 END)",
Expand Down
4 changes: 4 additions & 0 deletions postgres/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ postgresql.bgwriter.maxwritten_clean,count,,,,The number of times the background
postgresql.bgwriter.sync_time,count,,millisecond,,The total amount of checkpoint processing time spent synchronizing files to disk.,0,postgres,bgw sync time,
postgresql.bgwriter.write_time,count,,millisecond,,The total amount of checkpoint processing time spent writing files to disk.,0,postgres,bgw wrt time,
postgresql.buffer_hit,gauge,,hit,second,"The number of times disk blocks were found in the buffer cache, preventing the need to read from the database. This metric is tagged with db.",1,postgres,buff hit,
postgresql.buffercache.buffers,gauge,,buffer,,"The number of buffers from the shared_buffers. This metric is tagged with db, relation.",0,postgres,buffercache buffers,
postgresql.buffercache.dirty_buffers,gauge,,buffer,,"The number of dirty buffers from the shared_buffers. This metric is tagged with db, relation.",0,postgres,buffercache dirty buffers,
postgresql.buffercache.pinning_backends,gauge,,,,"The sum of buffers' pinning_backends aggregated by relation. This metric is tagged with db, relation.",0,postgres,buffercache pinning backends,
postgresql.buffercache.usage_count,gauge,,,,"The sum of buffers' usage_count aggregated by relation. This metric is tagged with db, relation.",0,postgres,buffercache usage count,
postgresql.checksums.checksum_failures,count,,,,"The number of checksum failures in this database. This metric is tagged with db.",0,postgres,checksums,
postgresql.checksums.enabled,count,,,,"Whether database checksums are enabled. Value is always 1 and tagged with enabled:true or enabled:false. This metric is tagged with db.",0,postgres,checksums.enabled,
postgresql.cluster_vacuum.heap_blks_scanned,gauge,,block,,"Number of heap blocks scanned. This counter only advances when the phase is seq scanning heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster blk_scanned,
Expand Down
1 change: 1 addition & 0 deletions postgres/tests/compose/resources/02_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ EOSQL
fi

psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" "datadog_test" <<-'EOSQL'
CREATE EXTENSION pg_buffercache SCHEMA public;
CREATE EXTENSION pg_stat_statements SCHEMA public;
GRANT SELECT ON pg_stat_statements TO datadog;
Expand Down
31 changes: 30 additions & 1 deletion postgres/tests/test_pg_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from datadog_checks.base.stubs import datadog_agent
from datadog_checks.postgres import PostgreSql
from datadog_checks.postgres.__about__ import __version__
from datadog_checks.postgres.util import DatabaseHealthCheckError, PartialFormatter, fmt
from datadog_checks.postgres.util import BUFFERCACHE_METRICS, DatabaseHealthCheckError, PartialFormatter, fmt

from .common import (
COMMON_METRICS,
Expand All @@ -25,6 +25,7 @@
POSTGRES_VERSION,
USER_ADMIN,
_get_expected_tags,
_iterate_metric_name,
assert_metric_at_least,
check_activity_metrics,
check_bgw_metrics,
Expand Down Expand Up @@ -339,6 +340,34 @@ def test_connections_metrics(aggregator, integration_check, pg_instance):
aggregator.assert_metric('postgresql.connections', count=1, tags=expected_tags)


@requires_over_10
def test_buffercache_metrics(aggregator, integration_check, pg_instance):
pg_instance['collect_buffercache_metrics'] = True
check = integration_check(pg_instance)
check.check(pg_instance)

db_to_rel = {
'shared': ['pg_tablespace', 'pg_database_datname_index'],
'datadog_test': ['pg_operator_oprname_l_r_n_index', 'pg_type'],
'postgres': ['pg_operator', 'pg_class'],
}
expected_tags = _get_expected_tags(check, pg_instance)

for metric in _iterate_metric_name(BUFFERCACHE_METRICS):
for db, relnames in db_to_rel.items():
for relname in relnames:
aggregator.assert_metric(metric, count=1, tags=expected_tags + [f'relation:{relname}', f'db:{db}'])

# Free buffers have no usage and can't be pinned, we shouldn't emit those metrics for free buffers
metrics_not_emitted_for_free = ['postgresql.buffercache.usage_count', 'postgresql.buffercache.pinning_backends']
free_buffers_tags = expected_tags + ['db:shared', 'relation:free']
for metric in _iterate_metric_name(BUFFERCACHE_METRICS):
if metric in metrics_not_emitted_for_free:
aggregator.assert_metric(metric, count=0, tags=free_buffers_tags)
else:
aggregator.assert_metric(metric, count=1, tags=free_buffers_tags)


def test_locks_metrics_no_relations(aggregator, integration_check, pg_instance):
"""
Since 4.0.0, to prevent tag explosion, lock metrics are not collected anymore unless relations are specified
Expand Down

0 comments on commit f6ca9fb

Please sign in to comment.