diff --git a/README.md b/README.md index 82f66b8..c374081 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,9 @@ create user postgres_ai_mon with password ''; grant connect on database to postgres_ai_mon; grant pg_monitor to postgres_ai_mon; -grant select on pg_stat_statements to postgres_ai_mon; -grant select on pg_stat_database to postgres_ai_mon; -grant select on pg_stat_user_tables to postgres_ai_mon; -grant select on pg_stat_user_indexes to postgres_ai_mon; grant select on pg_index to postgres_ai_mon; --- Create a public view for pg_statistic access (required for bloat metrics on user schemas) +-- Create a public view for pg_statistic access (optional, for bloat analysis) create view public.pg_statistic as select n.nspname as schemaname, @@ -116,11 +112,29 @@ join pg_namespace n on n.oid = c.relnamespace join pg_attribute a on a.attrelid = s.starelid and a.attnum = s.staattnum where a.attnum > 0 and not a.attisdropped; -grant select on public.pg_statistic to pg_monitor; +grant select on public.pg_statistic to postgres_ai_mon; alter user postgres_ai_mon set search_path = "$user", public, pg_catalog; commit; ``` +### Optional permissions to analyze risks of certain performance cliffs + +For RDS Postgres and Aurora: + +```sql +create extension if not exists rds_tools; +grant execute on function rds_tools.pg_ls_multixactdir() to postgres_ai_mon; +``` + +For self-managed Postgres: + +```sql +grant execute on function pg_stat_file(text) to postgres_ai_mon; +grant execute on function pg_stat_file(text, boolean) to postgres_ai_mon; +grant execute on function pg_ls_dir(text) to postgres_ai_mon; +grant execute on function pg_ls_dir(text, boolean, boolean) to postgres_ai_mon; +``` + **One command setup:** ```bash diff --git a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json index 16eec4a..08efa86 100644 --- a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json +++ b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 2, + "id": 1, "links": [], "panels": [ { @@ -4588,6 +4588,145 @@ ], "type": "timeseries" }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Safe threshold" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 20, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 197 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "pgwatch_multixact_size_members_bytes{cluster=\"$cluster_name\", node_name=\"$node_name\"}", + "legendFormat": "members", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "10737418240", + "hide": false, + "instant": false, + "legendFormat": "Safe threshold", + "range": true, + "refId": "B" + } + ], + "title": "Multixact members folder size", + "type": "timeseries" + }, { "fieldConfig": { "defaults": {}, @@ -4597,7 +4736,7 @@ "h": 3, "w": 24, "x": 0, - "y": 197 + "y": 209 }, "id": 40, "options": { @@ -4683,5 +4822,5 @@ "timezone": "utc", "title": "01. Single node performance overview (high-level)", "uid": "f90500a0-a12e-4081-a2f0-07ed96f27915", - "version": 3 + "version": 4 } \ No newline at end of file diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index ed87202..f0631a6 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -2200,6 +2200,106 @@ metrics: gauges: - '*' statement_timeout_seconds: 15 + + multixact_size: + sqls: + 11: | + with env as ( + select + exists ( + select + from pg_proc p + join pg_namespace n on n.oid = p.pronamespace + where p.proname = 'pg_ls_multixactdir' and n.nspname = 'rds_tools' + ) as has_rds_fn, + coalesce(pg_has_role('pg_read_server_files','usage'), false) as has_read_files, + (select rolsuper from pg_roles where rolname = current_user) as is_super, + exists (select from pg_proc where proname = 'pg_ls_dir') as has_pg_ls_dir_func, + exists (select from pg_proc where proname = 'pg_stat_file') as has_pg_stat_file_func + ), + can_local as ( + select (has_pg_ls_dir_func and has_pg_stat_file_func and (has_read_files or is_super)) as ok from env + ), + -- Use query_to_xml to safely execute RDS-specific multixact directory listing query. + -- The XML wrapper allows the query to fail gracefully if rds_tools.pg_ls_multixactdir() + -- is unavailable or returns errors, preventing the entire metric from failing. + rds_probe_xml as ( + select query_to_xml($q$ + with files as ( + select name, size + from rds_tools.pg_ls_multixactdir() + ), + members as ( + select sum(size)::bigint as sz from files where name like 'pg_multixact/members%' + ), + offsets as ( + select sum(size)::bigint as sz from files where name like 'pg_multixact/offsets%' + ), + has_rows as ( + select exists(select 1 from files where name like 'pg_multixact/%') as any_rows + ) + select + case when (select any_rows from has_rows) then coalesce((select sz from members), 0) end as members_bytes, + case when (select any_rows from has_rows) then coalesce((select sz from offsets), 0) end as offsets_bytes, + case when (select any_rows from has_rows) then 0 else 1 end as status_code + $q$, true, true, '') as x + where (select has_rds_fn from env) + ), + -- Use query_to_xml to safely execute standard Postgres multixact directory listing query. + -- The XML wrapper allows the query to fail gracefully if pg_stat_file() or pg_ls_dir() + -- are unavailable or return permission errors, preventing the entire metric from failing. + local_probe_xml as ( + select query_to_xml($q$ + with dirs as ( + select + (pg_stat_file('pg_multixact/members', true)).isdir as has_members, + (pg_stat_file('pg_multixact/offsets', true)).isdir as has_offsets + ), + flags as ( + select ((select has_members from dirs) or (select has_offsets from dirs)) as has_any + ), + members as ( + select sum((pg_stat_file(format('pg_multixact/members/%s', d), true)).size)::bigint as sz + from pg_ls_dir('pg_multixact/members') as d(d) + where (select has_members from dirs) + ), + offsets as ( + select sum((pg_stat_file(format('pg_multixact/offsets/%s', d), true)).size)::bigint as sz + from pg_ls_dir('pg_multixact/offsets') as d(d) + where (select has_offsets from dirs) + ) + select + case when (select has_any from flags) then coalesce((select sz from members), 0) end as members_bytes, + case when (select has_any from flags) then coalesce((select sz from offsets), 0) end as offsets_bytes, + case when (select has_any from flags) then 0 else 1 end as status_code + $q$, true, true, '') as x + where not (select has_rds_fn from env) and (select ok from can_local) + ), + picked as ( + select * from rds_probe_xml + union all + select * from local_probe_xml + limit 1 + ), + parsed as ( + select + (xpath('//members_bytes/text()', x))[1]::text::bigint as members_bytes, + (xpath('//offsets_bytes/text()', x))[1]::text::bigint as offsets_bytes, + (xpath('//status_code/text()', x))[1]::text::int as status_code + from picked + ) + select * from parsed + union all + select + null::bigint as members_bytes, + null::bigint as offsets_bytes, + 2::int as status_code + where not exists (select 1 from parsed); + gauges: + - members_bytes + - offsets_bytes + - status_code + statement_timeout_seconds: 15 presets: full: @@ -2244,6 +2344,7 @@ presets: stats_reset: 3600 archive_lag: 15 pg_vacuum_progress: 30 + multixact_size: 300 pg_index_pilot: metrics: pg_index_pilot: 30 diff --git a/config/target-db/init.sql b/config/target-db/init.sql index 0ff6558..7f4fc90 100644 --- a/config/target-db/init.sql +++ b/config/target-db/init.sql @@ -1,49 +1,52 @@ -- Initialize target database for monitoring -- Enable pg_stat_statements extension for query monitoring -CREATE EXTENSION IF NOT EXISTS pg_stat_statements; +create extension if not exists pg_stat_statements; -- Create a sample table for demonstration -CREATE TABLE IF NOT EXISTS sample_data ( - id SERIAL PRIMARY KEY, - name VARCHAR(100), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +create table if not exists sample_data ( + id serial primary key, + name varchar(100), + created_at timestamp default current_timestamp ); -- Insert some sample data -INSERT INTO sample_data (name) VALUES +insert into sample_data (name) values ('Sample Record 1'), ('Sample Record 2'), ('Sample Record 3'); -- Create a user for PGWatch monitoring -CREATE USER monitor WITH PASSWORD 'monitor_pass'; -GRANT CONNECT ON DATABASE target_database TO monitor; -GRANT USAGE ON SCHEMA public TO monitor; +create user monitor with password 'monitor_pass'; +grant connect on database target_database to monitor; +grant usage on schema public to monitor; -- Create a public view for pg_statistic access -CREATE OR REPLACE VIEW public.pg_statistic AS -SELECT +create or replace view public.pg_statistic as +select n.nspname as schemaname, c.relname as tablename, a.attname, s.stanullfrac as null_frac, s.stawidth as avg_width, false as inherited -FROM pg_statistic s -JOIN pg_class c ON c.oid = s.starelid -JOIN pg_namespace n ON n.oid = c.relnamespace -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE a.attnum > 0 AND NOT a.attisdropped; +from pg_statistic s +join pg_class c on c.oid = s.starelid +join pg_namespace n on n.oid = c.relnamespace +join pg_attribute a on a.attrelid = s.starelid and a.attnum = s.staattnum +where a.attnum > 0 and not a.attisdropped; -- Grant specific access instead of all tables -GRANT SELECT ON public.pg_statistic TO pg_monitor; +grant select on public.pg_statistic to pg_monitor; -- Grant access to monitoring views -GRANT SELECT ON pg_stat_statements TO monitor; -GRANT SELECT ON pg_stat_database TO monitor; -GRANT SELECT ON pg_stat_user_tables TO monitor; +grant select on pg_stat_statements to monitor; +grant select on pg_stat_database to monitor; +grant select on pg_stat_user_tables to monitor; -- Grant pg_monitor role to monitor user for enhanced monitoring capabilities -GRANT pg_monitor TO monitor; - +grant pg_monitor to monitor; +grant execute on function pg_stat_file(text) to monitor; +grant execute on function pg_stat_file(text, boolean) to monitor; +grant execute on function pg_ls_dir(text) to monitor; +grant execute on function pg_ls_dir(text, boolean, boolean) to monitor; -- Set search path for the monitor user -ALTER USER monitor SET search_path = "$user", public, pg_catalog; +alter user monitor set search_path = "$user", public, pg_catalog;