monitor postgresql replication lag through prometheus data
- Use a prometheus plugin for grafana (from prometheus)
- create a wrapper to handle the query because when used with the puppet way to configure the probe, the parsing is not don correctly
- raise the warning at 100GiB and the critical alert at 200GiB, it match what we observed in the last month but will be adapted later if needed
Related to T3452
Test Plan
- pergamon:
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_belvedere_replication_lag.sh\" {\n...
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_prometheus_metric.sh\" {\n import...
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
"content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
"order": 60,
"target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_belvedere_replication_lag.sh\" {\n...
"order": 15,
"tag": "_etc_icinga2_conf.d_swh-plugins.conf",
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_prometheus_metric.sh\" {\n import...
"order": 15,
"tag": "_etc_icinga2_conf.d_swh-plugins.conf",
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
"content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
"order": 60,
"tag": "_etc_icinga2_conf.d_static-checks.conf",
"target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
parameters =>
"command": "visudo -c || ",
"path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"refreshonly": true
*******************************************
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
parameters =>
"command": "visudo -c || ",
"path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"refreshonly": true
*******************************************
+ File[/etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
parameters =>
"ensure": "absent",
"group": "root",
"mode": "0440",
"owner": "root"
*******************************************
+ File[/etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
parameters =>
"ensure": "absent",
"group": "root",
"mode": "0440",
"owner": "root"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh] =>
parameters =>
"content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
"ensure": "present",
"group": "root",
"mode": "0755",
"owner": "root"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh] =>
parameters =>
"content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
"ensure": "present",
"group": "root",
"mode": "0755",
"owner": "root"
*******************************************
+ Icinga2::Object::Checkcommand[check_belvedere_replication_lag.sh] =>
parameters =>
"arguments": {
"-H": "$check_prometheus_metric_url$",
"-w": "$check_prometheus_metric_warning$",
"-c": "$check_prometheus_metric_critical$",
"-n": "$check_prometheus_metric_name$"
},
"checkcommand_name": "check_belvedere_replication_lag.sh",
"command": [
"/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh"
],
"ensure": "present",
"import": [
"plugin-check-command"
],
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false,
"vars": {
"check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:909...
}
*******************************************
+ Icinga2::Object::Checkcommand[check_prometheus_metric.sh] =>
parameters =>
"arguments": {
"-H": "$check_prometheus_metric_url$",
"-q": "$check_prometheus_metric_query$",
"-w": "$check_prometheus_metric_warning$",
"-c": "$check_prometheus_metric_critical$",
"-n": "$check_prometheus_metric_name$"
},
"checkcommand_name": "check_prometheus_metric.sh",
"command": [
"/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh"
],
"ensure": "present",
"import": [
"plugin-check-command"
],
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false,
"vars": {
"check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:909...
}
*******************************************
+ Icinga2::Object::Service[Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
"apply": false,
"assign": [
],
"check_command": "check_belvedere_replication_lag.sh",
"ensure": "present",
"host_name": "belvedere.internal.softwareheritage.org",
"ignore": [
],
"import": [
],
"order": 60,
"prefix": false,
"service_name": "Postgresql replication lag (belvedere -> somerset)",
"target": "/etc/icinga2/conf.d/static-checks.conf",
"template": false,
"vars": {
"check_prometheus_metric_name": "pg replication_lag belvedere somerset",
"check_prometheus_metric_warning": "107374182400",
"check_prometheus_metric_critical": "214748364800"
}
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
parameters =>
"apply": false,
"assign": [
],
"attrs": {
"command": [
"/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh"
],
"arguments": {
"-H": "$check_prometheus_metric_url$",
"-w": "$check_prometheus_metric_warning$",
"-c": "$check_prometheus_metric_critical$",
"-n": "$check_prometheus_metric_name$"
},
"vars": {
"check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:9...
}
},
"attrs_list": [
"command",
"env",
"timeout",
"arguments",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
],
"import": [
"plugin-check-command"
],
"object_name": "check_belvedere_replication_lag.sh",
"object_type": "CheckCommand",
"order": 15,
"prefix": false,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
parameters =>
"apply": false,
"assign": [
],
"attrs": {
"command": [
"/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh"
],
"arguments": {
"-H": "$check_prometheus_metric_url$",
"-q": "$check_prometheus_metric_query$",
"-w": "$check_prometheus_metric_warning$",
"-c": "$check_prometheus_metric_critical$",
"-n": "$check_prometheus_metric_name$"
},
"vars": {
"check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:9...
}
},
"attrs_list": [
"command",
"env",
"timeout",
"arguments",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
],
"import": [
"plugin-check-command"
],
"object_name": "check_prometheus_metric.sh",
"object_type": "CheckCommand",
"order": 15,
"prefix": false,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
"apply": false,
"assign": [
],
"attrs": {
"host_name": "belvedere.internal.softwareheritage.org",
"check_command": "check_belvedere_replication_lag.sh",
"vars": {
"check_prometheus_metric_name": "pg replication_lag belvedere somerset",...
"check_prometheus_metric_warning": "107374182400",
"check_prometheus_metric_critical": "214748364800"
}
},
"attrs_list": [
"display_name",
"host_name",
"check_command",
"check_timeout",
"check_interval",
"check_period",
"retry_interval",
"max_check_attempts",
"groups",
"enable_notifications",
"enable_active_checks",
"enable_passive_checks",
"enable_event_handler",
"enable_flapping",
"enable_perfdata",
"event_command",
"flapping_threshold_low",
"flapping_threshold_high",
"volatile",
"zone",
"command_endpoint",
"notes",
"notes_url",
"action_url",
"icon_image",
"icon_image_alt",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
],
"import": [
],
"object_name": "Postgresql replication lag (belvedere -> somerset)",
"object_type": "Service",
"order": 60,
"prefix": false,
"target": "/etc/icinga2/conf.d/static-checks.conf",
"template": false
*******************************************
+ Sudo::Conf[icinga-check_belvedere_replication_lag.sh] =>
parameters =>
"ensure": "absent",
"priority": 10,
"sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
*******************************************
+ Sudo::Conf[icinga-check_prometheus_metric.sh] =>
parameters =>
"ensure": "absent",
"priority": 10,
"sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
*******************************************
*** End octocatalog-diff on pergamon.softwareheritage.org
Migrated from D6050 (view on Phabricator)