Runtime Information
Uptime |
2021-01-13 09:26:42.916164456 +0000 UTC |
branch |
stable |
buildDate |
20160202-22:31:06 |
buildUser |
@9f8f0f8d724a |
goVersion |
1.5.3 |
revision |
287d9b2 |
version |
0.16.2 |
Configuration
### prometheus.yml ###
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these extra labels to all time-series collected by this Prometheus instance.
external_labels:
monitor: 'godocker-monitor'
rule_files:
- '/etc/prometheus/alert.rules'
- '/etc/prometheus/node_alert.rules'
# A scrape configuration containing exactly one endpoint to scrape:
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
# GO-Docker
- job_name: 'godocker'
scrape_interval: 20s
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 20s
scrape_timeout: 40s
# ['192.168.1.152:8080', '192.168.1.153:8080', '192.168.1.37:8080', '192.168.1.154:8080', '192.168.1.158:8080']
target_groups:
# - targets: ['192.168.1.37:8080', '192.168.1.154:8080', '192.168.1.158:8080']
# labels:
# group: 'godocker'
- targets: ['192.168.1.3:6543']
labels:
group: 'godockerweb'
- job_name: 'webproxy'
scrape_interval: 30s
target_groups:
- targets: ['192.168.1.118:9117']
labels:
group: 'webproxy'
- job_name: 'slurm'
scrape_interval: 300s
target_groups:
- targets: ['192.168.1.216:3000']
labels:
group: 'slurm'
- job_name: 'openstack'
scrape_interval: 120s
target_groups:
- targets: ['os-virt:9100', 'compute1:9100', 'compute2:9100', 'compute3:9100', 'compute4:9100', 'compute5:9100', 'compute6:9100', 'compute7:9100', 'compute8:9100' , 'compute9:9100' , 'compute10:9100' , 'compute11:9100']
labels:
group: 'openstack'
- job_name: 'openstack-rabbitmq'
scrape_interval: 120s
target_groups:
- targets: ['os-virt:15692']
labels:
group: 'openstack'
#- job_name: 'openstack-vm'
# scrape_interval: 120s
# target_groups:
# - targets: ['compute1:9177', 'compute2:9177', 'compute3:9177', 'compute4:9177', 'compute5:9177', 'compute6:9177', 'compute7:9177', 'compute8:9177', 'compute9:9177', 'compute10:9177', 'compute11:9177']
# labels:
# group: 'openstack-vm'
Rules
ALERT InstanceDown
IF up == 0
FOR 5m
WITH {service="prometheus_exporter"}
SUMMARY "Instance {{ $labels.instance }} down"
DESCRIPTION "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
RUNBOOK ""
ALERT ApiDown
IF openstack_endpoints_errors > 0
FOR 15m
WITH {component="api", service="openstack"}
SUMMARY "Openstack API {{ $labels.endpoint }} endpoint is down"
DESCRIPTION "Openstack endpoint {{ $labels.endpoint }} has been down for more than 5 minutes"
RUNBOOK ""
ALERT RabbitQueueGoingCrazy
IF rabbitmq_queue_messages > 100
FOR 30m
SUMMARY "Openstack RabbitMQ queue size is a bit large: {{ $value }}"
DESCRIPTION "Openstack RabbitMQ queue size is high: {{ $value }} for more than 30 minutes, should check processes"
RUNBOOK ""
ALERT NeutronIpAvailable
IF neutron_net_usage > 400
FOR 1d
WITH {component="neutron", service="openstack"}
SUMMARY "Openstack neutron available IP is low: {{ $value }}"
DESCRIPTION "Openstack neutron available IP addresses are getting low, used IP: {{ $value }}"
RUNBOOK ""
ALERT NeutronIpInactive
IF neutron_net_usage{status="DOWN"} > 10
FOR 1d
WITH {component="neutron", service="openstack"}
SUMMARY "Openstack neutron IP is inactive status: {{ $value }}"
DESCRIPTION "Openstack neutron IP addresses in inactive status is getting high, inactive IP: {{ $value }}"
RUNBOOK ""
ALERT VolumePoolUsage
IF (cinder_capacity - cinder_free) / cinder_capacity > 0.8
FOR 5m
SUMMARY "Cinder pools Size Alert (Over 0.80 Usage)"
DESCRIPTION "pool show the openstack storage pool (IE : NFS or EqualCloud)"
RUNBOOK ""
ALERT CpuUsage
IF avg(1 - irate(node_cpu{mode="idle"}[5m])) BY (instance) > 0.8
FOR 5m
SUMMARY "CPU usage"
DESCRIPTION "CPU Usage rate is beyond 0.8"
RUNBOOK ""
ALERT MemoryUsage
IF 1 - (node_memory_MemAvailable / node_memory_MemTotal) > 0.8
FOR 5m
SUMMARY "Mem usage"
DESCRIPTION "Mem Usage rate is beyond 0.8"
RUNBOOK ""
ALERT DiskUsage
IF (node_filesystem_size{mountpoint="/"} - node_filesystem_free) / node_filesystem_size > 0.85
FOR 5m
SUMMARY "Disk usage"
DESCRIPTION "Disk usage rate is beyond 0.85"
RUNBOOK ""
Targets
Startup Flags
alertmanager.http-deadline |
10s |
alertmanager.notification-queue-capacity |
100 |
alertmanager.url |
http://192.168.1.63:9093 |
config.file |
/etc/prometheus/prometheus.yml |
log.format |
|
log.level |
info |
query.max-concurrency |
20 |
query.staleness-delta |
5m0s |
query.timeout |
2m0s |
storage.local.checkpoint-dirty-series-limit |
5000 |
storage.local.checkpoint-interval |
5m0s |
storage.local.chunk-encoding-version |
1 |
storage.local.dirty |
false |
storage.local.index-cache-size.fingerprint-to-metric |
10485760 |
storage.local.index-cache-size.fingerprint-to-timerange |
5242880 |
storage.local.index-cache-size.label-name-to-label-values |
10485760 |
storage.local.index-cache-size.label-pair-to-fingerprints |
20971520 |
storage.local.max-chunks-to-persist |
1048576 |
storage.local.memory-chunks |
1048576 |
storage.local.path |
data |
storage.local.pedantic-checks |
false |
storage.local.retention |
360h0m0s |
storage.local.series-file-shrink-ratio |
0.1 |
storage.local.series-sync-strategy |
adaptive |
storage.remote.graphite-address |
|
storage.remote.graphite-prefix |
|
storage.remote.graphite-transport |
tcp |
storage.remote.influxdb-url |
|
storage.remote.influxdb.database |
prometheus |
storage.remote.influxdb.retention-policy |
default |
storage.remote.influxdb.username |
|
storage.remote.opentsdb-url |
|
storage.remote.timeout |
30s |
version |
false |
web.console.libraries |
console_libraries |
web.console.templates |
consoles |
web.enable-remote-shutdown |
false |
web.external-url |
http://93e1d99940e6:9090/ |
web.listen-address |
:9090 |
web.telemetry-path |
/metrics |
web.use-local-assets |
false |
web.user-assets |
|