Runtime Information

Uptime 2021-01-13 09:26:42.916164456 +0000 UTC

Build Information

branch stable
buildDate 20160202-22:31:06
buildUser @9f8f0f8d724a
goVersion 1.5.3
revision 287d9b2
version 0.16.2

Configuration

### prometheus.yml ###

global:
  scrape_interval: 15s # By default, scrape targets every 15 seconds.
  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
  # scrape_timeout is set to the global default (10s).

  # Attach these extra labels to all time-series collected by this Prometheus instance.
  external_labels:
    monitor: 'godocker-monitor'

rule_files:
  - '/etc/prometheus/alert.rules'
  - '/etc/prometheus/node_alert.rules'

# A scrape configuration containing exactly one endpoint to scrape:
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.

  # GO-Docker
  - job_name: 'godocker'
    scrape_interval: 20s

    # Override the global default and scrape targets from this job every 5 seconds.
    scrape_interval: 20s
    scrape_timeout: 40s

    # ['192.168.1.152:8080', '192.168.1.153:8080', '192.168.1.37:8080', '192.168.1.154:8080', '192.168.1.158:8080']
    target_groups:
#      - targets: ['192.168.1.37:8080', '192.168.1.154:8080', '192.168.1.158:8080']
#        labels:
#          group: 'godocker'
      - targets: ['192.168.1.3:6543']
        labels:
          group: 'godockerweb'
  - job_name: 'webproxy'
    scrape_interval: 30s
    target_groups:
      - targets: ['192.168.1.118:9117']
        labels:
          group: 'webproxy'

  - job_name: 'slurm'
    scrape_interval: 300s
    target_groups:
      - targets: ['192.168.1.216:3000']
        labels:
          group: 'slurm'

  - job_name: 'openstack'
    scrape_interval: 120s
    target_groups:
      - targets: ['os-virt:9100', 'compute1:9100', 'compute2:9100', 'compute3:9100', 'compute4:9100', 'compute5:9100', 'compute6:9100', 'compute7:9100', 'compute8:9100' , 'compute9:9100' , 'compute10:9100' , 'compute11:9100']
        labels:
          group: 'openstack'
  - job_name: 'openstack-rabbitmq'
    scrape_interval: 120s
    target_groups:
      - targets: ['os-virt:15692']
        labels:
          group: 'openstack'

  #- job_name: 'openstack-vm'
  #  scrape_interval: 120s
  #  target_groups:
  #    - targets: ['compute1:9177', 'compute2:9177', 'compute3:9177', 'compute4:9177', 'compute5:9177', 'compute6:9177', 'compute7:9177', 'compute8:9177', 'compute9:9177', 'compute10:9177', 'compute11:9177']
  #      labels:
  #        group: 'openstack-vm'

Rules

ALERT InstanceDown
  IF up == 0
  FOR 5m
  WITH {service="prometheus_exporter"}
  SUMMARY "Instance {{ $labels.instance }} down"
  DESCRIPTION "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
  RUNBOOK ""
ALERT ApiDown IF openstack_endpoints_errors > 0 FOR 15m WITH {component="api", service="openstack"} SUMMARY "Openstack API {{ $labels.endpoint }} endpoint is down" DESCRIPTION "Openstack endpoint {{ $labels.endpoint }} has been down for more than 5 minutes" RUNBOOK ""
ALERT RabbitQueueGoingCrazy IF rabbitmq_queue_messages > 100 FOR 30m SUMMARY "Openstack RabbitMQ queue size is a bit large: {{ $value }}" DESCRIPTION "Openstack RabbitMQ queue size is high: {{ $value }} for more than 30 minutes, should check processes" RUNBOOK ""
ALERT NeutronIpAvailable IF neutron_net_usage > 400 FOR 1d WITH {component="neutron", service="openstack"} SUMMARY "Openstack neutron available IP is low: {{ $value }}" DESCRIPTION "Openstack neutron available IP addresses are getting low, used IP: {{ $value }}" RUNBOOK ""
ALERT NeutronIpInactive IF neutron_net_usage{status="DOWN"} > 10 FOR 1d WITH {component="neutron", service="openstack"} SUMMARY "Openstack neutron IP is inactive status: {{ $value }}" DESCRIPTION "Openstack neutron IP addresses in inactive status is getting high, inactive IP: {{ $value }}" RUNBOOK ""
ALERT VolumePoolUsage IF (cinder_capacity - cinder_free) / cinder_capacity > 0.8 FOR 5m SUMMARY "Cinder pools Size Alert (Over 0.80 Usage)" DESCRIPTION "pool show the openstack storage pool (IE : NFS or EqualCloud)" RUNBOOK ""
ALERT CpuUsage IF avg(1 - irate(node_cpu{mode="idle"}[5m])) BY (instance) > 0.8 FOR 5m SUMMARY "CPU usage" DESCRIPTION "CPU Usage rate is beyond 0.8" RUNBOOK ""
ALERT MemoryUsage IF 1 - (node_memory_MemAvailable / node_memory_MemTotal) > 0.8 FOR 5m SUMMARY "Mem usage" DESCRIPTION "Mem Usage rate is beyond 0.8" RUNBOOK ""
ALERT DiskUsage IF (node_filesystem_size{mountpoint="/"} - node_filesystem_free) / node_filesystem_size > 0.85 FOR 5m SUMMARY "Disk usage" DESCRIPTION "Disk usage rate is beyond 0.85" RUNBOOK ""

Targets

godocker
Endpoint State Base Labels Last Scrape Error
http://192.168.1.3:6543/metrics
up group="godockerweb" 47.401199ms ago
openstack
Endpoint State Base Labels Last Scrape Error
http://os-virt:9100/metrics
up group="openstack" 1m44.263969978s ago
http://compute1:9100/metrics
up group="openstack" 1m33.358160575s ago
http://compute2:9100/metrics
up group="openstack" 1m40.504249061s ago
http://compute3:9100/metrics
up group="openstack" 1m16.030947963s ago
http://compute4:9100/metrics
down group="openstack" 50.315190702s ago Get http://compute4:9100/metrics: dial tcp 192.168.3.14:9100: i/o timeout
http://compute5:9100/metrics
up group="openstack" 14.503949882s ago
http://compute6:9100/metrics
up group="openstack" 1m26.429098576s ago
http://compute7:9100/metrics
up group="openstack" 1m6.462036852s ago
http://compute8:9100/metrics
up group="openstack" 1m13.973863092s ago
http://compute9:9100/metrics
up group="openstack" 55.874184412s ago
http://compute10:9100/metrics
up group="openstack" 1m18.176961433s ago
http://compute11:9100/metrics
down group="openstack" 59.615449154s ago Get http://compute11:9100/metrics: dial tcp 192.168.3.21:9100: i/o timeout
openstack-rabbitmq
Endpoint State Base Labels Last Scrape Error
http://os-virt:15692/metrics
up group="openstack" 1m59.28026638s ago
slurm
Endpoint State Base Labels Last Scrape Error
http://192.168.1.216:3000/metrics
up group="slurm" 1m44.750033731s ago
webproxy
Endpoint State Base Labels Last Scrape Error
http://192.168.1.118:9117/metrics
up group="webproxy" 1.53666373s ago

Startup Flags

alertmanager.http-deadline 10s
alertmanager.notification-queue-capacity 100
alertmanager.url http://192.168.1.63:9093
config.file /etc/prometheus/prometheus.yml
log.format
log.level info
query.max-concurrency 20
query.staleness-delta 5m0s
query.timeout 2m0s
storage.local.checkpoint-dirty-series-limit 5000
storage.local.checkpoint-interval 5m0s
storage.local.chunk-encoding-version 1
storage.local.dirty false
storage.local.index-cache-size.fingerprint-to-metric 10485760
storage.local.index-cache-size.fingerprint-to-timerange 5242880
storage.local.index-cache-size.label-name-to-label-values 10485760
storage.local.index-cache-size.label-pair-to-fingerprints 20971520
storage.local.max-chunks-to-persist 1048576
storage.local.memory-chunks 1048576
storage.local.path data
storage.local.pedantic-checks false
storage.local.retention 360h0m0s
storage.local.series-file-shrink-ratio 0.1
storage.local.series-sync-strategy adaptive
storage.remote.graphite-address
storage.remote.graphite-prefix
storage.remote.graphite-transport tcp
storage.remote.influxdb-url
storage.remote.influxdb.database prometheus
storage.remote.influxdb.retention-policy default
storage.remote.influxdb.username
storage.remote.opentsdb-url
storage.remote.timeout 30s
version false
web.console.libraries console_libraries
web.console.templates consoles
web.enable-remote-shutdown false
web.external-url http://93e1d99940e6:9090/
web.listen-address :9090
web.telemetry-path /metrics
web.use-local-assets false
web.user-assets