Skip to main content

Visão Geral

Prometheus é um sistema de monitoramento de código aberto e toolkit de alertas.

Instalação

# Download
wget https://github.com/prometheus/prometheus/releases/download/v2.47.0/prometheus-2.47.0.linux-amd64.tar.gz
tar xzf prometheus-*.tar.gz
cd prometheus-*

# Ou via Docker
docker run -d \
  --name prometheus \
  -p 9090:9090 \
  -v /etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
  prom/prometheus

# Via systemd
cat > /etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus
After=network.target

[Service]
Type=simple
User=prometheus
ExecStart=/opt/prometheus/prometheus --config.file=/etc/prometheus/prometheus.yml
Restart=always

[Install]
WantedBy=multi-user.target
EOF

Configuração

# /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'prod'
    env: 'production'

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

rule_files:
  - /etc/prometheus/rules/*.yml

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']

  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
      - role: endpoints
    scheme: https
    tls_config:
      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

  - job_name: 'kubernetes-nodes'
    kubernetes_sd_configs:
      - role: node
    relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)

  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true

PromQL

# Selecores básicos
up{job="prometheus"}
up{env="production"}

# Rate
rate(http_requests_total[5m])
rate(node_network_receive_bytes_total[5m])

# Irate (instantâneo)
irate(http_requests_total[5m])

# Aggregations
sum(rate(http_requests_total[5m])) by (job)
avg(rate(cpu_usage_seconds_total[5m])) by (instance)

# Funções
# rate() - taxa por segundo
# increase() - incremento no período
# predict_linear() - predição linear
# histogram_quantile() - quantis de histogramas
# label_replace() - substituir labels

Exemplos

# CPU usage por nó
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Memória disponível
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100

# Requests por segundo
sum(rate(http_requests_total[5m])) by (service)

# 95th percentile latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# Alerta: mais de 5 erros por minuto
sum(rate(http_requests_total{status=~"5.."}[1m])) > 5

Node Exporter

# Instalar
docker run -d \
  --name node-exporter \
  -p 9100:9100 \
  prom/node-exporter

# Métricas principais
# node_cpu_seconds_total
# node_memory_MemTotal_bytes
# node_memory_MemAvailable_bytes
# node_filesystem_size_bytes
# node_network_receive_bytes_total
# node_network_transmit_bytes_total
# node_disk_read_bytes_total
# node_disk_written_bytes_total

Alertmanager

# /etc/alertmanager/alertmanager.yml
global:
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_from: 'alertmanager@example.com'
  smtp_auth_username: 'alertmanager@example.com'
  smtp_auth_password: 'password'

route:
  group_by: ['alertname', 'cluster']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'email'
  routes:
    - match:
        severity: critical
      receiver: 'slack'
      continue: true

receivers:
  - name: 'email'
    email_configs:
      - to: 'admin@example.com'

  - name: 'slack'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/XXX'
        channel: '#alerts'
        send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'cluster']

Alertas

# /etc/prometheus/rules/alerts.yml
groups:
  - name: node
    interval: 30s
    rules:
      - alert: NodeDown
        expr: up{job="node"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

      - alert: HighCPU
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is above 80% for more than 10 minutes."

      - alert: HighMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"

      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}) * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space low on {{ $labels.instance }}"

Service Discovery

Kubernetes SD

scrape_configs:
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)

File SD

# /etc/prometheus/targets.yml
- targets:
    - '192.168.1.10:9100'
    - '192.168.1.11:9100'
  labels:
    env: production
    team: infra

Recording Rules

groups:
  - name: recording_rules
    interval: 30s
    rules:
      - record: job:http_requests_total:rate5m
        expr: sum(rate(http_requests_total[5m])) by (job)

      - record: instance:cpu:rate5m
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

      - record: job:request_latency_seconds:p99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))

Federation

scrape_configs:
  - job_name: 'federate'
    scrape_interval: 30s
    honor_labels: true
    metrics_path: '/federate'
    params:
      'match[]':
        - '{job="kubernetes-pods"}'
        - '{__name__=~"node_.*"}'
    static_configs:
      - targets:
          - 'central-prometheus:9090'

TSDB

# Comandos TSDB
promtool tsdb list /data/prometheus
promtool tsdb analyze /data/prometheus
promtool tsdb dump /data/prometheus
promtool tsdb create-blocks --output-dir=/tmp/backfill /data/prometheus

Remote Write

remote_write:
  - url: http://mimir:8080/api/v1/write
    queue_config:
      capacity: 10000
      max_shards: 10
      min_shards: 1
      max_samples_per_send: 2000

remote_read:
  - url: http://mimir:8080/api/v1/read
    read_recent: true