Alerts

/etc/prometheus/rules/ccs.yml > CCS alerts

credit_compliance_service_mismatch_count_gain (0 active)

alert: credit_compliance_service_mismatch_count_gain
expr: delta(ccs_mismatches[1h])
  > 0
for: 1m
labels:
  severity: warning
  team: analytics
annotations:
  description: CCS batch runs have shown an increase in mismatch count
  environment: production
  host: creditservices2
  summary: CCS batch runs have shown an increase in mismatch count

/etc/prometheus/rules/cds.yml > CDS alerts

credit_decision_service_error_count_change (0 active)

alert: credit_decision_service_error_count_change
expr: (cds_error_count_total
  - (cds_error_count_total offset 2m) > 0)
for: 2m
labels:
  severity: warning
  team: analytics
annotations:
  description: CDS error count is accumulating.
  environment: production
  host: creditservices2
  summary: CDS is experiencing some exceptions.

credit_decision_service_high_traffic (0 active)

alert: credit_decision_service_high_traffic
expr: (avg(cds_function_benchmark_seconds_count{name="process_flow"}
  - (cds_function_benchmark_seconds_count{name="process_flow"} offset 5m))
  > 1000)
for: 1m
labels:
  severity: critical
annotations:
  description: The 5m average request rate is significantly higher than usual
  environment: production
  host: creditservices
  summary: CDS traffic is abnormally high

credit_decision_service_response_500 (0 active)

alert: credit_decision_service_response_500
expr: (cds_status_code_count{code="500"}
  > (cds_status_code_count{code="500"} offset 1s))
for: 1s
labels:
  severity: warning
  team: analytics
annotations:
  description: The gunicorn workers are returning HTTP 500 status codes.
  environment: production
  host: creditservices2
  summary: CDS has some HTTP 500 error responses.

/etc/prometheus/rules/node.yml > Instances

Bashcontainerstats not responding (0 active)

alert: Bashcontainerstats
  not responding
expr: up{job="bashcontainerstats"}
  == 0
for: 2m
labels:
  severity: warning
annotations:
  description: '{{ $labels.job }} at {{ $labels.instance }} has been unreachable for
    more than 2 minutes.'
  environment: production
  summary: Prometheus target at {{ $labels.instance }} is unreachable

CPU Load (0 active)

alert: CPU
  Load
expr: avg_over_time(node_load1[5m])
  > 60 or avg_over_time(node_load[1m]) > 70
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has a high CPU load.'
  environment: production
  summary: Instance {{ $labels.instance }} has a high CPU load.

Disk Usage (0 active)

alert: Disk
  Usage
expr: (100
  - 100 * (node_filesystem_avail_bytes{device!~"by-uuid",device!~"tmpfs",mountpoint="/"}
  / node_filesystem_size_bytes{device!~"by-uuid",device!~"tmpfs",mountpoint="/"}))
  > 90
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has disk usage higher than 90%.'
  environment: production
  summary: Instance {{ $labels.instance }} has disk usage greater than 90%.

Postgres DB Connections (0 active)

alert: Postgres
  DB Connections
expr: avg_over_time(pg_stat_activity_count[1m])
  > 280 and avg_over_time(pg_stat_activity_count[15m]) > 275
for: 1m
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} has passed a DB connection threshold.'
  environment: production
  summary: Instance {{ $labels.instance }} has passed a DB connection threshold.

Prometheus target not responding (0 active)

alert: Prometheus
  target not responding
expr: up{job!="bashcontainerstats"}
  == 0
for: 2m
labels:
  severity: critical
annotations:
  description: '{{ $labels.job }} at {{ $labels.instance }} has been unreachable for
    more than 2 minutes.'
  environment: production
  summary: Prometheus target at {{ $labels.instance }} is unreachable

RAM Usage (0 active)

alert: RAM
  Usage
expr: (100
  * (node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes + node_memory_SwapTotal_bytes))
  < 5
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has less than 5% available RAM.'
  environment: production
  summary: Instance {{ $labels.instance }} has less than 5% available RAM.

/etc/prometheus/rules/postgres.yml > postgres

Deadlock detected (0 active)

alert: Deadlock
  detected
expr: rate(pg_stat_database_deadlocks{datname="integra_production",instance="DB_PRIMARY_HOST_NAME:9187"}[1m])
  > 0
for: 1m
labels:
  service: postgres
  severity: warning
annotations:
  description: |-
    PostgreSQL has dead-locks
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Dead locks (instance {{ $labels.instance }})

High connection count on DB Primary (0 active)

alert: High
  connection count on DB Primary
expr: sum
  by(environment, instance) (pg_stat_activity_count) > on(instance) pg_settings_max_connections
  * 0.9
for: 5m
labels:
  service: postgres
  severity: critical
annotations:
  description: Postgres total connections have been above 70% of the configured max_connections
    for the past 5 minutes on dbprimary
  summary: dbprimary connection count is too high

Queries are too slow (0 active)

alert: Queries
  are too slow
expr: avg
  by(datname) (rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]))
  > 2 * 60
for: 2m
labels:
  service: postgres
  severity: warning
annotations:
  description: The average of SQL duration over two minutes is more than two minutes
    per each database
  summary: Postgres is executing slow queries

postgres_down (0 active)

alert: postgres_down
expr: pg_up == 0
for: 5m
labels:
  service: postgres
  severity: warning
annotations:
  description: |-
    Postgres instance is down
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Postgres instance {{ $labels.instance }} is offline

/etc/prometheus/rules/product.yml > product

Credit Reporting Service down (1 active)

alert: Credit
  Reporting Service down
expr: absent(ruby_rss{app_name="credit_reporting_service",type="unicorn_master"})
for: 2m
labels:
  service: credit_reporting_service
  severity: critical
annotations:
  description: Credit Reporting service hasn't been detected for 2 minutes
  environment: production
  summary: Credit Reporting service is not running

Labels	State	Active Since	Value
alertname="Credit Reporting Service down" app_name="credit_reporting_service" service="credit_reporting_service" severity="critical" type="unicorn_master"	firing	2024-09-15 03:35:20.632689777 +0000 UTC	1
Annotations
description Credit Reporting service hasn't been detected for 2 minutes environment production summary Credit Reporting service is not running

Account Service down (0 active)

alert: Account
  Service down
expr: absent(ruby_rss{app_name="account_service",type="unicorn_master"})
for: 2m
labels:
  service: account_service
  severity: critical
annotations:
  description: Account service hasn't been detected for 2 minutes
  environment: production
  summary: Account service is not running

Admin App down (0 active)

alert: Admin
  App down
expr: absent(ruby_rss{app_name="admin_app",type="unicorn_master"})
for: 2m
labels:
  service: admin_app
  severity: critical
annotations:
  description: Admin App hasn't been detected for 2 minutes
  environment: production
  summary: Admin App is not running

Agent App down (0 active)

alert: Agent
  App down
expr: absent(ruby_rss{app_name="agent_app",type="unicorn_master"})
for: 2m
labels:
  service: agent_app
  severity: critical
annotations:
  description: Agent App hasn't been detected for 2 minutes
  environment: production
  summary: Agent App is not running

Application Service down (0 active)

alert: Application
  Service down
expr: absent(ruby_rss{app_name="application_service",type="unicorn_master"})
for: 2m
labels:
  service: application_service
  severity: critical
annotations:
  description: Application service hasn't been detected for 2 minutes
  environment: production
  summary: Application service is not running

Customer App down (0 active)

alert: Customer
  App down
expr: absent(ruby_rss{app_name="customer_app",type="unicorn_master"})
for: 2m
labels:
  service: customer_app
  severity: critical
annotations:
  description: Customer App hasn't been detected for 2 minutes
  environment: production
  summary: Customer App is not running

Customer Service down (0 active)

alert: Customer
  Service down
expr: absent(ruby_rss{app_name="customer_service",type="unicorn_master"})
for: 2m
labels:
  service: customer_service
  severity: critical
annotations:
  description: Customer service hasn't been detected for 2 minutes
  environment: production
  summary: Customer service is not running

Email Service down (0 active)

alert: Email
  Service down
expr: absent(ruby_rss{app_name="email_service",type="unicorn_master"})
for: 2m
labels:
  service: email_service
  severity: critical
annotations:
  description: Email service hasn't been detected for 2 minutes
  environment: production
  summary: Email service is not running

Financial Service down (0 active)

alert: Financial
  Service down
expr: absent(ruby_rss{app_name="financial_service",type="unicorn_master"})
for: 2m
labels:
  service: financial_service
  severity: critical
annotations:
  description: Financial service hasn't been detected for 2 minutes
  environment: production
  summary: Financial service is not running

Five9 Service down (0 active)

alert: Five9
  Service down
expr: absent(ruby_rss{app_name="five9_service",type="unicorn_master"})
for: 2m
labels:
  service: five9_service
  severity: critical
annotations:
  description: Five9 service hasn't been detected for 2 minutes
  environment: production
  summary: Five9 service is not running

Leads Service down (0 active)

alert: Leads
  Service down
expr: absent(ruby_rss{app_name="leads_service",type="unicorn_master"})
for: 2m
labels:
  service: leads_service
  severity: critical
annotations:
  description: Leads service hasn't been detected for 2 minutes
  environment: production
  summary: Leads service is not running

Payment Gateway Service down (0 active)

alert: Payment
  Gateway Service down
expr: absent(ruby_rss{app_name="payment_gateway_service",type="unicorn_master"})
for: 2m
labels:
  service: payment_gateway_service
  severity: critical
annotations:
  description: Payment Gateway service hasn't been detected for 2 minutes
  environment: production
  summary: Payment Gateway service is not running

Scheduler service down (0 active)

alert: Scheduler
  service down
expr: absent(ruby_rss{app_name="scheduler_service",type="sidekiq"})
for: 1m
labels:
  service: scheduler_service
  severity: critical
annotations:
  description: Scheduler service sidekiq hasn't been detected for 1 minute
  environment: production
  summary: Scheduler service sidekiq not running

Slow vendor responses (0 active)

alert: Slow
  vendor responses
expr: avg_over_time(ruby_http_request_duration_seconds{app_name="vendor_data_service",controller="requests",quantile="0.99"}[5m])
  > 5
for: 1m
labels:
  service: vendor_data_service
  severity: info
annotations:
  description: 99th percentile of responses over the last 5 minutes from vendor_data_service
    are taking over 5 seconds
  environment: production
  summary: Vendor data responses are slow

Underwriting Service down (0 active)

alert: Underwriting
  Service down
expr: absent(ruby_rss{app_name="underwriting_service",type="unicorn_master"})
for: 2m
labels:
  service: underwriting_service
  severity: critical
annotations:
  description: Underwriting service hasn't been detected for 2 minutes
  environment: production
  summary: Underwriting service is not running

Vendor Data Service down (0 active)

alert: Vendor
  Data Service down
expr: absent(ruby_rss{app_name="vendor_data_service",type="unicorn_master"})
for: 2m
labels:
  service: vendor_data_service
  severity: critical
annotations:
  description: Vendor Data service hasn't been detected for 2 minutes
  environment: production
  summary: Vendor Data service is not running

Vendor Proxy Service is down (0 active)

alert: Vendor
  Proxy Service is down
expr: absent(ruby_rss{app_name="vendor_proxy_service",type="unicorn_master"})
for: 2m
labels:
  service: vendor_proxy_service
  severity: critical
annotations:
  description: Vendor Proxy service hasn't been detected for 2 minutes
  environment: production
  summary: Vendor Proxy service is not running

/etc/prometheus/rules/rails.yml > rails

Rails apps are returning 500s (0 active)

alert: Rails
  apps are returning 500s
expr: (ruby_http_requests_total{status="500"}
  - (ruby_http_requests_total{status="500"} offset 1m) > 0)
for: 1m
labels:
  service: rails
  severity: warning
annotations:
  description: '{{ $labels.app_name }} is firing some 500 error responses!'
  environment: production
  summary: '{{ $labels.app_name }} is firing some 500 error responses!'

Slow responses from Rails apps (0 active)

alert: Slow
  responses from Rails apps
expr: avg_over_time(ruby_http_request_duration_seconds{quantile="0.99"}[3m])
  > 20
for: 1m
labels:
  service: rails
  severity: info
annotations:
  description: '{{ $labels.app_name }} has responses that are slower than 20 seconds
    for over 3 minutes'
  environment: production
  summary: '{{ $labels.app_name }} is responding slowly!'

Z-Score indicator high traffic on Customer App (0 active)

alert: Z-Score
  indicator high traffic on Customer App
expr: ((sum
  by(app_name) (rate(job:customer_app_http_requests_total:z_score[5m]) > 0)) /
  (count by(app_name) (rate(job:customer_app_http_requests_total:z_score[5m]) >
  0))) > 2
for: 1m
labels:
  service: customer_app
  severity: warning
annotations:
  description: The z-score for customer_app requests is above +2.0
  environment: production
  summary: Traffic for customer_app is high, as indicated by z-score offset

Z-Score indicator low traffic on Customer App (0 active)

alert: Z-Score
  indicator low traffic on Customer App
expr: ((sum
  by(app_name) (rate(job:customer_app_http_requests_total:z_score[5m]) > 0)) /
  (count by(app_name) (rate(job:customer_app_http_requests_total:z_score[5m]) >
  0))) < -1
for: 1m
labels:
  service: customer_app
  severity: warning
annotations:
  description: The z-score for customer_app requests is below -1.0
  environment: production
  summary: Traffic for customer_app is low, as indicated by z-score offset

/etc/prometheus/rules/redis.yml > redis

redis_down (0 active)

alert: redis_down
expr: redis_up == 0
for: 5m
labels:
  service: redis
  severity: warning
annotations:
  description: |-
    Redis instance is down
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  environment: production
  summary: Redis down (instance {{ $labels.instance }})

redis_high_memory_load (0 active)

alert: redis_high_memory_load
expr: redis_memory_used_bytes
  / redis_total_system_memory_bytes * 100 > 90
for: 10m
labels:
  service: redis
  severity: warning
annotations:
  description: |-
    Redis is running out of memory (> 90%)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  environment: production
  summary: Out of memory (instance {{ $labels.instance }})

/etc/prometheus/rules/security.yml > security

Potential Credential Stuffing attack (0 active)

alert: Potential
  Credential Stuffing attack
expr: rate(ruby_http_request_duration_seconds_count{action=~"create",app_name=~"customer_app",controller=~"sessions"}[10m])
  > 0.2
for: 1m
labels:
  service: rails
  severity: critical
annotations:
  description: The 5m average request rate for {{ $labels.app_name }} is significantly
    higher than usual
  environment: production
  summary: Rails app traffic for {{ $labels.app_name }} is abnormally high

Unusually high traffic on Customer App (0 active)

alert: Unusually
  high traffic on Customer App
expr: rate(customer_app_http_response_count_total[5m])
  > 5
for: 1m
labels:
  service: rails
  severity: warning
annotations:
  description: The 5m average request rate for {{ $labels.app_name }} is significantly
    higher than usual
  environment: production
  summary: Rails app traffic for {{ $labels.app_name }} is abnormally high