---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  creationTimestamp: "2026-02-17T12:49:45Z"
  generation: 1
  labels:
    app.kubernetes.io/component: query-layer
    app.kubernetes.io/instance: thanos-querier
    app.kubernetes.io/managed-by: cluster-monitoring-operator
    app.kubernetes.io/name: thanos-query
    app.kubernetes.io/part-of: openshift-monitoring
    app.kubernetes.io/version: 0.39.2
  managedFields:
  - apiVersion: monitoring.coreos.com/v1
    fieldsType: FieldsV1
    fieldsV1:
      f:metadata:
        f:labels:
          .: {}
          f:app.kubernetes.io/component: {}
          f:app.kubernetes.io/instance: {}
          f:app.kubernetes.io/managed-by: {}
          f:app.kubernetes.io/name: {}
          f:app.kubernetes.io/part-of: {}
          f:app.kubernetes.io/version: {}
      f:spec:
        .: {}
        f:groups:
          .: {}
          k:{"name":"thanos-query"}:
            .: {}
            f:name: {}
            f:rules: {}
    manager: operator
    operation: Update
    time: "2026-02-17T12:49:45Z"
  name: thanos-querier
  namespace: openshift-monitoring
  resourceVersion: "9878"
  uid: a2730f1e-827d-4501-99bb-f518481a6c9f
spec:
  groups:
  - name: thanos-query
    rules:
    - alert: ThanosQueryHttpRequestQueryErrorRateHigh
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is failing
          to handle {{$value | humanize}}% of "query" requests.
        summary: Thanos Query is failing to handle requests.
      expr: |
        (
          sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query"}[5m]))
        /
          sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query"}[5m]))
        ) * 100 > 5
      for: 1h
      labels:
        severity: warning
    - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is failing
          to handle {{$value | humanize}}% of "query_range" requests.
        summary: Thanos Query is failing to handle requests.
      expr: |
        (
          sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query_range"}[5m]))
        /
          sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m]))
        ) * 100 > 5
      for: 1h
      labels:
        severity: warning
    - alert: ThanosQueryGrpcServerErrorRate
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is failing
          to handle {{$value | humanize}}% of requests.
        summary: Thanos Query is failing to handle requests.
      expr: |
        (
          sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job="thanos-querier"}[5m]))
        /
          sum by (namespace, job) (rate(grpc_server_started_total{job="thanos-querier"}[5m]))
        * 100 > 5
        )
      for: 1h
      labels:
        severity: warning
    - alert: ThanosQueryGrpcClientErrorRate
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is failing
          to send {{$value | humanize}}% of requests.
        summary: Thanos Query is failing to send requests.
      expr: |
        (
          sum by (namespace, job) (rate(grpc_client_handled_total{grpc_code!="OK", job="thanos-querier"}[5m]))
        /
          sum by (namespace, job) (rate(grpc_client_started_total{job="thanos-querier"}[5m]))
        ) * 100 > 5
      for: 1h
      labels:
        severity: warning
    - alert: ThanosQueryHighDNSFailures
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} have {{$value
          | humanize}}% of failing DNS queries for store endpoints.
        summary: Thanos Query is having high number of DNS failures.
      expr: |
        (
          sum by (namespace, job) (rate(thanos_query_store_apis_dns_failures_total{job="thanos-querier"}[5m]))
        /
          sum by (namespace, job) (rate(thanos_query_store_apis_dns_lookups_total{job="thanos-querier"}[5m]))
        ) * 100 > 1
      for: 1h
      labels:
        severity: warning
    - alert: ThanosQueryOverload
      annotations:
        description: Thanos Query {{$labels.job}} in {{$labels.namespace}} has been
          overloaded for more than 15 minutes. This may be a symptom of excessive
          simultaneous complex requests, low performance of the Prometheus API, or
          failures within these components. Assess the health of the Thanos query
          instances, the connected Prometheus instances, look for potential senders
          of these requests and then contact support.
        summary: Thanos query reaches its maximum capacity serving concurrent requests.
      expr: |
        (
          max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1
        )
      for: 1h
      labels:
        severity: warning