From 2c0f0a68a83c46dcb4c4234d732b82d07b96e8dc Mon Sep 17 00:00:00 2001 From: "Langhammer, Jens" Date: Fri, 8 Nov 2019 13:49:28 +0100 Subject: [PATCH] helm(major): add prometheus rules, add switch to enable/disable monitoring --- hack/prometheus/grafana.helm.yaml | 10 ++ hack/prometheus/instance.yaml | 63 ++++++++++++ helm/passbook/templates/prom-rules.yaml | 124 +++++++++++++++++++++++ helm/passbook/templates/static-sm.yaml | 2 + helm/passbook/templates/web-service.yaml | 4 +- helm/passbook/templates/web-sm.yaml | 3 + helm/passbook/values.yaml | 28 ++--- 7 files changed, 219 insertions(+), 15 deletions(-) create mode 100644 hack/prometheus/grafana.helm.yaml create mode 100644 hack/prometheus/instance.yaml create mode 100644 helm/passbook/templates/prom-rules.yaml diff --git a/hack/prometheus/grafana.helm.yaml b/hack/prometheus/grafana.helm.yaml new file mode 100644 index 000000000..5fdbe54c4 --- /dev/null +++ b/hack/prometheus/grafana.helm.yaml @@ -0,0 +1,10 @@ +ingress: + enabled: true + hosts: + - some.address.tld + +grafana.ini: + auth.anonymous: + enabled: true + org_name: Main Org. + org_role: Viewer diff --git a/hack/prometheus/instance.yaml b/hack/prometheus/instance.yaml new file mode 100644 index 000000000..24487c50c --- /dev/null +++ b/hack/prometheus/instance.yaml @@ -0,0 +1,63 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: prod-passbook-ng +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus +spec: + serviceAccountName: prometheus + serviceMonitorSelector: + matchLabels: + app.kubernetes.io/name: passbook + enableAdminAPI: false + ruleSelector: + matchLabels: + app.kubernetes.io/name: passbook + storage: + volumeClaimTemplate: + metadata: + labels: + prometheus: k8s + name: prometheus-storage + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 15Gi diff --git a/helm/passbook/templates/prom-rules.yaml b/helm/passbook/templates/prom-rules.yaml new file mode 100644 index 000000000..c8743d96b --- /dev/null +++ b/helm/passbook/templates/prom-rules.yaml @@ -0,0 +1,124 @@ +{{- if .Values.monitoring.enabled -}} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "passbook.fullname" . }}-static-rules + labels: + app.kubernetes.io/name: {{ include "passbook.name" . }} + helm.sh/chart: {{ include "passbook.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + groups: + - name: Aggregate request counters + rules: + - record: job:django_http_requests_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_before_middlewares_total[30s])) by (job) + - record: job:django_http_requests_unknown_latency_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_total[30s])) by (job) + - record: job:django_http_ajax_requests_total:sum_rate30s + expr: sum(rate(django_http_ajax_requests_total[30s])) by (job) + - record: job:django_http_responses_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_responses_before_middlewares_total[30s])) by (job) + - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) by (job) + - record: job:django_http_requests_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_requests_body_total_bytes[30s])) by (job) + - record: job:django_http_responses_streaming_total:sum_rate30s + expr: sum(rate(django_http_responses_streaming_total[30s])) by (job) + - record: job:django_http_responses_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_responses_body_total_bytes[30s])) by (job) + - record: job:django_http_requests_total:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) by (job) + - record: job:django_http_requests_total_by_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) by (job,method) + - record: job:django_http_requests_total_by_transport:sum_rate30s + expr: sum(rate(django_http_requests_total_by_transport[30s])) by (job,transport) + - record: job:django_http_requests_total_by_view:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view) + - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view,transport,method) + - record: job:django_http_responses_total_by_templatename:sum_rate30s + expr: sum(rate(django_http_responses_total_by_templatename[30s])) by (job,templatename) + - record: job:django_http_responses_total_by_status:sum_rate30s + expr: sum(rate(django_http_responses_total_by_status[30s])) by (job,status) + - record: job:django_http_responses_total_by_status_name_method:sum_rate30s + expr: sum(rate(django_http_responses_total_by_status_name_method[30s])) by (job,status,name,method) + - record: job:django_http_responses_total_by_charset:sum_rate30s + expr: sum(rate(django_http_responses_total_by_charset[30s])) by (job,charset) + - record: job:django_http_exceptions_total_by_type:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_type[30s])) by (job,type) + - record: job:django_http_exceptions_total_by_view:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_view[30s])) by (job,view) + - name: Aggregate latency histograms + rules: + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "99.9" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le)) + labels: + quantile: "99.9" + - name: Aggregate model operations + rules: + - record: job:django_model_inserts_total:sum_rate1m + expr: sum(rate(django_model_inserts_total[1m])) by (job, model) + - record: job:django_model_updates_total:sum_rate1m + expr: sum(rate(django_model_updates_total[1m])) by (job, model) + - record: job:django_model_deletes_total:sum_rate1m + expr: sum(rate(django_model_deletes_total[1m])) by (job, model) + - name: Aggregate database operations + rules: + - record: job:django_db_new_connections_total:sum_rate30s + expr: sum(rate(django_db_new_connections_total[30s])) by (alias, vendor) + - record: job:django_db_new_connection_errors_total:sum_rate30s + expr: sum(rate(django_db_new_connection_errors_total[30s])) by (alias, vendor) + - record: job:django_db_execute_total:sum_rate30s + expr: sum(rate(django_db_execute_total[30s])) by (alias, vendor) + - record: job:django_db_execute_many_total:sum_rate30s + expr: sum(rate(django_db_execute_many_total[30s])) by (alias, vendor) + - record: job:django_db_errors_total:sum_rate30s + expr: sum(rate(django_db_errors_total[30s])) by (alias, vendor, type) + - name: Aggregate migrations + rules: + - record: job:django_migrations_applied_total:max + expr: max(django_migrations_applied_total) by (job, connection) + - record: job:django_migrations_unapplied_total:max + expr: max(django_migrations_unapplied_total) by (job, connection) + - name: Alerts + rules: + - alert: UnappliedMigrations + expr: job:django_migrations_unapplied_total:max > 0 + for: 1m + labels: + severity: testing + annotations: + summary: "Unapplied django migrations on {{$labels.connection}}" + description: "Django detected {{$value}} unapplied migrations on database {{$labels.connection}}" +{{- end }} diff --git a/helm/passbook/templates/static-sm.yaml b/helm/passbook/templates/static-sm.yaml index 22f961b39..030dfe178 100644 --- a/helm/passbook/templates/static-sm.yaml +++ b/helm/passbook/templates/static-sm.yaml @@ -1,3 +1,4 @@ +{{- if .Values.monitoring.enabled -}} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -13,3 +14,4 @@ spec: selector: matchLabels: k8s.passbook.io/component: static +{{- end }} diff --git a/helm/passbook/templates/web-service.yaml b/helm/passbook/templates/web-service.yaml index 4f600974a..b25af13dd 100644 --- a/helm/passbook/templates/web-service.yaml +++ b/helm/passbook/templates/web-service.yaml @@ -9,9 +9,9 @@ metadata: helm.sh/chart: {{ include "passbook.chart" . }} k8s.passbook.io/component: web spec: - type: {{ .Values.service.type }} + type: ClusterIP ports: - - port: {{ .Values.service.port }} + - port: 80 targetPort: http protocol: TCP name: http diff --git a/helm/passbook/templates/web-sm.yaml b/helm/passbook/templates/web-sm.yaml index cbdd24e48..39970422f 100644 --- a/helm/passbook/templates/web-sm.yaml +++ b/helm/passbook/templates/web-sm.yaml @@ -1,3 +1,4 @@ +{{- if .Values.monitoring.enabled -}} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -17,6 +18,8 @@ spec: name: {{ include "passbook.fullname" . }}-secret-key key: monitoring_username port: http + interval: 10s selector: matchLabels: k8s.passbook.io/component: web +{{- end }} diff --git a/helm/passbook/values.yaml b/helm/passbook/values.yaml index bb1996949..728161349 100644 --- a/helm/passbook/values.yaml +++ b/helm/passbook/values.yaml @@ -14,19 +14,10 @@ config: email: host: localhost -postgresql: - postgresqlDatabase: passbook - -redis: - cluster: - enabled: false - master: - persistence: - enabled: false - -service: - type: ClusterIP - port: 80 +# This Helm chart ships with built-in Prometheus ServiceMonitors and Rules. +# This requires the CoreOS Prometheus Operator. +monitoring: + enabled: false ingress: enabled: false @@ -40,3 +31,14 @@ ingress: # - secretName: chart-example-tls # hosts: # - passbook.k8s.local + +# These settings configure the packaged PostgreSQL and Redis chart. +postgresql: + postgresqlDatabase: passbook + +redis: + cluster: + enabled: false + master: + persistence: + enabled: false