diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 470dec3..53083aa 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -48,6 +48,7 @@ jobs: helm repo add weave-works https://helm.gitops.weave.works helm repo add kubecost https://kubecost.github.io/cost-analyzer helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add grafana https://grafana.github.io/helm-charts - name: Run chart-releaser uses: helm/chart-releaser-action@a917fd15b20e8b64b94d9158ad54cd6345335584 # v1.6.0 diff --git a/charts/moonswitch-agent/Chart.lock b/charts/moonswitch-agent/Chart.lock index 571525e..c94b4bd 100644 --- a/charts/moonswitch-agent/Chart.lock +++ b/charts/moonswitch-agent/Chart.lock @@ -17,5 +17,11 @@ dependencies: - name: nginx repository: https://charts.bitnami.com/bitnami version: 15.5.3 -digest: sha256:e7b2580c02c13649da674aa61079459759d662bf99540d3ad3bdb04cf1401e4f -generated: "2024-01-04T16:35:28.415829-06:00" +- name: loki + repository: https://grafana.github.io/helm-charts + version: 5.41.4 +- name: promtail + repository: https://grafana.github.io/helm-charts + version: 6.15.3 +digest: sha256:0eb0feedbfc3f5cc529676408ad8ec7b8266a491142f4483cf2e2ce9d66de752 +generated: "2024-01-06T06:23:51.791593-06:00" diff --git a/charts/moonswitch-agent/Chart.yaml b/charts/moonswitch-agent/Chart.yaml index 98162df..e0a391f 100644 --- a/charts/moonswitch-agent/Chart.yaml +++ b/charts/moonswitch-agent/Chart.yaml @@ -6,7 +6,7 @@ icon: https://static.moonswitch.com/logos/color/icon.svg sources: - https://github.com/moonswitch/charts -version: 0.7.0 +version: 0.8.0 dependencies: - name: teleport-kube-agent @@ -33,6 +33,14 @@ dependencies: repository: https://charts.bitnami.com/bitnami condition: nginx.enabled version: 15.5.3 + - name: loki + repository: https://grafana.github.io/helm-charts + condition: loki.enabled + version: 5.41.4 + - name: promtail + repository: https://grafana.github.io/helm-charts + condition: promtail.enabled + version: 6.15.3 maintainers: - name: Moonswitch Team diff --git a/charts/moonswitch-agent/values.yaml b/charts/moonswitch-agent/values.yaml index 05ccc95..de144c4 100644 --- a/charts/moonswitch-agent/values.yaml +++ b/charts/moonswitch-agent/values.yaml @@ -70,6 +70,17 @@ kube-prometheus-stack: org_name: Main Org. org_role: Admin hide_version: true + sidecar: + dashboards: + searchNamespace: moonswitch + additionalDataSources: + - name: Loki + type: loki + access: proxy + url: http://{{ printf "%s-loki" .Release.Name }}:3100 + jsonData: + maxLines: 1000 + timeout: 360 cleanPrometheusOperatorObjectNames: true @@ -233,3 +244,71 @@ nginx: staticSiteConfigmap: nginx-static-site service: type: ClusterIP + +loki: + enabled: true + loki: + auth_enabled: false + commonConfig: + replication_factor: 1 + structuredConfig: + query_range: + parallelise_shardable_queries: false + storage: + type: 'filesystem' + singleBinary: + replicas: 1 + test: + enabled: false + monitoring: + lokiCanary: + enabled: false + selfMonitoring: + enabled: false + grafanaAgent: + installOperator: false + serviceMonitor: + enabled: true + +promtail: + enabled: true + daemonset: + autoscaling: + enabled: true + tolerations: [] + config: + snippets: + pipelineStages: + - cri: {} + - drop: + source: "namespace" + expression: "(default)" # Use this to drop logs from client app namespaces e.g "(default|client-app1|cool-app-namespace)" + serviceMonitor: + enabled: true + prometheusRule: + enabled: true + rules: + - alert: PromtailRequestErrors + expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 + for: 5m + labels: + severity: critical + annotations: + description: | + The {{ $labels.job }} {{ $labels.route }} is experiencing + {{ printf "%.2f" $value }} errors. + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Promtail request errors (instance {{ $labels.instance }}) + - alert: PromtailRequestLatency + expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: Promtail request latency (instance {{ $labels.instance }}) + description: | + The {{ $labels.job }} {{ $labels.route }} is experiencing + {{ printf "%.2f" $value }}s 99th percentile latency. + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/ct.yaml b/ct.yaml index 713e202..1b1f899 100644 --- a/ct.yaml +++ b/ct.yaml @@ -10,6 +10,7 @@ chart-repos: - weave-works=https://helm.gitops.weave.works - kubecost=https://kubecost.github.io/cost-analyzer - bitnami=https://charts.bitnami.com/bitnami + - grafana=https://grafana.github.io/helm-charts helm-extra-args: --timeout 600s validate-maintainers: false namespace: moonswitch