move monitoring to services

This commit is contained in:
Márcio Fernandes
2026-06-07 21:43:49 +00:00
parent 8bf2f786d5
commit d8419c0e75
22 changed files with 1 additions and 0 deletions
+11
View File
@@ -0,0 +1,11 @@
creation_rules:
# encrypt all values from file
- path_regex: \.private\.dec\.yaml$
encrypted_regex: '^(.*)$'
age:
- age1f9e4pvp5y8gzuk8mz2s5xm85dd7znxhk56tcpuxqwn78qfjwja0qekwlju
# encrypt secrets files
- path_regex: .*.yaml
encrypted_regex: ^(data|stringData)$
age:
- age1f9e4pvp5y8gzuk8mz2s5xm85dd7znxhk56tcpuxqwn78qfjwja0qekwlju
+22
View File
@@ -0,0 +1,22 @@
# monitoring
## prometheus-stack
- <https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack>
- <https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml>
## promtail
## Setup
Using flux for reconciliation.
``` bash
./ops-scripts/apply-flux.sh
```
**promtail Encrypt secrets:**
``` bash
sops -e deploy/promtail/helm-values.private.dec.yaml > deploy/promtail/helm-values.private.yaml
```
@@ -0,0 +1,2 @@
**
!.gitignore
@@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- promtail-app-sync.yaml
- prometheus-app-sync.yaml
secretGenerator:
- name: flux-sops-age
files:
- "age.agekey=./.env.d/age.agekey"
generatorOptions:
disableNameSuffixHash: true
@@ -0,0 +1,16 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: prometheus
spec:
interval: 1m
sourceRef:
kind: GitRepository
name: casa
namespace: casa-limbosolutions-com
path: services/monitoring/deploy/prometheus
prune: true
decryption:
provider: sops
secretRef:
name: flux-sops-age
@@ -0,0 +1,16 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: promtail
spec:
interval: 1m
sourceRef:
kind: GitRepository
name: casa
namespace: casa-limbosolutions-com
path: services/monitoring/deploy/promtail
prune: true
decryption:
provider: sops
secretRef:
name: flux-sops-age
@@ -0,0 +1,55 @@
# values.yaml to install only Prometheus Operator and CRDs
# Disable all components except the operator
defaultRules:
create: false
alertmanager:
enabled: false
grafana:
enabled: false
kubeStateMetrics:
enabled: false
nodeExporter:
enabled: false
prometheus:
enabled: false
coreDns:
enabled: false
kubeControllerManager:
enabled: false
kubeEtcd:
enabled: false
kubeProxy:
enabled: false
kubeScheduler:
enabled: false
prometheusOperator:
enabled: true
createCustomResource: true
tls:
enabled: false
admissionWebhooks:
enabled: false
cleanupCustomResource: false
serviceMonitor:
selfMonitor: false
kubeletService:
enabled: true
# requires manual creation of service #prom-kublet-service
nodeSelector:
role: worker-node
# global:
# nodeSelector:
# dedicated: worker-node
@@ -0,0 +1,36 @@
kubeStateMetrics:
enabled: true
kube-state-metrics: # ok tested!
podLabels:
role: worker-node
nodeSelector:
role: worker-node
prometheus:
monitor:
interval: "60s"
relabelings:
- targetLabel: cluster
replacement: casa
additionalLabels:
app.kubernetes.io/name: prometheus-kube-state-metrics # !important: selector used by agent
coreDns: # ok tested!
enabled: true
serviceMonitor:
relabelings:
- targetLabel: cluster
replacement: casa
additionalLabels:
app.kubernetes.io/name: prometheus-stack-coredns # !important: selector used by agent
kubeApiServer: # ok tested!
enabled: true
serviceMonitor:
relabelings:
- targetLabel: cluster
replacement: casa
additionalLabels:
app.kubernetes.io/name: prometheus-stack-apiserver # !important: selector used by agent
@@ -0,0 +1,25 @@
# Deploy node exporter as a daemonset to all nodes
nodeExporter:
enabled: true
# job node exporter
prometheus-node-exporter:
prometheus:
monitor:
enabled: true
interval: "60s"
relabelings:
# https://github.com/dotdc/grafana-dashboards-kubernetes
- action: replace
sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: nodename
# identification of cluster
- targetLabel: cluster
replacement: casa
# it seams to be an timestamp can not be an label!
- action: labeldrop
regex: __meta_kubernetes_endpoints_annotation_endpoints_kubernetes_io_last_change_trigger_time
@@ -0,0 +1,78 @@
# Used file to testing new options and configurations
# Should be the laste file to be loaded
kubelet:
enabled: true
namespace: kube-system
serviceMonitor:
interval: 60s #WARN: Error on ingesting out-of-order samples. https://github.com/prometheus-community/helm-charts/issues/5483
enabled: true
## Enable scraping /metrics from kubelet's service
kubelet: true
cAdvisor: true
additionalLabels:
app.kubernetes.io/name: prometheus-kubelet # !important: selector used by agent
probesMetricRelabelings:
- targetLabel: cluster
replacement: casa
- sourceLabels: [__name__, image]
separator: ;
regex: container_([a-z_]+);
replacement: $1
action: drop
- sourceLabels: [__name__]
separator: ;
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
replacement: $1
action: drop
# # RelabelConfigs to apply to samples before scraping
# # ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
# #
# # metrics_path is required to match upstream rules and charts
cAdvisorRelabelings:
- targetLabel: cluster
replacement: casa
- action: replace
sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: [__meta_kubernetes_pod_node_name]
separator: ;
regex: ^(.*)$
targetLabel: nodename
replacement: $1
action: replace
# # RelabelConfigs to apply to samples before scraping
# # ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
# #
probesRelabelings:
- targetLabel: cluster
replacement: casa
- action: replace
sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: [__meta_kubernetes_pod_node_name]
separator: ;
regex: ^(.*)$
targetLabel: nodename
replacement: $1
action: replace
resourceRelabelings:
- targetLabel: cluster
replacement: casa
- action: replace
sourceLabels: [__metrics_path__]
targetLabel: metrics_path
@@ -0,0 +1,28 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: monitoring
spec:
releaseName: prometheus-stack
interval: 40h
chart:
spec:
chart: prometheus-stack
version: 86.x.x
sourceRef:
kind: HelmRepository
name: prometheus-stack
interval: 40h
valuesFrom:
- kind: Secret
name: prometheus-stack-helm-values
valuesKey: 01-only-crd-and-operator.yaml
- kind: Secret
name: prometheus-stack-helm-values
valuesKey: 02-kube-metrics.yaml
- kind: Secret
name: prometheus-stack-helm-values
valuesKey: 03-node-exporter.yaml
- kind: Secret
name: prometheus-stack-helm-values
valuesKey: 04-kubelet.yaml
@@ -0,0 +1,7 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: prometheus-stack
spec:
interval: 40h
url: https://prometheus-community.github.io/helm-charts
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- helm/helm-repo.yaml
- helm/helm-release.yaml
- prometheus-agent.yaml
secretGenerator:
- name: prometheus-stack-helm-values
files:
- 01-only-crd-and-operator.yaml=helm/01-only-crd-and-operator.yaml
- 02-kube-metrics.yaml=helm/02-kube-metrics.yaml
- 03-node-exporter.yaml=helm/03-node-exporter.yaml
- 04-kubelet.yaml=helm/helm/04-kubelet.yaml
@@ -0,0 +1,71 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: PrometheusAgent
metadata:
name: prometheus-agent
spec:
podMonitorSelector: null
podMonitorNamespaceSelector: null
serviceMonitorNamespaceSelector:
matchLabels:
prometheus-monitoring: enabled
serviceMonitorSelector:
matchLabels:
release: prometheus-stack
replicas: 1
remoteWrite:
- url: https://prometheus.monitoring.limbosolutions.com/api/v1/write
scrapeInterval: 60s
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 300Mi
serviceAccountName: prometheus-agent
nodeSelector:
role: worker-node
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus-agent
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/metrics", "nodes/proxy", "services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["monitoring.coreos.com"]
resources: ["servicemonitors", "podmonitors"]
verbs: ["get", "list", "watch"]
- nonResourceURLs:
- /metrics
- /metrics/cadvisor
- /metrics/probes
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus-agent-monitoring
roleRef:
kind: ClusterRole
name: prometheus-agent
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: prometheus-agent
namespace: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-agent
namespace: monitoring
@@ -0,0 +1,22 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: monitoring
spec:
releaseName: promtail
interval: 40h
chart:
spec:
chart: promtail
version: 6.x.x
sourceRef:
kind: HelmRepository
name: https://grafana.github.io/helm-charts
interval: 40h
valuesFrom:
- kind: Secret
name: promtail-helm-values
valuesKey: values.yaml
- kind: Secret
name: promtail-helm-values
valuesKey: values.private.yaml
@@ -0,0 +1,7 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: grafana
spec:
interval: 40h
url: https://grafana.github.io/helm-charts
@@ -0,0 +1,18 @@
config:
clients:
- url: ENC[AES256_GCM,data:AarLpmfJTu63kYzATeKf4m+60h93G5unSf2e8BplmCws7iVRzeFYGdvp14caaFZiZwWXe5rsdrMBQRc=,iv:Se74MvPyIP5xDcjrKBv3/X4G3G+Q9AYmdK/5t4yDuZc=,tag:A64ERrlrlCgf7PiQMT9WuQ==,type:str]
sops:
age:
- enc: |
-----BEGIN AGE ENCRYPTED FILE-----
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBYWjJxcitwdjR3QzQrSGVn
OVRFeTVrUGZMWlcycUREeWJWenpnVDVSbVJRClphakRuL2h2dEsvYWQ3VXJ4aHZL
YzQrU0FiRWttRUpmQkd5eVJFVVZBdVkKLS0tIDB3Y1FwQU5ndVlOQzNkZHA3V1Vl
bVpyTmhtUUhVTk9xZUFibHFyMVdqOEEKgoIrI9rJ1Q93AOZrP8r4rOggIGpSDv2H
uLp0yj1VqyyvtB/RHu4/Gyef2P5IwjTBnYYhZHbfX3AnYYWN58Riog==
-----END AGE ENCRYPTED FILE-----
recipient: age1f9e4pvp5y8gzuk8mz2s5xm85dd7znxhk56tcpuxqwn78qfjwja0qekwlju
encrypted_regex: ^(.*)$
lastmodified: "2026-06-07T21:04:02Z"
mac: ENC[AES256_GCM,data:Ieh41SbHtPqOIT2ynSEnz+qwaCsEDo9cZOk63AyuiMqsT0vR8TR94gimOKrMgQhjLpJPREYg0hXALgq7x6BxMfzts8n+eRuDsVxah8e17Ad4Gk9Vq9RtHYL06RO4EhevhzuzX32W8N1jt2wJTSDA4Ztjh1QIAAd7YyNnvYOATBo=,iv:eNzc4ObZ7lplIDPjF8Ub4Rfq3AiWLyOGwAdMLY7ojvo=,tag:87y5KNeAYASA/wDs4ETWmw==,type:str]
version: 3.13.1
@@ -0,0 +1,52 @@
config:
clients:
- url: "????" #replaced values.local.yaml. Example: https://lokiserver/loki/api/v1/push
# by default all scrap configs had node_name
snippets:
extraRelabelConfigs:
- target_label: host
replacement: ${HOSTNAME}
- target_label: cluster
replacement: casa
extraScrapeConfigs: |
#scrape config for syslog
- job_name: host-journald
journal:
json: true
max_age: 24h
path: /var/log/host/journal
labels:
job: journald
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'journal_systemd_unit'
- source_labels: ['__journal_syslog_identifier']
target_label: 'journal_syslog_identifier'
- source_labels: ['__journal__hostname']
target_label: 'journal_hostname'
- target_label: 'host'
replacement: '${HOSTNAME}'
- target_label: 'cluster'
replacement: 'casa'
extraArgs:
- --config.expand-env=true
extraVolumes:
- name: node-logs
hostPath:
path: /var/log
extraVolumeMounts:
- name: node-logs
mountPath: /var/log/host
readOnly: true
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
@@ -0,0 +1,13 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- helm-repo.yaml
- helm-release.yaml
secretGenerator:
- name: promtail-helm-values
files:
- values.yaml=helm-values.yaml
- values.private.yaml=helm-values.private.yaml
generatorOptions:
disableNameSuffixHash: true
+4
View File
@@ -0,0 +1,4 @@
#!/bin/bash
set -e
kubectl kustomize deploy/flux | kubectl apply -f -
@@ -0,0 +1,43 @@
# Prometheus Setup
## helm chart
```bash
#add repo
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
```
**This helm chart, installs:**
- crd
- Operator
- kubernetes services monitors
```bash
kubectl get namespace monitoring || kubectl create namespace monitoring
helm upgrade --install prometheus-stack prometheus-community/kube-prometheus-stack \
--version=86.0.1 \
--namespace monitoring \
--values=./deploy/helm/01-only-crd-and-operator.yaml \
--values=./deploy/helm/02-kube-metrics.yaml \
--values=./deploy/helm/03-node-exporter.yaml \
--values=./deploy/helm/04-kubelet.yaml \
--values=./deploy/helm/10-testing-values.yaml
```
## deploy prometheus agent
**requirements:**
On namespaces running prometheus monitors add label `prometheus-monitoring=enabled`
**Example:**
``` bash
kubectl label namespace monitoring prometheus-monitoring=enabled --overwrite
```
```bash
kubectl apply -f ./deploy/prometheus-agent.yaml
```