程序上线以后,就需要通过监控来确保程序健康和稳定的运行了。当程序异常的时候,还要有能力发出报警。一整套的监控收集、存储、报警的安装和配置是非常麻烦的。而且这套监控程序后续也需要进行维护,额外增加了很多工作量。
这里记录下利用基于 Kubernetes 集群,利用 helm 一键部署 Prometheus + Grafana 的方式。
---
additionalPrometheusRules: [] alertmanager: alertmanagerSpec: additionalPeers: [] affinity: {} configMaps: [] containers: [] externalUrl: null image: repository: quay.io/prometheus/alertmanager tag: v0.17.0 listenLocal: false logLevel: info nodeSelector: {} paused: false podAntiAffinity: "" podAntiAffinityTopologyKey: kubernetes.io/hostname podMetadata: {} priorityClassName: "" replicas: 1 resources: {} retention: 120h routePrefix: / secrets: [] securityContext: fsGroup: 2000 runAsNonRoot: true runAsUser: 1000 storage: {} tolerations: [] useExistingSecret: false config: global: resolve_timeout: 5m receivers: - name: alerta webhook_configs: - send_resolved: true url: https://alerta.xxxx.org/api/webhooks/prometheus?api-key=xxxxxxx - name: rocketchat webhook_configs: - send_resolved: false url: https://chat.xxxx.org/hooks/xxxx/xxxx route: group_by: - job group_interval: 5m group_wait: 30s receiver: alerta repeat_interval: 3h routes: - continue: true match: severity: critical receiver: alerta - continue: true match: severity: critical receiver: rocketchat enabled: true ingress: annotations: {} enabled: true hosts: - alert.xxxx.org labels: {} paths: [] tls: - hosts: - alert.xxxx.org secretName: xxxx-tls podDisruptionBudget: enabled: false maxUnavailable: "" minAvailable: 1 service: annotations: {} clusterIP: "" externalIPs: [] labels: {} loadBalancerIP: "" loadBalancerSourceRanges: [] nodePort: 30903 type: ClusterIP serviceAccount: create: true name: "" serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] selfMonitor: true templateFiles: {} commonLabels: {} coreDns: enabled: true service: port: 9153 selector: k8s-app: kube-dns targetPort: 9153 serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] defaultRules: annotations: {} create: true labels: environment: aliyun rules: alertmanager: true etcd: true general: true k8s: true kubeApiserver: true kubePrometheusNodeAlerting: true kubePrometheusNodeRecording: true kubeScheduler: true kubernetesAbsent: true kubernetesApps: true kubernetesResources: true kubernetesStorage: true kubernetesSystem: true node: true prometheus: true prometheusOperator: true fullnameOverride: "" global: imagePullSecrets: [] rbac: create: true pspEnabled: true grafana: additionalDataSources: [] adminPassword: xxxx defaultDashboardsEnabled: true enabled: true extraConfigmapMounts: [] grafana.ini: analytics: check_for_updates: true auth.ldap: allow_sign_up: true config_file: /etc/grafana/ldap.toml enabled: true grafana_net: url: https://grafana.net log: mode: console paths: data: /var/lib/grafana/data logs: /var/log/grafana plugins: /var/lib/grafana/plugins provisioning: /etc/grafana/provisioning smtp: enabled: true from_address: [email protected] from_name: xxxx host: xxxx:25 skip_verify: true ingress: annotations: null enabled: true hosts: - grafana.xxxx.org labels: {} path: / tls: - hosts: - grafana.xxxx.org secretName: xxxx-tls ldap: existingSecret: grafana-ldap-toml persistence: accessModes: - ReadWriteOnce enabled: false size: 20Gi serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] selfMonitor: true sidecar: dashboards: enabled: true label: grafana_dashboard searchNamespace: ALL datasources: defaultDatasourceEnabled: true enabled: true label: grafana_datasource searchNamespace: ALL kube-state-metrics: podSecurityPolicy: enabled: true rbac: create: true kubeApiServer: enabled: true relabelings: [] serviceMonitor: interval: "" jobLabel: component metricRelabelings: [] selector: matchLabels: component: apiserver provider: kubernetes tlsConfig: insecureSkipVerify: false serverName: kubernetes kubeControllerManager: enabled: false endpoints: [] service: port: 10252 selector: component: kube-controller-manager targetPort: 10252 serviceMonitor: https: false interval: "" metricRelabelings: [] relabelings: [] kubeDns: enabled: false service: selector: k8s-app: kube-dns serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] kubeEtcd: enabled: false endpoints: [] service: port: 2379 selector: k8s-app: etcd targetPort: 2379 serviceMonitor: caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client insecureSkipVerify: false interval: "" keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key metricRelabelings: [] relabelings: [] scheme: https serverName: "" kubeScheduler: enabled: false endpoints: [] service: port: 10251 selector: component: kube-scheduler targetPort: 10251 serviceMonitor: https: false interval: "" metricRelabelings: [] relabelings: [] kubeStateMetrics: enabled: true serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] kubelet: enabled: true namespace: kube-system serviceMonitor: cAdvisorMetricRelabelings: [] cAdvisorRelabelings: [] https: false interval: "" nameOverride: "" nodeExporter: enabled: true jobLabel: jobLabel serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] prometheus: additionalServiceMonitors: [] enabled: true ingress: annotations: {} enabled: true hosts: - prometheus.xxxx.org labels: {} paths: [] tls: - hosts: - prometheus.xxxx.org secretName: xxxx-tls podDisruptionBudget: enabled: false maxUnavailable: "" minAvailable: 1 prometheusSpec: additionalAlertManagerConfigs: [] additionalAlertRelabelConfigs: [] additionalScrapeConfigs: [] additionalScrapeConfigsExternal: false affinity: {} alertingEndpoints: [] configMaps: [] containers: [] enableAdminAPI: false evaluationInterval: "" externalLabels: environment: aliyun externalUrl: "" image: repository: quay.io/prometheus/prometheus tag: v2.9.1 listenLocal: false logFormat: logfmt logLevel: info nodeSelector: {} paused: false podAntiAffinity: "" podAntiAffinityTopologyKey: kubernetes.io/hostname podMetadata: {} priorityClassName: "" query: {} remoteRead: [] remoteWrite: [] replicas: 1 resources: {} retention: 10d routePrefix: / ruleNamespaceSelector: {} ruleSelector: {} ruleSelectorNilUsesHelmValues: true scrapeInterval: "" secrets: [] securityContext: fsGroup: 2000 runAsNonRoot: true runAsUser: 1000 serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} serviceMonitorSelectorNilUsesHelmValues: true storageSpec: {} thanos: {} tolerations: [] rbac: roleNamespaces: - kube-system service: annotations: {} clusterIP: "" externalIPs: [] labels: {} loadBalancerIP: "" loadBalancerSourceRanges: [] nodePort: 30090 sessionAffinity: "" targetPort: 9090 type: ClusterIP serviceAccount: create: true name: "" serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] selfMonitor: true prometheus-node-exporter: extraArgs: - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ podLabels: jobLabel: node-exporter prometheusOperator: affinity: {} cleanupCustomResource: false configmapReloadImage: repository: quay.io/coreos/configmap-reload tag: v0.0.1 crdApiGroup: monitoring.coreos.com createCustomResource: true enabled: true hyperkubeImage: pullPolicy: IfNotPresent repository: k8s.gcr.io/hyperkube tag: v1.12.1 image: pullPolicy: IfNotPresent repository: quay.io/coreos/prometheus-operator tag: v0.29.0 kubeletService: enabled: true namespace: kube-system nodeSelector: {} podAnnotations: {} podLabels: {} prometheusConfigReloaderImage: repository: quay.io/coreos/prometheus-config-reloader tag: v0.29.0 resources: {} securityContext: runAsNonRoot: true runAsUser: 65534 service: additionalPorts: [] annotations: {} clusterIP: "" externalIPs: [] labels: {} loadBalancerIP: "" loadBalancerSourceRanges: [] nodePort: 30080 type: ClusterIP serviceAccount: create: true name: "" serviceMonitor: interval: "" metricRelabelings: [] relabelings: [] selfMonitor: true tolerations: []
|