...
Extract ZIP
Create Services
Code Block |
---|
echo Installation Node Exporter
D:\otx\sw\otwatch\nssm.exe install otwatch_client "D:\otx\sw\otwatch\prometheus_node_exporter\windows_exporter-0.15.0-amd64.exe"
D:\otx\sw\otwatch\nssm.exe set otwatch_client AppDirectory "D:\otx\sw\otwatch\prometheus_node_exporter"
D:\otx\sw\otwatch\nssm.exe set otwatch_client AppParameters "--config.file=D:\otx\sw\otwatch\prometheus_node_exporter\config.yml"
D:\otx\sw\otwatch\nssm.exe set otwatch_client DisplayName otwatch_client
D:\otx\sw\otwatch\nssm.exe set otwatch_client Description otwatch client node exporter for prometheus
D:\otx\sw\otwatch\nssm.exe set otwatch_client Start SERVICE_AUTO_START
net start otwatch_client
pause
echo Installation Prometheus
D:\otx\sw\otwatch\nssm.exe install otwatch_prometheus "D:\otx\sw\otwatch\prometheus\prometheus.exe"
D:\otx\sw\otwatch\nssm.exe set otwatch_prometheus AppDirectory "D:\otx\sw\otwatch\prometheus"
D:\otx\sw\otwatch\nssm.exe set otwatch_prometheus AppParameters "--config.file=D:\otx\sw\otwatch\prometheus\prometheus.yml --web.enable-admin-api --web.external-url=https://otwatch.emmi.ch/prometheus --web.route-prefix="/" --storage.tsdb.retention.time="365d" --log.level=debug"
D:\otx\sw\otwatch\nssm.exe set otwatch_prometheus DisplayName otwatch_prometheus
D:\otx\sw\otwatch\nssm.exe set otwatch_prometheus Description otwatch prometheus database
D:\otx\sw\otwatch\nssm.exe set otwatch_prometheus Start SERVICE_AUTO_START
net start otwatch_prometheus
pause
echo Installation Prometheus Blackbox Exporter
D:\otx\sw\otwatch\nssm.exe install otwatch_blackbox "D:\otx\sw\otwatch\prometheus_blackbox_exporter\blackbox_exporter.exe"
D:\otx\sw\otwatch\nssm.exe set otwatch_blackbox AppDirectory "D:\otx\sw\otwatch\prometheus_blackbox_exporter"
D:\otx\sw\otwatch\nssm.exe set otwatch_blackbox AppParameters "--config.file=D:\otx\sw\otwatch\prometheus_blackbox_exporter\blackbox.yml"
D:\otx\sw\otwatch\nssm.exe set otwatch_blackbox DisplayName otwatch_blackbox
D:\otx\sw\otwatch\nssm.exe set otwatch_blackbox Description otwatch blackbox exporter for prometheus
D:\otx\sw\otwatch\nssm.exe set otwatch_blackbox Start SERVICE_AUTO_START
net start otwatch_blackbox
pause
echo Installation Prometheus Alert Manager
D:\otx\sw\otwatch\nssm.exe install otwatch_alertmanager "D:\otx\sw\otwatch\prometheus_alertmanager\alertmanager.exe"
D:\otx\sw\otwatch\nssm.exe set otwatch_alertmanager AppDirectory "D:\otx\sw\otwatch\prometheus_alertmanager"
D:\otx\sw\otwatch\nssm.exe set otwatch_alertmanager AppParameters "--config.file=D:\otx\sw\otwatch\prometheus_alertmanager\alertmanager.yml"
D:\otx\sw\otwatch\nssm.exe set otwatch_alertmanager DisplayName otwatch_alertmanager
D:\otx\sw\otwatch\nssm.exe set otwatch_alertmanager Description otwatch Alert Manager for prometheus
D:\otx\sw\otwatch\nssm.exe set otwatch_alertmanager Start SERVICE_AUTO_START
net start otwatch_alertmanager
pause
echo Installation Prometheus Pushgateway
D:\otx\sw\otwatch\nssm.exe install otwatch_pushgateway "D:\otx\sw\otwatch\prometheus_alertmanager\alertmanager.exe"
D:\otx\sw\otwatch\nssm.exe set otwatch_pushgateway AppDirectory "D:\otx\sw\otwatch\prometheus_pushgateway"
D:\otx\sw\otwatch\nssm.exe set otwatch_pushgateway DisplayName otwatch_pushgateway
D:\otx\sw\otwatch\nssm.exe set otwatch_pushgateway Description otwatch Pushgateway for prometheus
D:\otx\sw\otwatch\nssm.exe set otwatch_pushgateway Start SERVICE_AUTO_START
net start otwatch_pushgateway
pause |
Prometheus
prometheus_rules.yml beispiel
Code Block | ||
---|---|---|
| ||
groups:
- name: Prometheus self-monitorin Alert
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus job missing (instance {{ $labels.instance }})"
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetMissing
expr: up == 0
for: 1m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAllTargetsMissing
expr: count by (job) (up) == 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerConfigNotSynced
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: vector(1)
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})"
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus target empty (instance {{ $labels.instance }})"
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
for: 5m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
for: 0m
labels:
severity: warning
team: hit-nomail
annotations:
summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[5m]) > 5
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
team: hit-nomail
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: Windows Server Alert
rules:
- alert: WindowsServerCollectorError
expr: windows_exporter_collector_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: "Windows Server collector Error (instance {{ $labels.instance }})"
description: "Collector {{ $labels.collector }} was not successful\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: WindowsServerServiceStatus
expr: windows_service_status{status="ok"} != 1
for: 1m
labels:
severity: critical
annotations:
summary: "Windows Server service Status (instance {{ $labels.instance }})"
description: "Windows Service state is not OK\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: WindowsServerCpuUsage
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Windows Server CPU Usage (instance {{ $labels.instance }})"
description: "CPU Usage is more than 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: WindowsServerMemoryUsage
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Windows Server memory Usage (instance {{ $labels.instance }})"
description: "Memory usage is more than 90%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: WindowsServerDiskSpaceUsage
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes {volume!="X:"} / 1024 / 1024)) > 80
for: 5m
labels:
severity: critical
annotations:
summary: "Windows Server disk Space Usage (instance {{ $labels.instance }})"
description: "Disk usage is more than 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: Blackbox Alert
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: "Blackbox probe failed (instance {{ $labels.instance }})"
description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: "Blackbox slow probe (instance {{ $labels.instance }})"
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
annotations:
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
summary: "Blackbox SSL certificate expired (instance {{ $labels.instance }})"
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: "Blackbox probe slow HTTP (instance {{ $labels.instance }})"
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: "Blackbox probe slow ping (instance {{ $labels.instance }})"
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: opentext Archive Center |
snmp_exporter
Save the File to snmp.yml config File:
...