July 2024 NixOS Prometheus Config
Monitored Host Snippets
services.prometheus.exporters = {
apcupsd.enable = true;
node = {
enable = true;
extraFlags = [ "--collector.textfile.directory='/var/cache/node-exporter'" ];
};
};
systemd.timers."sanoid-monitoring" = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/5";
Unit = "sanoid-monitoring";
};
};
systemd.services."sanoid-monitoring" = {
script = ''
sanoid_capacity=0
${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-capacity > /dev/null || sanoid_capacity=$?
${pkgs.coreutils}/bin/echo "# TYPE sanoid_capacity gauge" > /var/cache/node-exporter/sanoid.prom.new
${pkgs.coreutils}/bin/echo "sanoid_capacity $sanoid_capacity" >> /var/cache/node-exporter/sanoid.prom.new
sanoid_health=0
${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-health > /dev/null || sanoid_health=$?
${pkgs.coreutils}/bin/echo "# TYPE sanoid_health gauge" >> /var/cache/node-exporter/sanoid.prom.new
${pkgs.coreutils}/bin/echo "sanoid_health $sanoid_health" >> /var/cache/node-exporter/sanoid.prom.new
sanoid_snapshots=0
${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-snapshots > /dev/null || sanoid_snapshots=$?
${pkgs.coreutils}/bin/echo "# TYPE sanoid_snapshots gauge" >> /var/cache/node-exporter/sanoid.prom.new
${pkgs.coreutils}/bin/echo "sanoid_snapshots $sanoid_snapshots" >> /var/cache/node-exporter/sanoid.prom.new
${pkgs.coreutils}/bin/mv /var/cache/node-exporter/sanoid.prom.new /var/cache/node-exporter/sanoid.prom
'';
serviceConfig.Type = "oneshot";
};
systemd.tmpfiles.rules = [
"d /var/cache/node-exporter 0750 root node-exporter"
];
Prometheus Container
{
containers.prometheus = {
autoStart = true;
ephemeral = true;
bindMounts = {
"/var/lib/prometheus2" = {
hostPath = "/var/lib/prometheus";
isReadOnly = false;
};
"/var/lib/private/alertmanager" = {
hostPath = "/var/lib/alertmanager";
isReadOnly = false;
};
"/var/lib/secrets/telegram_bot_token".hostPath = "/var/lib/secrets/telegram_bot_token";
};
hostBridge = "OMITTED";
localAddress = "OMITTED";
privateNetwork = true;
config = { config, pkgs, lib, ... }: {
system.stateVersion = "24.05";
networking = {
defaultGateway = "OMITTED";
nameservers = [ "OMITTED" ];
nftables.enable = true;
firewall.extraInputRules = "ip saddr OMITTED tcp dport { 9090, 9093 } accept";
};
environment.systemPackages = [ pkgs.prometheus-alertmanager ]; # ensure amtool is available for testing
services.prometheus = {
enable = true;
webExternalUrl = "https://OMITTED.ljlapierre.com/";
scrapeConfigs = [
{
job_name = "OMITTED";
static_configs = [{
targets = [ "OMITTED:9100" "OMITTED:9162" ];
}];
}
];
rules = [(builtins.toJSON {
groups = [{
name = "OMITTED";
rules = [
{
alert = "UPS Not Fully Charged";
expr = "apcupsd_battery_charge_percent < 100";
}
{
alert = "MDRAID Unhealthy";
expr = "node_md_disks{state='active'} != ignoring(state) node_md_disks_required";
}
{
alert = "zpool Unhealthy";
expr = "node_zfs_zpool_state{state='online'} != 1";
}
{
alert = "Sanoid Snapshots Unhealthy";
expr = "sanoid_snapshots != 0";
for = "2h";
labels.severity = "warning";
}
{
alert = "Less than 25% Free Space Remaining";
expr = "((node_filesystem_avail_bytes / (1024^3)) / (node_filesystem_size_bytes / (1024^3))) * 100 < 25";
labels.severity = "warning";
}
{
alert = "Less than 15% Free Space Remaining";
expr = "((node_filesystem_avail_bytes / (1024^3)) / (node_filesystem_size_bytes / (1024^3))) * 100 < 15";
}
];
}];
})];
alertmanagers = [ {
static_configs = [ {
targets = [ "localhost:9093" ];
} ];
} ];
alertmanager = {
enable = true;
webExternalUrl = "https://OMITTED.ljlapierre.com/";
configuration = {
route = {
receiver = "telegram";
repeat_interval = "3d";
routes = [
{
repeat_interval = "14d";
matchers = [
"severity=warning"
];
}
];
};
receivers = [ {
name = "telegram";
telegram_configs = [ {
bot_token_file = "/var/lib/secrets/telegram_bot_token";
chat_id = 0123456789;
} ];
} ];
};
};
};
};
};
}