Skip to main content

July 2024 NixOS Prometheus Config

Monitored Host Snippets

services.prometheus.exporters = {
  apcupsd.enable = true;
  node = {
    enable = true;
    extraFlags = [ "--collector.textfile.directory='/var/cache/node-exporter'" ];
  };
};

systemd.timers."sanoid-monitoring" = {
  wantedBy = [ "timers.target" ];
  timerConfig = {
    OnCalendar = "*:0/5";
    Unit = "sanoid-monitoring";
  };
};
systemd.services."sanoid-monitoring" = {
  script = ''
    sanoid_capacity=0
    ${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-capacity > /dev/null || sanoid_capacity=$?
    ${pkgs.coreutils}/bin/echo "# TYPE sanoid_capacity gauge" > /var/cache/node-exporter/sanoid.prom.new
    ${pkgs.coreutils}/bin/echo "sanoid_capacity $sanoid_capacity" >> /var/cache/node-exporter/sanoid.prom.new

    sanoid_health=0
    ${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-health > /dev/null || sanoid_health=$?
    ${pkgs.coreutils}/bin/echo "# TYPE sanoid_health gauge" >> /var/cache/node-exporter/sanoid.prom.new
    ${pkgs.coreutils}/bin/echo "sanoid_health $sanoid_health" >> /var/cache/node-exporter/sanoid.prom.new

    sanoid_snapshots=0
    ${config.systemd.services.sanoid.serviceConfig.ExecStart} --monitor-snapshots > /dev/null || sanoid_snapshots=$?
    ${pkgs.coreutils}/bin/echo "# TYPE sanoid_snapshots gauge" >> /var/cache/node-exporter/sanoid.prom.new
    ${pkgs.coreutils}/bin/echo "sanoid_snapshots $sanoid_snapshots" >> /var/cache/node-exporter/sanoid.prom.new

    ${pkgs.coreutils}/bin/mv /var/cache/node-exporter/sanoid.prom.new /var/cache/node-exporter/sanoid.prom
  '';
  serviceConfig.Type = "oneshot";
};

systemd.tmpfiles.rules = [
  "d /var/cache/node-exporter 0750 root node-exporter"
];

Prometheus Container

{
  containers.prometheus = {
    autoStart = true;
    ephemeral = true;
    bindMounts = {
      "/var/lib/prometheus2" = {
        hostPath = "/var/lib/prometheus";
        isReadOnly = false;
      };
      "/var/lib/private/alertmanager" = {
        hostPath = "/var/lib/alertmanager";
        isReadOnly = false;
      };
      "/var/lib/secrets/telegram_bot_token".hostPath = "/var/lib/secrets/telegram_bot_token";
    };
    hostBridge = "OMITTED";
    localAddress = "OMITTED";
    privateNetwork = true;
    config = { config, pkgs, lib, ... }: {
      system.stateVersion = "24.05";
      networking = {
        defaultGateway = "OMITTED";
        nameservers = [ "OMITTED" ];
        nftables.enable = true;
        firewall.extraInputRules = "ip saddr OMITTED tcp dport { 9090, 9093 } accept";
      };

      environment.systemPackages = [ pkgs.prometheus-alertmanager ];  # ensure amtool is available for testing

      services.prometheus = {
        enable = true;
        webExternalUrl = "https://OMITTED.ljlapierre.com/";

        scrapeConfigs = [
          {
            job_name = "OMITTED";
            static_configs = [{
              targets = [ "OMITTED:9100" "OMITTED:9162" ];
            }];
          }
        ];

        rules = [(builtins.toJSON {
          groups = [{
            name = "OMITTED";
            rules = [
              {
                alert = "UPS Not Fully Charged";
                expr = "apcupsd_battery_charge_percent < 100";
              }
              {
                alert = "MDRAID Unhealthy";
                expr = "node_md_disks{state='active'} != ignoring(state) node_md_disks_required";
              }
              {
                alert = "zpool Unhealthy";
                expr = "node_zfs_zpool_state{state='online'} != 1";
              }
              {
                alert = "Sanoid Snapshots Unhealthy";
                expr = "sanoid_snapshots != 0";
                for = "2h";
                labels.severity = "warning";
              }
              {
                alert = "Less than 25% Free Space Remaining";
                expr = "((node_filesystem_avail_bytes / (1024^3)) / (node_filesystem_size_bytes / (1024^3))) * 100 < 25";
                labels.severity = "warning";
              }
              {
                alert = "Less than 15% Free Space Remaining";
                expr = "((node_filesystem_avail_bytes / (1024^3)) / (node_filesystem_size_bytes / (1024^3))) * 100 < 15";
              }
            ];
          }];
        })];

        alertmanagers = [ {
          static_configs = [ {
            targets = [ "localhost:9093" ];
          } ];
        } ];

        alertmanager = {
          enable = true;
          webExternalUrl = "https://OMITTED.ljlapierre.com/";

          configuration = {
            route = {
              receiver = "telegram";
              repeat_interval = "3d";
              routes = [
                {
                  repeat_interval = "14d";
                  matchers = [
                    "severity=warning"
                  ];
                }
              ];
            };
            receivers = [ {
              name = "telegram";
              telegram_configs = [ {
                bot_token_file = "/var/lib/secrets/telegram_bot_token";
                chat_id = 0123456789;
              } ];
            } ];
          };
        };
      };
    };
  };
}