dotfiles/nixos/systems/blowhole/monitoring.nix

649 lines
18 KiB
Nix
Raw Normal View History

# SPDX-FileCopyrightText: 2023 Richard Brežák <richard@brezak.sk>
#
# SPDX-License-Identifier: LGPL-3.0-or-later
{
pkgs,
roots,
lib,
inputs',
config,
secret,
config',
...
}: let
inherit
(lib)
singleton
nixosTests
concatStringsSep
;
in {
uterranix.config = {tflib, ...}: let
inherit
(tflib)
tf
;
in {
output."envoy_grafana".value = tf "vault_consul_secret_backend_role.envoy-grafana";
output."envoy_blowhole".value = tf "vault_consul_secret_backend_role.envoy-blowhole";
data."influxdb-v2_organization"."redalder" = {
name = "redalder";
};
resource."influxdb-v2_bucket"."metrics_bucket" = {
name = "metrics";
description = "Metrics bucket";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
retention_rules = {
every_seconds = 30 * 24 * 60 * 60; # days * h/d * m/h * s/m
};
};
resource."influxdb-v2_bucket"."metrics_preprocessed_bucket" = {
name = "metrics-preprocessed";
description = "Preprocessed bucket";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
retention_rules = {
every_seconds = 30 * 24 * 60 * 60; # days * h/d * m/h * s/m
};
};
resource."influxdb-v2_bucket"."logs_bucket" = {
org_id = "\${data.influxdb-v2_organization.redalder.id}";
name = "logs";
description = "Logs bucket";
retention_rules = {
every_seconds = 30 * 24 * 60 * 60; # days * h/d * m/h * s/m
};
};
resource."influxdb-v2_authorization"."telegraf_authorization" = {
org_id = "\${data.influxdb-v2_organization.redalder.id}";
description = "Token for telegraf ingestion";
status = "active";
permissions = [
{
action = "write";
resource = {
id = "\${influxdb-v2_bucket.logs_bucket.id}";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
type = "buckets";
};
}
{
action = "write";
resource = {
id = "\${influxdb-v2_bucket.metrics_bucket.id}";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
type = "buckets";
};
}
];
};
resource."influxdb-v2_authorization"."grafana_authorization" = {
org_id = "\${data.influxdb-v2_organization.redalder.id}";
description = "Token for Grafana";
status = "active";
permissions = [
{
action = "read";
resource = {
id = "\${influxdb-v2_bucket.logs_bucket.id}";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
type = "buckets";
};
}
{
action = "read";
resource = {
id = "\${influxdb-v2_bucket.metrics_preprocessed_bucket.id}";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
type = "buckets";
};
}
{
action = "read";
resource = {
id = "\${influxdb-v2_bucket.metrics_bucket.id}";
org_id = "\${data.influxdb-v2_organization.redalder.id}";
type = "buckets";
};
}
];
};
resource."vault_mount"."kv" = {
path = "kv";
type = "kv";
options = {version = 2;};
description = "KV Version 2 secret engine mount";
};
resource."vault_kv_secret_v2"."telegraf_secret" = {
mount = "\${vault_mount.kv.path}";
name = "homelab-1/blowhole/monitor/telegraf";
options = {version = 2;};
data_json = builtins.toJSON {
influxdb_token = "\${influxdb-v2_authorization.telegraf_authorization.token}";
};
};
resource."vault_kv_secret_v2"."grafana_secret" = {
mount = "\${vault_mount.kv.path}";
name = "homelab-1/blowhole/monitor/grafana";
options = {version = 2;};
data_json = builtins.toJSON {
influxdb_token = "\${influxdb-v2_authorization.grafana_authorization.token}";
};
};
};
nixpkgs.overlays = singleton (_: _: {
telegraf = pkgs.buildGoModule rec {
pname = "telegraf";
version = "1.25.3";
excludedPackages = "test";
doCheck = false;
subPackages = singleton "cmd/telegraf";
src = pkgs.fetchFromGitHub {
owner = "influxdata";
repo = "telegraf";
rev = "v${version}";
sha256 = "sha256-FUZDS4As9qP2Dn0NSBM/e8udDLMk5OZol4CQSI39T4s=";
};
vendorHash = "sha256-uWoWvS9ZZzhpE+PiJv0fqblMLOAGIrhCdi0ugvF/lQI=";
proxyVendor = true;
ldflags = [
"-w"
"-s"
"-X main.version=${version}"
];
passthru.tests = {inherit (nixosTests) telegraf;};
meta = with lib; {
description = "The plugin-driven server agent for collecting & reporting metrics";
license = licenses.mit;
homepage = "https://www.influxdata.com/time-series-platform/telegraf/";
maintainers = with maintainers; [mic92 roblabla timstott];
};
};
});
services.hashicorp.vault-agent = {
settings.template = [
{
source = pkgs.writeText "envoy-grafana.token.vtmpl" ''
{{ with secret "consul/creds/envoy-grafana" }}{{ .Data.token }}{{ end }}
'';
destination = "/run/secrets/monitor/envoy-grafana.token";
command = let
serviceList = ["hashicorp-envoy-grafana" "hashicorp-envoy-influx" "hashicorp-envoy-telegraf"];
in
pkgs.writeShellScript "envoy-grafana-reload.sh" ''
sudo systemd-run -P --machine monitor /run/current-system/sw/bin/bash -l -c \
'systemctl try-reload-or-restart ${concatStringsSep " " serviceList}' || true
'';
}
{
source = pkgs.writeText "envoy-blowhole.token.vtmpl" ''
{{ with secret "consul/creds/envoy-blowhole" }}{{ .Data.token }}{{ end }}
'';
destination = "/run/secrets/envoy-blowhole.token";
command = pkgs.writeShellScript "envoy-blowhole-reload.sh" ''
sudo systemctl try-reload-or-restart hashicorp-envoy-telegraf || true
'';
}
{
source = pkgs.writeText "telegraf.env.vtmpl" ''
INFLUXDB_TOKEN={{ with secret "kv/data/homelab-1/blowhole/monitor/telegraf" }}{{ .Data.data.influxdb_token }}{{ end }}
'';
destination = "/run/secrets/monitor/telegraf.env";
command = pkgs.writeShellScript "monitor-telegraf-reload.sh" ''
sudo systemd-run -P --machine monitor /run/current-system/sw/bin/bash -l -c \
'systemctl try-reload-or-restart telegraf' || true
'';
}
{
source = pkgs.writeText "grafana-influx.token.vtmpl" ''
{{ with secret "kv/data/homelab-1/blowhole/monitor/grafana" }}
{{ .Data.data.influxdb_token }}
{{ end }}
'';
destination = "/run/secrets/monitor/grafana-influx.token";
perms = "0644";
command = pkgs.writeShellScript "monitor-telegraf-reload.sh" ''
sudo systemd-run -P --machine monitor /run/current-system/sw/bin/bash -l -c \
'systemctl try-reload-or-restart grafana' || true
'';
}
{
source = pkgs.writeText "itp.env.vtmpl" ''
{{ with secret "kv/data/homelab-1/blowhole/monitor/itp" }}
INFLUX_HOST={{ .Data.data.host }}
INFLUX_TOKEN={{ .Data.data.token }}
{{ end }}
'';
destination = "/run/secrets/monitor/itp.env";
}
];
};
systemd.services."hashicorp-envoy-telegraf" = {
requires = ["vault-unsealed.service"];
after = ["vault-unsealed.service"];
};
## There is no way to say, hey, listen on localhost. The listeners option is missing the `address` field
## and the `name` field so it's impossible to configure....
services.hashicorp-envoy.telegraf = {
type = "ingress";
address = "${secret.network.ips.blowhole.ip or ""}:19000";
service = {
kind = "ingress-gateway";
name = "telegraf-blowhole";
listeners = singleton {
port = 8086;
protocol = "tcp";
services = singleton {
name = "telegraf";
};
};
};
environment = {
"CONSUL_HTTP_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8500";
"CONSUL_GRPC_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8502";
"CONSUL_HTTP_TOKEN_FILE" = "/run/secrets/envoy-blowhole.token";
};
adminBind = "127.0.0.1:19100";
hotRestart = false;
consulPackage = inputs'.nixpkgs-hashicorp.legacyPackages.${pkgs.stdenv.system}.consul;
extraConsulArgs = ["-ignore-envoy-compatibility"];
};
systemd.services."telegraf-magic" = {
requires = ["vault-unsealed.service"];
after = ["vault-unsealed.service"];
};
services.telegraf-magic = {
enable = true;
settings = {
inputs.cpu = {
percpu = true;
totalcpu = true;
tags.host = "blowhole";
tags.bucket = "metrics";
};
inputs.mem = {
tags.host = "blowhole";
tags.bucket = "metrics";
};
inputs.nomad = {
url = "http://${secret.network.ips.blowhole.ip or ""}:4646";
tags.host = "blowhole";
tags.bucket = "metrics";
};
# aggregators.minmax = {
# period = "30s";
# drop_original = true;
# namepass = [ "nomad" ];
# };
inputs.zfs = {
tags.host = "blowhole";
tags.bucket = "metrics";
};
# inputs.tail = [
# {
# files = ["/var/lib/nomad/alloc/*/alloc/logs/*.stdout.*"];
# data_format = "value";
# data_type = "string";
# name_override = "nomad_alloc_log";
# tags.bucket = "logs";
# }
# {
# files = ["/var/lib/nomad/alloc/*/alloc/logs/*.stderr.*"];
# data_format = "value";
# data_type = "string";
# name_override = "nomad_alloc_log";
# tags.bucket = "logs";
# }
# ];
inputs.docker_log = {
tags.bucket = "logs";
source_tag = true;
};
processors.parser = [
{
parse_fields = ["message"];
merge = "override";
data_format = "grok";
grok_patterns = ["%{COMBINED_LOG_FORMAT}"];
tagpass = {
"grok_type" = ["nginx" "apache"];
};
namepass = ["docker_log"];
}
{
parse_fields = ["message"];
merge = "override";
data_format = "json_v2";
json_v2 = [
# the TOML generator won't create the structure required by telegraf without this
{}
{
object = [
{
path = "@this";
timestamp_key = "time";
timestamp_format = "unix";
tags = [
"level"
"server_name"
"namespace"
"level"
"request"
];
disable_prepend_keys = true;
}
];
}
];
tagpass = {
"grok_type" = ["synapse"];
};
namepass = ["docker_log"];
}
];
outputs.influxdb_v2 = [
{
urls = singleton "http://${secret.network.ips.blowhole.ip or ""}:8086";
bucket = "metrics";
tagpass.bucket = singleton "metrics";
}
{
urls = singleton "http://${secret.network.ips.blowhole.ip or ""}:8086";
bucket = "logs";
tagpass.bucket = singleton "logs";
}
];
};
systemd = {
serviceConfig.SupplementaryGroups = ["docker"];
};
};
fileSystems."/var/lib/grafana" = {
device = "blowhole-zpool/persist/grafana";
fsType = "zfs";
};
fileSystems."/var/lib/grafana-postgres" = {
device = "blowhole-zpool/persist/grafana-postgres";
fsType = "zfs";
};
fileSystems."/var/lib/grafana-influxdb2" = {
device = "blowhole-zpool/persist/grafana-influxdb2";
fsType = "zfs";
};
systemd.services."container@monitor" = {
requires = ["vault-unsealed.service"];
after = ["vault-unsealed.service"];
serviceConfig.LimitNOFILE = "infinity";
};
# TODO: split interface name and container name, i.e. rewrite the container module....... again
containers.monitor = {
ephemeral = true;
autoStart = true;
privateNetwork = true;
localAddress = "10.64.99.2";
hostAddress = "10.64.99.1";
extraFlags = [
"--capability=CAP_IPC_LOCK"
];
bindMounts = {
"/run/secrets" = {
hostPath = "/run/secrets/monitor";
isReadOnly = true;
};
"/var/lib/grafana" = {
hostPath = "/var/lib/grafana";
isReadOnly = false;
};
"/var/lib/postgresql" = {
hostPath = "/var/lib/grafana-postgres";
isReadOnly = false;
};
"/var/lib/influxdb2" = {
hostPath = "/var/lib/grafana-influxdb2";
isReadOnly = false;
};
};
config = {
nixpkgs.overlays = config.nixpkgs.overlays;
imports = with config'.flake.nixosModules; [
hashicorp
hashicorp-envoy
telegraf
grafana
influx-provisioning
];
services.hashicorp-envoy.grafana = {
service = {
name = "grafana";
id = "grafana";
address = "10.64.99.2";
port = 3000;
connect.sidecar_service = {};
};
environment = {
"CONSUL_HTTP_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8500";
"CONSUL_GRPC_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8502";
"CONSUL_HTTP_TOKEN_FILE" = "/run/secrets/envoy-grafana.token";
};
address = "10.64.99.2:19000";
adminBind = "127.0.0.1:19100";
hotRestart = false;
consulPackage = inputs'.nixpkgs-hashicorp.legacyPackages.${pkgs.stdenv.system}.consul;
extraConsulArgs = ["-ignore-envoy-compatibility"];
};
services.postgresql = {
enable = true;
ensureDatabases = singleton "grafana";
ensureUsers = singleton {
name = "grafana";
ensurePermissions."DATABASE grafana" = "ALL PRIVILEGES";
};
};
systemd.services.grafana = {
serviceConfig = {
Restart = "always";
RestartSec = "10s";
};
};
services.grafana-magic = {
enable = true;
settings = {
security = {
content_security_policy = true;
disable_gravatar = true;
data_source_proxy_whitelist = concatStringsSep " " [
"127.0.0.1:8086"
];
};
server = {
domain = "grafana.in.redalder.org";
};
system = {
http_addr = "127.0.0.1";
};
database = {
type = "postgres";
host = "/var/run/postgresql";
name = "grafana";
user = "grafana";
};
paths.provisioning = {
datasources.datasources = [
{
name = "InfluxDB";
type = "influxdb";
access = "proxy";
orgId = 1;
uid = "influxdb";
url = "http://127.0.0.1:8086";
jsonData = {
version = "Flux";
organization = "redalder";
defaultBucket = "bucket";
};
secureJsonData = {
token = "$__file{/run/secrets/grafana-influx.token}";
};
}
];
};
};
};
services.hashicorp-envoy.influx = {
service = {
name = "influx";
id = "influx";
address = "10.64.99.2";
port = 8086;
connect.sidecar_service = {};
};
environment = {
"CONSUL_HTTP_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8500";
"CONSUL_GRPC_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8502";
"CONSUL_HTTP_TOKEN_FILE" = "/run/secrets/envoy-grafana.token";
};
address = "10.64.99.2:19001";
adminBind = "127.0.0.1:19101";
hotRestart = false;
consulPackage = inputs'.nixpkgs-hashicorp.legacyPackages.${pkgs.stdenv.system}.consul;
extraConsulArgs = ["-ignore-envoy-compatibility"];
};
systemd.services."influxdb2-provision".serviceConfig.EnvironmentFile = [
"/run/secrets/itp.env"
];
services.influxdb2 = {
enable = true;
provision-magic = {
stateFile = "/var/lib/influxdb2/itp.state";
organization = "redalder";
# tasks.test = {
# every = "30s";
# fluxFile = ./influx-tasks/system-memory.flux;
# };
};
settings = {
http-bind-address = "127.0.0.1:8086";
hardening-enabled = true;
reporting-disabled = true;
};
};
services.hashicorp-envoy.telegraf = {
service = {
name = "telegraf";
id = "telegraf";
address = "10.64.99.2";
port = 8087;
connect.sidecar_service = {};
};
environment = {
"CONSUL_HTTP_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8500";
"CONSUL_GRPC_ADDR" = "http://${secret.network.ips.blowhole.ip or ""}:8502";
"CONSUL_HTTP_TOKEN_FILE" = "/run/secrets/envoy-grafana.token";
};
address = "10.64.99.2:19002";
adminBind = "127.0.0.1:19102";
hotRestart = false;
extraConsulArgs = ["-ignore-envoy-compatibility"];
};
services.telegraf-magic = {
enable = true;
settings = {
inputs.influxdb_v2_listener = {
service_address = "127.0.0.1:8087";
bucket_tag = "bucket";
parser_type = "upstream";
};
inputs.systemd_units = {
unittype = "service";
tags = {
host = "blowhole#monitoring";
bucket = "metrics";
};
};
outputs.influxdb_v2 = singleton {
urls = ["http://127.0.0.1:8086"];
token = "\${INFLUXDB_TOKEN}";
organization = "redalder";
bucket_tag = "bucket";
};
};
systemd.serviceConfig = {
EnvironmentFile = "/run/secrets/telegraf.env";
};
};
};
};
}