fix: watchdog killing backend every 60s on .198 (47 restarts/day)
Root cause: sd_notify::notify(true, ...) cleared NOTIFY_SOCKET env var, so watchdog pings never reached systemd. Backend killed every 60s. Fixes: - Change sd_notify::notify first param to false (keep socket) - Increase WatchdogSec from 60 to 300 (5min) for crash recovery - Add TimeoutStartSec=300 for slow container startups - Adjust watchdog ping interval to 120s This was causing 47 restarts/day on .198 and blocking REBOOT-03, FLEET-03, FLEET-04, VC-04. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -135,11 +135,12 @@ async fn main() -> Result<()> {
|
||||
info!("WebSocket: ws://{}/ws", addr);
|
||||
|
||||
// Notify systemd that we're ready (Type=notify)
|
||||
let _ = sd_notify::notify(true, &[sd_notify::NotifyState::Ready]);
|
||||
// Note: first param `false` keeps NOTIFY_SOCKET so watchdog pings work
|
||||
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]);
|
||||
|
||||
// Spawn systemd watchdog ping (WatchdogSec=60, ping every 30s)
|
||||
// Spawn systemd watchdog ping (WatchdogSec=300, ping every 120s)
|
||||
tokio::spawn(async {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(120));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]);
|
||||
|
||||
@@ -12,7 +12,8 @@ ExecStartPre=/bin/bash -c 'mkdir -p /etc/archipelago && echo "ARCHIPELAGO_HOST_I
|
||||
ExecStart=/usr/local/bin/archipelago
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
WatchdogSec=60
|
||||
WatchdogSec=300
|
||||
TimeoutStartSec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
Reference in New Issue
Block a user