Initial commit: health monitoring scripts for Discord webhooks
This commit is contained in:
39
README.md
Normal file
39
README.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# homelab-health-monitor
|
||||||
|
|
||||||
|
Lightweight system health monitoring for homelab servers with Discord webhook notifications.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Hourly stats collection (CPU, RAM, load, disk)
|
||||||
|
- Daily Discord reports with min/avg/max values
|
||||||
|
- SSD SMART health checks
|
||||||
|
- Journal error summaries
|
||||||
|
- Color-coded status (OK/WARNING/CRITICAL)
|
||||||
|
- Zero dependencies beyond bash, cron, and curl
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
1. Copy scripts to `~/.local/bin/`
|
||||||
|
2. Create data directory: `mkdir -p ~/.local/share/health-monitor`
|
||||||
|
3. Edit `health-report.sh` and set your `WEBHOOK_URL`
|
||||||
|
4. Make executable: `chmod +x ~/.local/bin/health-*.sh`
|
||||||
|
5. Add to crontab:
|
||||||
|
|
||||||
|
```cron
|
||||||
|
0 * * * * ~/.local/bin/health-collector.sh
|
||||||
|
0 9 * * * ~/.local/bin/health-report.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Send report now
|
||||||
|
./health-report.sh
|
||||||
|
|
||||||
|
# Only report if warnings/errors
|
||||||
|
./health-report.sh --errors-only
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
34
health-collector.sh
Executable file
34
health-collector.sh
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Collects system stats hourly - run via cron every hour
|
||||||
|
# Data stored in ~/.local/share/health-monitor/
|
||||||
|
|
||||||
|
DATA_DIR="$HOME/.local/share/health-monitor"
|
||||||
|
TODAY=$(date +%Y-%m-%d)
|
||||||
|
DATA_FILE="$DATA_DIR/stats-$TODAY.csv"
|
||||||
|
|
||||||
|
mkdir -p "$DATA_DIR"
|
||||||
|
|
||||||
|
# Initialize CSV header if new file
|
||||||
|
if [ ! -f "$DATA_FILE" ]; then
|
||||||
|
echo "timestamp,cpu,mem,load1,load5,load15,disk_pct" > "$DATA_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# CPU usage (1 second sample)
|
||||||
|
CPU=$(top -bn2 -d0.5 | grep "Cpu(s)" | tail -1 | awk '{printf "%.1f", 100 - $8}')
|
||||||
|
|
||||||
|
# Memory percentage
|
||||||
|
MEM=$(free | awk '/Mem:/ {printf "%.1f", $3/$2 * 100}')
|
||||||
|
|
||||||
|
# Load averages
|
||||||
|
read LOAD1 LOAD5 LOAD15 _ < /proc/loadavg
|
||||||
|
|
||||||
|
# Disk usage
|
||||||
|
DISK=$(df / | awk 'NR==2 {gsub(/%/,""); print $5}')
|
||||||
|
|
||||||
|
# Timestamp
|
||||||
|
TS=$(date +%H:%M)
|
||||||
|
|
||||||
|
echo "$TS,$CPU,$MEM,$LOAD1,$LOAD5,$LOAD15,$DISK" >> "$DATA_FILE"
|
||||||
|
|
||||||
|
# Cleanup old files (keep 7 days)
|
||||||
|
find "$DATA_DIR" -name "stats-*.csv" -mtime +7 -delete 2>/dev/null
|
||||||
139
health-report.sh
Executable file
139
health-report.sh
Executable file
@@ -0,0 +1,139 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# System Health Report - Sends daily summary to Discord
|
||||||
|
# Usage: ./health-report.sh [--errors-only]
|
||||||
|
|
||||||
|
WEBHOOK_URL="${DISCORD_WEBHOOK_URL:-YOUR_WEBHOOK_URL_HERE}"
|
||||||
|
DATA_DIR="$HOME/.local/share/health-monitor"
|
||||||
|
ERRORS_ONLY="${1:-}"
|
||||||
|
|
||||||
|
# Colors for Discord embeds
|
||||||
|
COLOR_OK=3066993 # Green
|
||||||
|
COLOR_WARN=16776960 # Yellow
|
||||||
|
COLOR_ERROR=15158332 # Red
|
||||||
|
|
||||||
|
# Thresholds
|
||||||
|
DISK_WARN=80
|
||||||
|
DISK_CRIT=90
|
||||||
|
MEM_WARN=80
|
||||||
|
MEM_CRIT=90
|
||||||
|
|
||||||
|
# Gather basic info
|
||||||
|
HOSTNAME=$(hostname)
|
||||||
|
UPTIME=$(uptime -p | sed 's/up //')
|
||||||
|
|
||||||
|
# Calculate stats from collected data (today + yesterday for context)
|
||||||
|
YESTERDAY=$(date -d "yesterday" +%Y-%m-%d 2>/dev/null || date -v-1d +%Y-%m-%d 2>/dev/null)
|
||||||
|
TODAY=$(date +%Y-%m-%d)
|
||||||
|
|
||||||
|
# Combine recent data files
|
||||||
|
STATS_DATA=""
|
||||||
|
for f in "$DATA_DIR/stats-$YESTERDAY.csv" "$DATA_DIR/stats-$TODAY.csv"; do
|
||||||
|
[ -f "$f" ] && STATS_DATA+=$(tail -n +2 "$f")$'\n'
|
||||||
|
done
|
||||||
|
|
||||||
|
# Calculate min/avg/max if we have data
|
||||||
|
if [ -n "$STATS_DATA" ] && [ $(echo "$STATS_DATA" | grep -c .) -gt 0 ]; then
|
||||||
|
CPU_STATS=$(echo "$STATS_DATA" | awk -F',' 'NF>=2 && $2~/^[0-9]/ {sum+=$2; if(min=="" || $2<min)min=$2; if($2>max)max=$2; n++} END {if(n>0) printf "%.0f / %.0f / %.0f", min, sum/n, max; else print "N/A"}')
|
||||||
|
MEM_STATS=$(echo "$STATS_DATA" | awk -F',' 'NF>=3 && $3~/^[0-9]/ {sum+=$3; if(min=="" || $3<min)min=$3; if($3>max)max=$3; n++} END {if(n>0) printf "%.0f / %.0f / %.0f", min, sum/n, max; else print "N/A"}')
|
||||||
|
LOAD_STATS=$(echo "$STATS_DATA" | awk -F',' 'NF>=4 && $4~/^[0-9]/ {sum+=$4; if(min=="" || $4<min)min=$4; if($4>max)max=$4; n++} END {if(n>0) printf "%.2f / %.2f / %.2f", min, sum/n, max; else print "N/A"}')
|
||||||
|
SAMPLES=$(echo "$STATS_DATA" | grep -c .)
|
||||||
|
|
||||||
|
# Get max values for threshold checks
|
||||||
|
MEM_MAX=$(echo "$STATS_DATA" | awk -F',' 'NF>=3 && $3~/^[0-9]/ {if($3>max)max=$3} END {print int(max)}')
|
||||||
|
[ -z "$MEM_MAX" ] && MEM_MAX=0
|
||||||
|
else
|
||||||
|
CPU_STATS="No data yet"
|
||||||
|
MEM_STATS="No data yet"
|
||||||
|
LOAD_STATS="No data yet"
|
||||||
|
SAMPLES=0
|
||||||
|
MEM_MAX=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Current disk usage
|
||||||
|
DISK_PCT=$(df / | awk 'NR==2 {gsub(/%/,""); print $5}')
|
||||||
|
DISK_DETAIL=$(df -h / | awk 'NR==2 {printf "%s / %s", $3, $2}')
|
||||||
|
|
||||||
|
# SSD SMART status
|
||||||
|
SMART_STATUS="Unknown"
|
||||||
|
if command -v smartctl &>/dev/null; then
|
||||||
|
SMART_OUT=$(smartctl -H /dev/sda 2>&1)
|
||||||
|
if echo "$SMART_OUT" | grep -q "PASSED"; then
|
||||||
|
SMART_STATUS="PASSED ✓"
|
||||||
|
elif echo "$SMART_OUT" | grep -q "FAILED"; then
|
||||||
|
SMART_STATUS="FAILED ✗"
|
||||||
|
elif echo "$SMART_OUT" | grep -qi "permission"; then
|
||||||
|
SMART_STATUS="Needs root"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Recent errors from journal (last 24h)
|
||||||
|
ERRORS=""
|
||||||
|
if command -v journalctl &>/dev/null; then
|
||||||
|
ERRORS=$(journalctl -p err -S "24 hours ago" --no-pager -q 2>/dev/null | tail -10)
|
||||||
|
fi
|
||||||
|
ERROR_COUNT=$(echo "$ERRORS" | grep -c . 2>/dev/null || echo 0)
|
||||||
|
[ -z "$ERRORS" ] && ERROR_COUNT=0
|
||||||
|
|
||||||
|
# Determine overall status
|
||||||
|
STATUS="OK"
|
||||||
|
COLOR=$COLOR_OK
|
||||||
|
|
||||||
|
if [ "$DISK_PCT" -ge "$DISK_CRIT" ] || [ "$MEM_MAX" -ge "$MEM_CRIT" ]; then
|
||||||
|
STATUS="CRITICAL"
|
||||||
|
COLOR=$COLOR_ERROR
|
||||||
|
elif [ "$DISK_PCT" -ge "$DISK_WARN" ] || [ "$MEM_MAX" -ge "$MEM_WARN" ] || [ "$ERROR_COUNT" -gt 0 ]; then
|
||||||
|
STATUS="WARNING"
|
||||||
|
COLOR=$COLOR_WARN
|
||||||
|
fi
|
||||||
|
|
||||||
|
if echo "$SMART_STATUS" | grep -q "FAILED"; then
|
||||||
|
STATUS="CRITICAL"
|
||||||
|
COLOR=$COLOR_ERROR
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Skip if errors-only mode and everything is OK
|
||||||
|
if [ "$ERRORS_ONLY" = "--errors-only" ] && [ "$STATUS" = "OK" ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Format errors for Discord
|
||||||
|
ERROR_FIELD=""
|
||||||
|
if [ "$ERROR_COUNT" -gt 0 ] && [ -n "$ERRORS" ]; then
|
||||||
|
# Escape and truncate errors
|
||||||
|
ERRORS_CLEAN=$(echo "$ERRORS" | head -5 | sed 's/`/'"'"'/g' | tr '\n' '|' | sed 's/|/\\n/g' | cut -c1-400)
|
||||||
|
ERROR_FIELD=',{"name": "📋 Recent Errors", "value": "```'"$ERRORS_CLEAN"'```", "inline": false}'
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build Discord embed JSON
|
||||||
|
PAYLOAD=$(cat <<EOF
|
||||||
|
{
|
||||||
|
"embeds": [{
|
||||||
|
"title": "🖥️ ${HOSTNAME} - Health Report",
|
||||||
|
"color": ${COLOR},
|
||||||
|
"fields": [
|
||||||
|
{"name": "Status", "value": "**${STATUS}**", "inline": true},
|
||||||
|
{"name": "Uptime", "value": "${UPTIME}", "inline": true},
|
||||||
|
{"name": "Samples", "value": "${SAMPLES} (24h)", "inline": true},
|
||||||
|
{"name": "💻 CPU % (min/avg/max)", "value": "${CPU_STATS}", "inline": true},
|
||||||
|
{"name": "🧠 RAM % (min/avg/max)", "value": "${MEM_STATS}", "inline": true},
|
||||||
|
{"name": "📊 Load (min/avg/max)", "value": "${LOAD_STATS}", "inline": true},
|
||||||
|
{"name": "💾 Disk /", "value": "${DISK_PCT}% (${DISK_DETAIL})", "inline": true},
|
||||||
|
{"name": "🔧 SSD Health", "value": "${SMART_STATUS}", "inline": true},
|
||||||
|
{"name": "⚠️ Errors (24h)", "value": "${ERROR_COUNT}", "inline": true}${ERROR_FIELD}
|
||||||
|
],
|
||||||
|
"footer": {"text": "$(date '+%Y-%m-%d %H:%M:%S')"}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send to Discord
|
||||||
|
RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -H "Content-Type: application/json" -d "$PAYLOAD" "$WEBHOOK_URL")
|
||||||
|
|
||||||
|
if [ "$RESPONSE" = "204" ]; then
|
||||||
|
echo "Health report sent successfully"
|
||||||
|
else
|
||||||
|
echo "Failed to send report (HTTP $RESPONSE)"
|
||||||
|
echo "$PAYLOAD" | head -50
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user