Skip to content

Commit 43e996b

Browse files
Add more Healthchecks to BigBearCasaOS Healthcheck (#18)
* 🔧 feat(healthcheck): Add process resource check Adds a new check to the healthcheck script that displays the top 5 CPU and memory consuming processes, as well as checks for any zombie processes. This provides more detailed information about the system's resource utilization. * ✨ feat: Implement comprehensive system health checks This commit introduces a comprehensive system health check module to the CasaOS healthcheck script. The new checks include: - Network interface status and error/drop monitoring - Network latency testing to common targets - Filesystem health checks, including inode usage and mount point write access - Time synchronization status, including NTP sync - Log rotation configuration and large log file detection These additional checks provide a more thorough assessment of the overall system health, helping to identify potential issues that could impact the CasaOS application. The goal is to proactively detect and report on common system problems, allowing users to address them before they become critical. * ✨ feat(casaos-healthcheck): Update healthcheck version to 3.4 The changes update the version of the BigBearCasaOS Healthcheck from 3.3 to 3.4. This update is necessary to reflect the latest improvements and bug fixes in the healthcheck script.
1 parent 33f8153 commit 43e996b

File tree

1 file changed

+332
-1
lines changed

1 file changed

+332
-1
lines changed

casaos-healthcheck/run.sh

Lines changed: 332 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,325 @@ check_dmesg_errors() {
381381
fi
382382
}
383383
384+
check_process_resources() {
385+
print_header "Process Resource Check"
386+
387+
echo "Top 5 CPU consuming processes:"
388+
ps aux --sort=-%cpu | head -6 | tail -5 | \
389+
awk '{printf "%-20s %5s%%\n", $11, $3}'
390+
391+
echo -e "\nTop 5 Memory consuming processes:"
392+
ps aux --sort=-%mem | head -6 | tail -5 | \
393+
awk '{printf "%-20s %5s%%\n", $11, $4}'
394+
395+
local zombie_count=$(ps aux | grep -w Z | wc -l)
396+
if [ "$zombie_count" -gt 0 ]; then
397+
print_color "0;31" "${CROSS_MARK} Found $zombie_count zombie processes"
398+
else
399+
print_color "0;32" "${CHECK_MARK} No zombie processes found"
400+
fi
401+
}
402+
403+
check_network_interfaces() {
404+
print_header "Network Interface Check"
405+
406+
for interface in $(ip -o link show | awk -F': ' '{print $2}'); do
407+
# Skip loopback
408+
[[ "$interface" == "lo" ]] && continue
409+
410+
# Check link status
411+
local state=$(ip link show $interface | grep -oP 'state \K\w+')
412+
local speed=$(ethtool $interface 2>/dev/null | grep "Speed:" | awk '{print $2}')
413+
local errors=$(ip -s link show $interface | awk '/errors/{print $2}')
414+
local drops=$(ip -s link show $interface | awk '/drops/{print $2}')
415+
416+
if [[ "$state" == "UP" ]]; then
417+
print_color "0;32" "${CHECK_MARK} Interface $interface is UP"
418+
[[ -n "$speed" ]] && echo "Speed: $speed"
419+
else
420+
print_color "0;31" "${CROSS_MARK} Interface $interface is DOWN"
421+
fi
422+
423+
if [[ "$errors" != "0" || "$drops" != "0" ]]; then
424+
print_color "0;33" "${WARNING_MARK} $interface has $errors errors and $drops drops"
425+
fi
426+
done
427+
}
428+
429+
check_network_latency() {
430+
print_header "Network Latency Check"
431+
local targets=("8.8.8.8" "1.1.1.1" "google.com")
432+
433+
for target in "${targets[@]}"; do
434+
local ping_result=$(ping -c 3 $target 2>/dev/null | tail -1 | awk '{print $4}' | cut -d '/' -f 2)
435+
if [ -n "$ping_result" ]; then
436+
if (( $(awk "BEGIN {print ($ping_result > 100) ? 1 : 0}") )); then
437+
print_color "0;31" "${CROSS_MARK} High latency to $target: ${ping_result}ms"
438+
else
439+
print_color "0;32" "${CHECK_MARK} Good latency to $target: ${ping_result}ms"
440+
fi
441+
fi
442+
done
443+
}
444+
445+
check_filesystem_health() {
446+
print_header "File System Health Check"
447+
448+
# Check inode usage
449+
df -i | grep -v "Filesystem" | while read line; do
450+
local fs=$(echo $line | awk '{print $1}')
451+
local inode_usage=$(echo $line | awk '{print $5}' | sed 's/%//g')
452+
453+
# Skip if inode usage is not a number
454+
if [[ "$inode_usage" =~ ^[0-9]+$ ]]; then
455+
if [ "$inode_usage" -gt 80 ]; then
456+
print_color "0;31" "${CROSS_MARK} High inode usage on $fs: $inode_usage%"
457+
else
458+
print_color "0;32" "${CHECK_MARK} Inode usage on $fs: $inode_usage%"
459+
fi
460+
fi
461+
done
462+
463+
# Check mount points
464+
mount | grep -E 'ext4|xfs|btrfs|zfs' | while read line; do
465+
local mount_point=$(echo $line | awk '{print $3}')
466+
if touch "$mount_point"/.test_write 2>/dev/null; then
467+
rm "$mount_point"/.test_write
468+
print_color "0;32" "${CHECK_MARK} Mount point $mount_point is writable"
469+
else
470+
print_color "0;31" "${CROSS_MARK} Mount point $mount_point is not writable"
471+
fi
472+
done
473+
}
474+
475+
check_time_sync() {
476+
print_header "Time Synchronization Check"
477+
478+
if command -v timedatectl &>/dev/null; then
479+
local ntp_status=$(timedatectl | grep "NTP synchronized")
480+
if [[ $ntp_status == *"yes"* ]]; then
481+
print_color "0;32" "${CHECK_MARK} NTP is synchronized"
482+
else
483+
print_color "0;31" "${CROSS_MARK} NTP is not synchronized"
484+
fi
485+
486+
local time_status=$(timedatectl status --no-pager)
487+
echo "System time status:"
488+
echo "$time_status"
489+
else
490+
if command -v ntpq &>/dev/null; then
491+
local ntp_peers=$(ntpq -p)
492+
echo "NTP peers status:"
493+
echo "$ntp_peers"
494+
else
495+
print_color "0;33" "${WARNING_MARK} No time synchronization service found"
496+
fi
497+
fi
498+
}
499+
500+
check_log_rotation() {
501+
print_header "Log Rotation Check"
502+
503+
local log_dirs=("/var/log" "/var/log/casaos")
504+
local max_log_size=$((100 * 1024 * 1024)) # 100MB
505+
506+
for dir in "${log_dirs[@]}"; do
507+
if [ -d "$dir" ]; then
508+
echo "Checking logs in $dir:"
509+
find "$dir" -type f -name "*.log" -o -name "*.gz" | while read log; do
510+
local size=$(stat -f%z "$log" 2>/dev/null || stat -c%s "$log")
511+
if [ "$size" -gt "$max_log_size" ]; then
512+
print_color "0;31" "${CROSS_MARK} Large log file: $log ($(numfmt --to=iec-i --suffix=B $size))"
513+
fi
514+
done
515+
fi
516+
done
517+
518+
if [ -f "/etc/logrotate.d/casaos" ]; then
519+
print_color "0;32" "${CHECK_MARK} CasaOS log rotation configured"
520+
else
521+
print_color "0;33" "${WARNING_MARK} No CasaOS log rotation configuration found"
522+
fi
523+
}
524+
525+
check_security_audit() {
526+
print_header "Security Audit Check"
527+
528+
# Check SSH configuration
529+
if [ -f "/etc/ssh/sshd_config" ]; then
530+
local root_login=$(grep "^PermitRootLogin" /etc/ssh/sshd_config)
531+
local password_auth=$(grep "^PasswordAuthentication" /etc/ssh/sshd_config)
532+
533+
[[ "$root_login" == *"no"* ]] && \
534+
print_color "0;32" "${CHECK_MARK} Root SSH login disabled" || \
535+
print_color "0;31" "${CROSS_MARK} Root SSH login enabled"
536+
537+
[[ "$password_auth" == *"no"* ]] && \
538+
print_color "0;32" "${CHECK_MARK} SSH password authentication disabled" || \
539+
print_color "0;31" "${CROSS_MARK} SSH password authentication enabled"
540+
fi
541+
542+
# Check failed login attempts
543+
if [ -f "/var/log/auth.log" ]; then
544+
local failed_attempts=$(grep "Failed password" /var/log/auth.log | wc -l)
545+
if [ "$failed_attempts" -gt 0 ]; then
546+
print_color "0;33" "${WARNING_MARK} Found $failed_attempts failed login attempts"
547+
fi
548+
fi
549+
550+
# Check listening ports
551+
echo "Open ports:"
552+
netstat -tuln | grep LISTEN
553+
}
554+
555+
check_memory_pressure() {
556+
print_header "Memory Pressure Check"
557+
558+
# Check swap usage and configuration
559+
local swap_total=$(free -m | awk '/Swap:/ {print $2}')
560+
local swap_used=$(free -m | awk '/Swap:/ {print $3}')
561+
local swappiness=$(cat /proc/sys/vm/swappiness)
562+
563+
echo "Swap Configuration:"
564+
if [ "$swap_total" -eq 0 ]; then
565+
print_color "0;33" "${WARNING_MARK} No swap space configured"
566+
else
567+
local swap_percent=$((swap_used * 100 / swap_total))
568+
if [ "$swap_percent" -gt 80 ]; then
569+
print_color "0;31" "${CROSS_MARK} High swap usage: ${swap_percent}%"
570+
else
571+
print_color "0;32" "${CHECK_MARK} Swap usage: ${swap_percent}%"
572+
fi
573+
fi
574+
echo "Swappiness value: $swappiness"
575+
576+
# Check memory pressure stats if available
577+
if [ -f "/proc/pressure/memory" ]; then
578+
echo -e "\nMemory Pressure Statistics:"
579+
local pressure=$(cat /proc/pressure/memory)
580+
echo "$pressure"
581+
582+
# Extract 10 second average and convert to integer
583+
local avg10=$(echo "$pressure" | grep "avg10=" | cut -d= -f2 | cut -d" " -f1 | awk '{printf "%d", $1}')
584+
if (( $(awk "BEGIN {print ($avg10 > 50) ? 1 : 0}") )); then
585+
print_color "0;31" "${CROSS_MARK} High memory pressure detected"
586+
else
587+
print_color "0;32" "${CHECK_MARK} Normal memory pressure"
588+
fi
589+
fi
590+
}
591+
592+
check_docker_containers_health() {
593+
print_header "Docker Container Health Check"
594+
595+
if ! command -v docker &>/dev/null; then
596+
print_color "0;33" "${WARNING_MARK} Docker not installed"
597+
return
598+
fi
599+
600+
# Get all containers including stopped ones
601+
local containers=$(docker ps -a --format "{{.Names}}")
602+
603+
for container in $containers; do
604+
echo "Container: $container"
605+
606+
# Check container status
607+
local status=$(docker inspect --format='{{.State.Status}}' "$container")
608+
local health=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no health check{{end}}' "$container")
609+
local restarts=$(docker inspect --format='{{.RestartCount}}' "$container")
610+
611+
# Get resource usage
612+
local cpu=$(docker stats --no-stream --format "{{.CPUPerc}}" "$container")
613+
local mem=$(docker stats --no-stream --format "{{.MemPerc}}" "$container")
614+
615+
case $status in
616+
"running")
617+
print_color "0;32" "${CHECK_MARK} Status: Running"
618+
;;
619+
"exited")
620+
print_color "0;31" "${CROSS_MARK} Status: Stopped"
621+
;;
622+
*)
623+
print_color "0;33" "${WARNING_MARK} Status: $status"
624+
;;
625+
esac
626+
627+
echo "Health: $health"
628+
echo "Restart Count: $restarts"
629+
echo "CPU Usage: $cpu"
630+
echo "Memory Usage: $mem"
631+
echo "---"
632+
done
633+
}
634+
635+
check_system_limits() {
636+
print_header "System Resource Limits Check"
637+
638+
local max_files=$(ulimit -n)
639+
local max_processes=$(ulimit -u)
640+
641+
echo "File descriptor limit: $max_files"
642+
echo "Max user processes: $max_processes"
643+
644+
if [ "$max_files" -lt 65535 ]; then
645+
print_color "0;33" "${WARNING_MARK} Low file descriptor limit"
646+
fi
647+
648+
if [ "$max_processes" -lt 4096 ]; then
649+
print_color "0;33" "${WARNING_MARK} Low process limit"
650+
fi
651+
}
652+
653+
generate_health_report() {
654+
print_header "Health Check Summary Report"
655+
656+
# Initialize arrays for different severity levels
657+
declare -a critical_issues=()
658+
declare -a warnings=()
659+
660+
# Collect issues from previous checks
661+
if $ERROR_FOUND; then
662+
critical_issues+=("Service log errors detected")
663+
fi
664+
665+
# Add disk space issues
666+
local disk_usage=$(df / | grep / | awk '{ print $5 }' | sed 's/%//g')
667+
if [ "$disk_usage" -ge 80 ]; then
668+
critical_issues+=("High disk usage: ${disk_usage}%")
669+
fi
670+
671+
# Add memory pressure issues
672+
local mem_used=$(free -m | awk '/^Mem:/ { print $3 }')
673+
local mem_total=$(free -m | awk '/^Mem:/ { print $2 }')
674+
if [ $((mem_used * 100 / mem_total)) -gt 80 ]; then
675+
critical_issues+=("High memory usage")
676+
fi
677+
678+
# Add Docker status issues
679+
if ! docker info &>/dev/null; then
680+
critical_issues+=("Docker service is not running")
681+
fi
682+
683+
# Display report
684+
if [ ${#critical_issues[@]} -gt 0 ]; then
685+
print_color "0;31" "Critical Issues Found:"
686+
for issue in "${critical_issues[@]}"; do
687+
echo "- $issue"
688+
done
689+
fi
690+
691+
if [ ${#warnings[@]} -gt 0 ]; then
692+
print_color "0;33" "Warnings:"
693+
for warning in "${warnings[@]}"; do
694+
echo "- $warning"
695+
done
696+
fi
697+
698+
if [ ${#critical_issues[@]} -eq 0 ] && [ ${#warnings[@]} -eq 0 ]; then
699+
print_color "0;32" "${CHECK_MARK} No significant issues found"
700+
fi
701+
}
702+
384703
# Main script flow
385704
check_root_privileges
386705
@@ -405,7 +724,7 @@ elif [[ "$1" == "real_test" ]]; then
405724
else
406725
# Normal script execution
407726
# Display Welcome
408-
print_header "BigBearCasaOS Healthcheck V3.3"
727+
print_header "BigBearCasaOS Healthcheck V3.4"
409728
echo "Here are some links:"
410729
echo "https://community.bigbeartechworld.com"
411730
echo "https://github.com/BigBearTechWorld"
@@ -479,6 +798,18 @@ else
479798
check_system_temperature
480799
check_system_updates
481800
check_dmesg_errors
801+
check_process_resources
802+
check_network_interfaces
803+
check_network_latency
804+
check_filesystem_health
805+
check_time_sync
806+
check_log_rotation
807+
check_security_audit
808+
check_memory_pressure
809+
check_system_limits
810+
check_docker_containers_health
811+
812+
generate_health_report
482813
483814
print_header "Health Check Complete"
484815
fi

0 commit comments

Comments
 (0)