#!/usr/bin/ksh # # Tivoli plugin to keep an eye on client queues on the master # # Written by Thomas Sluyter (nagiosATkilalaDOTnl) # By request of Unixerius, the Netherlands # Last Modified: 12-01-2009 # # Description: # Every time a BoKS client becomes unreachable the master server will retain updates # for this client in a queue. Over time this queue will continue to grow, containing all # manner of updates to /etc/passwd, /etc/shadow and so forth. Without these updates the # client will become out of date and known-good passwords will stop working. You could # lose access to the root account if you don't keep a history of the previous passwords! # This simple Tivoli plugin will warn you of any client queues that exceed a certain # size or age, with both thresholds adjustable from the command line. # # Usage: # ./check_boks_queues [-m MESS] [-a AGE] [-d -o FILE] [-f FILE] # # -m MESS Threshold for amount of messages. Default is 40 messages. # -a AGE Threshold for age of client queue. Default is 24 hours. # -f FILE Log file that queues that are over threshold. Default logs into $BOKS_var. # -d Debug mode. Provides error logging. # -o FILE Output file for debugging logs. Required when -d is passed. # # The -a parameter requires BoKS 6.5.x. It DOES NOT work in 6.0.x and older versions. # # Example: # ./check_boks_queues -m 50 -f /tmp/over50.txt # ./check_boks_queues -a 168 -f /tmp/oneweek.txt # # Limitations: # * This script must be run on a BoKS master server. # * This script must be run as root from the BoKS shell. # * This script was tested with BoKS 6.0.x and 6.5.x. # * Checking the age of a message queue is only supported in BoKS 6.5.x. # * If the script malfunctions, you may want to verify the platform specific # variables under "SETTING THINGS UP". # # Output: # This script is meant to be called as a Tivoli numeric script. # Hence both the output and the exit code are a single digit. Please # configure your numeric script calls accordingly: # # 0 OK Everything OK. # 1 WARNING Problem with script parameters or input. # 2 SEVERE One or more clients have crossed the threshold. # 3 CRITICAL Not used. # # Other notes: # * Use the debugging option to find out the cause of your problems if this # script ever misbehaves. # ### SETTING THINGS UP ### PROGNAME=$(basename $0) PATH="/usr/bin:/usr/sbin:/bin:/sbin:$PATH" MESS="40" # Default if -m parameter is not passed. AGE="24" # Default if -a parameter is not passed. DEBUG="0" # Default if -d parameter is not passed. DEBUGFILE="" # Default if -o parameter is not passed. LOGFILE="$BOKS_var/$PROGNAME.$(date +%Y%m%d)" # Default if -f is not passed. STATE_OK="0" # Exit code when everything's OK STATE_WA="1" # Exit code when there might be a problem STATE_SE="2" # Exit code when there's a problem STATE_CR="3" # Exit code when there's a problem # Setting up platform specific options case $(uname) in AIX) BOKSSBIN="/opt/boksm/sbin"; BOKSBIN="/opt/boksm/bin"; BOKSLIB="/opt/boksm/lib"; BOKSTMP="/var/opt/boksm/tmp"; BOKSETC="/etc/opt/boksm"; AWK="/usr/bin/awk"; GREP="/usr/bin/grep"; PING="/bin/ping -c 1"; PINGPARAM="";; Linux) BOKSSBIN="/opt/boksm/sbin"; BOKSBIN="/opt/boksm/bin"; BOKSLIB="/opt/boksm/lib"; BOKSTMP="/var/opt/boksm/tmp"; BOKSETC="/etc/opt/boksm"; AWK="/bin/awk"; GREP="/bin/grep"; PING="/bin/ping -c 1"; PINGPARAM="";; SunOS) BOKSSBIN="/opt/boksm/sbin"; BOKSBIN="/opt/boksm/bin"; BOKSLIB="/opt/boksm/lib"; BOKSETC="/etc/opt/boksm"; BOKSTMP="/var/opt/boksm/tmp"; AWK="/usr/xpg4/bin/awk"; GREP="/usr/xpg4/bin/grep"; PING="/usr/sbin/ping"; PINGPARAM="1";; *) echo "\nERROR: OS is not supported."; exit 1 ;; esac # Quick check to see if you're running as root with the BoKS shell if [[ ( -z $BOKS_etc ) || ( -z $(id | $GREP "uid=0") ) ]] then echo "ERROR: Script must be run as root, from the BoKS shell.\n" exit 1 fi BOKSVERSION=$($BOKSLIB/boksversion | grep ^BoKS | $AWK '{print $2}' | sed 's/\.//g') ### REQUISITE COMMAND LINE STUFF ### print_usage() { echo "\nUsage:" echo "$PROGNAME [-m MESS] [-a AGE] [-d -o FILE] [-f FILE]" echo "\n-m MESS\t\tThreshold for amount of messages. Default is 40 messages." echo "-a AGE\t\tThreshold for age of client queue. Default is 24 hours." echo "-f FILE\t\tLog file that queues that are over threshold. Default logs into $BOKS_var." echo "-d\t\tDebug mode. Provides error logging. " echo "-o FILE\t\tOutput file for debugging logs. Required when -d is passed." echo "\nThe -a parameter requires BoKS 6.5.x. It DOES NOT work in 6.0.x and older versions." echo "\nExample: " echo "./$PROGNAME -m 50 -f /tmp/over50.txt" echo "./$PROGNAME -a 168 -f /tmp/oneweek.txt\n" exit 1 } # Zero parameters are allowed. Default values will be used. #[[ $# -eq 0 ]] && print_usage while [[ $# -gt 0 ]] do case "$1" in -m) if [[ (-z $(echo $2 | $GREP ^-)) && (! -z $2) && (-z $(echo $2 | $GREP [A-z])) && (! -z $(echo $2 | $GREP [0-9])) ]] then MESS="$2"; shift else echo "\nERROR: Message threshold incorrectly specified. Use -m.\n\n"; exit 1 fi;; -a) if [[ (-z $(echo $2 | $GREP ^-)) && (! -z $2) && (-z $(echo $2 | $GREP [A-z])) && (! -z $(echo $2 | $GREP [0-9])) ]] then if [[ $BOKSVERSION -ge 65 ]] then AGE="$2"; shift else echo "\nERROR: Age threshold not supported in BoKS <6.5.\n\n"; exit 1 fi else echo "\nERROR: Age threshold (hours) incorrectly specified. Use -a.\n\n"; exit 1 fi;; -f) if [[ (-z $(echo $2 | $GREP ^-)) && (! -z $2) ]] then if [[ -d $(dirname $2) ]] then LOGFILE="$2" echo > $LOGFILE; chown root:root $LOGFILE; chmod 0644 $LOGFILE shift else echo "\nERROR: Output directory for log file does not exist. Use -f.\n\n" print_usage fi else echo "\nERROR: Log file not specified. Use -f.\n\n" print_usage fi;; -d) DEBUG="1";; -o) if [[ (-z $(echo $2 | $GREP ^-)) && (! -z $2) ]] then if [[ -d $(dirname $2) ]] then DEBUGFILE="$2" echo > $DEBUGFILE; chown root:root $DEBUGFILE; chmod 0600 $DEBUGFILE shift else echo "\nERROR: Output directory for debug log does not exist. Use -o.\n\n" print_usage fi else echo "\nERROR: Debug log output file not specified. Use -o.\n\n" print_usage fi;; -*) print_usage;; *) print_usage;; esac shift done # Some final checks and initialisation if [[ $DEBUG -gt 0 ]] then echo "=== SETUP ===" >> $DEBUGFILE echo "OS name is $(uname)" >> $DEBUGFILE echo "BOKSVERSION is $BOKSVERSION" >> $DEBUGFILE echo "BOKSSBIN is $BOKSSBIN" >> $DEBUGFILE echo "BOKSBIN is $BOKSBIN" >> $DEBUGFILE echo "BOKSLIB is $BOKSLIB" >> $DEBUGFILE echo "BOKSETC is $BOKSETC \n" >> $DEBUGFILE echo "Parameters passed to script are:" >> $DEBUGFILE echo "MESS = $MESS" >> $DEBUGFILE echo "AGE = $AGE" >> $DEBUGFILE echo "DEBUG = $DEBUG" >> $DEBUGFILE echo "DEBUGFILE = $DEBUGFILE \n" >> $DEBUGFILE echo "LOGFILE = $LOGFILE \n" >> $DEBUGFILE fi ### DEFINING SUBROUTINES ### function ExitScript { EXIT="$1" echo $EXIT exit $EXIT } TestQueues() { [[ $DEBUG -gt 0 ]] && echo "\n=== STARTING TestQueues ===\n" >> $DEBUGFILE rm $LOGFILE 2>/dev/null; touch $LOGFILE; chown root:root $LOGFILE; chmod 0644 $LOGFILE # By reading the manual page for boksdiag and the boksdiag-internal help function # I learnt about the -name and -age flags that I hadn't know about for four years. # BoKS > boksdiag fque -bridge -name -age # Host queued msgs age(h:m:s) # linux2(192.168.0.5) 8 80:39:28 # solaris2(192.168.0.3) 5 51:6:45 # linux3(192.168.0.6) 2 0:27:6 [[ $DEBUG -gt 0 ]] && $BOKSLIB/boksdiag fque -bridge -name -age >> $DEBUGFILE 2>&1 $BOKSLIB/boksdiag fque -bridge -name -age 2>/dev/null | grep -v ^Host | while read HOST QUEUE OLDEST do if [[ $BOKSVERSION -ge 65 ]] then if [[ ($QUEUE -gt $MESS) || ($(echo $OLDEST | $AWK -F: '{print $1}') -gt $AGE) ]] then echo "$HOST is over threshold, with $QUEUE messages (over $OLDEST hours old)." >> $LOGFILE let FAIL=$FAIL+1 if [[ $DEBUG -gt 0 ]] then echo "\n$HOST is over threshold, with $QUEUE messages (over $OLDEST hours old)." >> $DEBUGFILE echo "FAIL has now increased to $FAIL." >> $DEBUGFILE fi fi else if [[ ($QUEUE -gt $MESS) ]] then NAME=$($BOKSSBIN/hostadm -l -S | grep $HOST | awk '{print $1}') echo "$NAME($HOST) is over threshold, with $QUEUE messages. Age unknown." >> $LOGFILE let FAIL=$FAIL+1 if [[ $DEBUG -gt 0 ]] then echo "\n$NAME($HOST) is over threshold, with $QUEUE messages. Age unknown." >> $DEBUGFILE echo "FAIL has now increased to $FAIL." >> $DEBUGFILE fi fi fi let COUNT=$COUNT+1 done } ### FINALLY, THE MAIN ROUTINE ### [[ $DEBUG -gt 0 ]] && echo "\n=== STARTING MAIN PHASE ===\n $(date) \n" >> $DEBUGFILE COUNT="0"; FAIL="0" TestQueues [[ $DEBUG -gt 0 ]] && echo "\nRan for $COUNT queues, $FAIL over threshold.\n" >> $DEBUGFILE echo "\nRan for $COUNT queues, $FAIL over threshold.\n" >> $LOGFILE if [[ $FAIL -gt 0 ]] then ExitScript $STATE_SE else ExitScript $STATE_OK fi