Nagios script: check_suncluster

2006-06-01 00:00:00

This script was written at the time I was hired by KPN i-Diensten. It is reproduced/shared here with their permission.

A few of our projects and services are run on Solaris systems running Sun Cluster software. Since there were no Nagios scripts available to perform checks against Sun Cluster I made a basic script that checks the most important factors.

This script performs a different function, depending on the parameter with which it is called. This allows you to define multiple service checks in Nagios, without needing seperate check scripts for each.

EDIT:

Oh! Just like my other recent Nagios scripts, check_suncluster comes with a debugging option. Set $DEBUG at the top of the file to anything larger than zero and the script will dump information at various stages of its execution. And like my other, recent scripts it also comes with its own test script.



#!/usr/bin/ksh
#
# Nagios check script for Sun Cluster.
# Written by Thomas Sluyter (nagiosATkilalaDOTnl)
# By request of KPN-IS, i-Provide SYS, the Netherlands
# Last Modified: 25-09-2006
#
# Usage: ./check_suncluster [-t, -q, -g, -G resource-group, -r, -R resource, -i]
#
# Description:
# This script is capable of performing a number of basic checks on a 
# system running Sun Cluster. Depending on the parameter you pass to 
# it, it will check:
# * Transport paths (-t).
# * Quorum (-q).
# * Resource groups (-g).
# * One selected resource group (-G).
# * Resources (-r).
# * One selected resource (-R).
# * IPMP groups (-i).
#
# Limitations:
# This script will only work with Korn shell, due to some funky while
# looping with pipe forking. Bash doesn't handle this very gracefully,
# due to its sub-shell variable scoping. Maybe I really should learn
# to program in Perl.   
#
# Output:
# * Transport paths return a WARN when one of the paths is down and a
#   CRIT when all paths are offline. 
# * Quorum returns a WARN when not all, but enough quorum devices are
#   available. It returns a CRIT when quorum cannot be reached.
# * Resource groups returns a CRIT when a group is offline on all nodes
#   and a WARN if a group is in an unstable state.
# * Resources returns a CRIT when a resource is offline on all nodes
#   and a WARN if a resource is in an unstable state.
# * IPMP groups returns a CRIT when a group is offline.
#
# Other notes:
# Aside from the debugging output that I've built into most of my recent
# scripts, this check script will also have a testing mode  hacked on, as
# a bag on the side. This testing mode is only engaged when the test_check_suncluster
# script is being run and will intentionally "break" a few things, to 
# verify the failure options of this check script.
#

# Enabling the following dumps information into DEBUGFILE at various
# stages during the execution of this script.
DEBUG=0
DEBUGFILE="/tmp/foobar"

if [ -f /tmp/neko-wa-baka ]
then
	if [ `cat /tmp/neko-wa-baka` == "Nyo!" ]
	then
	   TESTING="1"
	else
	   TESTING="0"
	fi
else
	TESTING="0"
fi


### REQUISITE NAGIOS USER INTERFACE STUFF ###

# You may have to change this, depending on where you installed your
# Nagios plugins
PATH="/usr/bin:/usr/sbin:/bin:/sbin:/usr/cluster/bin"
LIBEXEC="/usr/local/nagios/libexec"
PROGNAME="check_suncluster"
. $LIBEXEC/utils.sh

[ $DEBUG -gt 0 ] && rm $DEBUGFILE 

print_usage() {
        echo "Usage: $PROGNAME [-t, -q, -g, -G resource-group, -r, -R resource, -i]"
        echo "Usage: $PROGNAME --help"
}

print_help() {
        echo ""
        print_usage
        echo ""
        echo "Sun Cluster check plugin for Nagios"
        echo ""
        echo "-t: check transport paths"
        echo "-q: check quorum"
        echo "-g: check resource groups"
        echo "-G: check one individual resource group"
        echo "-r: check all resources"
        echo "-R: check one individual resources"
        echo "-i: check IPMP groups"
        echo ""
        echo "This plugin not developped by the Nagios Plugin group."
        echo "Please do not e-mail them for support on this plugin, since"
        echo "they won't know what you're talking about :P"
        echo ""
        echo "For contact info, read the plugin itself..."
}


### SUB-ROUTINE DEFINITIONS ### 

function check_transport_paths
{
[ $DEBUG -gt 0 ] && echo "Starting check_transport_path subroutine." >> $DEBUGFILE

	TOTAL=`scstat -W | grep "Transport path:" | wc -l`
	let COUNT=0

	scstat -W | grep "Transport path:" | awk '{print $3" "$6}' | while read PATH STATUS
	do
[ $DEBUG -gt 0 ] && echo "Before math, Count has the value of $COUNT." >> $DEBUGFILE
		if [ $STATUS == "online" ]
		then
		   let COUNT=$COUNT+1
		fi
[ $DEBUG -gt 0 ] && echo "Path: $PATH has status $STATUS" >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Count: $COUNT online transport paths." >> $DEBUGFILE
	done

[ $DEBUG -gt 0 ] && echo "Count: Outside the loop it has a value of $COUNT." >> $DEBUGFILE
[ $TESTING -gt 0 ] && COUNT="0"

	if [ $COUNT -lt 1 ]
	then
	   echo "NOK - No transport paths online."
	   exit $STATE_CRITICAL
	elif [ $COUNT -lt $TOTAL ]
	then
	   echo "NOK - One or more transport paths offline."
	   exit $STATE_WARNING
	fi
}

function check_quorum
{
[ $DEBUG -gt 0 ] && echo "Starting check_quorum subroutine." >> $DEBUGFILE
	NEED=`scstat -q | grep "votes needed:" | awk '{print $4}'`
	PRES=`scstat -q | grep "votes present:" | awk '{print $4}'`

[ $DEBUG -gt 0 ] && echo "Quorum needed: $NEED" >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Quorum present: $PRES" >> $DEBUGFILE

[ $TESTING -gt 0 ] && PRES="0"
	if [ $PRES -ge $NEED ]
	then
[ $DEBUG -gt 0 ] && echo "Enough quorum votes." >> $DEBUGFILE
		scstat -q | grep "votes:" | awk '{print $3" "$6}' | while read VOTE STATUS
		do
[ $DEBUG -gt 0 ] && echo "Vote: $VOTE has status $STATUS." >> $DEBUGFILE
			if [ $STATUS != "Online" ] 
			then
			   echo "NOK - Quorum vote $VOTE not available."
			   exit $STATE_WARNING
			fi
		done		
	else
[ $DEBUG -gt 0 ] && echo "Not enough quorum." >> $DEBUGFILE
		echo "NOK - Not enough quorum votes present."
		exit $STATE_CRITICAL
	fi
}

function check_resource_groups
{
[ $DEBUG -gt 0 ] && echo "Starting check_resource_groups subroutine." >> $DEBUGFILE
	scstat -g | grep "Group:" | awk '{print $2}' | sort -u | while read GROUP
	do
	ONLINE=`scstat -g | grep "Group: $GROUP" | grep "Online" | wc -l`
	WEIRD=`scstat -g | grep "Group: $GROUP" | grep -v "Resources" | grep -v "Online" | grep -v "Offline" | wc -l`
[ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $ONLINE instances online." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $WEIRD instances in a weird state." >> $DEBUGFILE
[ $TESTING -gt 0 ] && ONLINE="0"
		if [ $ONLINE -lt 1 ] 
		then
		   echo "NOK - Resource group $GROUP not online."
		   exit $STATE_CRITICAL
		fi
                if [ $WEIRD -gt 1 ]
                then
                   echo "NOK - Resource group $GROUP is an unstable state."
                   exit $STATE_WARNING
                fi
	done
}

function check_resource_grp
{
[ $DEBUG -gt 0 ] && echo "Starting check_resource_grp subroutine." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Selected group: $RGROUP" >> $DEBUGFILE
	ONLINE=`scstat -g | grep $RGROUP | grep "Online" | wc -l`
	WEIRD=`scstat -g | grep $RGROUP | grep -v "Resources" | grep -v "Online" | grep -v "Offline" | wc -l`
[ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $ONLINE instances online." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $WEIRD instances in a weird state." >> $DEBUGFILE
[ $TESTING -gt 0 ] && ONLINE="0"
	if [ $ONLINE -lt 1 ] 
	then
	   echo "NOK - Resource group $RGROUP not online."
	   exit $STATE_CRITICAL
	fi
	if [ $WEIRD -gt 1 ]
        then
           echo "NOK - Resource group $RGROUP is in an unstable state."
           exit $STATE_WARNING
        fi
}

function check_resources
{
[ $DEBUG -gt 0 ] && echo "Starting check_resources subroutine." >> $DEBUGFILE
	RESOURCES=`scstat -g | grep "Resource:" | awk '{print $2}' | sort -u`
[ $DEBUG -gt 0 ] && echo "List of resources to check: $RESOURCES" >> $DEBUGFILE
	for RESOURCE in `echo $RESOURCES`
	do
	ONLINE=`scstat -g | grep "Resource: $RESOURCE" | awk '{print $4}' | grep "Online" | wc -l` 
	WEIRD=`scstat -g | grep "Resource: $RESOURCE" | awk '{print $4}' | grep -v "Online" | grep -v "Offline" | wc -l`
[ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $ONLINE instances online." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $WEIRD instances in a weird state." >> $DEBUGFILE
[ $TESTING -gt 0 ] && ONLINE="0"
		if [ $ONLINE -lt 1 ] 
		then
		   echo "NOK - Resource $RESOURCE not online."
		   exit $STATE_CRITICAL
		fi
                if [ $WEIRD -gt 1 ]
                then
                   echo "NOK - Resource $RESOURCE is in an unstable state."
                   exit $STATE_WARNING
                fi
	done
}

function check_rsrce
{
[ $DEBUG -gt 0 ] && echo "Starting check_rsrce subroutine." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Selected resource: $RSRCE" >> $DEBUGFILE
	ONLINE=`scstat -g | grep "Resource: $RSRCE" | awk '{print $4}' | grep "Online" | wc -l`
	WEIRD=`scstat -g | grep "Resource: $RSRCE" | awk '{print $4}' | grep -v "Online" | grep -v "Offline" | wc -l`
[ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $ONLINE instances online." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $WEIRD instances in a weird state." >> $DEBUGFILE
[ $TESTING -gt 0 ] && ONLINE="0"
	if [ $ONLINE -lt 1 ] 
	then
	   echo "NOK - Resource $RESOURCE not online."
	   exit $STATE_CRITICAL
	fi
	if [ $WEIRD -gt 1 ]
        then
           echo "NOK - Resource $RESOURCE is in an unstable state."
           exit $STATE_WARNING
        fi
}

function check_ipmp
{
[ $DEBUG -gt 0 ] && echo "Starting check_ipmp subroutine." >> $DEBUGFILE
	scstat -i | grep "IPMP Group:" | awk '{print $3" "$5}' | while read GROUP STATUS
	do
[ $DEBUG -gt 0 ] && echo "IPMP Group: $GROUP has status $STATUS" >> $DEBUGFILE
		if [ $STATUS != "Online" ] 
		then
		   echo "NOK - IPMP group $GROUP not online."
		   exit $STATE_CRITICAL
		fi
if [ $TESTING -gt 0 ]
then
   echo "NOK - IPMP group $GROUP not online."
   exit $STATE_CRITICAL
fi
	done
}

### THE MAIN ROUTINE FINALLY STARTS ###

[ $DEBUG -gt 0 ] && echo "Starting main routine." >> $DEBUGFILE

if [ $# -lt 1 ]
then
	print_usage
	exit $STATE_UNKNOWN
fi

[ $DEBUG -gt 0 ] && echo "More than one argument." >> $DEBUGFILE
[ $DEBUG -gt 0 ] && echo "" >> $DEBUGFILE

case "$1" in
	--help) print_help; exit $STATE_OK;;
	-h) print_help; exit $STATE_OK;;
	-t) check_transport_paths;;
	-q) check_quorum;;
	-g) check_resource_groups;;
	-G) RGROUP="$2"; check_resource_grp;;
	-r) check_resources;;
	-R) RSRCE="$2"; check_rsrce;;
	-i) check_ipmp;;
	*) print_usage; exit $STATE_UNKNOWN;;
esac

[ $DEBUG -gt 0 ] && echo "No problems. Exiting normally." >> $DEBUGFILE

# None of the other subroutines forced us to exit 1 before here, so let's quit with a 0.
echo "OK - Everything running like it should"
exit $STATE_OK

#!/usr/bin/bash

function testrun()
{
	echo "Running without parameters."
	/usr/local/nagios/libexec/check_suncluster 
	echo "Exit code is $?."
	echo ""

	echo "Testing transport paths."
	/usr/local/nagios/libexec/check_suncluster -t
	echo "Exit code is $?."
	echo ""

	echo "Quorum votes."
	/usr/local/nagios/libexec/check_suncluster -q
	echo "Exit code is $?."
	echo ""

	echo "Checking all resource groups."
	/usr/local/nagios/libexec/check_suncluster -g
	echo "Exit code is $?."
	echo ""

	echo "Checking individual resource groups."
	for GROUP in `scstat -g | grep "Group:" | awk '{print $2}' | sort -u`
	do
		echo "Running for group $GROUP."
		/usr/local/nagios/libexec/check_suncluster -G $GROUP
		echo "Exit code is $?."
		echo ""
	done

	echo "Checking all resources."
	/usr/local/nagios/libexec/check_suncluster -r
	echo "Exit code is $?."
	echo ""
	
	echo "Checking all resources."
	for RESOURCE in `scstat -g | grep "Resource:" | awk '{print $2}' | sort -u`
	do
		echo "Running for resource $RESOURCE."
		/usr/local/nagios/libexec/check_suncluster -R $RESOURCE
		echo "Exit code is $?."
		echo ""
	done
	
	echo "Checking IPMP groups."
	/usr/local/nagios/libexec/check_suncluster -i
	echo "Exit code is $?."
	echo ""
}

function breakstuff()
{
	# Now we'll start breaking things!!
	echo ""
	echo "Now it's time to start breaking things! Gruaargh!"
	echo "Mind you, it's all fake and simulated. I am not changing -anything-"
	echo "about the cluster itself."
	echo ""
	
	echo "Nyo!" > /tmp/neko-wa-baka 
}

echo "Starting clean"
rm /tmp/neko-wa-baka /tmp/foobar >/dev/null 2>&1
echo ""

testrun
breakstuff
testrun

echo "Starting clean at the end"
rm /tmp/neko-wa-baka  >/dev/null 2>&1
echo ""

kilala.nl tags: , , ,

View or add comments (curr. 2)