Difference between revisions of "Monitoring openvz resources using nagios and snmp"
(→nagios plugin: use source tag) |
|||
| Line 220: | Line 220: | ||
</source> | </source> | ||
| + | === nagios plugin/failcnt in python === | ||
| + | <source lang=python> | ||
| + | #!/usr/bin/python | ||
| + | # Copyright (C) 2008 Christian Benke | ||
| + | # Distributed under the terms of the GNU General Public License v2 | ||
| + | # v0.1 2008-04-03 | ||
| + | # Christian Benke <benkokakao gmail com> | ||
| + | |||
| + | import string | ||
| + | import pickle | ||
| + | import sys | ||
| + | import getopt | ||
| + | import re | ||
| + | import smtplib | ||
| + | import socket | ||
| + | |||
| + | veid='' | ||
| + | current_data=dict() | ||
| + | opts=None | ||
| + | beancounter_data=open('/proc/user_beancounters','r') | ||
| + | picklefilepath='/tmp/beancounters_pickledump' | ||
| + | |||
| + | #-------- find the hostname for each veid ---: | ||
| + | |||
| + | def find_veid(veid): | ||
| + | veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r') | ||
| + | for line in veid_conf: | ||
| + | if "HOSTNAME" in line: | ||
| + | quotes=re.compile("\"") | ||
| + | line=quotes.sub("",line) | ||
| + | linefeed=re.compile("\n") | ||
| + | line=linefeed.sub("",line) | ||
| + | fqdn=re.split('=',line) | ||
| + | hostname=re.split('\.',fqdn[1])[0] | ||
| + | return hostname | ||
| + | |||
| + | # ---------- send mail in case of a counter-change | ||
| + | def send_mail(count_change): | ||
| + | mailfrom = 'root@' + str(host) | ||
| + | mailto = 'tech@inqnet.at' | ||
| + | mailsubject = 'Beancounters changed in the last 5 minutes' | ||
| + | mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n' | ||
| + | server = smtplib.SMTP('localhost') | ||
| + | server.sendmail(mailfrom, [mailto], '''\ | ||
| + | From:''' + mailfrom + '''\ | ||
| + | \nTo:''' + mailto + '''\ | ||
| + | \nSubject:''' + mailsubject + '''\ | ||
| + | |||
| + | \n''' + mailbody + count_change) | ||
| + | server.quit() | ||
| + | |||
| + | #------------read raw and compare data from user_beancounters | ||
| + | |||
| + | def compare_data(beancounter_data,data_read,count): | ||
| + | barrier_break=str() | ||
| + | count_change=str() | ||
| + | for line in beancounter_data: | ||
| + | if 'Version' in line or 'uid' in line or 'dummy' in line: | ||
| + | continue | ||
| + | else: | ||
| + | fields=line.split( ) | ||
| + | if len(fields) == 7: | ||
| + | i=0 | ||
| + | veid=int(fields[0][:-1]) | ||
| + | fields.pop(0) #remove the first element | ||
| + | current_data[veid]=dict() | ||
| + | current_data[veid][fields[0]]=fields | ||
| + | else: | ||
| + | i=i+1 | ||
| + | current_data[veid][fields[0]]=fields | ||
| + | if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run | ||
| + | if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]: | ||
| + | if int(veid) != 0: | ||
| + | hostname=find_veid(veid) | ||
| + | else: | ||
| + | hostname='OpenVZ Hardware Node' | ||
| + | count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] | ||
| + | + ' to ' + str(current_data[veid][fields[0]][5]) + '\n' | ||
| + | |||
| + | if count == False: #comparing current level with barrier/limit | ||
| + | if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant | ||
| + | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9: | ||
| + | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
| + | elif current_data[veid][fields[0]][0] == 'physpages': | ||
| + | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9: | ||
| + | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
| + | else: | ||
| + | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9: | ||
| + | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
| + | if barrier_break and count == False: | ||
| + | print barrier_break | ||
| + | sys.exit(2) | ||
| + | elif count == False: | ||
| + | print 'All Beancounters OK' | ||
| + | sys.exit(0) | ||
| + | |||
| + | if count_change and count == True: | ||
| + | send_mail(count_change) | ||
| + | return current_data | ||
| + | elif count == True: | ||
| + | return current_data | ||
| + | |||
| + | |||
| + | # ----- pickle data - read or write | ||
| + | |||
| + | def pickle_data(current_data,action,count,picklefilepath): | ||
| + | try: | ||
| + | picklefile = None | ||
| + | if action == 'write': | ||
| + | if current_data: | ||
| + | picklefile=open(picklefilepath,'w') | ||
| + | pickle.dump(current_data, picklefile) | ||
| + | picklefile.close() | ||
| + | return | ||
| + | else: | ||
| + | print 'current_data is empty: ' + str(current_data) | ||
| + | elif action == 'read': | ||
| + | picklefile=open(picklefilepath,'r') | ||
| + | data_read=pickle.load(picklefile) | ||
| + | picklefile.close() | ||
| + | if data_read: | ||
| + | return data_read | ||
| + | else: | ||
| + | print 'DATA_READ IS NONE:' + str(data_read) | ||
| + | return data_read | ||
| + | except IOError: | ||
| + | current_data = compare_data(beancounter_data,'0',count) | ||
| + | picklefile=open(picklefilepath,'w') | ||
| + | pickle.dump(current_data,picklefile) | ||
| + | picklefile.close() | ||
| + | |||
| + | # ------- print script usage | ||
| + | |||
| + | def usage(prog="check_beancounters.py"): | ||
| + | print """ | ||
| + | check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase | ||
| + | |||
| + | check_beancounters.py [-tfh] | ||
| + | |||
| + | -h print this message | ||
| + | |||
| + | -t Check if failcnt-values have increased since the last run | ||
| + | -f Check if current value of a resource is higher than barrier/limit | ||
| + | """ | ||
| + | |||
| + | |||
| + | opts=getopt.getopt(sys.argv[1:], 'thf') | ||
| + | if opts: | ||
| + | if opts[0]==[]: | ||
| + | usage(); sys.exit(0) | ||
| + | elif opts[0][0][0]=='-h': | ||
| + | usage(); sys.exit(0) | ||
| + | elif opts[0][0][0]=='-t': | ||
| + | count=True | ||
| + | elif opts[0][0][0]=='-f': | ||
| + | count=False | ||
| + | |||
| + | |||
| + | data_read=pickle_data(current_data,'read',count,picklefilepath) | ||
| + | current_data = compare_data(beancounter_data,data_read,count) | ||
| + | pickle_data(current_data,'write',count,picklefilepath) | ||
| + | </source> | ||
[[Category: HOWTO]] | [[Category: HOWTO]] | ||
Revision as of 17:16, 3 April 2008
Contents
snmpd configuration
Debian Etch example:
apt-get install snmpd
edit /etc/default/snmpd : remove -u snmp and replace 127.0.0.1 with your ip, Full/etc/default/snmpd example:
export MIBDIRS=/usr/share/snmp/mibs SNMPDRUN=yes SNMPDOPTS='-Lsd -Lf /dev/null -I -smux -p /var/run/snmpd.pid 207.46.250.119' TRAPDRUN=no TRAPDOPTS='-Lsd -p /var/run/snmptrapd.pid'
For Debian 4.x:
export MIBDIRS=/usr/share/snmp/mibs SNMPDRUN=yes SNMPDOPTS='-Lsd -Lf /dev/null -I -smux -p /var/run/snmpd.pid' TRAPDRUN=no TRAPDOPTS='-Lsd -p /var/run/snmptrapd.pid'
Create user(my_username) and add new mib. Password need a min. of 8 charactes. Username only characters:
/etc/init.d/snmpd stop echo rouser my_username priv > /etc/snmp/snmpd.conf echo "extend .1.3.6.1.4.1.2021.51 beancounters /bin/cat /proc/user_beancounters" >> /etc/snmp/snmpd.conf echo "extend .1.3.6.1.4.1.2021.52 vzquota /bin/cat /proc/vz/vzquota" >> /etc/snmp/snmpd.conf echo createUser my_username MD5 my_password DES >> /var/lib/snmp/snmpd.conf /etc/init.d/snmpd start
Testing snmp:
snmpwalk -v 3 -u my_usrname -l authPriv -a MD5 -A my_password -x DES -X my_password 207.46.250.119
Warning: the minimum pass phrase length is 8 characters.
nagios configuration
example nagios configuration
add to configuration:
define command {
command_name check_snmp_openvz_on_port
# command_line /usr/local/bin/check_snmp_openvz.sh $HOSTADDRESS$ PORT USER PASSWORD
command_line /usr/local/bin/check_snmp_openvz.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$
}
define host {
host_name openvz-server
alias Serwer Openvz
address 207.46.250.119
use generic-host
contact_groups admins
}
define service{
use generic-service
host_name openvz-server
service_description Virtual Machines Limits
check_command check_snmp_openvz_on_port!161!my_username!my_password
max_check_attempts 1
}
nagios plugin
It is shell script:
# cat /usr/local/bin/check_snmp_openvz.sh
#!/bin/bash
HOST=$1
PORT=$2
USER=$3
PASS=$4
export FILE=/tmp/$HOST.beancounters
RET=0
DATA_TMP=`snmpwalk -v 3 -u $USER -l authPriv -a MD5 -A $PASS -x DES -X $PASS $HOST:$PORT .1.3.6.1.4.1.2021.51.4`
if [ "$?" != "0" ]; then
echo "Unknown snmp error"
exit 1
fi
DATA=`echo "$DATA_TMP"| perl -ne '/"(.*)"/ ; print "$1\n" ;'`
if [ -f $FILE ]; then
echo "$DATA" | perl -n -e'
use Data::Dumper;
my $file=$ENV{"FILE"};
my $ret=0 ;
my $vid ;
my $resource ;
my $held ;
my $maxheld ;
my $barrier ;
my $limit ;
my $failcnt ;
my %beancounters ;
my %beancounters_old ;
while(<STDIN>){
my %vmachine;
if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters{$vid}=\%vmachine ; }
if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
$resource=$1 ;
$held=$2 ;
$maxheld=$3 ;
$barrier=$4 ;
$limit=$5 ;
$failcnt=$6 ;
${beancounters{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
if ( ($held > $barrier) && ($barrier != 0) ) {
print "WARNING: Limits on $vid: $resource held->$held , barrier->$barrier ( limit->$limit ) " ;
$ret=1;
}
}
}
# read and parse old data
open(MYINPUTFILE, "<$file");
while(<MYINPUTFILE>){
my %vmachine;
if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters_old{$vid}=\%vmachine ; }
if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
$resource=$1 ;
$held=$2 ;
$maxheld=$3 ;
$barrier=$4 ;
$limit=$5 ;
$failcnt=$6 ;
${beancounters_old{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
}
}
foreach my $vmachine_id (keys %beancounters) {
foreach my $resource (keys %{$beancounters{$vmachine_id}} ) {
if ( defined($beancounters{$vmachine_id}{$resource}[4]) && defined($beancounters_old{$vmachine_id}{$resource}[4]) ){
my $failcnt=$beancounters{$vmachine_id}{$resource}[4];
my $failcnt_old=$beancounters_old{$vmachine_id}{$resource}[4];
my $held=$beancounters{$vmachine_id}{$resource}[0];
my $maxheld=$beancounters{$vmachine_id}{$resource}[1];
my $barrier=$beancounters{$vmachine_id}{$resource}[2];
my $limit=$beancounters{$vmachine_id}{$resource}[3];
if ( $failcnt_old < $failcnt ){
print "CRITICAL: Incrased failcnt $vmachine_id: $resource from $failcnt_old to $failcnt (held->$held , maxheld->$maxheld , barrier->$barrier , limit->$limit ) " ;
$ret=2;
}
}
}
}
# if ($ret == 0 ) { print "Ok. \n" ; }
# print Dumper(%beancounters_old) ;
# print "\n";
exit($ret);
'
RET1=$?
fi
echo "$DATA" > $FILE
#####################################################################################
######### quota check
#####################################################################################
DATA=`snmpwalk -v 3 -u $USER -l authPriv -a MD5 -A $PASS -x DES -X $PASS $HOST:$PORT .1.3.6.1.4.1.2021.52.4 \
| perl -ne '/"(.*)"/ ; print "$1\n" ;'`
if [ "$?" != "0" ]; then
echo "Unknown snmp error"
exit 1
fi
echo "$DATA" | perl -n -e'
my $vid ;
my $ret=0 ;
while(<STDIN>){
my %vid;
if ( /\D*(\d+):.*/ ){ $vid=$1; }
if ( /\s*(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ){
$resource=$1 ;
$usage=$2 ;
$softlimit=$3 ;
$hardlimit=$4 ;
$time=$5 ;
$expire=$6 ;
if ( $usage >= $softlimit ){
print "WARNING: VZquota limit exceeded on $vid: $resource usage->$usage, softlimit->$softlimit, hardlimit->$hardlimit, time->$time, expire->$expire " ;
$ret=1;
}
}
}
exit($ret);
'
RET2=$?
#####################################################################################
########### return
#####################################################################################
if [ $RET1 -gt $RET2 ]; then
RET=$RET1
else
RET=$RET2
fi
if [ $RET = 0 ]; then
echo Ok.
fi
exit $RETnagios plugin/failcnt in python
#!/usr/bin/python
# Copyright (C) 2008 Christian Benke
# Distributed under the terms of the GNU General Public License v2
# v0.1 2008-04-03
# Christian Benke <benkokakao gmail com>
import string
import pickle
import sys
import getopt
import re
import smtplib
import socket
veid=''
current_data=dict()
opts=None
beancounter_data=open('/proc/user_beancounters','r')
picklefilepath='/tmp/beancounters_pickledump'
#-------- find the hostname for each veid ---:
def find_veid(veid):
veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
for line in veid_conf:
if "HOSTNAME" in line:
quotes=re.compile("\"")
line=quotes.sub("",line)
linefeed=re.compile("\n")
line=linefeed.sub("",line)
fqdn=re.split('=',line)
hostname=re.split('\.',fqdn[1])[0]
return hostname
# ---------- send mail in case of a counter-change
def send_mail(count_change):
mailfrom = 'root@' + str(host)
mailto = 'tech@inqnet.at'
mailsubject = 'Beancounters changed in the last 5 minutes'
mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n'
server = smtplib.SMTP('localhost')
server.sendmail(mailfrom, [mailto], '''\
From:''' + mailfrom + '''\
\nTo:''' + mailto + '''\
\nSubject:''' + mailsubject + '''\
\n''' + mailbody + count_change)
server.quit()
#------------read raw and compare data from user_beancounters
def compare_data(beancounter_data,data_read,count):
barrier_break=str()
count_change=str()
for line in beancounter_data:
if 'Version' in line or 'uid' in line or 'dummy' in line:
continue
else:
fields=line.split( )
if len(fields) == 7:
i=0
veid=int(fields[0][:-1])
fields.pop(0) #remove the first element
current_data[veid]=dict()
current_data[veid][fields[0]]=fields
else:
i=i+1
current_data[veid][fields[0]]=fields
if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
if int(veid) != 0:
hostname=find_veid(veid)
else:
hostname='OpenVZ Hardware Node'
count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5]
+ ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
if count == False: #comparing current level with barrier/limit
if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
elif current_data[veid][fields[0]][0] == 'physpages':
if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
else:
if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
if barrier_break and count == False:
print barrier_break
sys.exit(2)
elif count == False:
print 'All Beancounters OK'
sys.exit(0)
if count_change and count == True:
send_mail(count_change)
return current_data
elif count == True:
return current_data
# ----- pickle data - read or write
def pickle_data(current_data,action,count,picklefilepath):
try:
picklefile = None
if action == 'write':
if current_data:
picklefile=open(picklefilepath,'w')
pickle.dump(current_data, picklefile)
picklefile.close()
return
else:
print 'current_data is empty: ' + str(current_data)
elif action == 'read':
picklefile=open(picklefilepath,'r')
data_read=pickle.load(picklefile)
picklefile.close()
if data_read:
return data_read
else:
print 'DATA_READ IS NONE:' + str(data_read)
return data_read
except IOError:
current_data = compare_data(beancounter_data,'0',count)
picklefile=open(picklefilepath,'w')
pickle.dump(current_data,picklefile)
picklefile.close()
# ------- print script usage
def usage(prog="check_beancounters.py"):
print """
check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase
check_beancounters.py [-tfh]
-h print this message
-t Check if failcnt-values have increased since the last run
-f Check if current value of a resource is higher than barrier/limit
"""
opts=getopt.getopt(sys.argv[1:], 'thf')
if opts:
if opts[0]==[]:
usage(); sys.exit(0)
elif opts[0][0][0]=='-h':
usage(); sys.exit(0)
elif opts[0][0][0]=='-t':
count=True
elif opts[0][0][0]=='-f':
count=False
data_read=pickle_data(current_data,'read',count,picklefilepath)
current_data = compare_data(beancounter_data,data_read,count)
pickle_data(current_data,'write',count,picklefilepath)