Difference between revisions of "Monitoring openvz resources using nagios and snmp"

From OpenVZ Virtuozzo Containers Wiki
Jump to: navigation, search
m
(nagios plugin/failcnt locally, without snmp)
Line 219: Line 219:
 
exit $RET
 
exit $RET
 
</source>
 
</source>
 
=== nagios plugin/failcnt locally, without snmp ===
 
 
To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf<br>
 
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks:
 
 
<source lang=python>
 
#!/usr/bin/python
 
# Copyright (C) 2008 Christian Benke
 
# Distributed under the terms of the GNU General Public License v2
 
# v0.1 2008-04-03
 
# Christian Benke <benkokakao  gmail  com>
 
 
import string
 
import pickle
 
import sys
 
import getopt
 
import re
 
import smtplib
 
import socket
 
 
veid=''
 
current_data=dict()
 
opts=None
 
beancounter_data=open('/proc/user_beancounters','r')
 
picklefilepath='/tmp/beancounters_pickledump'
 
 
#-------- find the hostname for each veid ---:
 
 
def find_veid(veid):
 
        veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
 
        for line in veid_conf:
 
                if "HOSTNAME" in line:
 
                        quotes=re.compile("\"")
 
                        line=quotes.sub("",line)
 
                        linefeed=re.compile("\n")
 
                        line=linefeed.sub("",line)
 
                        fqdn=re.split('=',line)
 
                        hostname=re.split('\.',fqdn[1])[0]
 
                        return hostname
 
 
# ---------- send mail in case of a counter-change
 
def send_mail(count_change):
 
        mailfrom = 'root@' + str(host)
 
        mailto = 'to@example.com'
 
        mailsubject = 'Beancounters changed in the last 5 minutes'
 
        mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n'
 
        server = smtplib.SMTP('localhost')
 
        server.sendmail(mailfrom, [mailto], '''\
 
From:''' + mailfrom + '''\
 
\nTo:''' + mailto + '''\
 
\nSubject:''' + mailsubject + '''\
 
 
\n''' + mailbody + count_change)
 
        server.quit()
 
 
#------------read raw and compare data from user_beancounters
 
 
def compare_data(beancounter_data,data_read,count):
 
        barrier_break=str()
 
        count_change=str()
 
        for line in beancounter_data:
 
                if 'Version' in line or 'uid' in line or 'dummy' in line:
 
                        continue
 
                else:
 
                        fields=line.split( )
 
                        if len(fields) == 7:
 
                                i=0
 
                                veid=int(fields[0][:-1])
 
                                fields.pop(0) #remove the first element
 
                                current_data[veid]=dict()
 
                                current_data[veid][fields[0]]=fields
 
                        else:
 
                                i=i+1
 
                                current_data[veid][fields[0]]=fields
 
                if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
 
                        if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
 
                                if int(veid) != 0:
 
                                        hostname=find_veid(veid)
 
                                else:
 
                                        hostname='OpenVZ Hardware Node'
 
                                count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5]
 
+ ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
 
 
                if count == False:      #comparing current level with barrier/limit
 
                        if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
 
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
                        elif current_data[veid][fields[0]][0] == 'physpages':
 
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
                        else:
 
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
 
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
        if barrier_break and count == False:
 
                print barrier_break
 
                sys.exit(2)
 
        elif count == False:
 
                print 'All Beancounters OK'
 
                sys.exit(0)
 
 
        if count_change and count == True:
 
                send_mail(count_change)
 
                return current_data
 
        elif count == True:
 
                return current_data
 
 
 
# ----- pickle data - read or write
 
 
def pickle_data(current_data,action,count,picklefilepath):
 
        try:
 
                picklefile = None
 
                if action == 'write':
 
                        if current_data:
 
                                picklefile=open(picklefilepath,'w')
 
                                pickle.dump(current_data, picklefile)
 
                                picklefile.close()
 
                                return
 
                        else:
 
                                print 'current_data is empty: ' + str(current_data)
 
                elif action == 'read':
 
                        picklefile=open(picklefilepath,'r')
 
                        data_read=pickle.load(picklefile)
 
                        picklefile.close()
 
                        if data_read:
 
                                return data_read
 
                        else:
 
                                print 'DATA_READ IS NONE:' + str(data_read)
 
                                return data_read
 
        except IOError:
 
                current_data = compare_data(beancounter_data,'0',count)
 
                picklefile=open(picklefilepath,'w')
 
                pickle.dump(current_data,picklefile)
 
                picklefile.close()
 
 
# ------- print script usage
 
 
def usage(prog="check_beancounters.py"):
 
    print """
 
check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase
 
 
check_beancounters.py [-tfh]
 
 
-h                  print this message
 
 
-t                  Check if failcnt-values have increased since the last run
 
-f                  Check if current value of a resource is higher than barrier/limit
 
"""
 
 
 
opts=getopt.getopt(sys.argv[1:], 'thf')
 
if opts:
 
        if opts[0]==[]:
 
                usage(); sys.exit(0)
 
        elif opts[0][0][0]=='-h':
 
                usage(); sys.exit(0)
 
        elif opts[0][0][0]=='-t':
 
                count=True
 
        elif opts[0][0][0]=='-f':
 
                count=False
 
 
 
data_read=pickle_data(current_data,'read',count,picklefilepath)
 
current_data = compare_data(beancounter_data,data_read,count)
 
pickle_data(current_data,'write',count,picklefilepath)
 
</source>
 
 
 
 
[[Category: HOWTO]]
 

Revision as of 08:48, 4 April 2008

snmpd configuration

Debian Etch example:

apt-get install snmpd

edit /etc/default/snmpd : remove -u snmp and replace 127.0.0.1 with your ip, Full/etc/default/snmpd example:

export MIBDIRS=/usr/share/snmp/mibs
SNMPDRUN=yes
SNMPDOPTS='-Lsd -Lf /dev/null  -I -smux -p /var/run/snmpd.pid 207.46.250.119'
TRAPDRUN=no
TRAPDOPTS='-Lsd -p /var/run/snmptrapd.pid'

For Debian 4.x:

export MIBDIRS=/usr/share/snmp/mibs
SNMPDRUN=yes
SNMPDOPTS='-Lsd -Lf /dev/null  -I -smux -p /var/run/snmpd.pid'
TRAPDRUN=no
TRAPDOPTS='-Lsd -p /var/run/snmptrapd.pid'

Create user(my_username) and add new mib. Password need a min. of 8 charactes. Username only characters:

/etc/init.d/snmpd stop
echo rouser my_username priv > /etc/snmp/snmpd.conf
echo "extend  .1.3.6.1.4.1.2021.51  beancounters  /bin/cat /proc/user_beancounters" >> /etc/snmp/snmpd.conf
echo "extend  .1.3.6.1.4.1.2021.52  vzquota  /bin/cat /proc/vz/vzquota" >> /etc/snmp/snmpd.conf
echo  createUser my_username MD5 my_password DES >> /var/lib/snmp/snmpd.conf
/etc/init.d/snmpd start 

Testing snmp:

snmpwalk   -v 3  -u my_usrname -l authPriv   -a MD5 -A my_password -x DES -X my_password  207.46.250.119

Warning: the minimum pass phrase length is 8 characters.

nagios configuration

example nagios configuration

add to configuration:

define command {
command_name check_snmp_openvz_on_port
# command_line /usr/local/bin/check_snmp_openvz.sh  $HOSTADDRESS$ PORT    USER    PASSWORD
command_line /usr/local/bin/check_snmp_openvz.sh  $HOSTADDRESS$ $ARG1$  $ARG2$  $ARG3$
}
define host {
        host_name   openvz-server
        alias       Serwer Openvz
        address     207.46.250.119
        use         generic-host
        contact_groups  admins
        }
define service{
        use                             generic-service
        host_name                       openvz-server
        service_description             Virtual Machines Limits
        check_command                   check_snmp_openvz_on_port!161!my_username!my_password
        max_check_attempts              1
        }

nagios plugin

It is shell script:

# cat /usr/local/bin/check_snmp_openvz.sh
#!/bin/bash
HOST=$1
PORT=$2
USER=$3
PASS=$4
export FILE=/tmp/$HOST.beancounters
RET=0

DATA_TMP=`snmpwalk   -v 3  -u $USER -l authPriv   -a MD5 -A $PASS -x DES -X $PASS $HOST:$PORT .1.3.6.1.4.1.2021.51.4`
if [ "$?" != "0" ]; then
        echo "Unknown snmp error"
        exit 1
fi

DATA=`echo "$DATA_TMP"| perl -ne '/"(.*)"/ ; print "$1\n" ;'`

if [ -f $FILE ]; then
echo "$DATA" | perl  -n -e'
use Data::Dumper;
my $file=$ENV{"FILE"};
my $ret=0 ;
my $vid ;
my $resource ;
my $held ;
my $maxheld ;
my $barrier ;
my $limit ;
my $failcnt ;
my %beancounters ;
my %beancounters_old ;
while(<STDIN>){
        my %vmachine;
        if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters{$vid}=\%vmachine ; }
        if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
                $resource=$1 ;
                $held=$2 ;
                $maxheld=$3 ;
                $barrier=$4 ;
                $limit=$5 ;
                $failcnt=$6 ;
                ${beancounters{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
                if ( ($held  > $barrier) && ($barrier != 0) ) {
                        print "WARNING: Limits on $vid: $resource  held->$held , barrier->$barrier ( limit->$limit ) " ;
                        $ret=1;
                }
        }
}

# read and parse old data
open(MYINPUTFILE, "<$file");
while(<MYINPUTFILE>){
        my %vmachine;
        if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters_old{$vid}=\%vmachine ; }
        if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
                $resource=$1 ;
                $held=$2 ;
                $maxheld=$3 ;
                $barrier=$4 ;
                $limit=$5 ;
                $failcnt=$6 ;
                ${beancounters_old{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
        }
}

foreach my $vmachine_id (keys %beancounters) {
        foreach my $resource (keys %{$beancounters{$vmachine_id}} ) {
                if ( defined($beancounters{$vmachine_id}{$resource}[4]) && defined($beancounters_old{$vmachine_id}{$resource}[4]) ){
                        my $failcnt=$beancounters{$vmachine_id}{$resource}[4];
                        my $failcnt_old=$beancounters_old{$vmachine_id}{$resource}[4];
                        my $held=$beancounters{$vmachine_id}{$resource}[0];
                        my $maxheld=$beancounters{$vmachine_id}{$resource}[1];
                        my $barrier=$beancounters{$vmachine_id}{$resource}[2];
                        my $limit=$beancounters{$vmachine_id}{$resource}[3];
                        if ( $failcnt_old < $failcnt ){
                                print "CRITICAL: Incrased failcnt  $vmachine_id: $resource from $failcnt_old to $failcnt (held->$held , maxheld->$maxheld , barrier->$barrier , limit->$limit ) " ;
                                $ret=2;
                        }
                }
        }

}

# if ($ret == 0 ) { print "Ok. \n" ; }
# print Dumper(%beancounters_old) ;
# print "\n";
exit($ret);
'

RET1=$?
fi

echo "$DATA" > $FILE
#####################################################################################
######### quota check
#####################################################################################

DATA=`snmpwalk   -v 3  -u $USER -l authPriv   -a MD5 -A $PASS -x DES -X $PASS $HOST:$PORT .1.3.6.1.4.1.2021.52.4 \
|  perl -ne '/"(.*)"/ ; print "$1\n" ;'`

if [ "$?" != "0" ]; then
        echo "Unknown snmp error"
        exit 1
fi


echo "$DATA" | perl  -n -e'
my $vid ;
my $ret=0 ;
while(<STDIN>){
        my %vid;
        if ( /\D*(\d+):.*/ ){ $vid=$1; }
        if ( /\s*(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ){
                $resource=$1 ;
                $usage=$2 ;
                $softlimit=$3 ;
                $hardlimit=$4 ;
                $time=$5 ;
                $expire=$6 ;
                if ( $usage >= $softlimit ){
                        print "WARNING: VZquota limit exceeded on $vid: $resource  usage->$usage, softlimit->$softlimit, hardlimit->$hardlimit, time->$time, expire->$expire  " ;
                        $ret=1;
                }
        }
}
exit($ret);
'
RET2=$?

#####################################################################################
########### return
#####################################################################################

if [  $RET1  -gt $RET2  ]; then
        RET=$RET1
        else
        RET=$RET2
fi

if [  $RET  = 0  ]; then
        echo Ok.
fi
exit $RET