Editing Monitoring openvz resources using nagios and snmp

Jump to: navigation, search

Warning: You are not logged in. Your IP address will be publicly visible if you make any edits. If you log in or create an account, your edits will be attributed to your username, along with other benefits.

The edit can be undone. Please check the comparison below to verify that this is what you want to do, and then save the changes below to finish undoing the edit.
Latest revision Your text
Line 5: Line 5:
 
</pre>
 
</pre>
  
edit '''/etc/default/snmpd''' : remove ''-u snmp'' and replace ''127.0.0.1'' with your ip (ie : 207.46.250.119), Full'''/etc/default/snmpd''' example:
+
edit '''/etc/default/snmpd''' : remove ''-u snmp'' and replace ''127.0.0.1'' with your ip, Full'''/etc/default/snmpd''' example:
 
<pre>
 
<pre>
 
export MIBDIRS=/usr/share/snmp/mibs
 
export MIBDIRS=/usr/share/snmp/mibs
Line 26: Line 26:
 
<pre>
 
<pre>
 
/etc/init.d/snmpd stop
 
/etc/init.d/snmpd stop
echo rouser my_username priv >> /etc/snmp/snmpd.conf
+
echo rouser my_username priv > /etc/snmp/snmpd.conf
 
echo "extend  .1.3.6.1.4.1.2021.51  beancounters  /bin/cat /proc/user_beancounters" >> /etc/snmp/snmpd.conf
 
echo "extend  .1.3.6.1.4.1.2021.51  beancounters  /bin/cat /proc/user_beancounters" >> /etc/snmp/snmpd.conf
 
echo "extend  .1.3.6.1.4.1.2021.52  vzquota  /bin/cat /proc/vz/vzquota" >> /etc/snmp/snmpd.conf
 
echo "extend  .1.3.6.1.4.1.2021.52  vzquota  /bin/cat /proc/vz/vzquota" >> /etc/snmp/snmpd.conf
Line 32: Line 32:
 
/etc/init.d/snmpd start  
 
/etc/init.d/snmpd start  
 
</pre>
 
</pre>
 
(Note that the createUser command goes into a separate file. On Centos5 this file is located in /var/net-snmp/snmpd.conf. Make sure you stop snmpd before putting the createUser command there!).
 
  
 
Testing snmp:
 
Testing snmp:
 
<pre>
 
<pre>
snmpwalk  -v 3  -u my_username -l authPriv  -a MD5 -A my_password -x DES -X my_password  $(hostname -i)
+
snmpwalk  -v 3  -u my_usrname -l authPriv  -a MD5 -A my_password -x DES -X my_password  207.46.250.119
 
</pre>
 
</pre>
  
Line 210: Line 208:
 
#####################################################################################
 
#####################################################################################
  
if [ $RET1 -gt $RET2 ]; then
+
if [ $RET1 -gt $RET2 ]; then
 
         RET=$RET1
 
         RET=$RET1
 
         else
 
         else
Line 222: Line 220:
 
</source>
 
</source>
  
=== check_vzquota Without SNMP ===
+
=== nagios plugin/failcnt locally, without snmp ===
<source lang="bash">
+
 
#!/bin/bash
+
To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf<br>
RET=0
+
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks:
DATA=`echo;sudo /usr/sbin/vzlist -1 2>/dev/null | xargs -I {} bash -c "echo {}:;sudo /usr/sbin/vzquota stat {} | sed 's/\*//g'"`
+
 
if [ -z "$DATA" ]; then
+
<source lang=python>
        VPS_err=$(sudo /usr/sbin/vzlist -1 2>&1 1>/dev/null)
+
#!/usr/bin/python
        if [ -n "$VPS_err" ] && [ "$VPS_err" == "Container(s) not found" ]; then
+
# Copyright (C) 2008 Christian Benke
                 echo "OK - $VPS_err";
+
# Distributed under the terms of the GNU General Public License v2
                 exit 0;
+
# v0.1 2008-04-03
         else
+
# Christian Benke <benkokakao  gmail  com>
                 if [ -n "$VPS_err" ]; then
+
 
                         echo "UNKNOWN - Error: $VPS_err";
+
import string
                 else
+
import pickle
                         echo "UNKNOWN - VZquota stats are not readable or empty. Maybe it is only readable for root and this script should be called by sudo.";
+
import sys
                 fi
+
import getopt
                 exit 3;
+
import re
        fi
+
import smtplib
fi
+
import socket
 +
 
 +
veid=''
 +
current_data=dict()
 +
opts=None
 +
beancounter_data=open('/proc/user_beancounters','r')
 +
picklefilepath='/tmp/beancounters_pickledump'
 +
 
 +
#-------- find the hostname for each veid ---:
 +
 
 +
def find_veid(veid):
 +
        veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
 +
        for line in veid_conf:
 +
                if "HOSTNAME" in line:
 +
                        quotes=re.compile("\"")
 +
                        line=quotes.sub("",line)
 +
                        linefeed=re.compile("\n")
 +
                        line=linefeed.sub("",line)
 +
                        fqdn=re.split('=',line)
 +
                        hostname=re.split('\.',fqdn[1])[0]
 +
                        return hostname
 +
 
 +
# ---------- send mail in case of a counter-change
 +
def send_mail(count_change):
 +
        mailfrom = 'root@' + str(host)
 +
        mailto = 'to@example.com'
 +
        mailsubject = 'Beancounters changed in the last 5 minutes'
 +
        mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n'
 +
        server = smtplib.SMTP('localhost')
 +
        server.sendmail(mailfrom, [mailto], '''\
 +
From:''' + mailfrom + '''\
 +
\nTo:''' + mailto + '''\
 +
\nSubject:''' + mailsubject + '''\
 +
 
 +
\n''' + mailbody + count_change)
 +
        server.quit()
 +
 
 +
#------------read raw and compare data from user_beancounters
 +
 
 +
def compare_data(beancounter_data,data_read,count):
 +
        barrier_break=str()
 +
        count_change=str()
 +
        for line in beancounter_data:
 +
                if 'Version' in line or 'uid' in line or 'dummy' in line:
 +
                        continue
 +
                else:
 +
                        fields=line.split( )
 +
                        if len(fields) == 7:
 +
                                i=0
 +
                                veid=int(fields[0][:-1])
 +
                                fields.pop(0) #remove the first element
 +
                                current_data[veid]=dict()
 +
                                current_data[veid][fields[0]]=fields
 +
                        else:
 +
                                i=i+1
 +
                                current_data[veid][fields[0]]=fields
 +
                if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
 +
                        if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
 +
                                if int(veid) != 0:
 +
                                        hostname=find_veid(veid)
 +
                                else:
 +
                                        hostname='OpenVZ Hardware Node'
 +
                                count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5]
 +
+ ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
 +
 
 +
                if count == False:      #comparing current level with barrier/limit
 +
                        if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
 +
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 +
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
                        elif current_data[veid][fields[0]][0] == 'physpages':
 +
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 +
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
                        else:
 +
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
 +
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
        if barrier_break and count == False:
 +
                print barrier_break
 +
                sys.exit(2)
 +
        elif count == False:
 +
                 print 'All Beancounters OK'
 +
                 sys.exit(0)
 +
 
 +
        if count_change and count == True:
 +
                send_mail(count_change)
 +
                return current_data
 +
         elif count == True:
 +
                return current_data
 +
 
 +
 
 +
# ----- pickle data - read or write
 +
 
 +
def pickle_data(current_data,action,count,picklefilepath):
 +
        try:
 +
                picklefile = None
 +
                 if action == 'write':
 +
                        if current_data:
 +
                                picklefile=open(picklefilepath,'w')
 +
                                pickle.dump(current_data, picklefile)
 +
                                picklefile.close()
 +
                                return
 +
                         else:
 +
                                print 'current_data is empty: ' + str(current_data)
 +
                 elif action == 'read':
 +
                        picklefile=open(picklefilepath,'r')
 +
                         data_read=pickle.load(picklefile)
 +
                        picklefile.close()
 +
                        if data_read:
 +
                                return data_read
 +
                        else:
 +
                                print 'DATA_READ IS NONE:' + str(data_read)
 +
                                return data_read
 +
        except IOError:
 +
                current_data = compare_data(beancounter_data,'0',count)
 +
                picklefile=open(picklefilepath,'w')
 +
                 pickle.dump(current_data,picklefile)
 +
                 picklefile.close()
 +
 
 +
# ------- print script usage
  
echo "$DATA" | perl  -n -e'
+
def usage(prog="check_beancounters.py"):
my $vid ;
+
    print """
my $ret=0 ;
+
check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase
my $crit="";
 
my $warn="";
 
my $ok="";
 
while(<STDIN>){
 
        my %vid;
 
        if ( /^(\d+):.*/ ){ $vid=$1; }
 
        if ( /\D*(\d+):.*/ ){ $vid=$1; }
 
        if ( /\s*(\S+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ){
 
                $resource=$1 ;
 
                $usage=$2 ;
 
                $softlimit=$3 ;
 
                $hardlimit=$4 ;
 
                if ( $usage >= $hardlimit ){
 
                        $crit=$crit."VZquota limit exceeded on $vid: $resource  usage->$usage, softlimit->$softlimit, hardlimit->$hardlimit, time->$time, expire->$expire  " ;
 
                        $ret=2;
 
                } elsif ( $usage >= $softlimit ){
 
                        $warn=$warn."VZquota limit exceeded on $vid: $resource  usage->$usage, softlimit->$softlimit, hardlimit->$hardlimit, time->$time, expire->$expire  " ;
 
                        $ret=1;
 
                }
 
                $ok=$ok."$vid:$resource $usage/$softlimit\n";
 
        }
 
}
 
if ($ret == 0) {
 
        print "OK - click on service-link for details...\n$ok";
 
} elsif ($ret == 1)  {
 
        print "WARNING - $warn\n";
 
} else {
 
        print "CRITICAL - $crit\n";
 
}
 
exit($ret);
 
'
 
RET=$?
 
exit $RET
 
</source>
 
The script calls <code>/usr/sbin/vzlist</code> by sudo. When doing this it normally needs a password, which check_nrpe will not know. Because of this it is necessary that you append a line like the following to <code>/etc/sudors</code> (user name an path should be adapted to the right ones on your system):
 
nagios  ALL=NOPASSWD: /usr/sbin/vzlist, /usr/sbin/vzquota
 
  
=== check_ubc Without SNMP ===
+
check_beancounters.py [-tfh]
<source lang="bash">
 
#!/bin/bash
 
# Servicestate description can have a http-link to the openvz-wiki
 
# in case that a ressource is warning/critical. To use it:
 
# 1. set "escape_html_tags=0" in nagios/etc/cgi.cfg
 
# 2. set "my $linked=1;" in the first perl lines in this script
 
#
 
export FILE=/tmp/check_ubc
 
RET=0
 
ubc_file='/proc/user_beancounters';
 
DATA='';
 
if [ -r $ubc_file ]; then
 
        DATA=`cat $ubc_file`
 
fi
 
if [ -z "$DATA" ]; then
 
        echo "UNKNOWN - $ubc_file is not readable or empty. Maybe it is only readable for root and this script should be called by sudo.";
 
        exit 3;
 
fi
 
  
if [ -f $FILE ]; then
+
  -h                  print this message
echo "$DATA" | perl -n -e'
 
use Data::Dumper;
 
my $linked=1;  # 0:plain text output, 1:resourcename is a http-link to OpenVZ-wiki
 
my $file=$ENV{"FILE"};
 
my $ret=0 ;
 
my $vid ;
 
my $resource ;
 
my $held ;
 
my $maxheld ;
 
my $barrier ;
 
my $limit ;
 
my $failcnt ;
 
my %beancounters ;
 
my %beancounters_old ;
 
while(<STDIN>){
 
        my %vmachine;
 
        if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters{$vid}=\%vmachine ; }
 
        if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
 
                $resource=$1 ;
 
                $held=$2 ;
 
                $maxheld=$3 ;
 
                $barrier=$4 ;
 
                $limit=$5 ;
 
                $failcnt=$6 ;
 
                ${beancounters{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
 
                if ( ($held  > $barrier) && ($barrier != 0) ) {
 
                        print "WARNING: Limits on $vid: ".&url($resource,$linked)."  held->$held , barrier->$barrier ( limit->$limit ) " ;
 
                        $ret=1;
 
                }
 
                                #print "$vid:$resource $held Barrier:$barrier ";
 
        }
 
}
 
  
# read and parse old data
+
-t                  Check if failcnt-values have increased since the last run
open(MYINPUTFILE, "<$file");
+
-f                  Check if current value of a resource is higher than barrier/limit
while(<MYINPUTFILE>){
+
"""
        my %vmachine;
 
        if ( /\D*(\d+):.*/ ){ $vid=$1; $beancounters_old{$vid}=\%vmachine ; }
 
        if ( /^[\W\d]+([a-z]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*/ ) {
 
                $resource=$1 ;
 
                $held=$2 ;
 
                $maxheld=$3 ;
 
                $barrier=$4 ;
 
                $limit=$5 ;
 
                $failcnt=$6 ;
 
                ${beancounters_old{$vid}}{$resource}=[$held , $maxheld , $barrier , $limit ,$failcnt ];
 
        }
 
}
 
  
foreach my $vmachine_id (keys %beancounters) {
 
        foreach my $resource (keys %{$beancounters{$vmachine_id}} ) {
 
                if ( defined($beancounters{$vmachine_id}{$resource}[4]) && defined($beancounters_old{$vmachine_id}{$resource}[4]) ){
 
                        my $failcnt=$beancounters{$vmachine_id}{$resource}[4];
 
                        my $failcnt_old=$beancounters_old{$vmachine_id}{$resource}[4];
 
                        my $held=$beancounters{$vmachine_id}{$resource}[0];
 
                        my $maxheld=$beancounters{$vmachine_id}{$resource}[1];
 
                        my $barrier=$beancounters{$vmachine_id}{$resource}[2];
 
                        my $limit=$beancounters{$vmachine_id}{$resource}[3];
 
                        if ( $failcnt_old < $failcnt ){
 
                                print "CRITICAL: Increased failcnt  $vmachine_id: ".url($resource,$linked)." from $failcnt_old to $failcnt (held->$held , maxheld->$maxheld , barrier->$barrier , limit->$limit ) " ;
 
                                $ret=2;
 
                        }
 
                                                #print "$vmachine_id: Old_Failcnt: $failcnt_old Failcnt: $failcnt \n";
 
                }
 
        }
 
  
}
+
opts=getopt.getopt(sys.argv[1:], 'thf')
sub url {
+
if opts:
        my ($name,$with_link) = @_;
+
         if opts[0]==[]:
         if ($with_link) {
+
                 usage(); sys.exit(0)
                return "<a target=\"_blank\" href=\"http://wiki.openvz.org/".$name."#".$name."\">$name</a>";
+
         elif opts[0][0][0]=='-h':
        } else {
+
                usage(); sys.exit(0)
                 return $name;
+
        elif opts[0][0][0]=='-t':
         }
+
                count=True
}
+
        elif opts[0][0][0]=='-f':
if ($ret == 0 ) { print "OK: All bean counters fine \n" ; }
+
                count=False
# print Dumper(%beancounters_old) ;
 
# print "\n";
 
exit($ret);
 
'
 
  
RET=$?
 
fi
 
  
echo "$DATA" > $FILE
+
data_read=pickle_data(current_data,'read',count,picklefilepath)
exit $RET
+
current_data = compare_data(beancounter_data,data_read,count)
 +
pickle_data(current_data,'write',count,picklefilepath)
 
</source>
 
</source>
The script needs to read the <code>/proc/user_beancounters</code> file. This is normally only readable for root. Because of this it is necessary that you append a line like the following to <code>/etc/sudors</code> (user name an path should be adapted to the right ones on your system):
 
nagios  ALL=NOPASSWD: /usr/local/nagios/libexec/check_ubc
 
  
Also don't forget to consider this on your <code>nrpe.cfg</code>, so that you call the script with sudo:
 
command[check_ubc]=sudo /usr/local/nagios/libexec/check_ubc
 
  
[[Category: Monitoring]]
+
 
 +
[[Category: HOWTO]]

Please note that all contributions to OpenVZ Virtuozzo Containers Wiki may be edited, altered, or removed by other contributors. If you don't want your writing to be edited mercilessly, then don't submit it here.
If you are going to add external links to an article, read the External links policy first!

To edit this page, please answer the question that appears below (more info):

Cancel Editing help (opens in new window)