Difference between revisions of "Monitoring /proc/user beancounters with nagios"

From OpenVZ Virtuozzo Containers Wiki
Jump to: navigation, search
m (Nagios beancounters monitoring moved to Monitoring /proc/user beancounters with nagios: adapted to what seems to be the convention for monitoring articles)
Line 8: Line 8:
 
# Copyright (C) 2008 Christian Benke
 
# Copyright (C) 2008 Christian Benke
 
# Distributed under the terms of the GNU General Public License v2
 
# Distributed under the terms of the GNU General Public License v2
# v0.1 2008-04-03
+
# v0.2 2008-04-04
 
# Christian Benke <benkokakao  gmail  com>
 
# Christian Benke <benkokakao  gmail  com>
  
Line 29: Line 29:
 
def find_veid(veid):
 
def find_veid(veid):
 
         veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
 
         veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
         for line in veid_conf:
+
         if int(veid) != 0:
                if "HOSTNAME" in line:
+
                for line in veid_conf:
                        quotes=re.compile("\"")
+
                        if "HOSTNAME" in line:
                        line=quotes.sub("",line)
+
                                quotes=re.compile("\"")
                        linefeed=re.compile("\n")
+
                                line=quotes.sub("",line)
                        line=linefeed.sub("",line)
+
                                linefeed=re.compile("\n")
                        fqdn=re.split('=',line)
+
                                line=linefeed.sub("",line)
                        hostname=re.split('\.',fqdn[1])[0]
+
                                fqdn=re.split('=',line)
                        return hostname
+
                                hostname=re.split('\.',fqdn[1])[0]
 +
                                return hostname
 +
        else:
 +
                hostname='OpenVZ HN'
 +
                return hostname
  
 
# ---------- send mail in case of a counter-change
 
# ---------- send mail in case of a counter-change
 +
 
def send_mail(count_change):
 
def send_mail(count_change):
         mailfrom = 'root@' + str(host)
+
         sendmail = "/usr/lib/sendmail" # sendmail location
         mailto = 'to@example.com'
+
        import os
         mailsubject = 'Beancounters changed in the last 5 minutes'
+
        p = os.popen("%s -t" % sendmail, "w")
         mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n'
+
        p.write("From: root\n")
         server = smtplib.SMTP('localhost')
+
         p.write("To: benke@inqnet.at\n")
         server.sendmail(mailfrom, [mailto], '''\
+
         p.write("Subject: Beancounters changed in the last 5 minutes\n")
From:''' + mailfrom + '''\
+
        p.write("\n") # blank line separating headers from body
\nTo:''' + mailto + '''\
+
         p.write("The Beancounter-failcnt value of the following veid(s) and resource(s) has \n")
\nSubject:''' + mailsubject + '''\
+
        p.write("increased in the last 5 minutes:\n\n")
 +
        p.write(count_change)
 +
         sts = p.close()
 +
         if sts is not None:
 +
                print "Sendmail exit status", sts
 +
 
 +
#---------- compare the failcnt-values
 +
 
 +
def cntcheck(data_read,current_data,veid,fields,count,count_change):
 +
        if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
 +
                if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
 +
                        hostname=find_veid(veid)
 +
                        count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] + ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
 +
        return count_change
 +
 
 +
#---------- compare the current value with barrier/limit value
 +
 
 +
def barriercheck(data_read,current_data,veid,fields,count,barrier_break):
 +
        if count == False:     #comparing current level with barrier/limit
 +
                if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
 +
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 +
                                hostname=find_veid(veid)
 +
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
                elif current_data[veid][fields[0]][0] == 'physpages':
 +
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 +
                                hostname=find_veid(veid)
 +
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
                else:
 +
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
 +
                                hostname=find_veid(veid)
 +
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 +
        return barrier_break
  
\n''' + mailbody + count_change)
 
        server.quit()
 
  
#------------read raw and compare data from user_beancounters
+
#------------ read user_beancounter and handle the result of the comparison subroutines
  
 
def compare_data(beancounter_data,data_read,count):
 
def compare_data(beancounter_data,data_read,count):
 +
        count_change=str()
 
         barrier_break=str()
 
         barrier_break=str()
        count_change=str()
 
 
         for line in beancounter_data:
 
         for line in beancounter_data:
 
                 if 'Version' in line or 'uid' in line or 'dummy' in line:
 
                 if 'Version' in line or 'uid' in line or 'dummy' in line:
Line 73: Line 107:
 
                                 i=i+1
 
                                 i=i+1
 
                                 current_data[veid][fields[0]]=fields
 
                                 current_data[veid][fields[0]]=fields
                if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
 
                        if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
 
                                if int(veid) != 0:
 
                                        hostname=find_veid(veid)
 
                                else:
 
                                        hostname='OpenVZ Hardware Node'
 
                                count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5]
 
+ ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
 
  
                 if count == False:      #comparing current level with barrier/limit
+
                 # ------ check failcnt
                         if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
+
                         barrier_break=barriercheck(data_read,current_data,veid,fields,count,barrier_break)
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
+
 
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
+
 
                        elif current_data[veid][fields[0]][0] == 'physpages':
+
                # ------ check barrier/limit
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
+
                         count_change=cntcheck(data_read,current_data,veid,fields,count,count_change)
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
+
 
                         else:
 
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
 
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
 
         if barrier_break and count == False:
 
         if barrier_break and count == False:
 
                 print barrier_break
 
                 print barrier_break
Line 100: Line 123:
  
 
         if count_change and count == True:
 
         if count_change and count == True:
 +
                print 'sending mail'
 
                 send_mail(count_change)
 
                 send_mail(count_change)
 +
                print 'mail sent'
 
                 return current_data
 
                 return current_data
 
         elif count == True:
 
         elif count == True:
Line 136: Line 161:
 
# ------- print script usage
 
# ------- print script usage
  
def usage(prog="check_beancounters.py"):
+
def usage(prog="check_beancounter.py"):
 
     print """
 
     print """
check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase
+
check_beancounter.py : Check if failcounters increase or resource-values break barriers or limits
  
  check_beancounters.py [-tfh]
+
  check_beancounter.py [-tfh]
  
 
  -h                  print this message
 
  -h                  print this message

Revision as of 15:48, 4 April 2008

Monitoring /proc/user_beancounters with nagios

To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks:

#!/usr/bin/python
# Copyright (C) 2008 Christian Benke
# Distributed under the terms of the GNU General Public License v2
# v0.2 2008-04-04
# Christian Benke <benkokakao  gmail  com>

import string
import pickle
import sys
import getopt
import re
import smtplib
import socket

veid=''
current_data=dict()
opts=None
beancounter_data=open('/proc/user_beancounters','r')
picklefilepath='/tmp/beancounters_pickledump'

#-------- find the hostname for each veid ---:

def find_veid(veid):
        veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
        if int(veid) != 0:
                for line in veid_conf:
                        if "HOSTNAME" in line:
                                quotes=re.compile("\"")
                                line=quotes.sub("",line)
                                linefeed=re.compile("\n")
                                line=linefeed.sub("",line)
                                fqdn=re.split('=',line)
                                hostname=re.split('\.',fqdn[1])[0]
                                return hostname
        else:
                hostname='OpenVZ HN'
                return hostname

# ---------- send mail in case of a counter-change

def send_mail(count_change):
        sendmail = "/usr/lib/sendmail" # sendmail location
        import os
        p = os.popen("%s -t" % sendmail, "w")
        p.write("From: root\n")
        p.write("To: benke@inqnet.at\n")
        p.write("Subject: Beancounters changed in the last 5 minutes\n")
        p.write("\n") # blank line separating headers from body
        p.write("The Beancounter-failcnt value of the following veid(s) and resource(s) has \n")
        p.write("increased in the last 5 minutes:\n\n")
        p.write(count_change)
        sts = p.close()
        if sts is not None:
                print "Sendmail exit status", sts

#---------- compare the failcnt-values

def cntcheck(data_read,current_data,veid,fields,count,count_change):
        if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
                if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
                        hostname=find_veid(veid)
                        count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] + ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
        return count_change

#---------- compare the current value with barrier/limit value

def barriercheck(data_read,current_data,veid,fields,count,barrier_break):
        if count == False:      #comparing current level with barrier/limit
                if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
                                hostname=find_veid(veid)
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
                elif current_data[veid][fields[0]][0] == 'physpages':
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
                                hostname=find_veid(veid)
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
                else:
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
                                hostname=find_veid(veid)
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
        return barrier_break


#------------ read user_beancounter and handle the result of the comparison subroutines

def compare_data(beancounter_data,data_read,count):
        count_change=str()
        barrier_break=str()
        for line in beancounter_data:
                if 'Version' in line or 'uid' in line or 'dummy' in line:
                        continue
                else:
                        fields=line.split( )
                        if len(fields) == 7:
                                i=0
                                veid=int(fields[0][:-1])
                                fields.pop(0) #remove the first element
                                current_data[veid]=dict()
                                current_data[veid][fields[0]]=fields
                        else:
                                i=i+1
                                current_data[veid][fields[0]]=fields

                # ------ check failcnt
                        barrier_break=barriercheck(data_read,current_data,veid,fields,count,barrier_break)


                # ------ check barrier/limit
                        count_change=cntcheck(data_read,current_data,veid,fields,count,count_change)

        if barrier_break and count == False:
                print barrier_break
                sys.exit(2)
        elif count == False:
                print 'All Beancounters OK'
                sys.exit(0)

        if count_change and count == True:
                print 'sending mail'
                send_mail(count_change)
                print 'mail sent'
                return current_data
        elif count == True:
                return current_data


# ----- pickle data - read or write

def pickle_data(current_data,action,count,picklefilepath):
        try:
                picklefile = None
                if action == 'write':
                        if current_data:
                                picklefile=open(picklefilepath,'w')
                                pickle.dump(current_data, picklefile)
                                picklefile.close()
                                return
                        else:
                                print 'current_data is empty: ' + str(current_data)
                elif action == 'read':
                        picklefile=open(picklefilepath,'r')
                        data_read=pickle.load(picklefile)
                        picklefile.close()
                        if data_read:
                                return data_read
                        else:
                                print 'DATA_READ IS NONE:' + str(data_read)
                                return data_read
        except IOError:
                current_data = compare_data(beancounter_data,'0',count)
                picklefile=open(picklefilepath,'w')
                pickle.dump(current_data,picklefile)
                picklefile.close()

# ------- print script usage

def usage(prog="check_beancounter.py"):
    print """
check_beancounter.py : Check if failcounters increase or resource-values break barriers or limits

 check_beancounter.py [-tfh]

 -h                  print this message

 -t                  Check if failcnt-values have increased since the last run
 -f                  Check if current value of a resource is higher than barrier/limit
 """


opts=getopt.getopt(sys.argv[1:], 'thf')
if opts:
        if opts[0]==[]:
                usage(); sys.exit(0)
        elif opts[0][0][0]=='-h':
                usage(); sys.exit(0)
        elif opts[0][0][0]=='-t':
                count=True
        elif opts[0][0][0]=='-f':
                count=False


data_read=pickle_data(current_data,'read',count,picklefilepath)
current_data = compare_data(beancounter_data,data_read,count)
pickle_data(current_data,'write',count,picklefilepath)