Difference between revisions of "Monitoring /proc/user beancounters with nagios"

From OpenVZ Virtuozzo Containers Wiki
Jump to: navigation, search
Line 4: Line 4:
 
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks.
 
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks.
  
<source lang=python>
+
The most current version of this script is available at http://github.com/peletiah/openvz/tree/master
#!/usr/bin/python
 
# Copyright (C) 2008 Christian Benke
 
# Distributed under the terms of the GNU General Public License v2
 
# v0.2 2008-04-04
 
# Christian Benke <c.benke  gmail  com>
 
  
import string
 
import pickle
 
import sys
 
import getopt
 
import re
 
  
veid=''
 
current_data=dict()
 
opts=None
 
beancounter_data=None
 
picklefilepath='/tmp/beancounters_pickledump'
 
  
#-------- find the hostname for each veid ---:
+
==Alternative Script==
 
 
def find_veid(veid):
 
        veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
 
        if int(veid) != 0:
 
                for line in veid_conf:
 
                        if "HOSTNAME" in line:
 
                                quotes=re.compile("\"")
 
                                line=quotes.sub("",line)
 
                                linefeed=re.compile("\n")
 
                                line=linefeed.sub("",line)
 
                                fqdn=re.split('=',line)
 
                                hostname=re.split('\.',fqdn[1])[0]
 
                                return hostname
 
        else:
 
                hostname='OpenVZ HN'
 
                return hostname
 
 
 
# ---------- send mail in case of a counter-change
 
 
 
def send_mail(count_change):
 
        sendmail = "/usr/lib/sendmail" # sendmail location
 
        import os
 
        p = os.popen("%s -t" % sendmail, "w")
 
        p.write("From: root\n")
 
        p.write("To: to@example.com\n")
 
        p.write("Subject: Beancounters changed in the last 5 minutes\n")
 
        p.write("\n") # blank line separating headers from body
 
        p.write("The Beancounter-failcnt value of the following veid(s) and resource(s) has \n")
 
        p.write("increased in the last 5 minutes:\n\n")
 
        p.write(count_change)
 
        sts = p.close()
 
        if sts is not None:
 
                print "Sendmail exit status", sts
 
 
 
#---------- compare the failcnt-values
 
 
 
def cntcheck(data_read,current_data,veid,fields,count,count_change):
 
        if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
 
                if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
 
                        hostname=find_veid(veid)
 
                        count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] + ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
 
        return count_change
 
 
 
#---------- compare the current value with barrier/limit value
 
 
 
def barriercheck(data_read,current_data,veid,fields,count,barrier_break):
 
        if count == False:      #comparing current level with barrier/limit
 
                if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
 
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 
                                hostname=find_veid(veid)
 
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
                elif current_data[veid][fields[0]][0] == 'physpages':
 
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
 
                                hostname=find_veid(veid)
 
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
                else:
 
                        if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
 
                                hostname=find_veid(veid)
 
                                barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
 
        return barrier_break
 
 
 
 
 
#------------ read user_beancounter and handle the result of the comparison subroutines
 
 
 
def compare_data(beancounter_data,data_read,count):
 
        count_change=str()
 
        barrier_break=str()
 
        for line in beancounter_data:
 
                if 'Version' in line or 'uid' in line or 'dummy' in line:
 
                        continue
 
                else:
 
                        fields=line.split( )
 
                        if len(fields) == 7:
 
                                i=0
 
                                veid=int(fields[0][:-1])
 
                                fields.pop(0) #remove the first element
 
                                current_data[veid]=dict()
 
                                current_data[veid][fields[0]]=fields
 
                        else:
 
                                i=i+1
 
                                current_data[veid][fields[0]]=fields
 
 
 
                # ------ check barrier/limit
 
                        barrier_break=barriercheck(data_read,current_data,veid,fields,count,barrier_break)
 
 
 
 
 
                # ------ check failcnt
 
                        count_change=cntcheck(data_read,current_data,veid,fields,count,count_change)
 
 
 
        if barrier_break and count == False:
 
                print barrier_break
 
                sys.exit(2)
 
        elif count == False:
 
                print 'All Beancounters OK'
 
                sys.exit(0)
 
 
 
        if count_change and count == True:
 
                send_mail(count_change)
 
                return current_data
 
        elif count == True:
 
                return current_data
 
 
 
 
 
# ----- pickle data - read or write
 
 
 
def pickle_data(current_data,action,count,picklefilepath):
 
        try:
 
                picklefile = None
 
                if action == 'write':
 
                        if current_data:
 
                                picklefile=open(picklefilepath,'w')
 
                                pickle.dump(current_data, picklefile)
 
                                picklefile.close()
 
                                return
 
                        else:
 
                                print 'current_data is empty: ' + str(current_data)
 
                elif action == 'read':
 
                        picklefile=open(picklefilepath,'r')
 
                        data_read=pickle.load(picklefile)
 
                        picklefile.close()
 
                        if data_read:
 
                                return data_read
 
                        else:
 
                                print 'DATA_READ IS NONE:' + str(data_read)
 
                                return data_read
 
        except IOError:
 
                current_data = compare_data(beancounter_data,'0',count)
 
                picklefile=open(picklefilepath,'w')
 
                pickle.dump(current_data,picklefile)
 
                picklefile.close()
 
 
 
# ------- print script usage
 
 
 
def usage(prog="check_beancounter.py"):
 
    print """
 
check_beancounter.py : Check if failcounters increase or resource-values break barriers or limits
 
 
 
check_beancounter.py [-tfh]
 
 
 
-h                  print this message
 
 
 
-t                  Check if failcnt-values have increased since the last run
 
-f                  Check if current value of a resource is higher than barrier/limit
 
"""
 
 
 
 
 
opts=getopt.getopt(sys.argv[1:], 'thf')
 
if opts:
 
        if opts[0]==[]:
 
                usage(); sys.exit(0)
 
        elif opts[0][0][0]=='-h':
 
                usage(); sys.exit(0)
 
        elif opts[0][0][0]=='-t':
 
                count=True
 
        elif opts[0][0][0]=='-f':
 
                count=False
 
 
 
beancounter_data=open('/proc/user_beancounters','r')
 
data_read=pickle_data(current_data,'read',count,picklefilepath)
 
current_data = compare_data(beancounter_data,data_read,count)
 
pickle_data(current_data,'write',count,picklefilepath)
 
</source>
 
  
 
Here is an alternative script, also written in python:
 
Here is an alternative script, also written in python:

Revision as of 11:21, 24 November 2008

To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf.

Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks.

The most current version of this script is available at http://github.com/peletiah/openvz/tree/master


Alternative Script

Here is an alternative script, also written in python: OpenVZ Nagios Bean Counters Script