Monitoring /proc/user beancounters with nagios

From OpenVZ Virtuozzo Containers Wiki
Revision as of 08:54, 4 April 2008 by Benko (talk | contribs) (Nagios beancounters monitoring moved to Monitoring /proc/user beancounters with nagios: adapted to what seems to be the convention for monitoring articles)
Jump to: navigation, search

Monitoring /proc/user_beancounters with nagios

To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf
Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks:

#!/usr/bin/python
# Copyright (C) 2008 Christian Benke
# Distributed under the terms of the GNU General Public License v2
# v0.1 2008-04-03
# Christian Benke <benkokakao  gmail  com>

import string
import pickle
import sys
import getopt
import re
import smtplib
import socket

veid=''
current_data=dict()
opts=None
beancounter_data=open('/proc/user_beancounters','r')
picklefilepath='/tmp/beancounters_pickledump'

#-------- find the hostname for each veid ---:

def find_veid(veid):
        veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
        for line in veid_conf:
                if "HOSTNAME" in line:
                        quotes=re.compile("\"")
                        line=quotes.sub("",line)
                        linefeed=re.compile("\n")
                        line=linefeed.sub("",line)
                        fqdn=re.split('=',line)
                        hostname=re.split('\.',fqdn[1])[0]
                        return hostname

# ---------- send mail in case of a counter-change
def send_mail(count_change):
        mailfrom = 'root@' + str(host)
        mailto = 'to@example.com'
        mailsubject = 'Beancounters changed in the last 5 minutes'
        mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n'
        server = smtplib.SMTP('localhost')
        server.sendmail(mailfrom, [mailto], '''\
From:''' + mailfrom + '''\
\nTo:''' + mailto + '''\
\nSubject:''' + mailsubject + '''\

\n''' + mailbody + count_change)
        server.quit()

#------------read raw and compare data from user_beancounters

def compare_data(beancounter_data,data_read,count):
        barrier_break=str()
        count_change=str()
        for line in beancounter_data:
                if 'Version' in line or 'uid' in line or 'dummy' in line:
                        continue
                else:
                        fields=line.split( )
                        if len(fields) == 7:
                                i=0
                                veid=int(fields[0][:-1])
                                fields.pop(0) #remove the first element
                                current_data[veid]=dict()
                                current_data[veid][fields[0]]=fields
                        else:
                                i=i+1
                                current_data[veid][fields[0]]=fields
                if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
                        if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
                                if int(veid) != 0:
                                        hostname=find_veid(veid)
                                else:
                                        hostname='OpenVZ Hardware Node'
                                count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5]
 + ' to ' + str(current_data[veid][fields[0]][5]) + '\n'

                if count == False:      #comparing current level with barrier/limit
                        if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
                        elif current_data[veid][fields[0]][0] == 'physpages':
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
                        else:
                                if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
                                        barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
        if barrier_break and count == False:
                print barrier_break
                sys.exit(2)
        elif count == False:
                print 'All Beancounters OK'
                sys.exit(0)

        if count_change and count == True:
                send_mail(count_change)
                return current_data
        elif count == True:
                return current_data


# ----- pickle data - read or write

def pickle_data(current_data,action,count,picklefilepath):
        try:
                picklefile = None
                if action == 'write':
                        if current_data:
                                picklefile=open(picklefilepath,'w')
                                pickle.dump(current_data, picklefile)
                                picklefile.close()
                                return
                        else:
                                print 'current_data is empty: ' + str(current_data)
                elif action == 'read':
                        picklefile=open(picklefilepath,'r')
                        data_read=pickle.load(picklefile)
                        picklefile.close()
                        if data_read:
                                return data_read
                        else:
                                print 'DATA_READ IS NONE:' + str(data_read)
                                return data_read
        except IOError:
                current_data = compare_data(beancounter_data,'0',count)
                picklefile=open(picklefilepath,'w')
                pickle.dump(current_data,picklefile)
                picklefile.close()

# ------- print script usage

def usage(prog="check_beancounters.py"):
    print """
check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase

 check_beancounters.py [-tfh]

 -h                  print this message

 -t                  Check if failcnt-values have increased since the last run
 -f                  Check if current value of a resource is higher than barrier/limit
 """


opts=getopt.getopt(sys.argv[1:], 'thf')
if opts:
        if opts[0]==[]:
                usage(); sys.exit(0)
        elif opts[0][0][0]=='-h':
                usage(); sys.exit(0)
        elif opts[0][0][0]=='-t':
                count=True
        elif opts[0][0][0]=='-f':
                count=False


data_read=pickle_data(current_data,'read',count,picklefilepath)
current_data = compare_data(beancounter_data,data_read,count)
pickle_data(current_data,'write',count,picklefilepath)