|
|
(5 intermediate revisions by 3 users not shown) |
Line 1: |
Line 1: |
− | ==== Monitoring /proc/user_beancounters with nagios ====
| + | {{DISPLAYTITLE:Monitoring /proc/user_beancounters with nagios}} |
| + | To be added locally on the VZ HN to <code>/etc/nagios/nrpe_local.conf</code> and as a standalone script run with cron. |
| | | |
− | To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf<br>
| + | Works as nagios-plugin with option '-c' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-i'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks. |
− | Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks: | |
| | | |
− | <source lang=python>
| + | The most current version of this script is available at http://github.com/peletiah/openvz/tree/master |
− | #!/usr/bin/python
| |
− | # Copyright (C) 2008 Christian Benke
| |
− | # Distributed under the terms of the GNU General Public License v2
| |
− | # v0.2 2008-04-04
| |
− | # Christian Benke <benkokakao gmail com>
| |
| | | |
− | import string
| |
− | import pickle
| |
− | import sys
| |
− | import getopt
| |
− | import re
| |
| | | |
− | veid=''
| |
− | current_data=dict()
| |
− | opts=None
| |
− | beancounter_data=None
| |
− | picklefilepath='/tmp/beancounters_pickledump'
| |
| | | |
− | #-------- find the hostname for each veid ---:
| + | ==Alternative Script== |
− | | |
− | def find_veid(veid):
| |
− | veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r')
| |
− | if int(veid) != 0:
| |
− | for line in veid_conf:
| |
− | if "HOSTNAME" in line:
| |
− | quotes=re.compile("\"")
| |
− | line=quotes.sub("",line)
| |
− | linefeed=re.compile("\n")
| |
− | line=linefeed.sub("",line)
| |
− | fqdn=re.split('=',line)
| |
− | hostname=re.split('\.',fqdn[1])[0]
| |
− | return hostname
| |
− | else:
| |
− | hostname='OpenVZ HN'
| |
− | return hostname
| |
− | | |
− | # ---------- send mail in case of a counter-change
| |
− | | |
− | def send_mail(count_change):
| |
− | sendmail = "/usr/lib/sendmail" # sendmail location
| |
− | import os
| |
− | p = os.popen("%s -t" % sendmail, "w")
| |
− | p.write("From: root\n")
| |
− | p.write("To: to@example.com\n")
| |
− | p.write("Subject: Beancounters changed in the last 5 minutes\n")
| |
− | p.write("\n") # blank line separating headers from body
| |
− | p.write("The Beancounter-failcnt value of the following veid(s) and resource(s) has \n")
| |
− | p.write("increased in the last 5 minutes:\n\n")
| |
− | p.write(count_change)
| |
− | sts = p.close()
| |
− | if sts is not None:
| |
− | print "Sendmail exit status", sts
| |
− | | |
− | #---------- compare the failcnt-values
| |
− | | |
− | def cntcheck(data_read,current_data,veid,fields,count,count_change):
| |
− | if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run
| |
− | if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]:
| |
− | hostname=find_veid(veid)
| |
− | count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] + ' to ' + str(current_data[veid][fields[0]][5]) + '\n'
| |
− | return count_change
| |
− | | |
− | #---------- compare the current value with barrier/limit value
| |
− | | |
− | def barriercheck(data_read,current_data,veid,fields,count,barrier_break):
| |
− | if count == False: #comparing current level with barrier/limit
| |
− | if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant
| |
− | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
| |
− | hostname=find_veid(veid)
| |
− | barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
| |
− | elif current_data[veid][fields[0]][0] == 'physpages':
| |
− | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9:
| |
− | hostname=find_veid(veid)
| |
− | barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
| |
− | else:
| |
− | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9:
| |
− | hostname=find_veid(veid)
| |
− | barrier_break = str(barrier_break) + str(hostname) + ': ' + str(current_data[veid][fields[0]][0]) + ' '
| |
− | return barrier_break
| |
− | | |
− | | |
− | #------------ read user_beancounter and handle the result of the comparison subroutines
| |
− | | |
− | def compare_data(beancounter_data,data_read,count):
| |
− | count_change=str()
| |
− | barrier_break=str()
| |
− | for line in beancounter_data:
| |
− | if 'Version' in line or 'uid' in line or 'dummy' in line:
| |
− | continue
| |
− | else:
| |
− | fields=line.split( )
| |
− | if len(fields) == 7:
| |
− | i=0
| |
− | veid=int(fields[0][:-1])
| |
− | fields.pop(0) #remove the first element
| |
− | current_data[veid]=dict()
| |
− | current_data[veid][fields[0]]=fields
| |
− | else:
| |
− | i=i+1
| |
− | current_data[veid][fields[0]]=fields
| |
− | | |
− | # ------ check barrier/limit
| |
− | barrier_break=barriercheck(data_read,current_data,veid,fields,count,barrier_break)
| |
− | | |
− | | |
− | # ------ check failcnt
| |
− | count_change=cntcheck(data_read,current_data,veid,fields,count,count_change)
| |
− | | |
− | if barrier_break and count == False:
| |
− | print barrier_break
| |
− | sys.exit(2)
| |
− | elif count == False:
| |
− | print 'All Beancounters OK'
| |
− | sys.exit(0)
| |
− | | |
− | if count_change and count == True:
| |
− | send_mail(count_change)
| |
− | return current_data
| |
− | elif count == True:
| |
− | return current_data
| |
− | | |
− | | |
− | # ----- pickle data - read or write
| |
− | | |
− | def pickle_data(current_data,action,count,picklefilepath):
| |
− | try:
| |
− | picklefile = None
| |
− | if action == 'write':
| |
− | if current_data:
| |
− | picklefile=open(picklefilepath,'w')
| |
− | pickle.dump(current_data, picklefile)
| |
− | picklefile.close()
| |
− | return
| |
− | else:
| |
− | print 'current_data is empty: ' + str(current_data)
| |
− | elif action == 'read':
| |
− | picklefile=open(picklefilepath,'r')
| |
− | data_read=pickle.load(picklefile)
| |
− | picklefile.close()
| |
− | if data_read:
| |
− | return data_read
| |
− | else:
| |
− | print 'DATA_READ IS NONE:' + str(data_read)
| |
− | return data_read
| |
− | except IOError:
| |
− | current_data = compare_data(beancounter_data,'0',count)
| |
− | picklefile=open(picklefilepath,'w')
| |
− | pickle.dump(current_data,picklefile)
| |
− | picklefile.close()
| |
− | | |
− | # ------- print script usage
| |
− | | |
− | def usage(prog="check_beancounter.py"):
| |
− | print """
| |
− | check_beancounter.py : Check if failcounters increase or resource-values break barriers or limits
| |
− | | |
− | check_beancounter.py [-tfh]
| |
− | | |
− | -h print this message
| |
− | | |
− | -t Check if failcnt-values have increased since the last run
| |
− | -f Check if current value of a resource is higher than barrier/limit
| |
− | """
| |
− | | |
− | | |
− | opts=getopt.getopt(sys.argv[1:], 'thf')
| |
− | if opts:
| |
− | if opts[0]==[]:
| |
− | usage(); sys.exit(0)
| |
− | elif opts[0][0][0]=='-h':
| |
− | usage(); sys.exit(0)
| |
− | elif opts[0][0][0]=='-t':
| |
− | count=True
| |
− | elif opts[0][0][0]=='-f':
| |
− | count=False
| |
− | | |
− | beancounter_data=open('/proc/user_beancounters','r')
| |
− | data_read=pickle_data(current_data,'read',count,picklefilepath)
| |
− | current_data = compare_data(beancounter_data,data_read,count)
| |
− | pickle_data(current_data,'write',count,picklefilepath)
| |
− | </source>
| |
| | | |
| + | Here is an alternative script, also written in python: |
| + | [http://www.kbrandt.com/2008/10/openvz-beancounters-nagios-script.html OpenVZ Nagios Bean Counters Script] |
| | | |
| | | |