Editing Monitoring /proc/user beancounters with nagios
Warning: You are not logged in. Your IP address will be publicly visible if you make any edits. If you log in or create an account, your edits will be attributed to your username, along with other benefits.
The edit can be undone.
Please check the comparison below to verify that this is what you want to do, and then save the changes below to finish undoing the edit.
Latest revision | Your text | ||
Line 1: | Line 1: | ||
− | + | ==== Monitoring /proc/user_beancounters with nagios ==== | |
− | |||
− | Works as nagios-plugin with option '- | + | To be added locally on the VZ HN to /etc/nagios/nrpe_local.conf<br> |
+ | Works as nagios-plugin with option '-f' or reports an increase of a failcnt-value by mail if run e.g. as a cronjob with option '-t'. We use it with both cases to be sure that we see a peak in case it happened between the nagios-checks: | ||
− | + | <source lang=python> | |
+ | #!/usr/bin/python | ||
+ | # Copyright (C) 2008 Christian Benke | ||
+ | # Distributed under the terms of the GNU General Public License v2 | ||
+ | # v0.1 2008-04-03 | ||
+ | # Christian Benke <benkokakao gmail com> | ||
+ | import string | ||
+ | import pickle | ||
+ | import sys | ||
+ | import getopt | ||
+ | import re | ||
+ | import smtplib | ||
+ | import socket | ||
+ | veid='' | ||
+ | current_data=dict() | ||
+ | opts=None | ||
+ | beancounter_data=open('/proc/user_beancounters','r') | ||
+ | picklefilepath='/tmp/beancounters_pickledump' | ||
− | == | + | #-------- find the hostname for each veid ---: |
+ | |||
+ | def find_veid(veid): | ||
+ | veid_conf=open('/etc/vz/conf/' + str(veid) + '.conf','r') | ||
+ | for line in veid_conf: | ||
+ | if "HOSTNAME" in line: | ||
+ | quotes=re.compile("\"") | ||
+ | line=quotes.sub("",line) | ||
+ | linefeed=re.compile("\n") | ||
+ | line=linefeed.sub("",line) | ||
+ | fqdn=re.split('=',line) | ||
+ | hostname=re.split('\.',fqdn[1])[0] | ||
+ | return hostname | ||
+ | |||
+ | # ---------- send mail in case of a counter-change | ||
+ | def send_mail(count_change): | ||
+ | mailfrom = 'root@' + str(host) | ||
+ | mailto = 'to@example.com' | ||
+ | mailsubject = 'Beancounters changed in the last 5 minutes' | ||
+ | mailbody = 'The Beancounter-failcnt value of the following veid(s) and resource(s) has \nincreased in the last 5 minutes:\n\n' | ||
+ | server = smtplib.SMTP('localhost') | ||
+ | server.sendmail(mailfrom, [mailto], '''\ | ||
+ | From:''' + mailfrom + '''\ | ||
+ | \nTo:''' + mailto + '''\ | ||
+ | \nSubject:''' + mailsubject + '''\ | ||
+ | |||
+ | \n''' + mailbody + count_change) | ||
+ | server.quit() | ||
+ | |||
+ | #------------read raw and compare data from user_beancounters | ||
+ | |||
+ | def compare_data(beancounter_data,data_read,count): | ||
+ | barrier_break=str() | ||
+ | count_change=str() | ||
+ | for line in beancounter_data: | ||
+ | if 'Version' in line or 'uid' in line or 'dummy' in line: | ||
+ | continue | ||
+ | else: | ||
+ | fields=line.split( ) | ||
+ | if len(fields) == 7: | ||
+ | i=0 | ||
+ | veid=int(fields[0][:-1]) | ||
+ | fields.pop(0) #remove the first element | ||
+ | current_data[veid]=dict() | ||
+ | current_data[veid][fields[0]]=fields | ||
+ | else: | ||
+ | i=i+1 | ||
+ | current_data[veid][fields[0]]=fields | ||
+ | if data_read and count == True and data_read is not '0': #comparing counters of new data with previous run | ||
+ | if data_read[veid][fields[0]][5] < current_data[veid][fields[0]][5]: | ||
+ | if int(veid) != 0: | ||
+ | hostname=find_veid(veid) | ||
+ | else: | ||
+ | hostname='OpenVZ Hardware Node' | ||
+ | count_change=str(count_change) + str(hostname) + ': ' + str(fields[0]) + ' failcnt has changed from ' + data_read[veid][fields[0]][5] | ||
+ | + ' to ' + str(current_data[veid][fields[0]][5]) + '\n' | ||
+ | |||
+ | if count == False: #comparing current level with barrier/limit | ||
+ | if current_data[veid][fields[0]][0] == 'oomguarpages': #for oomguarpages and physpages only the limit-value is relevant | ||
+ | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9: | ||
+ | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
+ | elif current_data[veid][fields[0]][0] == 'physpages': | ||
+ | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][4])*0.9: | ||
+ | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
+ | else: | ||
+ | if int(current_data[veid][fields[0]][1]) > int(current_data[veid][fields[0]][3])*0.9: | ||
+ | barrier_break = str(barrier_break) + str(veid) + ': ' + str(current_data[veid][fields[0]][0]) + ' ' | ||
+ | if barrier_break and count == False: | ||
+ | print barrier_break | ||
+ | sys.exit(2) | ||
+ | elif count == False: | ||
+ | print 'All Beancounters OK' | ||
+ | sys.exit(0) | ||
+ | |||
+ | if count_change and count == True: | ||
+ | send_mail(count_change) | ||
+ | return current_data | ||
+ | elif count == True: | ||
+ | return current_data | ||
+ | |||
+ | |||
+ | # ----- pickle data - read or write | ||
+ | |||
+ | def pickle_data(current_data,action,count,picklefilepath): | ||
+ | try: | ||
+ | picklefile = None | ||
+ | if action == 'write': | ||
+ | if current_data: | ||
+ | picklefile=open(picklefilepath,'w') | ||
+ | pickle.dump(current_data, picklefile) | ||
+ | picklefile.close() | ||
+ | return | ||
+ | else: | ||
+ | print 'current_data is empty: ' + str(current_data) | ||
+ | elif action == 'read': | ||
+ | picklefile=open(picklefilepath,'r') | ||
+ | data_read=pickle.load(picklefile) | ||
+ | picklefile.close() | ||
+ | if data_read: | ||
+ | return data_read | ||
+ | else: | ||
+ | print 'DATA_READ IS NONE:' + str(data_read) | ||
+ | return data_read | ||
+ | except IOError: | ||
+ | current_data = compare_data(beancounter_data,'0',count) | ||
+ | picklefile=open(picklefilepath,'w') | ||
+ | pickle.dump(current_data,picklefile) | ||
+ | picklefile.close() | ||
+ | |||
+ | # ------- print script usage | ||
+ | |||
+ | def usage(prog="check_beancounters.py"): | ||
+ | print """ | ||
+ | check_beancounters.py : Check if resource-values break barriers or limits and failcounters increase | ||
+ | |||
+ | check_beancounters.py [-tfh] | ||
+ | |||
+ | -h print this message | ||
+ | |||
+ | -t Check if failcnt-values have increased since the last run | ||
+ | -f Check if current value of a resource is higher than barrier/limit | ||
+ | """ | ||
+ | |||
+ | |||
+ | opts=getopt.getopt(sys.argv[1:], 'thf') | ||
+ | if opts: | ||
+ | if opts[0]==[]: | ||
+ | usage(); sys.exit(0) | ||
+ | elif opts[0][0][0]=='-h': | ||
+ | usage(); sys.exit(0) | ||
+ | elif opts[0][0][0]=='-t': | ||
+ | count=True | ||
+ | elif opts[0][0][0]=='-f': | ||
+ | count=False | ||
+ | |||
+ | |||
+ | data_read=pickle_data(current_data,'read',count,picklefilepath) | ||
+ | current_data = compare_data(beancounter_data,data_read,count) | ||
+ | pickle_data(current_data,'write',count,picklefilepath) | ||
+ | </source> | ||
− | |||
− | |||