From dd1ede06890103ffb95cd5684a8acd93b92b7d18 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 9 Apr 2014 16:56:27 +0530 Subject: [PATCH 01/35] Added separate class for generating checkresults for Nagios Passive checks --- ganglia-nagios-bridge.py | 91 ++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 35 deletions(-) mode change 100755 => 100644 ganglia-nagios-bridge.py diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py old mode 100755 new mode 100644 index 3e99d41..ad64999 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -41,19 +41,36 @@ def getByteStream(self): def read(self, buf_size): return self.socket.recv(buf_size) -# interprets metric values to generate Nagios passive notifications -class PassiveGenerator: - def __init__(self, force_dmax, tmax_grace): - self.force_dmax = force_dmax - self.tmax_grace = tmax_grace - - # Nagios is quite fussy about the filename, it must be + +class GenerateNagiosCheckResult: + + def __init__(self): + # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly self.fh = tmp_file[0] self.cmd_file = tmp_file[1] os.write(self.fh, "### Active Check Result File ###\n") os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } + + # Writes to the checkresult file + def create(self, host, service_name, last_seen, service_state, metric_value, metric_units): + os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "service_description=" + service_name + "\n") + os.write(self.fh, "check_type=0\n") + os.write(self.fh, "check_options=0\n") + os.write(self.fh, "scheduled_check=1\n") + os.write(self.fh, "reschedule_check=1\n") + os.write(self.fh, "latency=0.1\n") + os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "early_timeout=0\n") + os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "return_code=" + str(service_state) + "\n") + os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") def done(self): os.close(self.fh) @@ -61,6 +78,15 @@ def done(self): ok_fh = file(ok_filename, 'a') ok_fh.close() + + +# interprets metric values to generate Nagios passive notifications +class PassiveGenerator: + def __init__(self, force_dmax, tmax_grace): + self.force_dmax = force_dmax + self.tmax_grace = tmax_grace + + def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): effective_dmax = metric_dmax if(self.force_dmax > 0): @@ -72,41 +98,30 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met service_state = 3 elif isinstance(metric_value, str): service_state = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: + elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: service_state = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 + service_state = 1 elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: service_state = 2 elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: service_state = 1 else: service_state = 0 - #cmd = "[" + str(int(time.time())) + "] PROCESS_SERVICE_CHECK_RESULT;" + host + ";" + service_name + ";" + str(service_state) + ";Value = " + str(metric_value) - #os.write(self.fh, cmd + "\n") - os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") - os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + str(metric_value) + "\\n\n") - #os.write(self.fh, "\n") + return service_state + + + def done(self): + self.gn.done() + # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): - def __init__(self, clusters_c, value_handler): + def __init__(self, clusters_c, value_handler, checkresult_file_handler): self.clusters_c = clusters_c self.value_handler = value_handler + self.checkresult_file_handler = checkresult_file_handler self.clusters_cache = {} self.hosts_cache = {} self.metrics_cache = {} @@ -188,6 +203,7 @@ def handle_metric(self, metric_name, service_name, attrs): metric_tmax = int(attrs['TMAX']) metric_dmax = int(attrs['DMAX']) metric_type = attrs['TYPE'] + metric_units = attrs['UNITS'] # they metric_value has a dynamic type: if metric_type == 'string': metric_value = metric_value_raw @@ -196,9 +212,13 @@ def handle_metric(self, metric_name, service_name, attrs): else: metric_value = int(metric_value_raw) last_seen = self.cluster_localtime - metric_tn - # call the handler to process the value: - self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) - + #setting service state as 0 by default + service_state=0 + # call the handler to process the value and return service state after comparing metric value and threshold: + service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) + # write Passive checks to checkresult file + self.checkresult_file_handler.create(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) + # main program code if __name__ == '__main__': try: @@ -232,12 +252,13 @@ def handle_metric(self, metric_name, service_name, attrs): # set up the SAX parser parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) - parser.setContentHandler(GangliaHandler(clusters_c, pg)) + gn = GenerateNagiosCheckResult() + parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) # run the main program loop parser.parse(SocketInputSource(sock)) - + # write out for Nagios - pg.done() + gn.done() # all done sock.close() From 53a00c19e2d685cc8782925d6788eef6b9d16050 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 9 Apr 2014 17:05:19 +0530 Subject: [PATCH 02/35] Fixed minor tab issues --- ganglia-nagios-bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index ad64999..949beed 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -101,7 +101,7 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: service_state = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 + service_state = 1 elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: service_state = 2 elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: @@ -215,7 +215,7 @@ def handle_metric(self, metric_name, service_name, attrs): #setting service state as 0 by default service_state=0 # call the handler to process the value and return service state after comparing metric value and threshold: - service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) + service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) # write Passive checks to checkresult file self.checkresult_file_handler.create(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) From 637c0fd07ac506b9a242d37eb98349aacdb08752 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 9 Apr 2014 17:14:07 +0530 Subject: [PATCH 03/35] fixed tabs --- ganglia-nagios-bridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 949beed..664644d 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -101,7 +101,7 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: service_state = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 + service_state = 1 elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: service_state = 2 elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: From 8a2013cee1dc38c76dc518933f3c762d2190c03b Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 9 Apr 2014 17:32:47 +0530 Subject: [PATCH 04/35] fixed tabs --- ganglia-nagios-bridge.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 664644d..de5eff5 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -47,7 +47,7 @@ class GenerateNagiosCheckResult: def __init__(self): # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly self.fh = tmp_file[0] self.cmd_file = tmp_file[1] os.write(self.fh, "### Active Check Result File ###\n") @@ -98,7 +98,7 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met service_state = 3 elif isinstance(metric_value, str): service_state = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: + elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: service_state = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: service_state = 1 @@ -255,8 +255,7 @@ def handle_metric(self, metric_name, service_name, attrs): gn = GenerateNagiosCheckResult() parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) # run the main program loop - parser.parse(SocketInputSource(sock)) - + parser.parse(SocketInputSource(sock)) # write out for Nagios gn.done() From b8f5e6c62aac76aaeaafcb4291d58e12efde0713 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 9 Apr 2014 17:43:44 +0530 Subject: [PATCH 05/35] fixed tabs --- ganglia-nagios-bridge.py~ | 268 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 ganglia-nagios-bridge.py~ diff --git a/ganglia-nagios-bridge.py~ b/ganglia-nagios-bridge.py~ new file mode 100644 index 0000000..949beed --- /dev/null +++ b/ganglia-nagios-bridge.py~ @@ -0,0 +1,268 @@ +#!/usr/bin/python +# +# ganglia-nagios-bridge - transfer Ganglia XML to Nagios checkresults file +# +# Project page: http://danielpocock.com/ganglia-nagios-bridge +# +# Copyright (C) 2010 Daniel Pocock http://danielpocock.com +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################ + +import argparse +import os +import re +import socket +import tempfile +import time +import xml.sax + +# wrapper class so that the SAX parser can process data from a network +# socket +class SocketInputSource: + def __init__(self, socket): + self.socket = socket + + def getByteStream(self): + return self + + def read(self, buf_size): + return self.socket.recv(buf_size) + + +class GenerateNagiosCheckResult: + + def __init__(self): + # Nagios is quite fussy about the filename, it must be + # a 7 character name starting with 'c' + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } + + # Writes to the checkresult file + def create(self, host, service_name, last_seen, service_state, metric_value, metric_units): + os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "service_description=" + service_name + "\n") + os.write(self.fh, "check_type=0\n") + os.write(self.fh, "check_options=0\n") + os.write(self.fh, "scheduled_check=1\n") + os.write(self.fh, "reschedule_check=1\n") + os.write(self.fh, "latency=0.1\n") + os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "early_timeout=0\n") + os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "return_code=" + str(service_state) + "\n") + os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + + def done(self): + os.close(self.fh) + ok_filename = self.cmd_file + ".ok" + ok_fh = file(ok_filename, 'a') + ok_fh.close() + + + +# interprets metric values to generate Nagios passive notifications +class PassiveGenerator: + def __init__(self, force_dmax, tmax_grace): + self.force_dmax = force_dmax + self.tmax_grace = tmax_grace + + + def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): + effective_dmax = metric_dmax + if(self.force_dmax > 0): + effective_dmax = force_dmax + effective_tmax = metric_tmax + self.tmax_grace + if effective_dmax > 0 and metric_tn > effective_dmax: + service_state = 3 + elif metric_tn > effective_tmax: + service_state = 3 + elif isinstance(metric_value, str): + service_state = 0 + elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: + service_state = 2 + elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: + service_state = 1 + elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: + service_state = 2 + elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: + service_state = 1 + else: + service_state = 0 + return service_state + + + def done(self): + self.gn.done() + + + +# SAX event handler for parsing the Ganglia XML stream +class GangliaHandler(xml.sax.ContentHandler): + def __init__(self, clusters_c, value_handler, checkresult_file_handler): + self.clusters_c = clusters_c + self.value_handler = value_handler + self.checkresult_file_handler = checkresult_file_handler + self.clusters_cache = {} + self.hosts_cache = {} + self.metrics_cache = {} + + def startElement(self, name, attrs): + + # METRIC is the most common element, it is handled first, + # followed by HOST and CLUSTER + + # handle common elements that we ignore + if name == "EXTRA_ELEMENT": + return + if name == "EXTRA_DATA": + return + + # handle a METRIC element in the XML + if name == "METRIC" and self.metrics is not None: + metric_name = attrs['NAME'] + cache_key = (self.cluster_idx, self.host_idx, metric_name) + if cache_key in self.metrics_cache: + metric_info = self.metrics_cache[cache_key] + self.metric_idx = metric_info[0] + service_name = metric_info[1] + self.metric = self.clusters_c[self.cluster_idx][1][self.host_idx][1][self.metric_idx][1] + self.handle_metric(metric_name, service_name, attrs) + return + for idx, metric_def in enumerate(self.metrics): + match_result = metric_def[0].match(metric_name) + if match_result: + service_name_tmpl = metric_def[1]['service_name'] + if len(match_result.groups()) > 0: + service_name = match_result.expand(service_name_tmpl) + else: + service_name = service_name_tmpl + self.metrics_cache[cache_key] = (idx, service_name) + self.metric = metric_def[1] + self.handle_metric(metric_name, service_name, attrs) + return + + # handle a HOST element in the XML + if name == "HOST" and self.hosts is not None: + self.metrics = None + self.host_name = attrs['NAME'] + self.host_reported = long(attrs['REPORTED']) + if strip_domains: + self.host_name = self.host_name.partition('.')[0] + cache_key = (self.cluster_idx, self.host_name) + if cache_key in self.hosts_cache: + self.host_idx = self.hosts_cache[cache_key] + self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] + return + for idx, host_def in enumerate(self.hosts): + if host_def[0].match(self.host_name): + self.hosts_cache[cache_key] = idx + self.host_idx = idx + self.metrics = host_def[1] + return + + # handle a CLUSTER element in the XML + if name == "CLUSTER": + self.hosts = None + self.cluster_name = attrs['NAME'] + self.cluster_localtime = long(attrs['LOCALTIME']) + if self.cluster_name in self.clusters_cache: + self.cluster_idx = self.clusters_cache[self.cluster_name] + self.hosts = self.clusters_c[self.cluster_idx][1] + return + for idx, cluster_def in enumerate(self.clusters_c): + if cluster_def[0].match(self.cluster_name): + self.clusters_cache[self.cluster_name] = idx + self.cluster_idx = idx + self.hosts = cluster_def[1] + return + + def handle_metric(self, metric_name, service_name, attrs): + # extract the metric attributes + metric_value_raw = attrs['VAL'] + metric_tn = int(attrs['TN']) + metric_tmax = int(attrs['TMAX']) + metric_dmax = int(attrs['DMAX']) + metric_type = attrs['TYPE'] + metric_units = attrs['UNITS'] + # they metric_value has a dynamic type: + if metric_type == 'string': + metric_value = metric_value_raw + elif metric_type == 'double' or metric_type == 'float': + metric_value = float(metric_value_raw) + else: + metric_value = int(metric_value_raw) + last_seen = self.cluster_localtime - metric_tn + #setting service state as 0 by default + service_state=0 + # call the handler to process the value and return service state after comparing metric value and threshold: + service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) + # write Passive checks to checkresult file + self.checkresult_file_handler.create(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) + +# main program code +if __name__ == '__main__': + try: + # parse command line + parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') + parser.add_argument('config_file', nargs='?', + help='configuration file', default='/etc/ganglia/nagios-bridge.conf') + args = parser.parse_args() + + # read the configuration file, setting some defaults first + force_dmax = 0 + tmax_grace = 60 + execfile(args.config_file) + + # compile the regular expressions + clusters_c = [] + for cluster_def in clusters: + cluster_c = re.compile(cluster_def[0]) + hosts = [] + for host_def in cluster_def[1]: + host_c = re.compile(host_def[0]) + metrics = [] + for metric_def in host_def[1]: + metric_c = re.compile(metric_def[0]) + metrics.append((metric_c, metric_def[1])) + hosts.append((host_c, metrics)) + clusters_c.append((cluster_c, hosts)) + + # connect to the gmetad or gmond + sock = socket.create_connection((gmetad_host, gmetad_port)) + # set up the SAX parser + parser = xml.sax.make_parser() + pg = PassiveGenerator(force_dmax, tmax_grace) + gn = GenerateNagiosCheckResult() + parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) + # run the main program loop + parser.parse(SocketInputSource(sock)) + + # write out for Nagios + gn.done() + + # all done + sock.close() + except socket.error as e: + logging.warn('Failed to connect to gmetad: %s', e.strerror) + + From cfb2dfb4b85232ec9303d782212a3cd94e7dab84 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 29 Apr 2014 19:44:13 +0530 Subject: [PATCH 06/35] Added a module for Nagios checkresult class --- NagiosCheckResult.py | 52 +++++++++++++++++++++++++++++++ ganglia-nagios-bridge.py | 67 ++++++++-------------------------------- 2 files changed, 65 insertions(+), 54 deletions(-) create mode 100644 NagiosCheckResult.py diff --git a/NagiosCheckResult.py b/NagiosCheckResult.py new file mode 100644 index 0000000..f2c27cd --- /dev/null +++ b/NagiosCheckResult.py @@ -0,0 +1,52 @@ +#!/usr/bin/python +# +#Class that creates Nagios checkresult file and writes Passive checks to it +########################################################################### + +import os +import tempfile +import time + + +class GenerateNagiosCheckResult: + + def __init__(self): + self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } + + + #Creates a checkresult file + def Create(self,nagios_result_dir): + # Nagios is quite fussy about the filename, it must be + # a 7 character name starting with 'c' + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + + + # Writes to the checkresult file + def Build(self, host, service_name, last_seen, service_state, metric_value, metric_units): + os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "service_description=" + service_name + "\n") + os.write(self.fh, "check_type=0\n") + os.write(self.fh, "check_options=0\n") + os.write(self.fh, "scheduled_check=1\n") + os.write(self.fh, "reschedule_check=1\n") + os.write(self.fh, "latency=0.1\n") + os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "early_timeout=0\n") + os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "return_code=" + str(service_state) + "\n") + os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + + + def Submit(self): + os.close(self.fh) + ok_filename = self.cmd_file + ".ok" + ok_fh = file(ok_filename, 'a') + ok_fh.close() + diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index de5eff5..30e5d42 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -22,12 +22,10 @@ ############################################################################ import argparse -import os import re import socket -import tempfile -import time import xml.sax +import NagiosCheckResult # wrapper class so that the SAX parser can process data from a network # socket @@ -42,44 +40,6 @@ def read(self, buf_size): return self.socket.recv(buf_size) -class GenerateNagiosCheckResult: - - def __init__(self): - # Nagios is quite fussy about the filename, it must be - # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } - - # Writes to the checkresult file - def create(self, host, service_name, last_seen, service_state, metric_value, metric_units): - os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") - os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") - - def done(self): - os.close(self.fh) - ok_filename = self.cmd_file + ".ok" - ok_fh = file(ok_filename, 'a') - ok_fh.close() - - - # interprets metric values to generate Nagios passive notifications class PassiveGenerator: def __init__(self, force_dmax, tmax_grace): @@ -98,10 +58,10 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met service_state = 3 elif isinstance(metric_value, str): service_state = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: + elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: service_state = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 + service_state = 1 elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: service_state = 2 elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: @@ -109,12 +69,7 @@ def process(self, metric_def, service_name, host, metric_name, metric_value, met else: service_state = 0 return service_state - - - def done(self): - self.gn.done() - - + # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): @@ -215,9 +170,9 @@ def handle_metric(self, metric_name, service_name, attrs): #setting service state as 0 by default service_state=0 # call the handler to process the value and return service state after comparing metric value and threshold: - service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) + service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) # write Passive checks to checkresult file - self.checkresult_file_handler.create(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) + self.checkresult_file_handler.Build(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) # main program code if __name__ == '__main__': @@ -252,12 +207,16 @@ def handle_metric(self, metric_name, service_name, attrs): # set up the SAX parser parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) - gn = GenerateNagiosCheckResult() + #Instantiate GenerateNagiosCheckResult class + gn = NagiosCheckResult.GenerateNagiosCheckResult() + #Create CheckResultFile + gn.Create(nagios_result_dir) parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) # run the main program loop - parser.parse(SocketInputSource(sock)) + parser.parse(SocketInputSource(sock)) + # write out for Nagios - gn.done() + gn.Submit() # all done sock.close() From 629b50bfa07001c2c1fc65966e06b60701c408cf Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 29 Apr 2014 19:48:51 +0530 Subject: [PATCH 07/35] Removed temp file --- ganglia-nagios-bridge.py~ | 268 -------------------------------------- 1 file changed, 268 deletions(-) delete mode 100644 ganglia-nagios-bridge.py~ diff --git a/ganglia-nagios-bridge.py~ b/ganglia-nagios-bridge.py~ deleted file mode 100644 index 949beed..0000000 --- a/ganglia-nagios-bridge.py~ +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/python -# -# ganglia-nagios-bridge - transfer Ganglia XML to Nagios checkresults file -# -# Project page: http://danielpocock.com/ganglia-nagios-bridge -# -# Copyright (C) 2010 Daniel Pocock http://danielpocock.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################ - -import argparse -import os -import re -import socket -import tempfile -import time -import xml.sax - -# wrapper class so that the SAX parser can process data from a network -# socket -class SocketInputSource: - def __init__(self, socket): - self.socket = socket - - def getByteStream(self): - return self - - def read(self, buf_size): - return self.socket.recv(buf_size) - - -class GenerateNagiosCheckResult: - - def __init__(self): - # Nagios is quite fussy about the filename, it must be - # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } - - # Writes to the checkresult file - def create(self, host, service_name, last_seen, service_state, metric_value, metric_units): - os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") - os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") - - def done(self): - os.close(self.fh) - ok_filename = self.cmd_file + ".ok" - ok_fh = file(ok_filename, 'a') - ok_fh.close() - - - -# interprets metric values to generate Nagios passive notifications -class PassiveGenerator: - def __init__(self, force_dmax, tmax_grace): - self.force_dmax = force_dmax - self.tmax_grace = tmax_grace - - - def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): - effective_dmax = metric_dmax - if(self.force_dmax > 0): - effective_dmax = force_dmax - effective_tmax = metric_tmax + self.tmax_grace - if effective_dmax > 0 and metric_tn > effective_dmax: - service_state = 3 - elif metric_tn > effective_tmax: - service_state = 3 - elif isinstance(metric_value, str): - service_state = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: - service_state = 2 - elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 - elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: - service_state = 2 - elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: - service_state = 1 - else: - service_state = 0 - return service_state - - - def done(self): - self.gn.done() - - - -# SAX event handler for parsing the Ganglia XML stream -class GangliaHandler(xml.sax.ContentHandler): - def __init__(self, clusters_c, value_handler, checkresult_file_handler): - self.clusters_c = clusters_c - self.value_handler = value_handler - self.checkresult_file_handler = checkresult_file_handler - self.clusters_cache = {} - self.hosts_cache = {} - self.metrics_cache = {} - - def startElement(self, name, attrs): - - # METRIC is the most common element, it is handled first, - # followed by HOST and CLUSTER - - # handle common elements that we ignore - if name == "EXTRA_ELEMENT": - return - if name == "EXTRA_DATA": - return - - # handle a METRIC element in the XML - if name == "METRIC" and self.metrics is not None: - metric_name = attrs['NAME'] - cache_key = (self.cluster_idx, self.host_idx, metric_name) - if cache_key in self.metrics_cache: - metric_info = self.metrics_cache[cache_key] - self.metric_idx = metric_info[0] - service_name = metric_info[1] - self.metric = self.clusters_c[self.cluster_idx][1][self.host_idx][1][self.metric_idx][1] - self.handle_metric(metric_name, service_name, attrs) - return - for idx, metric_def in enumerate(self.metrics): - match_result = metric_def[0].match(metric_name) - if match_result: - service_name_tmpl = metric_def[1]['service_name'] - if len(match_result.groups()) > 0: - service_name = match_result.expand(service_name_tmpl) - else: - service_name = service_name_tmpl - self.metrics_cache[cache_key] = (idx, service_name) - self.metric = metric_def[1] - self.handle_metric(metric_name, service_name, attrs) - return - - # handle a HOST element in the XML - if name == "HOST" and self.hosts is not None: - self.metrics = None - self.host_name = attrs['NAME'] - self.host_reported = long(attrs['REPORTED']) - if strip_domains: - self.host_name = self.host_name.partition('.')[0] - cache_key = (self.cluster_idx, self.host_name) - if cache_key in self.hosts_cache: - self.host_idx = self.hosts_cache[cache_key] - self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] - return - for idx, host_def in enumerate(self.hosts): - if host_def[0].match(self.host_name): - self.hosts_cache[cache_key] = idx - self.host_idx = idx - self.metrics = host_def[1] - return - - # handle a CLUSTER element in the XML - if name == "CLUSTER": - self.hosts = None - self.cluster_name = attrs['NAME'] - self.cluster_localtime = long(attrs['LOCALTIME']) - if self.cluster_name in self.clusters_cache: - self.cluster_idx = self.clusters_cache[self.cluster_name] - self.hosts = self.clusters_c[self.cluster_idx][1] - return - for idx, cluster_def in enumerate(self.clusters_c): - if cluster_def[0].match(self.cluster_name): - self.clusters_cache[self.cluster_name] = idx - self.cluster_idx = idx - self.hosts = cluster_def[1] - return - - def handle_metric(self, metric_name, service_name, attrs): - # extract the metric attributes - metric_value_raw = attrs['VAL'] - metric_tn = int(attrs['TN']) - metric_tmax = int(attrs['TMAX']) - metric_dmax = int(attrs['DMAX']) - metric_type = attrs['TYPE'] - metric_units = attrs['UNITS'] - # they metric_value has a dynamic type: - if metric_type == 'string': - metric_value = metric_value_raw - elif metric_type == 'double' or metric_type == 'float': - metric_value = float(metric_value_raw) - else: - metric_value = int(metric_value_raw) - last_seen = self.cluster_localtime - metric_tn - #setting service state as 0 by default - service_state=0 - # call the handler to process the value and return service state after comparing metric value and threshold: - service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) - # write Passive checks to checkresult file - self.checkresult_file_handler.create(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) - -# main program code -if __name__ == '__main__': - try: - # parse command line - parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') - parser.add_argument('config_file', nargs='?', - help='configuration file', default='/etc/ganglia/nagios-bridge.conf') - args = parser.parse_args() - - # read the configuration file, setting some defaults first - force_dmax = 0 - tmax_grace = 60 - execfile(args.config_file) - - # compile the regular expressions - clusters_c = [] - for cluster_def in clusters: - cluster_c = re.compile(cluster_def[0]) - hosts = [] - for host_def in cluster_def[1]: - host_c = re.compile(host_def[0]) - metrics = [] - for metric_def in host_def[1]: - metric_c = re.compile(metric_def[0]) - metrics.append((metric_c, metric_def[1])) - hosts.append((host_c, metrics)) - clusters_c.append((cluster_c, hosts)) - - # connect to the gmetad or gmond - sock = socket.create_connection((gmetad_host, gmetad_port)) - # set up the SAX parser - parser = xml.sax.make_parser() - pg = PassiveGenerator(force_dmax, tmax_grace) - gn = GenerateNagiosCheckResult() - parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) - # run the main program loop - parser.parse(SocketInputSource(sock)) - - # write out for Nagios - gn.done() - - # all done - sock.close() - except socket.error as e: - logging.warn('Failed to connect to gmetad: %s', e.strerror) - - From 0d2380148bd36bd6d7b878ada297ae297f01822c Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 29 Apr 2014 21:56:10 +0530 Subject: [PATCH 08/35] included GPL license --- NagiosCheckResult.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/NagiosCheckResult.py b/NagiosCheckResult.py index f2c27cd..d317705 100644 --- a/NagiosCheckResult.py +++ b/NagiosCheckResult.py @@ -1,6 +1,21 @@ #!/usr/bin/python # -#Class that creates Nagios checkresult file and writes Passive checks to it +# NagiosCheckResult- Class that creates Nagios checkresult file and +# writes Passive checks to it +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# ########################################################################### import os From fa1e390847dac50cefd444003b333fbb54d5132e Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 30 Apr 2014 02:07:30 +0530 Subject: [PATCH 09/35] removed NagiosCheckResult.py --- NagiosCheckResult.py | 67 -------------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 NagiosCheckResult.py diff --git a/NagiosCheckResult.py b/NagiosCheckResult.py deleted file mode 100644 index d317705..0000000 --- a/NagiosCheckResult.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/python -# -# NagiosCheckResult- Class that creates Nagios checkresult file and -# writes Passive checks to it -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -########################################################################### - -import os -import tempfile -import time - - -class GenerateNagiosCheckResult: - - def __init__(self): - self.return_codes = { 0 : 'OK', 1 : 'WARNING', 2 : 'CRITICAL', 3 : 'UNKNOWN' } - - - #Creates a checkresult file - def Create(self,nagios_result_dir): - # Nagios is quite fussy about the filename, it must be - # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - - - # Writes to the checkresult file - def Build(self, host, service_name, last_seen, service_state, metric_value, metric_units): - os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") - os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") - - - def Submit(self): - os.close(self.fh) - ok_filename = self.cmd_file + ".ok" - ok_fh = file(ok_filename, 'a') - ok_fh.close() - From d39271f7f3b4093fede07055ea74822d058ce9f0 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 30 Apr 2014 02:08:40 +0530 Subject: [PATCH 10/35] Added nagios_checkresult.py --- ganglia-nagios-bridge.py | 12 ++++---- nagios_checkresult.py | 64 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) create mode 100644 nagios_checkresult.py diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 30e5d42..f517ae3 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -25,7 +25,7 @@ import re import socket import xml.sax -import NagiosCheckResult +import nagios_checkresult # wrapper class so that the SAX parser can process data from a network # socket @@ -46,7 +46,6 @@ def __init__(self, force_dmax, tmax_grace): self.force_dmax = force_dmax self.tmax_grace = tmax_grace - def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): effective_dmax = metric_dmax if(self.force_dmax > 0): @@ -167,12 +166,13 @@ def handle_metric(self, metric_name, service_name, attrs): else: metric_value = int(metric_value_raw) last_seen = self.cluster_localtime - metric_tn + #setting service state as 0 by default service_state=0 # call the handler to process the value and return service state after comparing metric value and threshold: service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) # write Passive checks to checkresult file - self.checkresult_file_handler.Build(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) + self.checkresult_file_handler.build(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) # main program code if __name__ == '__main__': @@ -208,15 +208,15 @@ def handle_metric(self, metric_name, service_name, attrs): parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) #Instantiate GenerateNagiosCheckResult class - gn = NagiosCheckResult.GenerateNagiosCheckResult() + gn = nagios_checkresult.GenerateNagiosCheckResult() #Create CheckResultFile - gn.Create(nagios_result_dir) + gn.create(nagios_result_dir) parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) # run the main program loop parser.parse(SocketInputSource(sock)) # write out for Nagios - gn.Submit() + gn.submit() # all done sock.close() diff --git a/nagios_checkresult.py b/nagios_checkresult.py new file mode 100644 index 0000000..7d9a569 --- /dev/null +++ b/nagios_checkresult.py @@ -0,0 +1,64 @@ +#!/usr/bin/python +# +# NagiosCheckResult- Class that creates Nagios checkresult file and +# writes Passive checks to it +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +########################################################################### + +import os +import tempfile +import time + + +class GenerateNagiosCheckResult: + + def __init__(self): + self.return_codes = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + + #Creates a checkresult file + def create(self, nagios_result_dir): + # Nagios is quite fussy about the filename, it must be + # a 7 character name starting with 'c' + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + + # Writes to the checkresult file + def build(self, host, service_name, last_seen, service_state, metric_value, metric_units): + os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "service_description=" + service_name + "\n") + os.write(self.fh, "check_type=0\n") + os.write(self.fh, "check_options=0\n") + os.write(self.fh, "scheduled_check=1\n") + os.write(self.fh, "reschedule_check=1\n") + os.write(self.fh, "latency=0.1\n") + os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "early_timeout=0\n") + os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "return_code=" + str(service_state) + "\n") + os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + + def submit(self): + os.close(self.fh) + ok_filename = self.cmd_file + ".ok" + ok_fh = file(ok_filename, 'a') + ok_fh.close() + From a7416f06ff7f8cc869ba7146a7f2b6a99fcbdba4 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sat, 10 May 2014 21:50:15 +0530 Subject: [PATCH 11/35] Added build_host and build_service --- ganglia-nagios-bridge.py | 52 ++++++++++++++++++++++++++-------------- nagios_checkresult.py | 37 +++++++++++++++++++++------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index f517ae3..1d18735 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -40,34 +40,34 @@ def read(self, buf_size): return self.socket.recv(buf_size) -# interprets metric values to generate Nagios passive notifications +# interprets metric values to generate service return codes class PassiveGenerator: def __init__(self, force_dmax, tmax_grace): self.force_dmax = force_dmax self.tmax_grace = tmax_grace - def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): + def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax): effective_dmax = metric_dmax if(self.force_dmax > 0): effective_dmax = force_dmax effective_tmax = metric_tmax + self.tmax_grace if effective_dmax > 0 and metric_tn > effective_dmax: - service_state = 3 + service_return_code = 3 elif metric_tn > effective_tmax: - service_state = 3 + service_return_code = 3 elif isinstance(metric_value, str): - service_state = 0 + service_return_code = 0 elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: - service_state = 2 + service_return_code = 2 elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 + service_return_code = 1 elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: - service_state = 2 + service_return_code = 2 elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: - service_state = 1 + service_return_code = 1 else: - service_state = 0 - return service_state + service_return_code = 0 + return service_return_code # SAX event handler for parsing the Ganglia XML stream @@ -126,12 +126,14 @@ def startElement(self, name, attrs): if cache_key in self.hosts_cache: self.host_idx = self.hosts_cache[cache_key] self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] + self.handle_host(host_name, attrs) return for idx, host_def in enumerate(self.hosts): if host_def[0].match(self.host_name): self.hosts_cache[cache_key] = idx self.host_idx = idx self.metrics = host_def[1] + self.handle_host(self.host_name, attrs) return # handle a CLUSTER element in the XML @@ -150,6 +152,18 @@ def startElement(self, name, attrs): self.hosts = cluster_def[1] return + # checks the state of host by comaring tmax and tn for the host + def handle_host(self, host_name, attrs): + host_tn = int(attrs['TN']) + host_tmax = int(attrs['TMAX']) + host_last_seen = self.cluster_localtime - host_tn + if host_tn > host_tmax*4 : + host_return_code = 1 #host down + else: + host_return_code = 0 #host up + # write host checks to Nagios checkresult file + self.checkresult_file_handler.build_host(self.host_name, host_last_seen, host_return_code) + def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes metric_value_raw = attrs['VAL'] @@ -158,21 +172,23 @@ def handle_metric(self, metric_name, service_name, attrs): metric_dmax = int(attrs['DMAX']) metric_type = attrs['TYPE'] metric_units = attrs['UNITS'] - # they metric_value has a dynamic type: + # the metric_value has a dynamic type: if metric_type == 'string': metric_value = metric_value_raw elif metric_type == 'double' or metric_type == 'float': metric_value = float(metric_value_raw) else: metric_value = int(metric_value_raw) - last_seen = self.cluster_localtime - metric_tn + service_last_seen = self.cluster_localtime - metric_tn - #setting service state as 0 by default - service_state=0 + #setting service return code as 0 by default + service_return_code=0 # call the handler to process the value and return service state after comparing metric value and threshold: - service_state = self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) - # write Passive checks to checkresult file - self.checkresult_file_handler.build(self.host_name, service_name, last_seen, service_state, metric_value, metric_units) + service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) + # write Passive service checks to checkresult file + self.checkresult_file_handler.build_service(self.host_name, service_name, service_last_seen, service_return_code, metric_value, metric_units) + + # main program code if __name__ == '__main__': diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 7d9a569..68d5f4e 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -1,7 +1,7 @@ #!/usr/bin/python # # NagiosCheckResult- Class that creates Nagios checkresult file and -# writes Passive checks to it +# writes Passive Host and Service checks to it # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -26,9 +26,10 @@ class GenerateNagiosCheckResult: def __init__(self): - self.return_codes = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} - #Creates a checkresult file + # Creates a checkresult file def create(self, nagios_result_dir): # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' @@ -37,9 +38,28 @@ def create(self, nagios_result_dir): self.cmd_file = tmp_file[1] os.write(self.fh, "### Active Check Result File ###\n") os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - - # Writes to the checkresult file - def build(self, host, service_name, last_seen, service_state, metric_value, metric_units): + + # Accepts host name, last seen time and return code for the host checkresult + # Writes host checks to checkresult file + def build_host(self, host, last_seen, host_return_code): + os.write(self.fh, "\n### Nagios Host Check Result ###\n") + os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "check_type=0\n") + os.write(self.fh, "check_options=0\n") + os.write(self.fh, "scheduled_check=1\n") + os.write(self.fh, "reschedule_check=1\n") + os.write(self.fh, "latency=0.1\n") + os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") + os.write(self.fh, "early_timeout=0\n") + os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "return_code=" + str(host_return_code) + "\n") + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + self.host_state[host_return_code] + "\\n\n") + + # Accepts host name, service name, last seen time, metric values and units and return code for the service checkresult + # Writes service checks to the checkresult file + def build_service(self, host, service_name, last_seen, service_return_code, metric_value, metric_units): os.write(self.fh, "\n### Nagios Service Check Result ###\n") os.write(self.fh, "# Time: " + time.asctime() + "\n") os.write(self.fh, "host_name=" + host + "\n") @@ -53,9 +73,10 @@ def build(self, host, service_name, last_seen, service_state, metric_value, metr os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") os.write(self.fh, "early_timeout=0\n") os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + self.return_codes[service_state] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + os.write(self.fh, "return_code=" + str(service_return_code) + "\n") + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + # Close the file handle and create an ok-to-go indicator file def submit(self): os.close(self.fh) ok_filename = self.cmd_file + ".ok" From 0e73ac19d17aa1c5cfd664ab043fec228fc461f3 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 13 May 2014 19:15:42 +0530 Subject: [PATCH 12/35] Exception handle for temfile added --- nagios_checkresult.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 68d5f4e..acb595a 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -21,6 +21,7 @@ import os import tempfile import time +import sys class GenerateNagiosCheckResult: @@ -33,12 +34,17 @@ def __init__(self): def create(self, nagios_result_dir): # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - + try: + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + except OSError as e: + #print "OS error({0}): {1}".format(e.errno, e.strerror) + print "Failed to create tempfile at", nagios_result_dir + sys.exit(1) + # Accepts host name, last seen time and return code for the host checkresult # Writes host checks to checkresult file def build_host(self, host, last_seen, host_return_code): From 858f6aeb30f2b90dd05d772c2d97c5bbd84fc7a4 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Wed, 14 May 2014 23:48:56 +0530 Subject: [PATCH 13/35] Flexibility in Data sent to checkresult file --- ganglia-nagios-bridge.py | 13 +++++++----- nagios_checkresult.py | 45 ++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 1d18735..d54c9d1 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -156,13 +156,15 @@ def startElement(self, name, attrs): def handle_host(self, host_name, attrs): host_tn = int(attrs['TN']) host_tmax = int(attrs['TMAX']) - host_last_seen = self.cluster_localtime - host_tn + last_seen = self.cluster_localtime - host_tn if host_tn > host_tmax*4 : host_return_code = 1 #host down else: host_return_code = 0 #host up + host_last_seen = str(last_seen) + '.0' + # write host checks to Nagios checkresult file - self.checkresult_file_handler.build_host(self.host_name, host_last_seen, host_return_code) + self.checkresult_file_handler.build_host(self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code) def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes @@ -179,14 +181,15 @@ def handle_metric(self, metric_name, service_name, attrs): metric_value = float(metric_value_raw) else: metric_value = int(metric_value_raw) - service_last_seen = self.cluster_localtime - metric_tn + last_seen = self.cluster_localtime - metric_tn + service_last_seen = str(last_seen) + '.0' #setting service return code as 0 by default service_return_code=0 # call the handler to process the value and return service state after comparing metric value and threshold: - service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) + service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) # write Passive service checks to checkresult file - self.checkresult_file_handler.build_service(self.host_name, service_name, service_last_seen, service_return_code, metric_value, metric_units) + self.checkresult_file_handler.build_service(self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index acb595a..e8dd1b2 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -45,40 +45,40 @@ def create(self, nagios_result_dir): print "Failed to create tempfile at", nagios_result_dir sys.exit(1) - # Accepts host name, last seen time and return code for the host checkresult + # Accepts parameters required for the host checkresult # Writes host checks to checkresult file - def build_host(self, host, last_seen, host_return_code): + def build_host(self, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code): os.write(self.fh, "\n### Nagios Host Check Result ###\n") os.write(self.fh, "# Time: " + time.asctime() + "\n") os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "check_type=" + str(check_type) + "\n") + os.write(self.fh, "check_options=" + str(check_options) + "\n") + os.write(self.fh, "scheduled_check=" + str(scheduled_check) + "\n") + os.write(self.fh, "reschedule_check=" + str(reschedule_check) + "\n") + os.write(self.fh, "latency=" + str(latency) + "\n") + os.write(self.fh, "start_time=" + str(start_time) + "\n") + os.write(self.fh, "finish_time=" + str(finish_time) + "\n") + os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") + os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(host_return_code) + "\n") os.write(self.fh, "output=" + " " + "Host (" + host + ")" + self.host_state[host_return_code] + "\\n\n") - # Accepts host name, service name, last seen time, metric values and units and return code for the service checkresult + # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file - def build_service(self, host, service_name, last_seen, service_return_code, metric_value, metric_units): + def build_service(self, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units): os.write(self.fh, "\n### Nagios Service Check Result ###\n") os.write(self.fh, "# Time: " + time.asctime() + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") + os.write(self.fh, "check_type=" + str(check_type) + "\n") + os.write(self.fh, "check_options=" + str(check_options) + "\n") + os.write(self.fh, "scheduled_check=" + str(scheduled_check) + "\n") + os.write(self.fh, "reschedule_check=" + str(reschedule_check) + "\n") + os.write(self.fh, "latency=" + str(latency) + "\n") + os.write(self.fh, "start_time=" + str(start_time) + "\n") + os.write(self.fh, "finish_time=" + str(finish_time) + "\n") + os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") + os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(service_return_code) + "\n") os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") @@ -86,6 +86,7 @@ def build_service(self, host, service_name, last_seen, service_return_code, metr def submit(self): os.close(self.fh) ok_filename = self.cmd_file + ".ok" + print self.cmd_file ok_fh = file(ok_filename, 'a') ok_fh.close() From cda8781979c3055d27f2d51a3d51d2210c934cca Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Thu, 15 May 2014 00:03:51 +0530 Subject: [PATCH 14/35] Accept output string --- ganglia-nagios-bridge.py | 4 ++-- nagios_checkresult.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index d54c9d1..d4016d3 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -164,7 +164,7 @@ def handle_host(self, host_name, attrs): host_last_seen = str(last_seen) + '.0' # write host checks to Nagios checkresult file - self.checkresult_file_handler.build_host(self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code) + self.checkresult_file_handler.build_host(self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes @@ -189,7 +189,7 @@ def handle_metric(self, metric_name, service_name, attrs): # call the handler to process the value and return service state after comparing metric value and threshold: service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) # write Passive service checks to checkresult file - self.checkresult_file_handler.build_service(self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units) + self.checkresult_file_handler.build_service(self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") diff --git a/nagios_checkresult.py b/nagios_checkresult.py index e8dd1b2..44f152f 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -47,7 +47,7 @@ def create(self, nagios_result_dir): # Accepts parameters required for the host checkresult # Writes host checks to checkresult file - def build_host(self, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code): + def build_host(self, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): os.write(self.fh, "\n### Nagios Host Check Result ###\n") os.write(self.fh, "# Time: " + time.asctime() + "\n") os.write(self.fh, "host_name=" + host + "\n") @@ -61,11 +61,14 @@ def build_host(self, host, check_type, check_options, scheduled_check, reschedul os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(host_return_code) + "\n") - os.write(self.fh, "output=" + " " + "Host (" + host + ")" + self.host_state[host_return_code] + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file - def build_service(self, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units): + def build_service(self, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): os.write(self.fh, "\n### Nagios Service Check Result ###\n") os.write(self.fh, "# Time: " + time.asctime() + "\n") os.write(self.fh, "host_name=" + host + "\n") @@ -80,7 +83,10 @@ def build_service(self, host, service_name, check_type, check_options, scheduled os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(service_return_code) + "\n") - os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Close the file handle and create an ok-to-go indicator file def submit(self): From 80840763d0f1c08628b66556a47959685627adaa Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Fri, 16 May 2014 03:43:59 +0530 Subject: [PATCH 15/35] send file time and checkresult time --- ganglia-nagios-bridge.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index d4016d3..6e69d0f 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -25,6 +25,7 @@ import re import socket import xml.sax +import time import nagios_checkresult # wrapper class so that the SAX parser can process data from a network @@ -164,7 +165,7 @@ def handle_host(self, host_name, attrs): host_last_seen = str(last_seen) + '.0' # write host checks to Nagios checkresult file - self.checkresult_file_handler.build_host(self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") + self.checkresult_file_handler.build_host(time.asctime(), self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes @@ -189,7 +190,7 @@ def handle_metric(self, metric_name, service_name, attrs): # call the handler to process the value and return service state after comparing metric value and threshold: service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) # write Passive service checks to checkresult file - self.checkresult_file_handler.build_service(self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") + self.checkresult_file_handler.build_service(time.asctime(), self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") @@ -229,7 +230,7 @@ def handle_metric(self, metric_name, service_name, attrs): #Instantiate GenerateNagiosCheckResult class gn = nagios_checkresult.GenerateNagiosCheckResult() #Create CheckResultFile - gn.create(nagios_result_dir) + gn.create(nagios_result_dir, int(time.time())) parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) # run the main program loop parser.parse(SocketInputSource(sock)) From 8a292666ad910827021ef897b2dfd47b5ca9412f Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Fri, 16 May 2014 03:44:35 +0530 Subject: [PATCH 16/35] Accept file time and checkresult time --- nagios_checkresult.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 44f152f..e2841bf 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -20,7 +20,6 @@ import os import tempfile -import time import sys @@ -31,7 +30,7 @@ def __init__(self): self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} # Creates a checkresult file - def create(self, nagios_result_dir): + def create(self, nagios_result_dir, file_time): # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' try: @@ -39,7 +38,7 @@ def create(self, nagios_result_dir): self.fh = tmp_file[0] self.cmd_file = tmp_file[1] os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") + os.write(self.fh, "file_time=" + str(file_time) + "\n") except OSError as e: #print "OS error({0}): {1}".format(e.errno, e.strerror) print "Failed to create tempfile at", nagios_result_dir @@ -47,9 +46,9 @@ def create(self, nagios_result_dir): # Accepts parameters required for the host checkresult # Writes host checks to checkresult file - def build_host(self, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): + def build_host(self, checkresult_time, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): os.write(self.fh, "\n### Nagios Host Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "check_type=" + str(check_type) + "\n") os.write(self.fh, "check_options=" + str(check_options) + "\n") @@ -68,9 +67,9 @@ def build_host(self, host, check_type, check_options, scheduled_check, reschedul # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file - def build_service(self, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): + def build_service(self, checkresult_time, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") + os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "service_description=" + service_name + "\n") os.write(self.fh, "check_type=" + str(check_type) + "\n") @@ -92,7 +91,7 @@ def build_service(self, host, service_name, check_type, check_options, scheduled def submit(self): os.close(self.fh) ok_filename = self.cmd_file + ".ok" - print self.cmd_file ok_fh = file(ok_filename, 'a') ok_fh.close() + return self.cmd_file From e2ff3ca99411af00a4d775bbbf748b7bcadae8a9 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sun, 18 May 2014 02:12:18 +0530 Subject: [PATCH 17/35] unit test for nagios_checkresult --- test_nagios_checkresult.py | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 test_nagios_checkresult.py diff --git a/test_nagios_checkresult.py b/test_nagios_checkresult.py new file mode 100644 index 0000000..47ff5a2 --- /dev/null +++ b/test_nagios_checkresult.py @@ -0,0 +1,60 @@ +#! /usr/bin/python + +import unittest +import time +import textwrap +import nagios_checkresult + +class TestNagiosCheckResult(unittest.TestCase): + def setUp (self): + self.maxDiff = None + #this is how checkresult file should look like + self.checkresult = textwrap.dedent("""\ + ### Active Check Result File ### + file_time=1400347643.73 + + ### Nagios Host Check Result ### + # Time: Sat May 17 22:57:23 2014 + host_name=xyz + check_type=0 + check_options=0 + scheduled_check=1 + reschedule_check=1 + latency=0.1 + start_time=1399732963.0 + finish_time=1399732963.0 + early_timeout=0 + exited_ok=1 + return_code=0 + output= Host (xyz) UP\\n + + ### Nagios Service Check Result ### + # Time: Sat May 17 22:57:23 2014 + host_name=xyz + service_description=Total processes + check_type=0 + check_options=0 + scheduled_check=1 + reschedule_check=1 + latency=0.1 + start_time=1399732963.0 + finish_time=1399732963.0 + early_timeout=0 + exited_ok=1 + return_code=0 + output=Total processes OK- Total processes 288 \\n\n""") + + def test_checkresult(self): + #generate checkresult file by sending data to GenerateNagiosCheckResult + ng = nagios_checkresult.GenerateNagiosCheckResult() + ng.create('/var/lib/nagios3/spool/checkresults', 1400347643.73) + ng.build_host('Sat May 17 22:57:23 2014', 'xyz', 0, 0, 1, 1, 0.1, str(1399732963.0), str(1399732963.0), 0, 1, 0, "") + ng.build_service('Sat May 17 22:57:23 2014', 'xyz', 'Total processes', 0, 0, 1, 1, 0.1, str(1399732963.0), str(1399732963.0), 0, 1, 0, 288, "", "") + #fname is the name of checkresult file generated + fname = ng.submit() + self.testfile = open(fname).read() + #compare the expected checkresult file with generated checkresult file + self.assertMultiLineEqual(self.testfile, self.checkresult, msg=None) + +if __name__ == '__main__': + unittest.main() From 4be2d9fdcc721f37361ee5a2a522fbade6b57edd Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Thu, 29 May 2014 00:01:06 +0530 Subject: [PATCH 18/35] Added space in error message --- nagios_checkresult.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index e2841bf..b5509c3 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -41,7 +41,7 @@ def create(self, nagios_result_dir, file_time): os.write(self.fh, "file_time=" + str(file_time) + "\n") except OSError as e: #print "OS error({0}): {1}".format(e.errno, e.strerror) - print "Failed to create tempfile at", nagios_result_dir + print "Failed to create tempfile at ", nagios_result_dir sys.exit(1) # Accepts parameters required for the host checkresult From 76e8bfd840da02defe956123bfc20ff7614ec526 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 2 Jun 2014 18:54:29 +0530 Subject: [PATCH 19/35] Converted tabs to spaces --- nagios_checkresult.py | 44 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index b5509c3..9be8023 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -26,28 +26,28 @@ class GenerateNagiosCheckResult: def __init__(self): - self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} - self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} + self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} # Creates a checkresult file def create(self, nagios_result_dir, file_time): - # Nagios is quite fussy about the filename, it must be + # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - try: - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(file_time) + "\n") - except OSError as e: - #print "OS error({0}): {1}".format(e.errno, e.strerror) - print "Failed to create tempfile at ", nagios_result_dir - sys.exit(1) + try: + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(file_time) + "\n") + except OSError as e: + #print "OS error({0}): {1}".format(e.errno, e.strerror) + print "Failed to create tempfile at ", nagios_result_dir + sys.exit(1) # Accepts parameters required for the host checkresult # Writes host checks to checkresult file def build_host(self, checkresult_time, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): - os.write(self.fh, "\n### Nagios Host Check Result ###\n") + os.write(self.fh, "\n### Nagios Host Check Result ###\n") os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "check_type=" + str(check_type) + "\n") @@ -60,15 +60,15 @@ def build_host(self, checkresult_time, host, check_type, check_options, schedule os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(host_return_code) + "\n") - if not output_string: - os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") - else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file def build_service(self, checkresult_time, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): - os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "\n### Nagios Service Check Result ###\n") os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "service_description=" + service_name + "\n") @@ -82,10 +82,10 @@ def build_service(self, checkresult_time, host, service_name, check_type, check_ os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(service_return_code) + "\n") - if not output_string: - os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Close the file handle and create an ok-to-go indicator file def submit(self): From 8c517d1fbba31ba05269f25e07563afcc3413810 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 2 Jun 2014 19:00:53 +0530 Subject: [PATCH 20/35] Corrected try block --- nagios_checkresult.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 9be8023..079ffc7 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -33,16 +33,16 @@ def __init__(self): def create(self, nagios_result_dir, file_time): # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - try: - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(file_time) + "\n") - except OSError as e: - #print "OS error({0}): {1}".format(e.errno, e.strerror) - print "Failed to create tempfile at ", nagios_result_dir - sys.exit(1) + try: + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(file_time) + "\n") + except OSError as e: + #print "OS error({0}): {1}".format(e.errno, e.strerror) + print "Failed to create tempfile at ", nagios_result_dir + sys.exit(1) # Accepts parameters required for the host checkresult # Writes host checks to checkresult file From f6778f9719916cf886c34b06194fa835cc92d521 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 2 Jun 2014 21:29:05 +0530 Subject: [PATCH 21/35] error in checkresult file creation handled by calling class --- nagios_checkresult.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 079ffc7..0889a4f 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -26,28 +26,23 @@ class GenerateNagiosCheckResult: def __init__(self): - self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} - self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} + self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} # Creates a checkresult file def create(self, nagios_result_dir, file_time): - # Nagios is quite fussy about the filename, it must be + # Nagios is quite fussy about the filename, it must be # a 7 character name starting with 'c' - try: - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(file_time) + "\n") - except OSError as e: - #print "OS error({0}): {1}".format(e.errno, e.strerror) - print "Failed to create tempfile at ", nagios_result_dir - sys.exit(1) + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(file_time) + "\n") # Accepts parameters required for the host checkresult # Writes host checks to checkresult file def build_host(self, checkresult_time, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): - os.write(self.fh, "\n### Nagios Host Check Result ###\n") + os.write(self.fh, "\n### Nagios Host Check Result ###\n") os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "check_type=" + str(check_type) + "\n") @@ -60,15 +55,15 @@ def build_host(self, checkresult_time, host, check_type, check_options, schedule os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(host_return_code) + "\n") - if not output_string: - os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") - else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file def build_service(self, checkresult_time, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): - os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "\n### Nagios Service Check Result ###\n") os.write(self.fh, "# Time: " + checkresult_time + "\n") os.write(self.fh, "host_name=" + host + "\n") os.write(self.fh, "service_description=" + service_name + "\n") @@ -82,10 +77,10 @@ def build_service(self, checkresult_time, host, service_name, check_type, check_ os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(service_return_code) + "\n") - if not output_string: - os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + if not output_string: + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Close the file handle and create an ok-to-go indicator file def submit(self): From e4fb226e488ed7da13da6c0467895502a250e7d3 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 2 Jun 2014 21:29:42 +0530 Subject: [PATCH 22/35] Handles error during creation of checkresult file --- ganglia-nagios-bridge.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 6e69d0f..c73f99b 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -230,16 +230,20 @@ def handle_metric(self, metric_name, service_name, attrs): #Instantiate GenerateNagiosCheckResult class gn = nagios_checkresult.GenerateNagiosCheckResult() #Create CheckResultFile - gn.create(nagios_result_dir, int(time.time())) - parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) - # run the main program loop - parser.parse(SocketInputSource(sock)) + try: + gn.create(nagios_result_dir, int(time.time())) + parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) + # run the main program loop + parser.parse(SocketInputSource(sock)) - # write out for Nagios - gn.submit() + # write out for Nagios + gn.submit() - # all done - sock.close() + # all done + sock.close() + except OSError as e: + print "Failed to create tempfile at", nagios_result_dir + except socket.error as e: logging.warn('Failed to connect to gmetad: %s', e.strerror) From 55aee797caee84ecec1992ba6c57a608303b78c0 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 2 Jun 2014 22:28:34 +0530 Subject: [PATCH 23/35] Fixed indentation --- ganglia-nagios-bridge.py | 6 +++--- nagios_checkresult.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index c73f99b..315dc14 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -36,7 +36,7 @@ def __init__(self, socket): def getByteStream(self): return self - + def read(self, buf_size): return self.socket.recv(buf_size) @@ -69,7 +69,7 @@ def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax) else: service_return_code = 0 return service_return_code - + # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): @@ -200,7 +200,7 @@ def handle_metric(self, metric_name, service_name, attrs): # parse command line parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') parser.add_argument('config_file', nargs='?', - help='configuration file', default='/etc/ganglia/nagios-bridge.conf') + help='configuration file', default='/etc/ganglia/nagios-bridge.conf') args = parser.parse_args() # read the configuration file, setting some defaults first diff --git a/nagios_checkresult.py b/nagios_checkresult.py index 0889a4f..09b41c8 100644 --- a/nagios_checkresult.py +++ b/nagios_checkresult.py @@ -56,10 +56,10 @@ def build_host(self, checkresult_time, host, check_type, check_options, schedule os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(host_return_code) + "\n") if not output_string: - os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") - + os.write(self.fh, "output=" + " " + output_string + "\\n\n") + # Accepts parameters required for the service checkresult # Writes service checks to the checkresult file def build_service(self, checkresult_time, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): @@ -78,9 +78,9 @@ def build_service(self, checkresult_time, host, service_name, check_type, check_ os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") os.write(self.fh, "return_code=" + str(service_return_code) + "\n") if not output_string: - os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") else: - os.write(self.fh, "output=" + " " + output_string + "\\n\n") + os.write(self.fh, "output=" + " " + output_string + "\\n\n") # Close the file handle and create an ok-to-go indicator file def submit(self): From 73f4ab428493695e668606d66e03871e9b434645 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sun, 27 Jul 2014 12:12:10 +0530 Subject: [PATCH 24/35] New config format --- conf_parser.py | 49 +++++++++++++++++++++++++++++++++ ganglia-nagios-bridge.py | 56 +++++++++++++++----------------------- sample.conf | 58 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 35 deletions(-) create mode 100644 conf_parser.py create mode 100644 sample.conf diff --git a/conf_parser.py b/conf_parser.py new file mode 100644 index 0000000..5daef96 --- /dev/null +++ b/conf_parser.py @@ -0,0 +1,49 @@ +#! /usr/bin/python + +from configobj import ConfigObj,ConfigObjError + + +class ConfigParser: + def __init__(self): + self.clusters =[] + + def parse (self,config_file): + try: + config = ConfigObj(config_file) + self.gmetad_host = config.pop('gmetad_host') + self.gmetad_port = config.pop('gmetad_port') + self.force_dmax = config.pop('force_dmax') + self.tmax_grace = config.pop('tmax_grace') + self.strip_domains = config.pop('strip_domains') + self.nagios_result_dir = config.pop('nagios_result_dir') + for cluster_name in config.keys(): + hosts = [] + for host_name in config[cluster_name].keys(): + host = [] + for x in host_name.split(','): + host.append(x) + metrics = [] + for metric_name in config[cluster_name][host_name].keys(): + metric_def = {} + metric_def['service_name'] = config[cluster_name][host_name][metric_name]['service_name'] + if 'crit_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_above'] = config[cluster_name][host_name][metric_name]['crit_above'] + metric_def['crit_below'] = None + if 'crit_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_below'] = config[cluster_name][host_name][metric_name]['crit_below'] + metric_def['crit_above'] = None + if 'warn_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_above'] = config[cluster_name][host_name][metric_name]['warn_above'] + metric_def['warn_below'] = None + if 'warn_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_below'] = config[cluster_name][host_name][metric_name]['warn_below'] + metric_def['warn_above'] = None + metrics.append((metric_name,metric_def)) + for hostn in host: + hosts.append((hostn,metrics)) + self.clusters.append((cluster_name,hosts)) + + except (ConfigObjError, IOError), e: + print 'Could not read %s' % (e) + + diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 315dc14..e54ff95 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -27,6 +27,7 @@ import xml.sax import time import nagios_checkresult +import conf_parser # wrapper class so that the SAX parser can process data from a network # socket @@ -58,13 +59,13 @@ def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax) service_return_code = 3 elif isinstance(metric_value, str): service_return_code = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: + elif metric_def['crit_below'] is not None and metric_value < float(metric_def['crit_below']): service_return_code = 2 - elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: + elif metric_def['warn_below'] is not None and metric_value < float(metric_def['warn_below']): service_return_code = 1 - elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: + elif metric_def['crit_above'] is not None and metric_value > float(metric_def['crit_above']): service_return_code = 2 - elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: + elif metric_def['warn_above'] is not None and metric_value > float(metric_def['warn_above']): service_return_code = 1 else: service_return_code = 0 @@ -73,19 +74,20 @@ def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax) # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): - def __init__(self, clusters_c, value_handler, checkresult_file_handler): + def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_domains): self.clusters_c = clusters_c self.value_handler = value_handler self.checkresult_file_handler = checkresult_file_handler self.clusters_cache = {} self.hosts_cache = {} self.metrics_cache = {} + self.strip_domains = strip_domains def startElement(self, name, attrs): # METRIC is the most common element, it is handled first, # followed by HOST and CLUSTER - + # handle common elements that we ignore if name == "EXTRA_ELEMENT": return @@ -104,13 +106,9 @@ def startElement(self, name, attrs): self.handle_metric(metric_name, service_name, attrs) return for idx, metric_def in enumerate(self.metrics): - match_result = metric_def[0].match(metric_name) + match_result = metric_def[0] == metric_name if match_result: - service_name_tmpl = metric_def[1]['service_name'] - if len(match_result.groups()) > 0: - service_name = match_result.expand(service_name_tmpl) - else: - service_name = service_name_tmpl + service_name = metric_def[1]['service_name'] self.metrics_cache[cache_key] = (idx, service_name) self.metric = metric_def[1] self.handle_metric(metric_name, service_name, attrs) @@ -121,7 +119,7 @@ def startElement(self, name, attrs): self.metrics = None self.host_name = attrs['NAME'] self.host_reported = long(attrs['REPORTED']) - if strip_domains: + if self.strip_domains: self.host_name = self.host_name.partition('.')[0] cache_key = (self.cluster_idx, self.host_name) if cache_key in self.hosts_cache: @@ -130,7 +128,7 @@ def startElement(self, name, attrs): self.handle_host(host_name, attrs) return for idx, host_def in enumerate(self.hosts): - if host_def[0].match(self.host_name): + if host_def[0] == self.host_name: self.hosts_cache[cache_key] = idx self.host_idx = idx self.metrics = host_def[1] @@ -147,7 +145,7 @@ def startElement(self, name, attrs): self.hosts = self.clusters_c[self.cluster_idx][1] return for idx, cluster_def in enumerate(self.clusters_c): - if cluster_def[0].match(self.cluster_name): + if cluster_def[0] == self.cluster_name: self.clusters_cache[self.cluster_name] = idx self.cluster_idx = idx self.hosts = cluster_def[1] @@ -200,30 +198,18 @@ def handle_metric(self, metric_name, service_name, attrs): # parse command line parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') parser.add_argument('config_file', nargs='?', - help='configuration file', default='/etc/ganglia/nagios-bridge.conf') + help='configuration file', default='/etc/ganglia/sample.conf') args = parser.parse_args() # read the configuration file, setting some defaults first force_dmax = 0 tmax_grace = 60 - execfile(args.config_file) - - # compile the regular expressions - clusters_c = [] - for cluster_def in clusters: - cluster_c = re.compile(cluster_def[0]) - hosts = [] - for host_def in cluster_def[1]: - host_c = re.compile(host_def[0]) - metrics = [] - for metric_def in host_def[1]: - metric_c = re.compile(metric_def[0]) - metrics.append((metric_c, metric_def[1])) - hosts.append((host_c, metrics)) - clusters_c.append((cluster_c, hosts)) + #pasre config file + config_parse = conf_parser.ConfigParser() + config_parse.parse(args.config_file) # connect to the gmetad or gmond - sock = socket.create_connection((gmetad_host, gmetad_port)) + sock = socket.create_connection((config_parse.gmetad_host, config_parse.gmetad_port)) # set up the SAX parser parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) @@ -231,8 +217,8 @@ def handle_metric(self, metric_name, service_name, attrs): gn = nagios_checkresult.GenerateNagiosCheckResult() #Create CheckResultFile try: - gn.create(nagios_result_dir, int(time.time())) - parser.setContentHandler(GangliaHandler(clusters_c, pg,gn)) + gn.create(config_parse.nagios_result_dir, int(time.time())) + parser.setContentHandler(GangliaHandler(config_parse.clusters, pg, gn, config_parse.strip_domains)) # run the main program loop parser.parse(SocketInputSource(sock)) @@ -242,7 +228,7 @@ def handle_metric(self, metric_name, service_name, attrs): # all done sock.close() except OSError as e: - print "Failed to create tempfile at", nagios_result_dir + print "Failed to create tempfile at", config_parse.nagios_result_dir except socket.error as e: logging.warn('Failed to connect to gmetad: %s', e.strerror) diff --git a/sample.conf b/sample.conf new file mode 100644 index 0000000..4da8573 --- /dev/null +++ b/sample.conf @@ -0,0 +1,58 @@ +gmetad_host = '127.0.0.1' +gmetad_port = 8649 +# This overrides the DMAX attribute from all metrics in all hosts +# If DMAX > 0 and TN > DMAX, then a metric state is considered +# UNKNOWN and Nagios will potentially send an alert +force_dmax = 0 + +# Every collection group in gmond.conf defines a time_threshold +# This value appears as TMAX in the XML. +# The gmond process should normally send every metric again before +# the value timer TN > TMAX. +# If ganglia-nagios-bridge is polling a gmond collector +# then a very small tmax_grace period (perhaps 5 seconds) is used. +# If ganglia-nagios-bridge is polling a gmetad server then +# tmax_grace should be set higher than the polling interval configured +# in gmetad. +tmax_grace = 30 + +# Ganglia XML typically contains FQDNs for all hosts, as it obtains +# the hostnames using reverse DNS lookups. Nagios, on the other hand, +# is often configured with just the hostname and no domain. Setting +# strip_domains = True will ensure that the domain part is stripped from +# the hostname before passing it to Nagios. +strip_domains = True + +# This is the directory where Nagios expects to read checkresults +# submitted in batch +nagios_result_dir = '/var/lib/nagios3/spool/checkresults' + + +# Defining Cluster and hosts to be monitored along with their metrics + +[Production] + [[chandrika-HP-G42-Notebook-PC, virtualhost1]] + [[[proc_total]]] + service_name = Total processes + warn_above = 120 + crit_above = 150 + [[[load]]] + service_name = abc + warn_above = 30 + crit_above = 35 + [[virtualhost2]] + [[[cpu_idle]]] + service_name = CPU IDLE + warn_above = 100 + crit_above = 130 +[Production1] + [[virtualhost3, virtualhost12]] + [[[disk_free]]] + service_name = DISK FREE + warn_below = 80 + crit_below = 65 + [[virtualhost12]] + [[[cpu_speed]]] + service_name = CPU SPEED + warn_below = 2112 + crit_below = 2000 From 3d8526b3663cb8b1510fb1d649cc284d8cdab324 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Mon, 28 Jul 2014 19:07:29 +0530 Subject: [PATCH 25/35] Added cluster defintion config format --- sample.conf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sample.conf b/sample.conf index 4da8573..ba57cee 100644 --- a/sample.conf +++ b/sample.conf @@ -29,6 +29,14 @@ nagios_result_dir = '/var/lib/nagios3/spool/checkresults' # Defining Cluster and hosts to be monitored along with their metrics +# [ClusterName] +# [[Hostname(s) separated by ,]] +# [[[metric name]]] +# service_name = +# warn_above/below = +# crit_above/below = + + [Production] [[chandrika-HP-G42-Notebook-PC, virtualhost1]] From 857f1a220a39d9325df746a4a537682960557ff2 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 5 Aug 2014 20:34:57 +0530 Subject: [PATCH 26/35] Check for Hosts known by Nagios --- ganglia-nagios-bridge.py | 54 +++++++++++++++++++++++++++++----------- sample.conf | 14 +++-------- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index e54ff95..146da22 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -28,6 +28,7 @@ import time import nagios_checkresult import conf_parser +from pynag import Model # wrapper class so that the SAX parser can process data from a network # socket @@ -70,11 +71,24 @@ def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax) else: service_return_code = 0 return service_return_code - + +# gets the hosts and services Nagios knows about +class NagiosHosts: + def __init__(self): + self.host_service = [] + + def process(self): + all_hosts = Model.Host.objects.all + for host in all_hosts: + service_name = [] + for service in host.get_effective_services(): + service_name.append(service.service_description) + self.host_service.append((host.host_name, service_name)) + # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): - def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_domains): + def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_domains, nagios_hosts): self.clusters_c = clusters_c self.value_handler = value_handler self.checkresult_file_handler = checkresult_file_handler @@ -82,6 +96,7 @@ def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_do self.hosts_cache = {} self.metrics_cache = {} self.strip_domains = strip_domains + self.host_service = nagios_hosts.host_service def startElement(self, name, attrs): @@ -109,31 +124,38 @@ def startElement(self, name, attrs): match_result = metric_def[0] == metric_name if match_result: service_name = metric_def[1]['service_name'] - self.metrics_cache[cache_key] = (idx, service_name) - self.metric = metric_def[1] - self.handle_metric(metric_name, service_name, attrs) - return + # if service is defined in Nagios for host_name + if service_name in self.nagios_service: + self.metrics_cache[cache_key] = (idx, service_name) + self.metric = metric_def[1] + self.handle_metric(metric_name, service_name, attrs) + return # handle a HOST element in the XML if name == "HOST" and self.hosts is not None: self.metrics = None self.host_name = attrs['NAME'] self.host_reported = long(attrs['REPORTED']) + self.nagios_service = None if self.strip_domains: self.host_name = self.host_name.partition('.')[0] cache_key = (self.cluster_idx, self.host_name) if cache_key in self.hosts_cache: - self.host_idx = self.hosts_cache[cache_key] + self.host_ix = self.hosts_cache[cache_key] self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] self.handle_host(host_name, attrs) return for idx, host_def in enumerate(self.hosts): if host_def[0] == self.host_name: - self.hosts_cache[cache_key] = idx - self.host_idx = idx - self.metrics = host_def[1] - self.handle_host(self.host_name, attrs) - return + for host in self.host_service: + if host[0] == self.host_name: + self.hosts_cache[cache_key] = idx + self.host_idx = idx + self.metrics = host_def[1] + self.handle_host(self.host_name, attrs) + # get the services defined for the host in Nagios + self.nagios_service = host[1] + return # handle a CLUSTER element in the XML if name == "CLUSTER": @@ -151,7 +173,7 @@ def startElement(self, name, attrs): self.hosts = cluster_def[1] return - # checks the state of host by comaring tmax and tn for the host + # checks the state of host by comparing tmax and tn for the host def handle_host(self, host_name, attrs): host_tn = int(attrs['TN']) host_tmax = int(attrs['TMAX']) @@ -208,6 +230,10 @@ def handle_metric(self, metric_name, service_name, attrs): config_parse = conf_parser.ConfigParser() config_parse.parse(args.config_file) + #get hosts and associated services known to Nagios to prevent generating checkresult for hosts not known to Nagios + nagios_hosts = NagiosHosts() + nagios_hosts.process() + # connect to the gmetad or gmond sock = socket.create_connection((config_parse.gmetad_host, config_parse.gmetad_port)) # set up the SAX parser @@ -218,7 +244,7 @@ def handle_metric(self, metric_name, service_name, attrs): #Create CheckResultFile try: gn.create(config_parse.nagios_result_dir, int(time.time())) - parser.setContentHandler(GangliaHandler(config_parse.clusters, pg, gn, config_parse.strip_domains)) + parser.setContentHandler(GangliaHandler(config_parse.clusters, pg, gn, config_parse.strip_domains, nagios_hosts)) # run the main program loop parser.parse(SocketInputSource(sock)) diff --git a/sample.conf b/sample.conf index ba57cee..2c173c8 100644 --- a/sample.conf +++ b/sample.conf @@ -29,23 +29,15 @@ nagios_result_dir = '/var/lib/nagios3/spool/checkresults' # Defining Cluster and hosts to be monitored along with their metrics -# [ClusterName] -# [[Hostname(s) separated by ,]] -# [[[metric name]]] -# service_name = -# warn_above/below = -# crit_above/below = - - [Production] [[chandrika-HP-G42-Notebook-PC, virtualhost1]] [[[proc_total]]] - service_name = Total processes + service_name = Total Processes warn_above = 120 crit_above = 150 - [[[load]]] - service_name = abc + [[[load_one]]] + service_name = Current Load warn_above = 30 crit_above = 35 [[virtualhost2]] From 83f5ad9d071e7c8e367d5ac76197ff26b8532184 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 5 Aug 2014 20:39:31 +0530 Subject: [PATCH 27/35] Config format --- sample.conf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sample.conf b/sample.conf index 2c173c8..2648e7e 100644 --- a/sample.conf +++ b/sample.conf @@ -29,6 +29,12 @@ nagios_result_dir = '/var/lib/nagios3/spool/checkresults' # Defining Cluster and hosts to be monitored along with their metrics +# [ClusterName] +# [[Hostname(s) separated by ,]] +# [[[metric name]]] +# service_name = +# warn_above/below = +# crit_above/below = [Production] [[chandrika-HP-G42-Notebook-PC, virtualhost1]] From c6b48ce5dddb44c4e80b1a7394f0c028fb169b11 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Tue, 5 Aug 2014 20:45:01 +0530 Subject: [PATCH 28/35] Fixed indentation --- ganglia-nagios-bridge.py | 144 +++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 73 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 146da22..dbf9f65 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -35,10 +35,10 @@ class SocketInputSource: def __init__(self, socket): self.socket = socket - + def getByteStream(self): return self - + def read(self, buf_size): return self.socket.recv(buf_size) @@ -48,7 +48,7 @@ class PassiveGenerator: def __init__(self, force_dmax, tmax_grace): self.force_dmax = force_dmax self.tmax_grace = tmax_grace - + def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax): effective_dmax = metric_dmax if(self.force_dmax > 0): @@ -63,27 +63,27 @@ def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax) elif metric_def['crit_below'] is not None and metric_value < float(metric_def['crit_below']): service_return_code = 2 elif metric_def['warn_below'] is not None and metric_value < float(metric_def['warn_below']): - service_return_code = 1 + service_return_code = 1 elif metric_def['crit_above'] is not None and metric_value > float(metric_def['crit_above']): service_return_code = 2 elif metric_def['warn_above'] is not None and metric_value > float(metric_def['warn_above']): service_return_code = 1 else: service_return_code = 0 - return service_return_code - + return service_return_code + # gets the hosts and services Nagios knows about -class NagiosHosts: +class NagiosHosts: def __init__(self): - self.host_service = [] - + self.host_service = [] + def process(self): - all_hosts = Model.Host.objects.all - for host in all_hosts: - service_name = [] - for service in host.get_effective_services(): - service_name.append(service.service_description) - self.host_service.append((host.host_name, service_name)) + all_hosts = Model.Host.objects.all + for host in all_hosts: + service_name = [] + for service in host.get_effective_services(): + service_name.append(service.service_description) + self.host_service.append((host.host_name, service_name)) # SAX event handler for parsing the Ganglia XML stream @@ -91,18 +91,18 @@ class GangliaHandler(xml.sax.ContentHandler): def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_domains, nagios_hosts): self.clusters_c = clusters_c self.value_handler = value_handler - self.checkresult_file_handler = checkresult_file_handler + self.checkresult_file_handler = checkresult_file_handler self.clusters_cache = {} self.hosts_cache = {} self.metrics_cache = {} - self.strip_domains = strip_domains - self.host_service = nagios_hosts.host_service + self.strip_domains = strip_domains + self.host_service = nagios_hosts.host_service def startElement(self, name, attrs): # METRIC is the most common element, it is handled first, # followed by HOST and CLUSTER - + # handle common elements that we ignore if name == "EXTRA_ELEMENT": return @@ -124,38 +124,38 @@ def startElement(self, name, attrs): match_result = metric_def[0] == metric_name if match_result: service_name = metric_def[1]['service_name'] - # if service is defined in Nagios for host_name - if service_name in self.nagios_service: - self.metrics_cache[cache_key] = (idx, service_name) - self.metric = metric_def[1] - self.handle_metric(metric_name, service_name, attrs) - return + # if service is defined in Nagios for host_name + if service_name in self.nagios_service: + self.metrics_cache[cache_key] = (idx, service_name) + self.metric = metric_def[1] + self.handle_metric(metric_name, service_name, attrs) + return # handle a HOST element in the XML if name == "HOST" and self.hosts is not None: self.metrics = None self.host_name = attrs['NAME'] self.host_reported = long(attrs['REPORTED']) - self.nagios_service = None + self.nagios_service = None if self.strip_domains: self.host_name = self.host_name.partition('.')[0] cache_key = (self.cluster_idx, self.host_name) if cache_key in self.hosts_cache: self.host_ix = self.hosts_cache[cache_key] self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] - self.handle_host(host_name, attrs) + self.handle_host(host_name, attrs) return for idx, host_def in enumerate(self.hosts): if host_def[0] == self.host_name: - for host in self.host_service: - if host[0] == self.host_name: - self.hosts_cache[cache_key] = idx - self.host_idx = idx - self.metrics = host_def[1] - self.handle_host(self.host_name, attrs) - # get the services defined for the host in Nagios - self.nagios_service = host[1] - return + for host in self.host_service: + if host[0] == self.host_name: + self.hosts_cache[cache_key] = idx + self.host_idx = idx + self.metrics = host_def[1] + self.handle_host(self.host_name, attrs) + # get the services defined for the host in Nagios + self.nagios_service = host[1] + return # handle a CLUSTER element in the XML if name == "CLUSTER": @@ -176,16 +176,16 @@ def startElement(self, name, attrs): # checks the state of host by comparing tmax and tn for the host def handle_host(self, host_name, attrs): host_tn = int(attrs['TN']) - host_tmax = int(attrs['TMAX']) - last_seen = self.cluster_localtime - host_tn - if host_tn > host_tmax*4 : - host_return_code = 1 #host down - else: - host_return_code = 0 #host up - host_last_seen = str(last_seen) + '.0' - - # write host checks to Nagios checkresult file - self.checkresult_file_handler.build_host(time.asctime(), self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") + host_tmax = int(attrs['TMAX']) + last_seen = self.cluster_localtime - host_tn + if host_tn > host_tmax*4 : + host_return_code = 1 #host down + else: + host_return_code = 0 #host up + host_last_seen = str(last_seen) + '.0' + + # write host checks to Nagios checkresult file + self.checkresult_file_handler.build_host(time.asctime(), self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes @@ -194,7 +194,7 @@ def handle_metric(self, metric_name, service_name, attrs): metric_tmax = int(attrs['TMAX']) metric_dmax = int(attrs['DMAX']) metric_type = attrs['TYPE'] - metric_units = attrs['UNITS'] + metric_units = attrs['UNITS'] # the metric_value has a dynamic type: if metric_type == 'string': metric_value = metric_value_raw @@ -203,17 +203,17 @@ def handle_metric(self, metric_name, service_name, attrs): else: metric_value = int(metric_value_raw) last_seen = self.cluster_localtime - metric_tn - service_last_seen = str(last_seen) + '.0' - - #setting service return code as 0 by default - service_return_code=0 + service_last_seen = str(last_seen) + '.0' + + #setting service return code as 0 by default + service_return_code=0 # call the handler to process the value and return service state after comparing metric value and threshold: - service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) - # write Passive service checks to checkresult file - self.checkresult_file_handler.build_service(time.asctime(), self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") + service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) + # write Passive service checks to checkresult file + self.checkresult_file_handler.build_service(time.asctime(), self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") + + - - # main program code if __name__ == '__main__': try: @@ -226,37 +226,35 @@ def handle_metric(self, metric_name, service_name, attrs): # read the configuration file, setting some defaults first force_dmax = 0 tmax_grace = 60 - #pasre config file - config_parse = conf_parser.ConfigParser() - config_parse.parse(args.config_file) + #pasre config file + config_parse = conf_parser.ConfigParser() + config_parse.parse(args.config_file) - #get hosts and associated services known to Nagios to prevent generating checkresult for hosts not known to Nagios - nagios_hosts = NagiosHosts() - nagios_hosts.process() + #get hosts and associated services known to Nagios to prevent generating checkresult for hosts not known to Nagios + nagios_hosts = NagiosHosts() + nagios_hosts.process() # connect to the gmetad or gmond sock = socket.create_connection((config_parse.gmetad_host, config_parse.gmetad_port)) # set up the SAX parser parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) - #Instantiate GenerateNagiosCheckResult class - gn = nagios_checkresult.GenerateNagiosCheckResult() - #Create CheckResultFile - try: - gn.create(config_parse.nagios_result_dir, int(time.time())) + #Instantiate GenerateNagiosCheckResult class + gn = nagios_checkresult.GenerateNagiosCheckResult() + #Create CheckResultFile + try: + gn.create(config_parse.nagios_result_dir, int(time.time())) parser.setContentHandler(GangliaHandler(config_parse.clusters, pg, gn, config_parse.strip_domains, nagios_hosts)) # run the main program loop parser.parse(SocketInputSource(sock)) - + # write out for Nagios gn.submit() # all done sock.close() - except OSError as e: - print "Failed to create tempfile at", config_parse.nagios_result_dir - + except OSError as e: + print "Failed to create tempfile at", config_parse.nagios_result_dir + except socket.error as e: logging.warn('Failed to connect to gmetad: %s', e.strerror) - - From cffd0d68871fbd0b4f66c23315810416b47e3819 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Thu, 7 Aug 2014 01:42:04 +0530 Subject: [PATCH 29/35] Fixed bugs in the parser for adding multiple metrics for same host --- conf_parser.py | 81 +++++++++++++++++++--------------------- ganglia-nagios-bridge.py | 8 +++- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/conf_parser.py b/conf_parser.py index 5daef96..ab9eecd 100644 --- a/conf_parser.py +++ b/conf_parser.py @@ -4,46 +4,41 @@ class ConfigParser: - def __init__(self): - self.clusters =[] - - def parse (self,config_file): - try: - config = ConfigObj(config_file) - self.gmetad_host = config.pop('gmetad_host') - self.gmetad_port = config.pop('gmetad_port') - self.force_dmax = config.pop('force_dmax') - self.tmax_grace = config.pop('tmax_grace') - self.strip_domains = config.pop('strip_domains') - self.nagios_result_dir = config.pop('nagios_result_dir') - for cluster_name in config.keys(): - hosts = [] - for host_name in config[cluster_name].keys(): - host = [] - for x in host_name.split(','): - host.append(x) - metrics = [] - for metric_name in config[cluster_name][host_name].keys(): - metric_def = {} - metric_def['service_name'] = config[cluster_name][host_name][metric_name]['service_name'] - if 'crit_above' in config[cluster_name][host_name][metric_name].keys(): - metric_def['crit_above'] = config[cluster_name][host_name][metric_name]['crit_above'] - metric_def['crit_below'] = None - if 'crit_below' in config[cluster_name][host_name][metric_name].keys(): - metric_def['crit_below'] = config[cluster_name][host_name][metric_name]['crit_below'] - metric_def['crit_above'] = None - if 'warn_above' in config[cluster_name][host_name][metric_name].keys(): - metric_def['warn_above'] = config[cluster_name][host_name][metric_name]['warn_above'] - metric_def['warn_below'] = None - if 'warn_below' in config[cluster_name][host_name][metric_name].keys(): - metric_def['warn_below'] = config[cluster_name][host_name][metric_name]['warn_below'] - metric_def['warn_above'] = None - metrics.append((metric_name,metric_def)) - for hostn in host: - hosts.append((hostn,metrics)) - self.clusters.append((cluster_name,hosts)) - - except (ConfigObjError, IOError), e: - print 'Could not read %s' % (e) - - + def __init__(self): + self.clusters =[] + + def parse (self,config_file): + try: + config = ConfigObj(config_file) + self.gmetad_host = config.pop('gmetad_host') + self.gmetad_port = config.pop('gmetad_port') + self.force_dmax = config.pop('force_dmax') + self.tmax_grace = config.pop('tmax_grace') + self.strip_domains = config.pop('strip_domains') + self.nagios_result_dir = config.pop('nagios_result_dir') + for cluster_name in config.keys(): + cluster_hosts = {} + for host_name in config[cluster_name].keys(): + metrics = [] + for metric_name in config[cluster_name][host_name].keys(): + metric_def = {} + metric_def['service_name'] = config[cluster_name][host_name][metric_name]['service_name'] + if 'crit_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_above'] = config[cluster_name][host_name][metric_name]['crit_above'] + metric_def['crit_below'] = None + if 'crit_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_below'] = config[cluster_name][host_name][metric_name]['crit_below'] + metric_def['crit_above'] = None + if 'warn_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_above'] = config[cluster_name][host_name][metric_name]['warn_above'] + metric_def['warn_below'] = None + if 'warn_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_below'] = config[cluster_name][host_name][metric_name]['warn_below'] + metric_def['warn_above'] = None + metrics.append((metric_name,metric_def)) + for host in host_name.split(','): + cluster_hosts.setdefault(host, []).append(metrics) + self.clusters.append((cluster_name,cluster_hosts)) + + except (ConfigObjError, IOError), e: + print 'Could not read %s' % (e) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index dbf9f65..f8861cf 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -151,7 +151,9 @@ def startElement(self, name, attrs): if host[0] == self.host_name: self.hosts_cache[cache_key] = idx self.host_idx = idx - self.metrics = host_def[1] + self.metrics = [] + for metric_tuple in host_def[1]: + self.metrics += metric_tuple self.handle_host(self.host_name, attrs) # get the services defined for the host in Nagios self.nagios_service = host[1] @@ -170,7 +172,9 @@ def startElement(self, name, attrs): if cluster_def[0] == self.cluster_name: self.clusters_cache[self.cluster_name] = idx self.cluster_idx = idx - self.hosts = cluster_def[1] + self.hosts = [] + for host_name in cluster_def[1]: + self.hosts.append((host_name, cluster_def[1][host_name])) return # checks the state of host by comparing tmax and tn for the host From 9e13e8b9c6b46e6edadbfb3cc17a1b78757ebf92 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Thu, 7 Aug 2014 23:21:47 +0530 Subject: [PATCH 30/35] Fixed error when no host matches --- ganglia-nagios-bridge.py | 51 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index f8861cf..1c4d34c 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -132,32 +132,33 @@ def startElement(self, name, attrs): return # handle a HOST element in the XML - if name == "HOST" and self.hosts is not None: + if name == "HOST": self.metrics = None - self.host_name = attrs['NAME'] - self.host_reported = long(attrs['REPORTED']) - self.nagios_service = None - if self.strip_domains: - self.host_name = self.host_name.partition('.')[0] - cache_key = (self.cluster_idx, self.host_name) - if cache_key in self.hosts_cache: - self.host_ix = self.hosts_cache[cache_key] - self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] - self.handle_host(host_name, attrs) - return - for idx, host_def in enumerate(self.hosts): - if host_def[0] == self.host_name: - for host in self.host_service: - if host[0] == self.host_name: - self.hosts_cache[cache_key] = idx - self.host_idx = idx - self.metrics = [] - for metric_tuple in host_def[1]: - self.metrics += metric_tuple - self.handle_host(self.host_name, attrs) - # get the services defined for the host in Nagios - self.nagios_service = host[1] - return + if self.hosts is not None: + self.host_name = attrs['NAME'] + self.host_reported = long(attrs['REPORTED']) + self.nagios_service = None + if self.strip_domains: + self.host_name = self.host_name.partition('.')[0] + cache_key = (self.cluster_idx, self.host_name) + if cache_key in self.hosts_cache: + self.host_ix = self.hosts_cache[cache_key] + self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] + self.handle_host(host_name, attrs) + return + for idx, host_def in enumerate(self.hosts): + if host_def[0] == self.host_name: + for host in self.host_service: + if host[0] == self.host_name: + self.hosts_cache[cache_key] = idx + self.host_idx = idx + self.metrics = [] + for metric_tuple in host_def[1]: + self.metrics += metric_tuple + self.handle_host(self.host_name, attrs) + # get the services defined for the host in Nagios + self.nagios_service = host[1] + return # handle a CLUSTER element in the XML if name == "CLUSTER": From 1d6515fa56951733ec3c19885d95b7e9789c0e5d Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sat, 9 Aug 2014 23:06:40 +0530 Subject: [PATCH 31/35] Added comments --- conf_parser.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf_parser.py b/conf_parser.py index ab9eecd..87871c9 100644 --- a/conf_parser.py +++ b/conf_parser.py @@ -10,16 +10,22 @@ def __init__(self): def parse (self,config_file): try: config = ConfigObj(config_file) + + #get gmetad host information and nagios checkresult directory self.gmetad_host = config.pop('gmetad_host') self.gmetad_port = config.pop('gmetad_port') self.force_dmax = config.pop('force_dmax') self.tmax_grace = config.pop('tmax_grace') self.strip_domains = config.pop('strip_domains') self.nagios_result_dir = config.pop('nagios_result_dir') + + for cluster_name in config.keys(): cluster_hosts = {} + #get hosts in the cluster for host_name in config[cluster_name].keys(): metrics = [] + #collect metric for each host in the cluster for metric_name in config[cluster_name][host_name].keys(): metric_def = {} metric_def['service_name'] = config[cluster_name][host_name][metric_name]['service_name'] From 7a759ed8920c731e5a222667d45c90d270e6f166 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Thu, 14 Aug 2014 19:01:57 +0530 Subject: [PATCH 32/35] Remove leading space while collecting hostnames from conf --- conf_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf_parser.py b/conf_parser.py index 87871c9..cb1c055 100644 --- a/conf_parser.py +++ b/conf_parser.py @@ -43,7 +43,7 @@ def parse (self,config_file): metric_def['warn_above'] = None metrics.append((metric_name,metric_def)) for host in host_name.split(','): - cluster_hosts.setdefault(host, []).append(metrics) + cluster_hosts.setdefault(host.lstrip(), []).append(metrics) self.clusters.append((cluster_name,cluster_hosts)) except (ConfigObjError, IOError), e: From d12788a76504475870dbc8bf6554c4058b2ddff9 Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sat, 16 Aug 2014 01:45:23 +0530 Subject: [PATCH 33/35] added default configuration file --- ganglia-nagios-bridge.conf | 87 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100755 ganglia-nagios-bridge.conf diff --git a/ganglia-nagios-bridge.conf b/ganglia-nagios-bridge.conf new file mode 100755 index 0000000..e08b137 --- /dev/null +++ b/ganglia-nagios-bridge.conf @@ -0,0 +1,87 @@ +gmetad_host = '127.0.0.1' +gmetad_port = 8649 +# This overrides the DMAX attribute from all metrics in all hosts +# If DMAX > 0 and TN > DMAX, then a metric state is considered +# UNKNOWN and Nagios will potentially send an alert +force_dmax = 0 + +# Every collection group in gmond.conf defines a time_threshold +# This value appears as TMAX in the XML. +# The gmond process should normally send every metric again before +# the value timer TN > TMAX. +# If ganglia-nagios-bridge is polling a gmond collector +# then a very small tmax_grace period (perhaps 5 seconds) is used. +# If ganglia-nagios-bridge is polling a gmetad server then +# tmax_grace should be set higher than the polling interval configured +# in gmetad. +tmax_grace = 30 + +# Ganglia XML typically contains FQDNs for all hosts, as it obtains +# the hostnames using reverse DNS lookups. Nagios, on the other hand, +# is often configured with just the hostname and no domain. Setting +# strip_domains = True will ensure that the domain part is stripped from +# the hostname before passing it to Nagios. +strip_domains = True + +# This is the directory where Nagios expects to read checkresults +# submitted in batch +nagios_result_dir = '/var/lib/nagios3/spool/checkresults' + +# This is where we select the metrics that we want to map from +# Ganglia to Nagios service names +# Any metric not matched in the configuration will be ignored and +# not passed to Nagios. +# Defintion for multiple clusters and their hosts to be monitored +# along with their metrics is added in a nested format +# +# Format overview : +# cluster definiton specifying the clustername and hostnames and +# associated metrics to be monitored +# +# can add mutliple cluster names +# [cluster_name] +# Add comma separated host name(s) and and the common metrics to be monitored +# [[hostname(s) separated by ,]] +# Metric name of the metric to be monitored for the hostnames +# [[[metric name]]] +# metric attributes: corresponding service name and threshold values +# service_name = +# warn_above/below = +# crit_above/below = + +# Sample configuration + +[cluster_name] + [[host_01, host_02]] + [[[proc_total]]] + service_name = Total Processes + warn_above = 180 + crit_above = 200 + [[[load_one]]] + service_name = Current Load + warn_above = 0.1 + crit_above = 0.3 + [[host_02]] + [[[cpu_idle]]] + service_name = CPU IDLE + warn_above = 85 + crit_above = 90 + [[[disk_free]]] + service_name = DISK FREE + warn_below = 5 + crit_below = 2 + [[[cpu_speed]]] + service_name = CPU SPEED + warn_below = 2112 + crit_below = 2000 +[Production1] + [[host3, host12]] + [[[disk_free]]] + service_name = DISK FREE + warn_below = 10 + crit_below = 5 + [[host1]] + [[[cpu_speed]]] + service_name = CPU SPEED + warn_below = 2000 + crit_below = 1890 From 38195a67d3399ef3dc4d4199399acb0a038b811d Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sat, 16 Aug 2014 01:47:28 +0530 Subject: [PATCH 34/35] Removed sample.conf --- sample.conf | 64 ----------------------------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 sample.conf diff --git a/sample.conf b/sample.conf deleted file mode 100644 index 2648e7e..0000000 --- a/sample.conf +++ /dev/null @@ -1,64 +0,0 @@ -gmetad_host = '127.0.0.1' -gmetad_port = 8649 -# This overrides the DMAX attribute from all metrics in all hosts -# If DMAX > 0 and TN > DMAX, then a metric state is considered -# UNKNOWN and Nagios will potentially send an alert -force_dmax = 0 - -# Every collection group in gmond.conf defines a time_threshold -# This value appears as TMAX in the XML. -# The gmond process should normally send every metric again before -# the value timer TN > TMAX. -# If ganglia-nagios-bridge is polling a gmond collector -# then a very small tmax_grace period (perhaps 5 seconds) is used. -# If ganglia-nagios-bridge is polling a gmetad server then -# tmax_grace should be set higher than the polling interval configured -# in gmetad. -tmax_grace = 30 - -# Ganglia XML typically contains FQDNs for all hosts, as it obtains -# the hostnames using reverse DNS lookups. Nagios, on the other hand, -# is often configured with just the hostname and no domain. Setting -# strip_domains = True will ensure that the domain part is stripped from -# the hostname before passing it to Nagios. -strip_domains = True - -# This is the directory where Nagios expects to read checkresults -# submitted in batch -nagios_result_dir = '/var/lib/nagios3/spool/checkresults' - - -# Defining Cluster and hosts to be monitored along with their metrics -# [ClusterName] -# [[Hostname(s) separated by ,]] -# [[[metric name]]] -# service_name = -# warn_above/below = -# crit_above/below = - -[Production] - [[chandrika-HP-G42-Notebook-PC, virtualhost1]] - [[[proc_total]]] - service_name = Total Processes - warn_above = 120 - crit_above = 150 - [[[load_one]]] - service_name = Current Load - warn_above = 30 - crit_above = 35 - [[virtualhost2]] - [[[cpu_idle]]] - service_name = CPU IDLE - warn_above = 100 - crit_above = 130 -[Production1] - [[virtualhost3, virtualhost12]] - [[[disk_free]]] - service_name = DISK FREE - warn_below = 80 - crit_below = 65 - [[virtualhost12]] - [[[cpu_speed]]] - service_name = CPU SPEED - warn_below = 2112 - crit_below = 2000 From 6f5162119b6908e8a765af1bb1e9860a2a1ab56c Mon Sep 17 00:00:00 2001 From: MirrorZ Date: Sat, 16 Aug 2014 01:49:12 +0530 Subject: [PATCH 35/35] Added default path for conf file --- ganglia-nagios-bridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py index 1c4d34c..36f335f 100644 --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -225,7 +225,7 @@ def handle_metric(self, metric_name, service_name, attrs): # parse command line parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') parser.add_argument('config_file', nargs='?', - help='configuration file', default='/etc/ganglia/sample.conf') + help='configuration file', default='/etc/ganglia/ganglia-nagios-bridge.conf') args = parser.parse_args() # read the configuration file, setting some defaults first