User Tools

Site Tools


progetti:cloud-areapd:ced-c:nfs_cluster_monitoring

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
progetti:cloud-areapd:ced-c:nfs_cluster_monitoring [2015/07/20 15:29] – [Allow nrpe to run the check as root] mazzon@infn.itprogetti:cloud-areapd:ced-c:nfs_cluster_monitoring [2015/10/14 13:29] (current) – [Reload nagios] mazzon@infn.it
Line 1: Line 1:
 +====== Monitoring the NFS cluster with Nagios ======
  
 +The NFS service provided by the 2 nodes cluster is an "active/passive" one. In this case the normal behaviour is:
 +
 +  * one node is actually running the nfsd daemon
 +  * the other nodes are in standby
 +  * takeover of the service is handled by the cluster daemons 
 +
 +Therefore we decide to monitor the situation by:
 +
 +  * checking the **cluster** daemons are running on each node
 +  * checking the ''nfsclusterserver'' service is running on the cluster
 +  * if the server is running on my host check detailed status of the **nfs** daemons
 +  * if I'm a standby node but the cluster is OK and nfs is running somewhere return OK
 +
 +===== Install needed packages =====
 +
 +On all the monitored nodes:
 +
 +<code bash>
 +# yum -y install nrpe nagios-plugins-perl perl-Nagios-Plugin
 +</code>
 +
 +Obtain latest version of the monitoring scripts from [[https://exchange.nagios.org/directory/Plugins/Operating-Systems/Linux/check_nfs4/details|here]]
 +and [[https://exchange.nagios.org/directory/Plugins/Clustering-and-High-2DAvailability/Check-CRM/details|here]] and copy them on the relevant directory
 +
 +<code>
 +# cp check_nfs4.0.2.pl /usr/lib64/nagios/plugins/check_nfs4
 +# cp check_crm_v0_7 /usr/lib64/nagios/plugins/check_crm
 +# chmod +rx /usr/lib64/nagios/plugins/check_nfs4
 +# chmod +rx /usr/lib64/nagios/plugins/check_crm
 +</code>
 +
 +Since all nodes on the cluster share the same domain and users we do not use the idmapd daemon. Its absence is therefore not critical:
 +
 +<code bash>
 +sed -i 's/^if (!$idmapd_d) { $daelist/# if (!$idmapd_d) { $daelist/' /usr/lib64/nagios/plugins/check_nfs4
 +</code>
 +
 +===== Create an helper script =====
 +
 +To implement the nagios check as designed we use an helper script that checks if the nfs daemon is running on the tested host or not. 
 +In the former case the result of the check is handled over to the ''check_nfs4'' script:
 +
 +<code bash check_my_nfs>
 +#!/bin/bash
 +
 +monitor="/usr/sbin/crm_mon -1"
 +
 +# check cluster is healthy
 +${monitor} -s 1>/dev/null
 +if [ "$?" != "0" ];
 +then
 +   echo "Cluster is not OK!"
 +   exit 2
 +else
 +   #
 +   # check if there is at least one nfs server active
 +   #
 +   ${monitor} | grep nfsclusterserver | grep -i started 1>/dev/null
 +   if [ "$?" != "0" ];
 +   then
 +      echo "NFS server is not running anywhere!"
 +      exit 2
 +   else
 +      hname=$(hostname -s)
 +      ${monitor} | grep $hname | grep nfsclusterserver 1>/dev/null
 +      if [ "$?" = "0" ];
 +      then
 +         #
 +         # I am the nfs server: check if I'm healthy
 +         #
 +         exec /usr/lib64/nagios/plugins/check_nfs4
 +      else
 +         #
 +         # I am not the nfs server but:
 +         # - the cluster is ok
 +         # - the service is running
 +         #
 +         echo "NFS is running somewhere..."
 +         exit 0
 +      fi
 +   fi
 +fi
 +</code>
 +===== Setup nrpe on monitored hosts =====
 +
 +==== nrpe directives ====
 +
 +On all the hosts composing the cluster create the file ''/etc/nrpe.d/check_nfs4.cfg'' containing the following directives:
 +
 +<code>
 +# Allow requests from cld-nagios by adding the cld-nagios IP to the list of allowed hosts
 +allowed_hosts=127.0.0.1,192.168.60.32
 +
 +# Define the check_crm command:
 +command[check_crm]=/usr/lib64/nagios/plugins/check_crm
 +
 +# Define the check_nfs4 command:
 +# On CentOS the file '/var/log/messages' is readable only 
 +# by root so we run this check through 'sudo'
 +command[check_nfs4]=sudo /usr/lib64/nagios/plugins/check_my_nfs  </code>
 +
 +
 +==== Allow nrpe to run the checks as root ====
 +
 +  * Create the file ''/etc/sudoers.d/nrpe'' containing
 +<code>
 +Defaults:nrpe !requiretty
 +
 +nrpe ALL = (root) NOPASSWD: /usr/sbin/crm_mon
 +nrpe ALL = (root) NOPASSWD: /usr/lib64/nagios/plugins/check_my_nfs
 +nrpe ALL = (root) NOPASSWD: /usr/lib64/nagios/plugins/check_nfs4 -v
 +</code>
 +
 +  * Give the file the correct permissions
 +<code bash>
 +chmod 440 /etc/sudoers.d/nrpe </code>
 +
 +
 +==== Open firewall port 5666 ====
 +
 +<code bash>
 +firewall-cmd --add-port=5666/tcp
 +firewall-cmd --permanent --add-port=5666/tcp
 +</code>
 +
 +==== Start and enable the nrpe daemon ====
 +
 +<code bash>
 +systemctl start nrpe
 +systemctl enable nrpe
 +</code>
 +===== Define needed commands on cld-nagios =====
 +
 +  * Make sure nrpe is installed on the nagios server
 +<code># rpm -qa | grep nrpe
 +nrpe-2.15-2.el6.x86_64
 +nagios-plugins-nrpe-2.15-2.el6.x86_64</code>
 +
 +  * Make sure a command to exec checks using nrpe is defined (check the ''commands.cfg'' file)
 +<code>
 +define command{
 +    command_name            check_nrpe_cedc
 +    command_line            $USER1$/check_nrpe  -H $HOSTADDRESS$ -t 480 -c $ARG1$
 +}
 +</code>
 +
 +  * Create the new command that execs check_nfs4 on the monitored host
 +<code>
 +define command{
 +        command_name        check_nfs4
 +        contact_groups      cedc-admins
 +        command_line        $USER1$/check_nrpe_cedc -H $HOSTADDRESS$ -c check_nfs4
 +}
 +</code>
 +
 +  * Add it to the list of the scheduled checks for every node in the cluster
 +<code>
 +define service{
 +        use                             server-service         ; Name of service template to use
 +        contact_groups                  cedc-admins
 +        host_name                       cld-blu-01
 +        service_description             NFSv4 Status
 +        check_command                   check_nrpe_cedc!check_nfs4
 +        }  
 +</code>
 +
 +  * Create the new command that execs check_crm on the monitored host
 +<code>
 +define command{
 +        command_name        check_crm
 +        contact_groups      cedc-admins
 +        command_line        $USER1$/check_nrpe_cedc -H $HOSTADDRESS$ -c check_crm
 +}
 +</code>
 +
 +  * Add it to the list of the scheduled checks for every node in the cluster
 +<code>
 +define service{
 +        use                             server-service         ; Name of service template to use
 +        contact_groups                  cedc-admins
 +        host_name                       cld-blu-01
 +        service_description             CFS Cluster Status
 +        check_command                   check_nrpe_cedc!check_crm
 +        }  
 +</code>
 +===== Reload nagios =====
 +<code>/etc/init.d/nagios reload</code>

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki