#!/usr/bin/perl
# -----------------------------------------------------------------------------
#  check_nodes_status : Nagios plugin for Stratus Node status check
# -----------------------------------------------------------------------------
#  Created by: Andriy Fomenko
#  Authors: Alex Titov, Alex Tsibulnik, Andriy Fomenko
#  QA by:
#  Copyright: videoNEXT Federal, Inc., 2015
# -----------------------------------------------------------------------------

use strict;
use warnings;

use Nagios::Plugin;
use Nagios::Plugin::Getopt;

my $np = Nagios::Plugin->new( usage => "Usage: %s [ -v|--verbose ]" );
$np->getopts;

my $APL=$ENV{APL} || $np->nagios_exit( UNKNOWN, 'APL environment variable is not defined' );

use SKM::DB;
use SKM::Common "ArrayTable";
use Master::Conf;
use Node::Conf ":all";

$np->nagios_exit( OK, 'skipped on slave node' ) if ! am_I_master();

my $LOADAVG_ALERT  = 4;
my $nlist = NodeList; # preload Node list
my @nodes = ();
my $UNI=UNI;
my ($system_status, $notes) = ('OK', '');
my $verid = $nlist->{$UNI}->{VERID};   # own VERID

foreach my $uni (keys %$nlist) {
    my $node = $nlist->{$uni};
    my $status = $node->{ALIVE} ? 'ALIVE' : $node->{DEAD} ? 'DEAD' : 'UNKNOWN';
    push @nodes, [$node->{IP}, $node->{VERID}, $status];
        if(not $node->{ALIVE}) {
            $notes.="Node $node->{HOST} is OFFLINE\n";
            $system_status='CRITICAL';
        }
        if($node->{INSTALL_RESULT} ne 'SUCCESS') {
          if($uni!=$UNI) {            # do not report master second time
             $system_status='CRITICAL';
             $notes.="Installation status for node $node->{HOST} is FAILED\n";
          }
        }
        if($node->{VERID} ne $verid) {
          $notes.="Node $node->{HOST} has software version mismatch: $node->{VERID}\n";
          $system_status='ALERT' if $system_status ne 'CRITICAL';
        }
}

# Try to connect to db
my $dbh;
eval {
    $dbh = DBMaster({ PrintError=>0, RaiseError=>1, FetchHashKeyName=>'NAME_uc' });
};

$np->nagios_exit( CRITICAL, 'Database is DOWN' ) if $@;

# Check node statistics
my $ra;
eval {
    $ra = $dbh->selectall_arrayref(
        "select o.name as uni, a.val as loadavg 
           from _objs o inner join _obj_attr a on o.obj=a.obj
          where otype='D' and subtype='N' and deleted=0
            and a.attr='STAT_LOAD_AVERAGE_CORE'"
    );
};

if($@) {
    $notes.="Failed to read node statistics from the DB\n";
    $system_status='ALERT' if $system_status ne 'CRITICAL';
}

if ($ra and @$ra) {
    foreach my $row (@$ra) {
    my $node = $nlist->{ $row->[0] };
    next unless $node;
    my @loadavg = split(/\s/, $row->[1]);
    # Look at 5-min load average
    my $avg5 = $loadavg[1];
    if ($avg5 >= $LOADAVG_ALERT) {
        $notes.="CPU overload is detected on node $node->{HOST}\n";
        $system_status='ALERT' if $system_status ne 'CRITICAL';
    }
    }
}

# TODO: make is parametrized: ArrayTable(['Node IP', 'SKM Version', 'Status'], \@nodes) if not $silent;

$np->nagios_exit( OK, length($notes) ? $notes : 'All nodes run normally' ) if $system_status eq 'OK';
$np->nagios_exit( WARNING, $notes ) if $system_status eq 'ALERT';
$np->nagios_exit( CRITICAL, $notes );


