#!/usr/bin/perl
# -----------------------------------------------------------------------------
#  check_cache_status : Nagios plugin for Stratus in-memory-cache health check
# -----------------------------------------------------------------------------
#  Created by: Andriy Fomenko
#  Authors: Alex Titov, Alex Tsibulnik, Andriy Fomenko
#  QA by:
#  Copyright: videoNEXT Federal, Inc., 2015
# -----------------------------------------------------------------------------

use strict;
use warnings;

use Nagios::Plugin;
use Nagios::Plugin::Getopt;

my $np = Nagios::Plugin->new( usage => "Usage: %s [ -v|--verbose ]" );
$np->getopts;

my $APL=$ENV{APL} || $np->nagios_exit( UNKNOWN, 'APL environment variable is not defined' );

use SKM::DB;
use Master::Conf;
use Node::Conf ":all";

$np->nagios_exit( OK, 'skipped on slave node' ) if ! am_I_master();

# Try connecting to db
my $dbh;
eval {
    $dbh = DBMaster({ PrintError=>0, RaiseError=>1, FetchHashKeyName=>'NAME_uc' });
};
$np->nagios_exit( CRITICAL, 'Database is DOWN' ) if $@;

my $nlist = NodeList;
my ($cache, $alert_notes, $critical_notes);
eval {
    $cache=$dbh->selectall_hashref(
        "select nodeid, size, usage, stime, chunk_loss_ts, EXTRACT(EPOCH from now() at time zone 'UTC' - chunk_loss_ts) as sinceloss from sm_cache",
        'NODEID',
        {Slice=>{}}
    );
};
$np->nagios_exit( CRITICAL, 'Can not read cache status from DB' ) if $@;

foreach my $uni (keys %$nlist) {
    my $node=$nlist->{$uni};
    next if $node->{DEAD};       # skip the dead
    # check if record is present in cache and set CRITICAL | ALERT with corresponded explanation
    if (not exists $cache->{$uni}) {
        $alert_notes.="Cache status for node $node->{HOST} is UNKNOWN\n";
        next;
    }
    my $ncache=$cache->{$uni};
    if ($ncache->{SIZE} == 0) { # Cache is turned off
        $critical_notes.="Cache status for node $node->{HOST} is OFFLINE\n";
        next;
    }
    #------------------------------------ Analyze cache Usage
    my $pct = int($ncache->{USAGE} / $ncache->{SIZE}) * 100;
    if ($pct > 90) {
        $critical_notes.="Cache status for node $node->{HOST} is FULL (${pct}%)\n";
    }
    elsif ($pct > 75) {
        $alert_notes.="Cache status for node $node->{HOST} is DEGRADED (${pct}%)\n";
    }
    #------------------------------------ Analyze chunk loss
    next unless defined $ncache->{CHUNK_LOSS_TS};
    my $sinceloss=int($ncache->{SINCELOSS});
    next if not defined $sinceloss or $sinceloss < 0;
    $sinceloss = int($sinceloss/60) || 1; # In minutes
    if ($sinceloss < 30) {
        my $note="Video Archive loss detected $sinceloss minutes ago\n";
        if($sinceloss<5) {
            $critical_notes.=$note;
        }else {
            $alert_notes.=$note;
        }
    }
}

$np->nagios_exit( CRITICAL, "Cache is in CRITICAL state\nCritical notes:\n${critical_notes}Warnings:$alert_notes" ) if $critical_notes;
$np->nagios_exit( WARNING, "Cache warnings detected:\n$alert_notes" ) if $alert_notes;
$np->nagios_exit( OK, 'Memory cache is running normally' );
