#!/usr/bin/perl
#
#  $Id: master_collector 21985 2011-02-23 18:18:15Z teetov $
# -----------------------------------------------------------------------------
#  COLLECTOR
# -----------------------------------------------------------------------------
#  Author: Alex Teetov
#  QA by:
#  Copyright: videoNEXT LLC
# -----------------------------------------------------------------------------
#
# for REQ see: Rally S915: Storage Manager TA902: SM tools
# notes:
#  1. Onece a 60 seconds collect Health information for:
#     - DB
#     - Nodes
#     - Cameras
#     - Storage
#  2. Health info published:
#     http://<MASTER>/hm/collector_details.json  [ $APL/www/hm/collector_details.json ]
#  
#  3. Summary info published 
#     http://<MASTER>/hm/collector_summary.json  [ $APL/www/hm/collector_summary.json ]
#  
#  4. Log is placed in
#     $APL/var/log/system_collector.log     
#
#
# usage:
#  started from procctl
#
#

use warnings;
use strict;
use Data::Dumper;
use JSON;
use Master::Conf;
use SKM::DB;
use lib "$ENV{APL}/sm/lib";              # find  SM::Config here
use SM::Config ':all';


# CONS ------------------------------------------
my $APL=$ENV{APL};
my $DETAILS="$APL/www/hm/collector_details.json";
my $SUMMARY="$APL/www/hm/collector_summary.json";


my $dbm;                                                # DB handler
my $node_list;
my $camera_list;
my $wheel_list;
my $cache_list;
# SUBS ------------------------------------------


#------------------------------------------------
# check_database  Possible targets for a check:
#    egrep max_connection /var/opt/sarch/pgsql/postgresql.conf
#    

sub check_database {
 my %result=(name=>"database",state=>"NORMAL",description=>"");
 $camera_list='';
 $wheel_list='';
 $cache_list='';
 eval {
   $dbm=DBMaster({PrintError=>1,'RaiseError' => 1});
   $dbm->{FetchHashKeyName} = 'NAME_uc';
   $dbm->{ShowErrorStatement}=1;
 };
 if($@) {
        $result{state}='CRITICAL';
        $result{description}='cannot establish database connection';
        print "$result{description}: $@\n";
        return \%result;
 }
 eval {
  
  #----------------------------------------------------------------------
  # list of all cameras in the system. It looks like
  # obj |          uni           |     description      | status 
  #-----+------------------------+----------------------+--------
  # 122 | vDCnsDRf4mtWqCH95l1VaQ | DCC - Fixed          | BROKEN
  # 107 | vDCnsDRf4mtWqCH95l1VaQ | West Main Door       | ON 
  # 136 | vDCnsDRf4mtWqCH95l1VaQ | Fedex PTZ            | OFF
  $camera_list=$dbm->selectall_hashref(qq{
    select o.obj,o.node_ip as uni,o.description,a.val as status 
    from _objs o,_obj_attr a 
    where o.obj=a.obj and attr='STATUS' and o.deleted=0 and o.subtype='C'
  },'OBJ',{Slice=>{}});
  #----------------------------------------------------------------------
  # list of all configured disk volumes in the system. It looks like
  #                  id                  |         uni            |  name  |      delta       |  ost   
  #--------------------------------------+------------------------+--------+------------------+--------
  # bd8b61bf-cc8e-4fa6-bb57-aba720559423 | vDCnsDRf4mtWqCH95l1VaQ | vol001 | 19.6001582145691 | Online
  # ceb03df6-f163-4d6b-b8ba-8eaaa441fed1 | vDCnsDRf4mtWqCH95l1VaQ | vol003 | 19.6001582145691 | Broken

  $wheel_list=$dbm->selectall_hashref(qq{
    select w.id, w.nodeid as uni, w.name,o.ost,
    EXTRACT(EPOCH from now() at time zone 'UTC')- EXTRACT(EPOCH from n.alive) as delta
    from sm_wheels w, sm_nodes n, sm_ost o where w.nodeid=n.nodeid and w.id=o.id
  } ,'ID',{Slice=>{}});
  
  #-------------------------------------------------------------------------------------------
  # SM cache status of all nodes in security domain. It looks like
  # nodeid                 | size | usage |           stime            |    chunk_loss_ts
  #- ------------------------+------+-------+----------------------------+---------------------
  # OJ4xFECoqzjeVg24Sqp2Tw |    5 |     5 | 2011-02-17 11:32:10.154918 | 2011-02-17 11:32:03
  
  $cache_list=$dbm->selectall_arrayref(qq{
    select nodeid, size, usage, EXTRACT(EPOCH from chunk_loss_ts) as chunk_loss_ts from sm_cache
  }, {Slice=>{}});
  
 };
 if($@) {
      $result{state}='CRITICAL';
      $result{description}='cannot select information from database';
      print "$result{description}: $@\n";
      return \%result;
 }
 return \%result;
}

sub check_nodes {
 my %result=(name=>"nodes",state=>"NORMAL",description=>"");
 $node_list=NodeList();
 my ($dead,$nodes)=(0,0);
 foreach my $uni (keys %$node_list) {
   $nodes++;
   $dead++  if $node_list->{$uni}->{DEAD};
   $node_list->{$uni}->{WHEELS_ONLINE}=0;
   $node_list->{$uni}->{WHEELS_BROKEN}=0;
   $node_list->{$uni}->{WHEELS_DEGRADED}=0;
   $node_list->{$uni}->{WHEELS_TOTAL}=0;
 }
 if ($dead) {
   $result{state}='CRITICAL';
   $result{description}="$dead node out of $nodes is not responding"   if $dead==1;
   $result{description}="$dead nodes out of $nodes are not responding" if $dead>1;
 }
 #print Dumper $node_list;
 return \%result;
}

sub check_cameras { 
 my %result=(name=>"cameras",state=>"NORMAL",description=>"");
 my ($broken,$cameras)=(0,0);
 foreach my $obj (keys %$camera_list) {
   $cameras++;
   $broken++ if $camera_list->{$obj}->{STATUS} eq 'BROKEN' or $node_list->{$camera_list->{$obj}->{UNI}}->{DEAD};
 }
 if($broken) {
   $result{state}='CRITICAL';
   $result{description}="$broken camera out of $cameras is broken" if $broken==1;
   $result{description}="$broken cameras out of $cameras are broken" if $broken>1;
 }
 return \%result;
}

sub check_storage {
 my %result=(name=>"storage",state=>"NORMAL",description=>"");
 my ($wheels,$broken,$degraded,$online)=(0,0,0,0);
 
 foreach my $uuid (keys %$wheel_list) {
   my $uni=$wheel_list->{$uuid}->{UNI};
   $node_list->{$uni}->{WHEELS_TOTAL}++;
   $wheels++; 
   if($node_list->{$uni}->{DEAD}){
     $node_list->{$uni}->{WHEELS_BROKEN}++;  # all volumes assumed as broken for dead node
     $broken++;
   }elsif($wheel_list->{$uuid}->{DELTA}>100) {
     $node_list->{$uni}->{WHEELS_BROKEN}++;  # if storage manager is not responded
     $broken++;
   }else {                                   # count actual state
     $node_list->{$uni}->{WHEELS_ONLINE}++   if $wheel_list->{$uuid}->{OST} eq 'Online';
     $online++                               if $wheel_list->{$uuid}->{OST} eq 'Online';
     $node_list->{$uni}->{WHEELS_DEGRADED}++ if $wheel_list->{$uuid}->{OST} eq 'Degraded';
     $degraded++                             if $wheel_list->{$uuid}->{OST} eq 'Degraded';
     $node_list->{$uni}->{WHEELS_BROKEN}++   if $wheel_list->{$uuid}->{OST} eq 'Broken';
     $broken++                               if $wheel_list->{$uuid}->{OST} eq 'Broken';
   }
 }
 foreach my $uni (keys %$node_list) {
   if($node_list->{$uni}->{WHEELS_ONLINE}==0) {
     $result{state}="CRITICAL";
   }elsif(  $node_list->{$uni}->{WHEELS_DEGRADED}+$node_list->{$uni}->{WHEELS_BROKEN}>0){
     $result{state}="WARNING" if $result{state} ne 'CRITICAL';
   }
 }
 if($result{state} ne 'NORMAL') {
    $result{description}="broken: $broken; "           if $broken>0;
    $result{description}.="degraded: $degraded; "      if $degraded>0;
    $result{description}.="online: $online"            if $degraded+$broken==0;
    $result{description}.=" total: $wheels";
 }
 return \%result;
}


sub check_storage_cirrus {
 my %result=(name=>"storage",state=>"NORMAL",description=>"");
 my ($wheels,$broken,$degraded,$online,$full)=(0,0,0,0,0);

 my $vol=SM_Wheels();  # load the local wheels
 foreach(sort keys %$vol) {
   my $disk=$vol->{$_};
   $wheels++;
   $broken++   if $disk->{ost}=~/^(Broken|undef)$/;
   $degraded++ if $disk->{ost} eq 'Degraded';
   $full++     if $disk->{ost} eq 'Full';
   $online++   if $disk->{ost}=~/^(Online|Degraded)$/;
 }
 if($online) {
   if($degraded) { 
     $result{state}="WARNING";
     $result{description}.="degraded performance";
   }
   return \%result;
 }
 if($broken) {
    $result{state}="CRITICAL";
    $result{description}.="broken volume";
    return \%result;
 }
 if($full) {
    $result{state}="CRITICAL";
    $result{description}.="Full";
    return \%result;
 }
 $result{state}="CRITICAL";
 $result{description}.="va-cirrus volume is missing";
 return \%result;
}

sub check_cache_usage {
 my %result=(name=>"cache usage",state=>"NORMAL",description=>"");
 my ($broken,$degraded,$full) = (0,0,0);
 
 foreach my $cache (@$cache_list) {
    my $sz=$cache->{SIZE};
    if ($sz == 0) { # Cache is turned off
	$broken++;
	next;
    }
    my $pct = ($cache->{USAGE} / $cache->{SIZE}) * 100;
    if ($pct > 90) {
	$full++;
    }
    elsif ($pct > 75) {
	$degraded++;
    }
 }
 
 if ($broken) {
    $result{state}="CRITICAL";
    $result{description}.="$broken broken; ";
 }
 if ($full) {
    $result{state}="CRITICAL";
    $result{description}.="$full full; ";
 }
 if ($degraded) {
    $result{state}="WARNING";
    $result{description}.="$degraded degraded;";
 }
 
 return \%result;
}

sub check_varchive_loss {
 my %result=(name=>"video archive loss",state=>"NORMAL",description=>"");
 
 my $last_drop=0;
 foreach my $cache (@$cache_list) {
    next unless defined $cache->{CHUNK_LOSS_TS};
    $last_drop = $cache->{CHUNK_LOSS_TS} if $cache->{CHUNK_LOSS_TS} > $last_drop;
 }
 my $min=int((time - $last_drop) / 60) || 1;
 if ($min <= 30) {
    $result{state}=$min<=5 ? "CRITICAL" : "WARNING";
    $result{description}="archive chunk[s] loss detected $min  min ago";
 }
 return \%result;
}


sub publish {
 my $info=shift;
 my %details=(error=>'',report=>$info);
 my %summary=(error=>'',report=>{summary=>"NORMAL"});

# print Dumper \%details;
 my $jsond = encode_json \%details;
 if(open    DETAILS,">$DETAILS.tmp") {
    print   DETAILS "$jsond\n";
    close   DETAILS;
    rename"$DETAILS.tmp",$DETAILS;
 }
# print Dumper \%summary;
# print Dumper $info;
 foreach (@$info) {
    $summary{report}{summary}='CRITICAL' if $_->{state} eq 'CRITICAL';
    $summary{report}{summary}='WARNING'  if $_->{state} eq 'WARNING' and $summary{report}{summary} ne 'CRITICAL';
 }
 my $jsons = encode_json \%summary;
 if(open   SUMMARY,">$SUMMARY.tmp") {
    print  SUMMARY "$jsons\n";
    close  SUMMARY;
    rename"$SUMMARY.tmp",$SUMMARY;
 }

}

# MAIN ==========================================


for(;;) {
  my @info;
  push @info,check_database;     # open $dbm
  push @info,check_nodes;
  push @info,check_cameras;  
  push @info,check_storage        if $ENV{APL_MOD} ne 'CIRRUS';
  push @info,check_storage_cirrus if $ENV{APL_MOD} eq 'CIRRUS';
  push @info,check_cache_usage;
  push @info,check_varchive_loss;
  $dbm->disconnect() if ($dbm);  # do not keep open connection when sleep
  publish(\@info);

  sleep 60;
}


