#!/usr/bin/perl

use strict;

use lib "/opt/sarch/fw/bin/perl/FwIface";
use lib "/opt/sarch/fw/bin/perl";
use lib "/opt/sarch/elog/bin/perl";
use lib "/opt/sarch/elog/bin/perl/ELogIface";

use Data::Dumper;
use NextCAM::Servicing::Client;
use FwIface::Types;
use ELog;
use Log::Log4perl "get_logger";

my $APL = $ENV{APL};
my $ELSERVER = "$APL/elog/bin/elserver";
my $HOST = 's_master';
my $PORT = 10000;
my $PULSE_TMT = 20000; # wait for elserver answer (millis)
my $SLEEP = 10;

require "$APL/common/bin/logger.patrol";

# MAIN
my $log = get_logger('NEXTCAM::MGEARS::ELOG_PATROL');
my $kill_num = 0;
$log->info("Starting up elog monitor");
while (1)
{
	my $svc_cli = NextCAM::Servicing::Client->new($HOST, $PORT);
	unless ($svc_cli) {
	    $log->error("MBus connection failed");
	    next;
	}
	unless(`ps aux | grep $ELSERVER | grep -v grep`) {
	    $log->error("Elserver is absent in memory");
	    next;
	}
	
	my $elog_cli = ELogClient->new($svc_cli->getProtocol("skm.eventlog", $PULSE_TMT));
	
	my $pulse_ok = 0;
	my $pulseid = int( rand( 0xffffffff ) );
	my $action = Action->new( { 
	    name => 'check_pulse',
	    parameters => {
		pulseid => $pulseid
	    }
	} );
	
	# Submit pulse check and 
	eval {
	    my $resp = $elog_cli->submitAction($action);
	    if ($resp and ref($resp) eq 'ActionResponse') { 
		if ($resp->{parameters}{pulseid} eq $pulseid) {
		    $pulse_ok = 1;
		    $log->debug("Response OK (id=$pulseid)");
		}
		else {
		    $log->warn("Wrong pulse id received from elserver: $pulseid");
		}
	    }
	    else {
		$log->warn("Bad response from elserver: $resp");
	    }
	};
	if ( $@ ) {
	    my $msg = UNIVERSAL::isa($@,'Thrift::TException')?$@->{message}:$@;
	    $log->warn("Exception in submitAction: $msg");
	}
	
	# Restart elserver unless pulse check succeeded
	unless ($pulse_ok) {
	    $log->warn("Pulse check failed: restart elserver!");
	    $kill_num++;
	    
	    # Do not kill elserver too often
	    if ($kill_num > 1) {
	    	$log->warn("Elserver pulse check failed $kill_num times in a row. Sleep for 30 sec");
		$kill_num = 0;
		sleep 30;
		next;
	    }
	    
	    my $pid = `ps ax | egrep '$ELSERVER' | grep -v grep | cut -c 1-6`;
	    if ($pid) {
		chomp $pid;
	        kill 15 => $pid;
	        sleep 1;
	        if (kill 0 => $pid) {
    		    $log->warn("Elserver still running after SIGTERM. Will now send SIGKILL");
    		    kill 9 => $pid;
		}
	    }
	    else {
		$log->error("Try to kill elserver but it isn't running");
	    }
	}
	else {
	    $log->info("Elserver's pulse got normalized") if $kill_num;
	    $kill_num = 0;
	}
}
continue {
	sleep $SLEEP;
}
