#!/usr/bin/perl
#  $Id: sm_spindle 31951 2015-03-15 23:05:23Z afomenko $
# -----------------------------------------------------------------------------
#  SM_SPINDLE   - Spindle handler:
#               - keep spindle (disk resource) in target state (uuid.cst)
#               - monitor spindle health 
#               - publish spindle health in uuid.hm
# -----------------------------------------------------------------------------
#  Author: Alex Titov
#  QA by:
#  Copyright: videoNEXT LLC
# -----------------------------------------------------------------------------
# usage:   sm_spindle uuid            - start a spindle
#
#    spindle looks $APL/var/sm/stat/uuid.cst for a target state ONLINE|OFFLINE 
#    spindle publishes a operation state in $APL/var/sm/stat/uuid.ost
#    spindle publishes a health information in $APL/var/sm/stat/uuid.hm
#    spindle use $APL/var/sm/stat/uuid.io for IO information 
#
# for REQ see: Rally S623:Storage Manager TA923: SM Engine
#
# TBD: moutn with device and option if defined in wheels/uuid
use warnings;
use strict;
use POE qw/Wheel::FollowTail/;
use File::Basename qw(dirname);                  # core
use lib dirname(__FILE__).'/../lib';             # find  Spindle & Monitor here
use SM::Spindle;
use SM::SpindleMonitor;
use SM::Config ':all';

# CONS ------------------------------------------------------------------------
my $SPIN   =(split(/\//,$0))[-1];                # the actual name of the prog
my $APL    =$ENV{APL};
my $MOUNT  ="sudo $APL/sm/sbin/sm_mount";        # mount by root operation
my $UMOUNT ="sudo $APL/sm/sbin/sm_umount";       # mount by root operation
my $FSCK   ="sudo $APL/sm/sbin/sm_fsck";         # filesystem check
my $SPACE  ="$APL/sm/bin/sm_space";              # space scan
my $FSLOG  ="$APL/var/log/sm/fsck";              # directory for fsck logs
my $MOUNTPOINT=SM_MNT;
my $MONCYCLE=30;                                 # monitor cycle(ex:every 5secs)
my $MONWRITETIMEOUT=40;                          # timeout for write probing
my %LIMITS =(LIMIT_UTIL=>100,LIMIT_AWAIT=>300,LIMIT_WRITE=>20); # default limits 

# ARGS ------------------------------------------------------------------------
my $uuid   = $ARGV[0]; # disk's ID
die "Usage: $SPIN <ID>\n"                     unless $uuid;
die "$SPIN: $uuid: Cannot be found in wheels" unless -e SM_CONF."/wheels/$uuid";
die "$SPIN: $uuid: Permission denied"         unless -r SM_CONF."/wheels/$uuid";

# VARS ------------------------------------------------------------------------
my $last_mtime=0;
my $last_known_stat='Cool';
my $Prf='['.substr($uuid,0,3).'-'.substr($uuid,-3,3).']'; # Prefix
sub Prf {$Prf}
# MAIN ========================================================================
# spawn one spindle ... 
SM::Spindle->spawn(ID=>$uuid,LOG=>SM_LOG,SMSTAT=>SM_STAT,
                   ACT_MOUNT=>\&act_mount,
                   ACT_START=>\&act_start,
                   ACT_STOP =>\&act_stop,
                   ACT_PID  =>\&act_pid,
                   ACT_INFO =>\&act_info,
                   ACT_ISEMPTY=>\&act_isempty
);

# spawn one monitor ... 
SM::SpindleMonitor->spawn(ID=>$uuid,LOG=>SM_LOG,CYCLE=>$MONCYCLE,
                    SMSTAT=>SM_STAT,ACT_MONITOR=>\&act_monitor
);

SM_LOG->info(Prf."Starting POE::Kernel");
POE::Kernel->run();
SM_LOG->info(Prf."POE::Kernel's run() method finishes (that's wrong!)");
exit 0;

# ACTS ========================================================================
sub act_pid  {  SM_WritePid("$SPIN.$_[0]") }      # called every 30 seconds
sub act_info {  SM_info(@_) }

sub act_mount   {             # return SUCCESS|Broken|Unmarked|Confound|Alien
  my $id=shift;	              # TBD: load configuration, Alien
  my $options=SM_Options;   
  my $timeout=(exists $options->{MOUNT_TIMEOUT} and $options->{MOUNT_TIMEOUT}>0)?$options->{MOUNT_TIMEOUT}:20;
  eval {
    local $SIG{ALRM}= sub{ die "Broken: TIMEOUT $!" }; alarm $timeout;
    SM_LOG->info("mount $id , timeout=$timeout");
    `$UMOUNT $id 2>&1`;       # try to unmount  
    my $out=`$MOUNT $id 2>&1`;
    die("Broken: mount $id error: $out")         if $?!=0;
    die("Broken: mount $id message: $out")       if not $out=~/(ID|MSG)=(.+)/;
    my $mntid=$2;
    die("Broken: mount $id error: $out")         if $mntid eq 'MOUNT_ERROR';
    die("Unmarked: volume $id is unmarked: $out")if $mntid eq 'volume_unmarked';
    die("Confound: volume $id wrong mark =$mntid=")if $mntid ne $id;
    #     TBD: Version check. 
    if(! -d "$MOUNTPOINT/$id/".SM_VER) {         # version dir is absent
      opendir(DIR,"$MOUNTPOINT/$id") || die("Broken: volume $id cannot read");
      my @files = sort {$b cmp $a} grep { /^va-\d\.\d.\d/ and -d "$MOUNTPOINT/$id/$_"} readdir(DIR);
      closedir DIR;
      die("Confound: volume $id does not have directory:".SM_VER) if ! @files;
      die("Alien: volume $id has wrong $files[0] directory");
    }
    #--------------------------------------- minfo
    my %minfo=map{/(^\w+)=(.+)/} grep {/^\w+=.+/} split /\n/,$out;
    my $smstat=SM_STAT;
    if (open( MINFO,">$smstat/$id.minfo")) { 
      print(MINFO "$_=$minfo{$_}\n")  foreach (keys %minfo);
      close MINFO;
    }else{
      SM_LOG->error("Cannot open $smstat/$id.minfo");
    }
  };
  alarm 0;                             # remove alarm
  if ($@) { #------------------------  # error handling
      my $msg=$@; $msg=~s/\n/ /g;
      if (not $msg=~/^(Confound|Unmarked|Broken|Alien):(.+)$/) {
         SM_LOG->error(Prf."Code error;  unexpected '$msg'");
         SM_error($id,"Mount fails;  unexpected '$msg'");
         return 'Broken';
      }
      SM_LOG->warn(Prf."$msg");
      SM_error($id,"Volume $msg");
      return $1;
  }
  SM_info ($id,"scanning for space usage");
  my $log="$FSLOG/$id.".time();
#  system("$SPACE rescan $id >$log.scan 2>&1"); 
  SM_info ($id,"Volume is mounted successfully"); 
  return 'SUCCESS';
}

sub act_start   {                       # return SUCCESS or Broken
  my $id=shift;
  my $log="$FSLOG/$id.".time();
  SM_LOG->info("Check FS $id");
  SM_info ($id,"Check FS consistency (fsck)");  
  system("$FSCK $id $log");
  my $ret=$?;
  SM_LOG->info("Check FS $id is completed with code=$ret");
  SM_info ($id,"Check FS is completed with code=$ret");  
  if($ret<2) {
     SM_LOG->info("FS $id is cleaned and errors are corrected. Details: $log") if $ret;
     SM_info ($id,"FS is cleaned and errors are corrected")                    if $ret;
     return 'SUCCESS';
  }
  SM_LOG->error("Manual actions required! FS $id is dirty and cannot be cleaned automatically. Details: $log");
  SM_error($id,"Manual actions required! Filesystem is dirty and cannot be cleaned automatically. Details: $log");
  return 'Broken'
}

sub act_stop    {                       # always retrun SUCCESS
 my $id=shift;
 SM_LOG->info("umount $id");
 SM_info ($id,"unmount");
 my $out=`$UMOUNT $id 2>&1`;
 return 'SUCCESS'; 
}   

sub write_probe {                       # try writing into $id/va-2.6.0/test
   my $id=shift;
   my $test="$MOUNTPOINT/$id/".SM_VER.'/test';
   my $write_timeout=$MONWRITETIMEOUT;  # default value 40
   my $options=SM_Options;
   $write_timeout=$options->{WRITE_TIMEOUT} if exists $options->{WRITE_TIMEOUT} and $options->{WRITE_TIMEOUT}>0;
   eval {
     local $SIG{ALRM}= sub{  die "Cannot write into $test, TIMEOUT $write_timeout sec. $!" }; alarm $write_timeout;
     unlink($test) if -f  $test;
     open(TEST ,">$test") || die "Cannot open for writing $test $!";
     print(TEST "test")   || die "Cannot write into $test $!";
     close(TEST)          || die "Cannot close file $test $!";
     unlink($test)        || die "Cannot remove file $test $!";
   };
   alarm 0;                             # turn off alarm
   if($@) {                             # catch a error
      my $msg="$@";
      return 'Tight' if $msg=~/No space left on device/;
      SM_LOG->warn(Prf.$msg); 
      SM_warn ($id,$msg);
      return 'Failed';
   }
   return 'SUCCESS';		        # success
}

#---------------------------------------------------------------------
# Act "is empty" 
sub act_isempty    {                    # Returns: Empty|Used
  my $id=shift;
  my $ret='Empty';
  opendir(DIR,"$MOUNTPOINT/$id/".SM_VER); # TBD error check
  my @files= grep {! /^\.\.?$/ and ! /^TO-BE-REMOVED$/ } readdir(DIR);
  closedir DIR; 
  $ret="Used since present: ".join(',',@files) if(@files);
  SM_LOG->info("is empty? $id : $ret");
  SM_info ($id,"is empty? $ret"); 
  return $ret;
}
#---------------------------------------------------------------------
# indicate if a new stat file is avaiable
sub stat_available  {                   # indicate that a new stat is avaiable
  my $id=shift;
  my $stat_name=SM_STAT."/$id.stat";
  return 0 if not -f $stat_name;        # stat is absent
  my $mtime=(stat($stat_name))[9];
  return 0 if $last_mtime == $mtime;    # not avaiable (still old)
  $last_mtime=$mtime;
  return 1;
}

sub act_monitor {                       # possible Failed,Cool,Weak,Tight
  my ($id,$count)=@_;
  my $start=time;
  $_=write_probe($id);
  my $spent=time-$start;
  SM_LOG->info("write_probe on $id returns: $_, spent $spent sec");
  return $_ if $_ ne 'SUCCESS';
  return $last_known_stat if not stat_available($id);
  my $msg;
  #----------------------------------- stat avaiale twice a minute
  eval {
    local $SIG{ALRM}=  sub{ die "ERR-MON01: TIMEOUT $!" }; alarm 10; 
    #--------------------------------- get configuration
    my ($confname,$statname)=(SM_CONF."/wheels/$id",SM_STAT."/$id.stat");
    open(CONF,$confname) || die "ERR-MON03: Cannot open $confname";
    my %cc=map{/(^\w+)=(.+)/} grep {/^\w+=.+/} <CONF>;
    close CONF;
    my %conf=(%LIMITS,%cc);              # combine with default limits
    #--------------------------------- get stat
    open(STAT,$statname) || die "ERR-MON03: Cannot open $statname";
    my %stat=map{/(^\w+)=(.+)/} grep {/^\w+=.+/} <STAT>;
    close STAT; 
    #--------------------------------- compare with limits for decition
    if($conf{LIMIT_UTIL} ne 'unlimited' and $stat{UTIL} >$conf{LIMIT_UTIL}) 
           {$msg.="high disk utilization $stat{UTIL}>$conf{LIMIT_UTIL}; "}
    if($conf{LIMIT_AWAIT}ne 'unlimited' and $stat{AWAIT}>$conf{LIMIT_AWAIT})
           {$msg.="high disk waits $stat{AWAIT}>$conf{LIMIT_AWAIT}; "    }
    if($conf{LIMIT_WRITE}ne 'unlimited' and $stat{WRITE}>$conf{LIMIT_WRITE})
           {$msg.="high disk writes $stat{WRITE}>$conf{LIMIT_WRITE}; "   }
    #--------------------------------- disk full check
    if(int(1000*$stat{FREE}/$stat{SIZE})<=5) # 0.5% or less free
           {$msg.="DISK FULL: $stat{FREE}/$stat{SIZE};"                   }
  };
  alarm 0;                             # remove alarm
  if($@) {  # TBD Error handling
    SM_LOG->error(Prf."act_monitor gets a problem:$@");
    SM_error("Monitoring malfunctions: :$@");
  }
  if(not $msg) {                       # nothing complain about
    if($last_known_stat ne 'Cool') {   # switch from problem to Cool
      SM_LOG->info(Prf."Volume $id is OK($last_known_stat is cleared)");
      SM_info($id,"Volume is OK ($last_known_stat is cleared)");
    }
    return $last_known_stat='Cool';
  }
  if($msg=~/DISK FULL/) {               # disk full 
    SM_LOG->warn(Prf.$msg) if $last_known_stat ne 'Tight'; # don't litter log
    SM_warn ($id,$msg)     if $last_known_stat ne 'Tight';
    return $last_known_stat='Tight';    
  }else {                               # disk weak
    SM_LOG->warn(Prf.$msg) if $last_known_stat ne 'Weak'; # don't litter log
    SM_warn ($id,$msg)     if $last_known_stat ne 'Weak';
    return $last_known_stat='Weak';
  }
}
