#!/bin/sh

# check_enabled_rsyncs_health.sh
# This script depends on rsyncwrapper script to be executed with login remotely without password
# properly set up on remote servers and proper
# ssh key login via rsync user
# The logic behind is
# 1. a sync job dump is made using crontab -u root -l and some regex
# 2. get_dir is locally checked for existence
# 3. put_dir is remotely checked for existence
# 4. put_server ssh access is checked with defined user (uptime command ran on remote)
# 5. ping is used on put_server to see whether nfs Netapp filer is reachable
# 6. put_nfs remote filer is checked if mounted
# 7. get_nfs local filer is checked for existence and if is properly mounted
# 8. Output of success or failure is logged into defined $out_log and Trigger Mail Alert
# Author: Copyright (C) Georgi Dimitrov Georgiev georgi.georgiev05@sap.com

# Log File
out_log='/home/user/rsync_server-dir_jobs_check.log';

# rsync.pl config
rsync_conf='/opt/imal/etc/rsync.conf';

user='rsync';
MAIL='georgi.georgievXX@email-addr.com';
mail_sent_pid='/tmp/rsync_mail.pid';

# delete mail_sent_pid older than ZZZ minutes
older_than='180';

# Enable or disable /tmp/rsync.lock* and /opt/imal/etc/rsync.lock* 
# pid removal if no rsync job is running in process list
CLEAN_OBSOLETE_PIDS='1';

# remote run cmds
CHECK_CMD='uptime';
CHECK_CMD_P='ping -c 1';
CHECK_CMD_L='ls -d';
# rsync.conf greppable strings

# grep rsync.conf strings 
g_serv='get_server|';
p_serv='put_server|';
g_nfs='get_nfs|';
p_nfs='put_nfs|';
p_dir='put_dir|';
g_dir='get_dir|';

# string to grep in processes
check_proc_string='if \[.*\]';

echo "######## $(date +%d-%m-%y-%H:%m): Check Running RSYNC JOBS health Script ########";
echo '####';
echo "### Log Started on: $(date +%d-%m-%y-%H:%m) ###" | tee -a $out_log


check_ps_rsyncs_for_duplicates () {
# check processes are not running multiple times
# loop over the running rsync jobs and check whether no multiples are running due to network problems or rsync.pl bugs

echo 
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" | tee -a $out_log;
echo "Running: ps axu|grep -i /tmp/rsync.lock to check for duplicate processes" | tee -a $out_log;
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" | tee -a $out_log;
echo | tee -a $out_log;
echo -e "Current Running Rsync Jobs are: \n\n$(ps axu|grep -oh "$check_proc_string" | cut -d '.' -f 3 | awk '{ print $1 }' | sed -e 's#]##g'|sort -rn|uniq)\n";
echo "++++++++++++++++++++++++++++++++++++++" | tee -a $out_log;

for i in $(ps axu|grep -oh "$check_proc_string" | cut -d '.' -f 3 | awk '{ print $1 }' | sed -e 's#]##g'|sort -rn|uniq); do

sleep 1;

# rsync.pl if condition shows duplicate in ps list thus check for cnt 2
for f in $(seq 1 5); do

rsync_proc_cnt=$(ps axu|grep -i /tmp/rsync.lock.$i | grep -v grep | wc -l);

if [ $rsync_proc_cnt == '2' ] || [ $rsync_proc_cnt == '0' ]; then

STATUS='ok'

else

STATUS='failed';

fi


done


if [ "$STATUS" == 'ok' ]; then

echo "$(date +%d-%m-%y-%H:%m): Rsync Job: $i is FINE on $(hostname) as shown in  ps axu (process list) [ OK ]" | tee -a $out_log;

else

if [ "$STATUS" == 'failed' ]; then
echo "$(date +%d-%m-%y-%H:%m)  Rsync Job: $i seems BROKEN [ FAILED ] Please check !" | tee -a $out_log;
fi

fi

done

}


check_if_get_dir_exists () {

# check if get_dirs exists on get_server
if [ ! -d "${g_dir[$f]}" ]; then
echo "$(date +%d-%m-%y-%H:%m): ${g_dir[$f]} not existing ... Please check." | tee -a $out_log
else
echo "$(date +%d-%m-%y-%H:%m): get_dir directory in $rsync_conf existing on ${g_server[$f]} [ OK ]" | tee -a $out_log;
fi

}


check_if_put_dirs_exists () {

# check if put_dirs exists on put_server
echo "P_DIR ${p_dir[$f]}";
echo "Running: ssh -q -i /opt/imal/etc/rsync_key  -l $user ${p_server[$f]} $CHECK_CMD_L ${p_dir[$f]} 2>&1" | tee -a $out_log;

rem_cmd2=$(ssh -q -i /opt/imal/etc/rsync_key  -l $user ${p_server[$f]} $CHECK_CMD_L ${p_dir[$f]} 2>&1);
rem_cmd2_cnt=$(echo $rem_cmd2 |wc -l);

# if result ! 1 log error
if [[ $rem_cmd2_cnt = '1' ]]; then
echo "$(date +%d-%m-%y-%H:%m): Directory put_dir ${p_dir[$f]} on put_server ${p_server[$f]} Exists [ OK ]" | tee -a $out_log
else
echo "$(date +%d-%m-%y-%H:%m): Directory put_dir ${p_dir[$f]} on put_server ${p_server[$f]} Not Exists [ FAILED ] Please check !" | tee -a $out_log
fi


}


check_and_log_if_put_server_pingable () {

# Check and log if put_server is pingable
echo "Running: ping -c 1 ${p_server[$f]}" | tee -a $out_log;
if [ "`ping -c 1 ${p_server[$f]}`" ]
then
  echo "$(date +%d-%m-%y-%H:%m): ping to ${p_server[$f]} Successful [ OK ]" | tee -a $out_log
else
  echo "$(date +%d-%m-%y-%H:%m): ping to ${p_server[$f]} [ FAILED ] ! Please check !" | tee -a $out_log
fi

}


check_if_remote_cmd_returns_positive () {

# check if remote cmd returns positive (i.e. server and ssh key works)

echo "Running: ssh -q -i /opt/imal/etc/rsync_key -l $user ${p_server[$f]} $CHECK_CMD;" | tee -a $out_log;

rem_cmd1=$(ssh -q -i /opt/imal/etc/rsync_key  -l $user ${p_server[$f]} $CHECK_CMD);
cnt_rem_cmd=$(echo $rem_cmd1 |wc -l);

# if returned 1 cmd is sucessful
if [[ $cnt_rem_cmd == '1' ]]; then

echo "$(date +%d-%m-%y-%H:%m): ${p_server[$f]} $rem_cmd Successful [ OK ]" | tee -a $out_log;
else
echo "$(date +%d-%m-%y-%H:%m): ${p_server[$f]} $rem_cmd Unsucessful [ FAILED ] Please heck !" | tee -a $out_log

fi

}


check_only_cron_existing_rsync_conf_sets () {

# Check Only for cron existing rsync.pl entries (skip non-used rsync.conf sets)

if [[ $i == ${job[$f]} ]]; then

echo "Running: ssh -q -i /opt/imal/etc/rsync_key  -l $user ${p_server[$f]} $CHECK_CMD_P ${p_nfs_filer[$f]}" | tee -a $out_log;

p_nfs_result=$(ssh -q -i /opt/imal/etc/rsync_key  -l $user ${p_server[$f]} $CHECK_CMD_P ${p_nfs_filer[$f]} 2>&1);
p_nfs_result_cnt=$(echo $p_nfs_result |grep 'unknown'|wc -l);

echo "P_NFS $p_nfs_result_cnt";
# if cmd returns 1 then p_nfs_filer is not pingable else ok
if [ $p_nfs_result_cnt == '1' ]; then
echo "$(date +%d-%m-%y-%H:%m): ping from src: ${p_server[$f]} to ${p_nfs_filer[$f]} [ FAILED ]  Please check !" | tee -a $out_log
else
echo "$(date +%d-%m-%y-%H:%m): ping from src: ${p_server[$f]} to ${p_nfs_filer[$f]} Sucessful [ OK ]" | tee -a $out_log
fi

fi

}


check_if_get_nfs_filers_locally_mounted () {

# check if get_nfs storage filers are locally mounted
echo "Running: mount|grep -i ${g_nfs_filer[$f]} |wc -l" | tee -a $out_log;
mount_res=$(mount|grep -i ${g_nfs_filer[$f]});
mount_res_cnt=$(echo $mount_res 2>&1|wc -l);
if [[ $mount_res_cnt == '1' ]]; then
echo "$(date +%d-%m-%y-%H:%m): ${g_nfs_filer[$f]} mounted on ${g_server[$f]} [ OK ]" | tee -a $out_log;
echo -e "Mount Info:\n $mount_res\n" | tee -a $out_log;
else
echo "$(date +%d-%m-%y-%H:%m): ${g_nfs_filer[$f]} Not mounted on ${g_server[$f]} [ FAILED ] Please check !" | tee -a $out_log;
fi


}


loop_over_cron_rsync_jobs () {

# loop to print all cron existing entries (checks are done only for existing entries)
# dump only active rsync set jobs

for i in $(crontab -u root -l |grep -oh '/opt/imal/bin/rsync.pl.*) ' |grep -v grep|cut -d ' ' -f 2|sed -e 's#)##g'); do

f=$(( f + 1 ));

# remove comments in rsync.conf and extract blocks 
set[$i,$f]=$(cat $rsync_conf |grep -v '#' | perl -ne "print if /$i/../exclude/")
sleep 1;

job[$f]=$(echo -e "${set[$i,$f]}" | grep -E '\[.*\]' | sed -e 's#\[##g' -e 's#\]##g' | awk '{ print $1 }' |grep -E -v 'grep|#');

# def arr strings taken from rsync.conf
g_server[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$g_serv" | sed -e "s#$g_serv##" | grep -E -v 'grep|#');
p_server[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$p_serv" | sed -e "s#$p_serv##" | grep -E -v 'grep|#');
g_nfs_vol[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$g_nfs" | sed -e "s#$g_nfs##" | grep -E -v 'grep|#'| cut -d ':' -f2);
p_nfs_vol[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$p_nfs" | sed -e "s#$p_nfs##" | grep -E -v 'grep|#'| cut -d ':' -f2);
g_nfs_filer[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$g_nfs" | sed -e "s#$g_nfs##" | grep -E -v 'grep|#'| cut -d ':' -f1);
p_nfs_filer[$f]=$(echo -e "${set[$i,$f]}" | grep -i "$p_nfs" | sed -e "s#$p_nfs##" | grep -E -v 'grep|#'| cut -d ':' -f1);
p_dir[$f]=$(echo -e "${set[$i,$f]}" |grep -i "$p_dir" | sed -e "s#$p_dir##" | grep -E -v 'grep|#' | cut -d ':' -f2);
g_dir[$f]=$(echo -e "${set[$i,$f]}" |grep -i "$g_dir" | sed -e "s#$g_dir##" | grep -E -v 'grep|#' |cut -d ':' -f1);


echo -e "### Checking Job Name ${job[$f]} on $(date +%d-%m-%y-%H:%m) ###" | tee -a $out_log;
echo -e "Job Name: ${job[$f]}" | tee -a $out_log;
echo -e '# Check Parameters #' | tee -a $out_log; 
echo -e "get_server: ${g_server[$f]}" | tee -a $out_log;
echo -e "get_nfs_vol: ${g_nfs_vol[$f]}" | tee -a $out_log;
echo -e "put_nfs_vol: ${p_nfs_vol[$f]}" | tee -a $out_log;
echo -e "get_nfs: ${g_nfs_filer[$f]}" | tee -a $out_log;
echo -e "get_dir: ${g_dir[$f]}"  | tee -a $out_log;
echo -e "get_server: ${p_server[$f]}" | tee -a $out_log;
echo -e "put_nfs ${p_nfs_filer[$f]}" | tee -a $out_log;
echo -e "put_dir ${p_dir[$f]}" | tee -a $out_log;
echo -e '###########################' | tee -a $out_log;

check_if_get_dir_exists;

check_if_put_dirs_exists;

check_if_get_nfs_filers_locally_mounted;

check_and_log_if_put_server_pingable;

check_if_remote_cmd_returns_positive;

check_only_cron_existing_rsync_conf_sets;


done

}


check_tmp_pid () {

# grep rsync.pl in ps -afe and compare if /opt/imal/etc/rsync.lock.set* matches the grepped string, 10 checks are made to mitigate false positives 
# if no match so send to $MAIL (the pid has to be deleted manually and job reviewed / restarted if hanged)

for i in `seq 1 10`; do


# timeout ps output to be wiped out due to timeout 86000 returned results

for f in $(ps -afe | grep /opt/imal/bin/rsync.pl | grep -v grep | grep -v log | grep -v timeout | grep rsync | awk '{ print $10 }' | sort); do


if [ -f /opt/imal/etc/rsync.lock.$f ]; then
PID_NOT_EXIST='0';
else
PID_NOT_EXIST='1';
fi


done

sleep 3;


done

if [ "$PID_NOT_EXIST" == '1' ]; then echo 'Sending mail to notify'; 
echo "check rsync lock files $HOSTNAME" | mail -s "ERROR HEC RSYNC $HOSTNAME" $MAIL;
fi

if [ "$PID_NOT_EXIST" == '0' ]; then
echo "$(date +%d-%m-%y-%H:%m): All seems good with Rsync Jobs processes and /opt/imal/etc/rsync.lock.* Pids [ OK ]";
fi


}


clean_obsolete_pids () {

# check if obsolete pids are left from rsync.pl or cron if check while process noot runnning (that might occur if cron job or rsync crashes)
# runs if CLEAN_OBSOLETE_PIDS is defined in script header

if [ CLEAN_OBSOLETE_PIDS == '1' ]; then


for file in /tmp/rsync.lock.set*; do
    name=${file##*/}

    if [[ ! -f "/opt/imal/etc/$name" ]] && [[ -f "/tmp/$name" ]] && [[ $(ps axu|grep -i $name | grep -v grep | wc -l) -eq 0 ]]; then
        echo "rsync Job | $name | and pid $name missing in /opt/imal/etc/$name but existing in /tmp/$name";
        echo "Found PROBLEM !! Running: rm -f /tmp/$name . Cleaning obsolete left pid $name" | tee -a $out_log;
        rm -f /tmp/$name
    fi

    if [[ ! -f "/tmp/$name" ]] && [[ -f "/opt/imal/etc/$name" ]] && [[ "$(ps axu | grep -i $name | grep -v grep | wc -l)" -eq 0 ]]; then
 	echo "$name exists in both directories"
        echo "rsync Job | $name | and pid missing in /tmp/$name but existing in /opt/imal/etc/$name";
      echo "Found PROBLEM !! Running: rm -f /opt/imal/etc/$name . Cleaning obsolete left pid $name" | tee -a $out_log;
      rm -f /opt/imal/etc/$name;
    fi
done


fi

}


check_tmp_mail_alert () {

check_tmp_pid;
clean_obsolete_pids;

# Check error log and send mail
if [ $(grep -i 'FAILED' $out_log | wc -l) -gt '0' ] && [ ! -f $mail_sent_pid ]; then
echo "Check rsync on $(hostname) check $out_log" | mail -s "$(grep -i 'FAILED' $out_log)" $MAIL;
echo "MAIL SENT to $MAIL on $(date +%d-%m-%y-%H:%m)" | tee -a $out_log;
touch $mail_sent_pid

# delete pid older than minutes
if [ $(find $mail_sent_pid -type f -mmin +$older_than) ]; then
echo 'Pid older than $older_than deleting ...';
rm -f $mail_sent_pid
fi

fi

}


main () {

check_ps_rsyncs_for_duplicates;
loop_over_cron_rsync_jobs;
check_tmp_mail_alert;

}

main;
