#!/bin/bash

trap message3 INT

# Functions
message0(){
    echo "WELCOME for the first-time using cpos-sftp2hpcf!"
    echo -e "For security reasons, you are required to establish your \e[4mfirst connection\e[0m to odds.cpos.hku.hk \e[4mmanually\e[0m."
    echo -e "Kindly follow the instructions below to finish the first-time one-off setup and then restart cpos-sftp2hpcf."
    echo ""
    echo -e "Instructions (input the \e[4munderlined commands to the terminal\e[0m):"
    echo -e "\e[4msftp -oConnectTimeout=120 odds.cpos.hku.hk\e[0m"
    echo "#The authenticity of host 'odds.cpos.hku.hk' can't be established."
    echo "#RSA key fingerprint is SHA256:483ukXiNDjGARDNKQi5Y2yxy9CUjcYGGXHLnGPH1jgc."
    echo "#RSA key fingerprint is MD5:bb:4c:86:43:62:e1:89:ef:d7:21:de:ab:54:0d:19:c8."
    echo "#Are you sure you want to continue connecting (yes/no)?"
    echo -e "\e[4myes\e[0m"
    echo "#Warning: Permanently added 'odds.cpos.hku.hk' (RSA) to the list of known hosts."
    echo "#<username>@odds.cpos.hku.hk's password:"
    echo -e "\e[4mCTRL-C\e[0m"
}

message1(){
    echo "cpos-sftp2hpcf version 1.0 (04-08-2022)"
    echo "Contact us: bioinfo.cpos@hku.hk"
    echo "Introduction: cpos-sftp2hpcf is a tool for CPOS HPCF users to download, check md5sum and unzip data from CPOS sFTP server. The procedures briefly, you are required to prepare a config file and then start the tool. Once the tool is started, it will download the data in the master node and subsequently submit job(s) for check md5sum and unzip in the compute node."
}
message2(){
    echo -e "\nUsage: sh cpos-sftp2hpcf.sh <config filename>\n"
}

message3(){
    echo ""
    echo "cpos-sftp2hpcf receives an interrupt signal. Exiting..."
    echo "THANK YOU for using cpos-sftp2hpcf! Bye!"
    exit
}

message4(){
    echo "[ERROR] ${1} already exists! You have to remove ${2} from this working directory to proceed."
}

# Main
script_path="$(readlink -f ${0})"
script_folder="$(dirname ${script_path})"
work_folder="${PWD}"
if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then
    message1
    message2
elif [[ "${1}" == "-v" || "${1}" == "--version" ]]; then
    message1
elif [ -f "${1}" ]; then
    module load yq/4.25.2 > /dev/null 2>&1
    module load expect/5.45 > /dev/null 2>&1
    echo "Checking..."
    ## Prerequisite
    keep=$(yq e -o=j -I=0 '.keep_downloaded_md5sum_zip_files' "${1}")
    if [ "${keep}" != "false" ] && [ "${keep}" != "true" ]; then
        echo "[ERROR] Unknown option for keep."
        echo 'Input true/false to "keep_downloaded_md5sum_zip_files" section in config. true: Keep the downloaded md5sum and zip file(s) after everything is ok. false: Vice versa.'
        exit 1
    fi
    
    if [ ! -f "${script_folder}/pbs_cpos-md5sumN7za" ]; then
        echo "[ERROR] Include our required script 'pbs_cpos-md5sumN7za' to the script directory: '${script_folder}'."
        exit 1
    fi
    
    if [ -f ~/.ssh/known_hosts ]; then
        if [[ ! "$(grep '172.30.30.16' ~/.ssh/known_hosts)" && ! "$(grep 'odds.cpos.hku.hk' ~/.ssh/known_hosts)" ]]; then
            message0
            exit
        fi
    else
        message0
        exit
    fi
    
    ### 1st loop
    for project in $(yq e -o=j -I=0 '.projects[]' "${1}"); do
        ## Parse yaml config
        project_id=$(echo "${project}" | yq e '.project_id')
        sftp_username=$(echo "${project}" | yq e '.sftp_username')
        sftp_password=$(echo "${project}" | yq e '.sftp_password')
        unzip_password=$(echo "${project}" | yq e '.unzip_password')
        
        ## Avoid repeat download
        possible_names=("${project_id}" "${project_id}_AnalysisData" "${project_id}_AnalysisResult")
        status="proceed"
        for name in ${possible_names[@]}; do
            ### Check work dir
            error_files=($(find . -maxdepth 1 -name "${name}" -type d))
            error_files+=($(find . -maxdepth 1 -name "${name}.md5sum" -type f))
            error_files+=($(find . -maxdepth 1 -name "${name}.zip.???" -type f))
            if [ "${error_files}" ]; then
                for file in ${error_files[@]}; do
                    message4 "${file}" "it"
                done
                status="exit"
            fi
            ### Check sub dir
            if [ "$(find . -type d | grep '^./cpos_sftp2hpcf_session_')" ]; then
                error_files=($(find cpos_sftp2hpcf_session_* -maxdepth 1 -name "${name}" -type d))
                error_files+=($(find cpos_sftp2hpcf_session_* -maxdepth 1 -name "${name}.md5sum" -type f))
                error_files+=($(find cpos_sftp2hpcf_session_* -maxdepth 1 -name "${name}.zip.???" -type f))
                if [ "${error_files}" ]; then
                    for file in ${error_files[@]}; do
                        message4 "${file}" "$(dirname ${file})"
                    done
                    status="exit"
                fi
            fi
            if [ "${status}" == "exit" ]; then
                exit 1
            fi
        done
        
        ## Check destination
        ### expect codes cannot have identation
        expect << EOC
log_user 0
set timeout 120
spawn sftp -oConnectTimeout=120 ${sftp_username}@odds.cpos.hku.hk
expect "${sftp_username}@odds.cpos.hku.hk's password:"
send "${sftp_password}\n"
expect {
"Permission denied, please try again." { puts "\[ERROR\] ${project_id} sFTP username and/or password not correct! Update your config and try again!"; exit 1 }
"sftp>" { set counter 0 }
}
foreach name {${possible_names[@]}} {
send "ls data/\${name}\n"
sleep 5
expect {
"Can't ls*" { set counter [ expr \$counter + 1] }
"*No such file or directory" { set counter [ expr \$counter + 1] }
"sftp>" { continue }
}
send "ls data/\${name}.zip.001\n"
sleep 5
expect {
"Can't ls*" { set counter [ expr \$counter + 1] }
"*No such file or directory" { set counter [ expr \$counter + 1] }
"sftp>" { continue }
}
}
send "bye\n"
if { \$counter == 6 } {
    puts "\[ERROR\] ${project_id} not in destination! Update your config and try again!"
    exit 1
}
EOC
        if [ "$?" == "1" ]; then
            exit 1
        fi
    done
    
    ## Start
    ### 2nd loop
    ### ---
    ### The two loop approach will prevent errors thrown between download sessions.
    ###     one loop: (project1) check errors --> download --> (project2) check errors --> download --> ...
    ###     two loop: (project1..project2..) check errors --> (project1..project2..) download
    ### ---
    for project in $(yq e -o=j -I=0 '.projects[]' "${1}"); do 
        ## Parse yaml config
        project_id=$(echo "${project}" | yq e '.project_id')
        sftp_username=$(echo "${project}" | yq e '.sftp_username')
        sftp_password=$(echo "${project}" | yq e '.sftp_password')
        unzip_password=$(echo "${project}" | yq e '.unzip_password')
        
        ## Download
        download_folder=$(echo "cpos_sftp2hpcf_session_")$(date +%y.%m.%d.%H.%M.%S)_$(echo "${RANDOM}" | md5sum | head -c 10)
        echo "$(date) Downloading ${project_id} to ${download_folder}..."
        mkdir "${download_folder}"
        cd "${download_folder}"
        ### expect codes cannot have identation
        expect << EOC
set timeout -1
spawn sftp -oConnectTimeout=120 ${sftp_username}@odds.cpos.hku.hk
expect "${sftp_username}@odds.cpos.hku.hk's password:"
send "${sftp_password}\n"
expect "sftp>"
send "get -r data/${project_id}*\n"
expect "sftp>"
send "bye\n"
EOC
        echo ""
        cd "${work_folder}"
        
        ## Clean up
        possible_files=$(find "${download_folder}" -mindepth 2 -type f)
        for file in ${possible_files[@]}; do
            filename=$(basename ${file})
            if [ ! -f "${download_folder}"/${filename} ]; then
                mv ${file} "${download_folder}"
            else
                echo "[ERROR] Found two ${filename} files. Is there a problem with the download?"
                exit 1
            fi
        done
        possible_files=$(find "${download_folder}" -maxdepth 1 -type f)
        possible_basenames=()
        for file in ${possible_files[@]}; do 
            possible_basenames+=($(basename "${file}"))
        done
        rm -r $(find "${download_folder}" -mindepth 1 -maxdepth 1 -type d)
        
        ## Prepare for md5sum and 7za
        possible_names=("${project_id}" "${project_id}_AnalysisData" "${project_id}_AnalysisResult")
        for name in ${possible_names[@]}; do
            project_archive=($(find "${download_folder}" -maxdepth 1 -name "${name}.zip."???))
            if [ "${project_archive}" ]; then
                for file in ${project_archive[@]}; do
                    file_basename=$(basename "${file}")
                    possible_basenames=(${possible_basenames[@]/$file_basename})
                done
                project_md5sum=($(find "${download_folder}" -maxdepth 1 -name "${name}.md5sum"))
                if [ ! "${project_md5sum}" ]; then
                    echo "[ERROR] ${name} md5sum not found. Is there a problem with the download?"
                    exit 1
                else
                    file_basename=$(basename "${project_md5sum}")
                    possible_basenames=("${possible_basenames[@]/$file_basename}")
                fi
                
                ## Adjust resources for md5sum and 7za
                project_size=$(ls -l "${project_archive[@]}" | awk '{total += $5} END {print total}')
                if [ "${project_size}" -le "536870912000" ]; then
                    extract_resources=$(yq e -o=j -I=0 '.resources.small' "${1}")
                elif [[ "${project_size}" -gt "536870912000" || "${project_size}" -le "1073741824000" ]]; then
                    extract_resources=$(yq e -o=j -I=0 '.resources.medium' "${1}")
                elif [ "${project_size}" -gt "1073741824000" ]; then
                    extract_resources=$(yq e -o=j -I=0 '.resources.large' "${1}")
                fi
                
                q=$(echo "${extract_resources}" | yq e '.queue')
                p=$(echo "${extract_resources}" | yq e '.ppn')
                m=$(echo "${extract_resources}" | yq e '.mem')
                w=$(echo "${extract_resources}" | yq e '.walltime')
                e=$(echo "${extract_resources}" | yq e '.mail')
                
                echo "$(date) Submitting job for check md5sum and unzip of ${name}..."
                qsub -q "${q}" -l nodes=1:ppn="${p}",mem="${m}",walltime="${w}" -m "${e}" -N "log.cpos-md5sumN7za.${name}" "${script_folder}/pbs_cpos-md5sumN7za" -v download_folder="${download_folder}",keep="${keep}",name="${name}",unzip_password="${unzip_password}"
            fi
        done
        
        ## Unused files
        for file in ${possible_basenames[@]}; do
            if [ "${file}" ]; then
                echo '[WARN] Found unknown cache file, "'${download_folder}/${file}'", cpos-sftp2hpcf will ignore and proceed per usual...'
            fi
        done
    done
    echo "THANK YOU for using cpos-sftp2hpcf! Bye!"
else
    echo "[ERROR] Unknown option for cpos-sftp2hpcf."
    message2
fi
