#!/bin/bash
exec >> /marcell/logs/orchestrator.log 2>&1

#set -x

self=$(basename "$0")

for pid in $(pidof -x "$self"); do
    if [ $pid != $$ ]; then
	# echo_log "[$(date)] : $self : Process is already running with PID $pid"
        exit 1
    fi
done

# Set epoch time as a runId
runId=$(date +%s)

function echo_log() {
	echo "$(date) [$runId]:(orchestrator.sh) > $1"
	if [[ $1 =~ ^ERROR ]]; then 
		exit 1
	fi
}

fullRun=1

configVersion=1
while [ -d "/marcell/mnt/configs/config_v$configVersion" ] || [ -d "/marcell/mnt/resources/Clusterization_Resources_v$configVersion" ]; do
	if md5sum -c --status /marcell/mnt/configs/config_v$configVersion/md5sum; then
		fullRun=0
		echo_log "INFO: found previous run with the same resources (/marcell/mnt/configs/config_v$configVersion) .Started incremental run"
		break
	fi
	configVersion=$(( configVersion + 1 )); 
done

if [ $(find /marcell/Initial_Feeds/ -mindepth 1 |wc -l) -eq 0 ] ; then
	if [ $fullRun -eq 0 ]; then 
		echo_log "ERROR: empty initial feeds directory. Aborted full run"
	else
		echo_log "INFO: empty initial feeds directory. Started incremental run"
	fi 
fi

CONFIGFULLPATH="/marcell/mnt/configs/config_v$configVersion/config.txt"

if [ $fullRun -eq 1 ]; then
   if [ ! -f /marcell/Initial_Feeds/config.txt ];then 
	echo_log "ERROR: config file config.txt does not exist."
   fi

   mkdir -p /marcell/mnt/configs/config_v$configVersion
   grep -v -P "(_FOLDER|_FILE|^$)" /marcell/Initial_Feeds/config.txt > $CONFIGFULLPATH

   grep -q IATE_FILTER_DUPLICATES $CONFIGFULLPATH || echo "IATE_FILTER_DUPLICATES = 1" >> $CONFIGFULLPATH
   grep -q FIRST_MATCH $CONFIGFULLPATH || echo "FIRST_MATCH = 0" >> $CONFIGFULLPATH
   grep -q CLUSTERS_COUNT $CONFIGFULLPATH || echo "CLUSTERS_COUNT = 5000" >> $CONFIGFULLPATH
   grep -q DATA_EUROVOC_2 $CONFIGFULLPATH || echo "DATA_EUROVOC_2 = 0" >> $CONFIGFULLPATH
   grep -q USE_INTERSECT $CONFIGFULLPATH || echo "USE_INTERSECT = 0" >> $CONFIGFULLPATH

   mkdir -p /marcell/mnt/resources/Clusterization_Resources_v$configVersion

   CORPORA_FOLDER=$(grep CORPORA_FOLDER /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$CORPORA_FOLDER" == "" ] || [ ! -d /marcell/mnt/$CORPORA_FOLDER ]; then
	echo_log "ERROR: config dir CORPORA_FOLDER /marcell/mnt/$CORPORA_FOLDER does not exist."
   fi
   echo "CORPORA_FOLDER = /marcell/mnt/$CORPORA_FOLDER" >> $CONFIGFULLPATH


   EUROVOC_FOLDER=$(grep EUROVOC_FOLDER /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$EUROVOC_FOLDER" == "" ] || [ ! -d /marcell/Initial_Feeds/$EUROVOC_FOLDER ]; then
	echo_log "ERROR: config dir EUROVOC_FOLDER does not exist."
   fi
   cp -r "/marcell/Initial_Feeds/$EUROVOC_FOLDER" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/
   echo "EUROVOC_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$EUROVOC_FOLDER" >> $CONFIGFULLPATH


   IATE_FOLDER=$(grep IATE_FOLDER /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$IATE_FOLDER" == "" ] || [ ! -d /marcell/Initial_Feeds/$IATE_FOLDER ]; then
        echo_log "ERROR: config dir IATE_FOLDER does not exist."
   fi
   cp -r "/marcell/Initial_Feeds/$IATE_FOLDER" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/
   echo "IATE_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$IATE_FOLDER" >> $CONFIGFULLPATH

   TERM_FOLDER=$(grep TERM_FOLDER /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$TERM_FOLDER" == "" ] || [ ! -d /marcell/Initial_Feeds/$TERM_FOLDER ]; then
        echo_log "ERROR: config dir TERM_FOLDER does not exist."
   fi
   cp -r "/marcell/Initial_Feeds/$TERM_FOLDER" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/
   echo "TERM_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$TERM_FOLDER" >> $CONFIGFULLPATH

   LEMMA_FOLDER=$(grep LEMMA_FOLDER /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$LEMMA_FOLDER" == "" ] || [ ! -d /marcell/Initial_Feeds/$LEMMA_FOLDER ]; then
        echo_log "ERROR: config dir LEMMA_FOLDER does not exist."
   fi
   cp -r "/marcell/Initial_Feeds/$LEMMA_FOLDER" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/
    echo "LEMMA_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$LEMMA_FOLDER" >> $CONFIGFULLPATH

   LEMMAS_FILE=$(grep LEMMAS_FILE /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$LEMMAS_FILE" == "" ] || [ ! -f /marcell/Initial_Feeds/$LEMMA_FOLDER/$LEMMAS_FILE ]; then
        echo_log "ERROR: config file LEMMA_FOLDER/$LEMMAS_FILE does not exist."
   fi
   cp  "/marcell/Initial_Feeds/$LEMMA_FOLDER/$LEMMAS_FILE" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$LEMMA_FOLDER/$LEMMAS_FILE
   echo "LEMMAS_FILE = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$LEMMA_FOLDER/$LEMMAS_FILE" >> $CONFIGFULLPATH

   TERMS_FILE=$(grep TERMS_FILE /marcell/Initial_Feeds/config.txt | sed 's/.*\///' )
   if [ "$TERMS_FILE" == "" ] || [ ! -f /marcell/Initial_Feeds/$TERM_FOLDER/$TERMS_FILE ]; then
        echo_log "ERROR: config file TERM_FOLDER/$TERMS_FILE does not exist."
   fi
   cp  "/marcell/Initial_Feeds/$TERM_FOLDER/$TERMS_FILE" /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$TERM_FOLDER/$TERMS_FILE
   echo "TERMS_FILE = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/$TERM_FOLDER/$TERMS_FILE" >> $CONFIGFULLPATH


   mkdir -p /marcell/mnt/Clusterization_Results
   echo "RESULTS_FOLDER = /marcell/mnt/Clusterization_Results" >> $CONFIGFULLPATH

   mkdir -p /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Multi_Corpus
   echo "MULTI_CORPUS_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Multi_Corpus" >> $CONFIGFULLPATH

   mkdir -p /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Document_Categories
   echo "DOCUMENT_CATEGORIES_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Document_Categories" >> $CONFIGFULLPATH

   mkdir -p /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Corpora_Representation/Baseline
   echo "DATA_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Corpora_Representation/Baseline" >> $CONFIGFULLPATH

   mkdir -p /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Corpora_Representation/Baseline_Annotated
   echo "DATA_ANNOTATED_FOLDER = /marcell/mnt/resources/Clusterization_Resources_v$configVersion/Corpora_Representation/Baseline_Annotated" >> $CONFIGFULLPATH


   find /marcell/mnt/resources/Clusterization_Resources_v$configVersion/ -type d -name ".svn" -exec rm -rf '{}' \;

   find /marcell/Initial_Feeds/ -type f | grep -v ".svn" | while read f; do md5sum $f; done > /marcell/mnt/configs/config_v$configVersion/md5sum

fi 

# Workarounded nasted directory relations
cd /marcell/Process_Resources/
cp "$CONFIGFULLPATH" config.txt

CONFIG="config.txt"

# Fix files with wrong naming convention
echo_log "INFO: Fixing files with wrong naming convention"
CORPORA_FOLDER=$(grep CORPORA_FOLDER $CONFIG   | sed 's/ //g' | cut -d "=" -f2)
for lang in $(find $CORPORA_FOLDER -mindepth 1 -maxdepth 1 -type d  -printf "%f\n"); do  
   for f in $(find $CORPORA_FOLDER/$lang/conllup/ -type f -not -name "${lang}-*" -and -name "*.conllup"  -printf "%f\n"); do 
      echo_log "INFO: mv $CORPORA_FOLDER/$lang/conllup/$f $CORPORA_FOLDER/$lang/conllup/${lang}-$f";  
      mv $CORPORA_FOLDER/$lang/conllup/$f $CORPORA_FOLDER/$lang/conllup/${lang}-$f
   done
done 



if [ $fullRun -eq 1 ]; then
        echo_log "INFO: Started resources update"
        echo_log "INFO: Exec php /marcell/Process_Resources/update_eurovoc.php $CONFIGFULLPATH"
        php /marcell/Process_Resources/update_eurovoc.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_eurovoc.php) > $l"; done &
        echo_log "INFO: Exec php /marcell/Process_Resources/update_iate.php $CONFIGFULLPATH"
        php /marcell/Process_Resources/update_iate.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_iate.php) > $l"; done &
        echo_log "INFO: Exec php /marcell/Process_Resources/update_terms.php $CONFIGFULLPATH"
        php /marcell/Process_Resources/update_terms.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_terms.php) > $l"; done &
        echo_log "INFO: Exec php /marcell/Process_Resources/update_lemmas.php $CONFIGFULLPATH"
        php /marcell/Process_Resources/update_lemmas.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_lemmas.php) > $l"; done &

        wait
fi

echo_log "INFO: Started corpora processing"
echo_log "INFO: Exec php /marcell/Process_Resources/update_multi_corpus.php $CONFIGFULLPATH"
php /marcell/Process_Resources/update_multi_corpus.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_multi_corpus.php) > $l"; done

echo_log "INFO: Started representation generation"
echo_log "INFO: exec php /marcell/Process_Resources/update_representation.php $CONFIGFULLPATH"
php /marcell/Process_Resources/update_representation.php "$CONFIG" |& while read l; do echo "$(date) [$runId]:(update_representation.php) > $l"; done

echo_log "INFO: Started clusterisation"
echo_log "INFO: exec python3 /marcell/Process_Resources/cluster_tfidf_kmeans.py $CONFIGFULLPATH"
python3 /marcell/Process_Resources/cluster_tfidf_kmeans.py "$CONFIG" |& while read l; do echo "$(date) [$runId]:(cluster_tfidf_kmeans.py) > $l"; done

echo_log "INFO: Completed"

