import sys, os, subprocess, configparser

from datetime import datetime
from time import time
from statistics import mean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from lib.corpus import Corpus
from lib.embeddings import Embeddings
from lib.stopwords import StopWords
from lib.dictionary import Dictionary

def main() :

	if len(sys.argv) < 2 :
		usage()
		sys.exit()

	config_file				= sys.argv[1]
	config					= load_config(config_file)
	languages				= list(map(str.strip, config['languages'].split(',')))
	clusters_count			= int(config['clusters_count'])

	data_folder				= config['data_folder'];
	data_annotated_folder	= config['data_annotated_folder'];
	results_folder			= config['results_folder'] + '/' + str(int(time())) + '_all'
	tfidf_file				= results_folder + '/tfidf.txt'
	centroids_file			= results_folder + '/centroids.txt'
	clusters_file			= results_folder + '/clusters.txt'
	distribution_file		= results_folder + '/language_distribution.txt'
	metadata_file			= results_folder + '/metadata.txt'

	os.makedirs(results_folder)

	print('loading corpus...', file=sys.stderr)
	corpus = Corpus();
	for language in languages:
		corpus.load_plain_content(data_folder + '/' + language)
	print('DONE', file=sys.stderr)

	documents = corpus.get_documents()
	document_names = corpus.get_document_names()

	print('calculating corpus TF-IDFs...', file=sys.stderr)
	tfidf_vectorizer = TfidfVectorizer(stop_words=[], lowercase=False)
	tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
	print('DONE', file=sys.stderr)

	print('saving corpus TF-IDFs...', file=sys.stderr)
	term_names = tfidf_vectorizer.get_feature_names();
	F = open(tfidf_file, 'w', encoding='utf-8')
	for document_index, term_index in zip(*tfidf_matrix.nonzero()):
		value = tfidf_matrix[document_index, term_index]
		file_name = document_names[document_index]
		print(document_names[document_index] + '\t' + term_names[term_index] + '\t' +  str(value), file=F)
	F.close()
	print('DONE', file=sys.stderr)

	#print(tfidf_matrix)

	print('cluster documents...', file=sys.stderr)
	batch_size			= int(clusters_count / 2.5)
	kmeans_model		= MiniBatchKMeans(init='k-means++', n_clusters=clusters_count, batch_size=batch_size)
	vectors				= tfidf_matrix
	labels				= document_names

	kmeans_model.fit(vectors)
	print('DONE', file=sys.stderr)

	cluster_labels		= kmeans_model.labels_
	cluster_inertia		= kmeans_model.inertia_
	cluster_centroids	= kmeans_model.cluster_centers_

	#print(cluster_inertia)

	label_clusters = [[] for i in range(clusters_count)]
	for label_index, cluster_index in enumerate(cluster_labels):
		label_clusters[cluster_index].append(labels[label_index])

	print('saving clusters and centroids...', file=sys.stderr)

	#sorted_labels = []
	#for labels in label_clusters:
		#sorted_list = natsorted(labels)
		#sorted_string = ' '.join(sorted_list)
		#sorted_labels.append(sorted_string)

	#sorted_labels = natsorted(sorted_labels)
	#F = open(clusters_file, 'w', encoding='utf-8')
	#for sorted_string in sorted_labels:
		#F.write(sorted_string + '\n')
	#F.close()

	F = open(clusters_file, 'w', encoding='utf-8')
	for labels in label_clusters:
		F.write(' '.join(map(str, labels)) + '\n')
	F.close()

	F = open(centroids_file, 'w', encoding='utf-8')
	for centroid in cluster_centroids:
		F.write(' '.join(map(str, centroid)) + '\n')
	F.close()
	print('DONE', file=sys.stderr)

	subprocess.run('php reorder_clusters.php ' + clusters_file, shell=True)

	print('saving metadata...', file=sys.stderr)
	silhouette_index = silhouette_score(vectors, cluster_labels, metric='euclidean')
	davies_bouldin_index = davies_bouldin_score(vectors.todense(), cluster_labels)
	calinski_harabasz_index = calinski_harabasz_score(vectors.todense(), cluster_labels)

	subprocess.run('php multilingual_clusters_distributions.php ' + clusters_file + ' > ' + distribution_file, shell=True)

	F = open(metadata_file, 'w', encoding='utf-8')
	F.write('Description = Eurovoc from lemmas, Eurovoc with synonyms, IATE from lemmas, Intersection, TF-IDF vectors, All languages, All documents, k-Means' + '\n');
	F.write('Clusters Count = ' + str(clusters_count) + '\n')
	F.write('Documents Count = ' + str(len(documents)) + '\n')
	F.write('Algorithm = k-Means\n')
	F.write('Language = ALL' + '\n')
	F.write('Date = ' + str(datetime.now()) + '\n')
	F.write('Silhouette Index = ' + str(silhouette_index) + '\n')
	F.write('Davies–Bouldin Index = ' + str(davies_bouldin_index) + '\n')
	F.write('Calinski–Harabasz Index = ' + str(calinski_harabasz_index) + '\n')
	F.write('Data Folder = ' + data_folder + '\n')
	F.write('Annotated Data Folder = ' + data_annotated_folder + '\n')
	F.write('Results Folder = ' + results_folder + '\n')
	F.write('TF-IDF File = ' + tfidf_file + '\n')
	F.write('Centroids File = ' + centroids_file + '\n')
	F.write('Clusters File = ' + clusters_file + '\n')
	F.write('Language Distribution File = ' + distribution_file + '\n')
	F.write('Metadata File = ' + metadata_file + '\n')
	F.write('Publish = 0\n')
	F.close()
	print('DONE', file=sys.stderr)

	subprocess.run('php groups_evaluate.php ' + results_folder, shell=True)
	subprocess.run('php clusters_domains.php ' + results_folder + ' ' + config_file, shell=True)

def usage() :
	print ("""\

---------------------------------------------------------
Clusters multilingual corpora

Usage:

./python3 cluster_tfidf_kmeans.py <config_file>

     <config_file>    - path to config
---------------------------------------------------------
""")

def load_config(config_file):

	with open(config_file) as f:
		file_content = '[main]\n' + f.read()

	config_parser = configparser.RawConfigParser()
	config_parser.read_string(file_content)

	return dict(config_parser.items('main'))

if __name__ == '__main__':
    main()