import sys, os, subprocess, configparser

from datetime import datetime
from time import time
from statistics import mean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

from lib.corpus import Corpus
from lib.embeddings import Embeddings
from lib.stopwords import StopWords
from lib.dictionary import Dictionary

def main() :

	config_file				= sys.argv[1]
	config					= load_config(config_file)
	languages				= list(map(str.strip, config['languages'].split(',')))
	data_folder				= config['data_folder'];
	data_annotated_folder	= config['data_annotated_folder'];

	print('loading corpus...', file=sys.stderr)
	corpus = Corpus();
	for language in languages:
		corpus.load_plain_content(data_folder + '/' + language)
	print('DONE', file=sys.stderr)

	documents				= corpus.get_documents()
	document_names			= corpus.get_document_names()
	labels					= document_names

	print('calculating corpus TF-IDFs...', file=sys.stderr)
	tfidf_vectorizer		= TfidfVectorizer(stop_words=[], lowercase=False)
	#tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
	vectors					= tfidf_vectorizer.fit_transform(documents)
	print('DONE', file=sys.stderr)

	#vectors					= tfidf_matrix

	print('cluster documents...', file=sys.stderr)
	for clusters_count in range(8000, 10000, 500):
		batch_size			= int(clusters_count / 2.5)
		kmeans_model		= MiniBatchKMeans(init='k-means++', n_clusters=clusters_count, batch_size=batch_size)
		kmeans_model.fit(vectors)
		cluster_inertia		= kmeans_model.inertia_
		print(clusters_count, cluster_inertia)
	print('DONE', file=sys.stderr)


def load_config(config_file):

	with open(config_file) as f:
		file_content = '[main]\n' + f.read()

	config_parser = configparser.RawConfigParser()
	config_parser.read_string(file_content)

	return dict(config_parser.items('main'))

if __name__ == '__main__':
    main()