<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language 							= $argv[1];

	$config_file						= $argv[2];
	if ( !file_exists($config_file) )
		die('ERROR: Cannot open file: ' . $config_file . "\n");

	$config								= parse_ini_file($config_file);
	$language_positions					= get_language_positions($config);
	$corpora_folder						= rtrim($config['CORPORA_FOLDER'], '/');
	$corpus_folder						= $corpora_folder . '/' . $language . '/conllup';

	if ( !file_exists($config['DATA_ANNOTATED_FOLDER'] . '/' . $language) )
		mkdir($config['DATA_ANNOTATED_FOLDER'] . '/' . $language);

	if ( $config['USE_INTERSECT'] == 1 )
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_intersected_' . $language . '.json';
	else
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_' . $language . '.json';

	$iate_lemmas_file					= $config['IATE_FOLDER']  . '/iate_lemmas_' . $language . '.json';
	$eurovoc_lemmas_file				= $config['EUROVOC_FOLDER']  . '/eurovoc_lemmas_' . $language . '.json';

	fwrite(STDERR, "Loading data...\n");
	$iate_lemmas						= json_decode(file_get_contents($iate_lemmas_file), true);
	$eurovoc_lemmas						= json_decode(file_get_contents($eurovoc_lemmas_file), true);
	$terms_simple						= load_terms($config['TERMS_FILE'], $language);
	$lemmas_simple						= load_lemmas($config['LEMMAS_FILE'], $language);
	$multi_corpus						= json_decode(file_get_contents($multi_corpus_file), true);
	fwrite(STDERR, "DONE\n");

	fwrite(STDERR, "Calculating coverage...\n");
	$documents_count			= 0;
	$document_names				= array_keys($multi_corpus);
	$total						= count($document_names);
	$total_documents			= 0;
	$total_covarage				= 0;
	$missing_lemmas				= array();
	foreach ( $document_names as $i => $document_name ) {

		if ( array_key_exists($document_name, $multi_corpus) ) {

			$document_data = $multi_corpus[$document_name];
			$annotation_lemmas = array();

			foreach ( $document_data as $code => $count ) {

				$prefix = mb_strtolower(mb_substr($code, 0, 2));

				if ( $prefix == 'ia' ) {

					$iate_id = str_replace('IATE_', '', $code);
					foreach ( $iate_lemmas[$iate_id] as $iate_name )
						$annotation_lemmas = array_merge($annotation_lemmas, explode(' ', $iate_name));
				
				} else if ( $prefix == 'eu' ) {

					$eurovoc_id = str_replace('Eurovoc_', '', $code);
					foreach ( $eurovoc_lemmas[$eurovoc_id] as $eurovoc_name )
						$annotation_lemmas = array_merge($annotation_lemmas, explode(' ', $eurovoc_name));

				} else if ( $prefix == 'te' )
					$annotation_lemmas = array_merge($annotation_lemmas, explode(' ', $terms_simple[$code]));
				else if ( $prefix == 'le' )
					$annotation_lemmas = array_merge($annotation_lemmas, explode(' ', $lemmas_simple[$code]));
			}
		}

		$annotation_lemmas = array_unique($annotation_lemmas);
		sort($annotation_lemmas);

		if ( count($annotation_lemmas) == 0 )
			continue;

		$total_documents++;

		$document_lemmas = load_document_lemmas($corpus_folder . '/' . $document_name . '.conllup');
		$missing_lemmas = array_merge($missing_lemmas, array_diff($document_lemmas, $annotation_lemmas));

		$difference_count = count(array_diff($document_lemmas, $annotation_lemmas));
		$lemmas_count = count($document_lemmas);

		$document_covarage = 1 - $difference_count / $lemmas_count;
		$total_covarage += $document_covarage;

		if ( $i % 100 == 0 )
			fwrite(STDERR, round($i * 100 / $total, 2) . "%      \r");
	}
	fwrite(STDERR, "100%    \n");
	fwrite(STDERR, "DONE\n");

	$missing_lemmas = array_unique($missing_lemmas);
	sort($missing_lemmas);
	file_put_contents('missing_lemmas_' . $language . '.txt', implode("\n", $missing_lemmas));

	$coverage = round($total_covarage / $total_documents, 4) * 100;
	fwrite(STDERR, 'Coverage: ' . $coverage . "%\n");

	function load_document_lemmas($document_path) {

		global $language;

		$document_lemmas = array();

		$lines = file($document_path);
		foreach ( $lines as $line ) {

			$line = trim($line);
			$entry = explode("\t", $line);
			if ( count($entry) < 14 )
				continue;

			$pos = $entry[3];
			if ( $language == 'hr' )
				$pos = $entry[4];

			if ( !in_array($pos, array('NOUN', 'ADJ', 'VERB', 'N', 'A', 'V')) )
				continue;

			$lemma = $entry[2];
			$document_lemmas[] = mb_strtolower($lemma);
		}

		$document_lemmas = array_unique($document_lemmas);
		sort($document_lemmas);

		return $document_lemmas;
	}

	function load_terms($terms_file, $language) {

		global $language_positions;

		$terms = array();

		$lines = file($terms_file);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			$entry = explode("\t", $line);
			if ( count($entry) != 7 )
				continue;

			$term = $entry[$language_positions[$language]];
			if ( !empty($term) )
				$terms['TERM_' . $i] = $term;
		}

		return $terms;
	}

	function load_lemmas($lemmas_file, $language) {

		global $language_positions;

		$lemmas = array();

		$lines = file($lemmas_file);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			$entry = explode("\t", $line);
			if ( count($entry) != 7 )
				continue;

			$lemma = $entry[$language_positions[$language]];
			if ( !empty($lemma) )
				$lemmas['LEMMA_' . $i] = $lemma;
		}

		return $lemmas;
	}

	function get_language_positions($config) {

		$languages = explode(',', $config['LANGUAGES']);
		$languages = array_map('trim', $languages);
		$language_positions = array_flip($languages);

		return $language_positions;
	}

?>