<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language				= $argv[1];
	$config_file			= $argv[2];
	if ( !file_exists($config_file) )
		die('ERROR: Cannot open file: ' . $config_file . "\n");

	$config					= parse_ini_file($config_file);
	$corpora_folder			= rtrim($config['CORPORA_FOLDER'], '/');
	$corpus_folder			= $corpora_folder . '/' . $language . '/conllup';
	if ( $config['USE_EUROVOC'] == 1 )
		$eurovoc_search		= json_decode(file_get_contents($config['EUROVOC_FOLDER'] . '/eurovoc_lemmas_search_' . $language . '.json'), true);
	if ( $config['USE_IATE'] == 1 )
		$iate_search		= json_decode(file_get_contents($config['IATE_FOLDER'] . '/iate_lemmas_search_' . $language . '.json'), true);
	if ( $config['USE_TERM'] == 1 ) {
// 		$terms_search		= json_decode(file_get_contents($config['TERM_FOLDER'] . '/term_lemma_search_' . $language . '.json'), true);
		$terms_search		= json_decode(file_get_contents($config['TERM_FOLDER'] . '/term_search_' . $language . '.json'), true);
	}
	if ( $config['USE_LEMMA'] == 1 )
		$lemmas_search		= json_decode(file_get_contents($config['LEMMA_FOLDER'] . '/lemma_search_' . $language . '.json'), true);
	$multi_corpus			= array();
	$document_names			= array_diff(scandir($corpus_folder), array('.', '..'));
	$total					= count($document_names);

	fwrite(STDERR, "Extracting multi corpus...\n");
	foreach ( $document_names as $i => $document_name ) {

		$sentences = get_sentences($corpus_folder . '/' . $document_name);
		$basename = pathinfo($document_name, PATHINFO_FILENAME);

		$multi_corpus[$basename] = array();

		foreach ( $sentences as $sentence ) {

			$positions = array_fill(0, count($sentence), 0);
			if ( $config['USE_EUROVOC'] == 1 )
				$positions = process_sentence($sentence, $basename, $positions, $eurovoc_search);
			if ( $config['USE_IATE'] == 1 )
				$positions = process_sentence($sentence, $basename, $positions, $iate_search);
			if ( $config['USE_TERM'] == 1 )
				$positions = process_sentence($sentence, $basename, $positions, $terms_search);
			if ( $config['USE_LEMMA'] == 1 )
				process_sentence($sentence, $basename, $positions, $lemmas_search);
		}

		if ( $i % 100 == 0 )
			fwrite(STDERR, round($i * 100 / $total, 2) . "%      \r");
	}
	fwrite(STDERR, "100%    \n");
	fwrite(STDERR, "DONE\n");

	foreach ( $multi_corpus as $document_name => $document_data )
		arsort($multi_corpus[$document_name]);

	file_put_contents($config['MULTI_CORPUS_FOLDER'] . '/multi_corpus_' . $language . '.json', json_encode($multi_corpus, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE));

	function process_sentence($sentence, $document_name, $positions, $dictionary_search) {

		global $multi_corpus, $config;

		$matches = array();

		$length = count($sentence);
		foreach ( $sentence as $i => $lemma ) {

			if ( !array_key_exists($lemma, $dictionary_search) )
				continue;

			foreach ( $dictionary_search[$lemma] as $candidate ) {

				if ( $i + $candidate['length'] > $length )
					continue;

				$code = $candidate['code'];
				$phrase = implode(' ', array_slice($sentence, $i, $candidate['length']));
				if ( $candidate['name'] != $phrase )
					continue;

				$matches[] = array(
					'start' => $i,
					'candidate' => $candidate,
					'homonyms' => get_homonyms($candidate, $dictionary_search[$lemma])
				);
			}
		}

		usort($matches, function($a, $b) {

			if ( $a['candidate']['length'] == $b['candidate']['length'] )
				return 0;

			return ( $a['candidate']['length'] < $b['candidate']['length'] ) ? 1 : -1;
		});

		$codes = array();

		foreach ( $matches as $i => $match )

			if ( check_match($match, $positions) ) {

				$positions = mark_match($match, $positions);

				if ( $config['FIRST_MATCH'] == 1 ) {

					$code = $match['candidate']['code'];
					if ( !array_key_exists($code, $codes) )
						$codes[$code] = 0;

					$codes[$code]++;

				} else if ( $config['FIRST_MATCH'] == 0 ) {
					foreach ( $code = $match['homonyms'] as $homonym ) {

						$code = $homonym['code'];
						if ( !array_key_exists($code, $codes) )
							$codes[$code] = 0;

						$codes[$code]++;
					}
				}
			}

		foreach ( $codes as $code => $count ) {

			if ( !array_key_exists($code, $multi_corpus[$document_name]) )
				$multi_corpus[$document_name][$code] = 0;

			$multi_corpus[$document_name][$code] += $count;
		}

		return $positions;
	}

	function get_homonyms($candidate, $candidates) {

		$homonyms = array();

		foreach ( $candidates as $c )
			if ( $c['name'] == $candidate['name'] )
				$homonyms[] = $c;

		return $homonyms;
	}

	function mark_match($match, $positions) {

		$start = $match['start'];
		$length = $match['candidate']['length'];

		for ( $i = $start; $i < $start + $length; ++$i )
			$positions[$i] = 1;

		return $positions;
	}

	function check_match($match, $positions) {

		$start = $match['start'];
		$length = $match['candidate']['length'];

		for ( $i = $start; $i < $start + $length; ++$i )
			if ( $positions[$i] == 1 )
				return false;

		return true;
	}

	function get_sentences($document_path) {

		$sentences = array();
		$sentence = array();

		$lines = file($document_path);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			if ( empty($line) )
				continue;

			$entry = explode("\t", $line);
			if ( count($entry) < 10 ) {

				if ( count($sentence) != 0 ) {

					$sentences[] = $sentence;
					$sentence = array();
				}

				continue;
			}

			$lemma = mb_strtolower($entry[2]);
			$lemma = str_replace('│', '', $lemma);

			$sentence[] = $lemma;
		}

		if ( count($sentence) != 0 )
			$sentences[] = $sentence;

		return $sentences;
	}

?>