<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language				= $argv[1];
	$config_file			= $argv[2];
	$config					= parse_ini_file($config_file);
	$corpora_folder			= rtrim($config['CORPORA_FOLDER'], '/');
	$corpus_folder			= $corpora_folder . '/' . $language . '/conllup';
	$eurovoc_search			= json_decode(file_get_contents($config['EUROVOC_FOLDER'] . '/eurovoc_lemmas_search_' . $language . '.json'), true);
	$eurovoc_corpus			= array();
	$document_names			= array_diff(scandir($corpus_folder), array('.', '..'));
	$total					= count($document_names);

	fwrite(STDERR, "Extracting lemma corpus...\n");
	foreach ( $document_names as $i => $document_name ) {

		$sentences = get_sentences($corpus_folder . '/' . $document_name);
		$basename = pathinfo($document_name, PATHINFO_FILENAME);

		$eurovoc_corpus[$basename] = array();

		foreach ( $sentences as $sentence )
			process_sentence($sentence, $basename);

		if ( $i % 100 == 0 )
			fwrite(STDERR, round($i * 100 / $total, 2) . "%\r");
	}
	fwrite(STDERR, "100%    \n");
	fwrite(STDERR, "DONE\n");

	foreach ( $eurovoc_corpus as $document_name => $document_data )
		arsort($eurovoc_corpus[$document_name]);

	file_put_contents($config['EUROVOC_FOLDER'] . '/eurovoc_lemmas_corpus_' . $language . '.json', json_encode($eurovoc_corpus, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE));

	function process_sentence($sentence, $document_name) {

		global $eurovoc_search, $eurovoc_corpus;

		$matches = array();

		$length = count($sentence);
		foreach ( $sentence as $i => $lemma ) {

			if ( !array_key_exists($lemma, $eurovoc_search) )
				continue;

			foreach ( $eurovoc_search[$lemma] as $candidate ) {

				if ( $i + $candidate['length'] > $length )
					continue;

				$code = $candidate['code'];
				$phrase = implode(' ', array_slice($sentence, $i, $candidate['length']));
				if ( $candidate['name'] != $phrase )
					continue;

				$matches[] = array(
					'start' => $i,
					'candidate' => $candidate
				);
			}
		}

		$positions = array_fill(0, $length, 0);

		usort($matches, function($a, $b) {

			if ( $a['candidate']['length'] == $b['candidate']['length'] )
				return 0;

			return ( $a['candidate']['length'] < $b['candidate']['length'] ) ? 1 : -1;
		});

		$codes = array();

		foreach ( $matches as $i => $match )

			if ( check_match($match, $positions) ) {

				$positions = mark_match($match, $positions);

				$code = $match['candidate']['code'];
				if ( !array_key_exists($code, $codes) )
					$codes[$code] = 0;

				$codes[$code]++;
			}

		foreach ( $codes as $code => $count ) {

			if ( !array_key_exists($code, $eurovoc_corpus[$document_name]) )
				$eurovoc_corpus[$document_name][$code] = 0;

			$eurovoc_corpus[$document_name][$code] += $count;
		}

// 		if ( count($matches) > 3 ) {
// 
// 			print_r($positions);
// 			print_r($matches);
// 			print_r($codes);
// 			die();
// 		}
	}

	function mark_match($match, $positions) {

		$start = $match['start'];
		$length = $match['candidate']['length'];

		for ( $i = $start; $i < $start + $length; ++$i )
			$positions[$i] = 1;

		return $positions;
	}

	function check_match($match, $positions) {

		$start = $match['start'];
		$length = $match['candidate']['length'];

		for ( $i = $start; $i < $start + $length; ++$i )
			if ( $positions[$i] == 1 )
				return false;

		return true;
	}

	function get_sentences($document_path) {

		$sentences = array();
		$sentence = array();

		$lines = file($document_path);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			if ( empty($line) )
				continue;

			$entry = explode("\t", $line);
			if ( count($entry) < 10 ) {

				if ( count($sentence) != 0 ) {

					$sentences[] = $sentence;
					$sentence = array();
				}

				continue;
			}

			$lemma = mb_strtolower($entry[2]);
			$lemma = str_replace('│', '', $lemma);

			$sentence[] = $lemma;
		}

		if ( count($sentence) != 0 )
			$sentences[] = $sentence;

		return $sentences;
	}

?>