<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language				= $argv[1];
	$config_file			= $argv[2];
	$config					= parse_ini_file($config_file);
	$corpora_folder			= rtrim($config['CORPORA_FOLDER'], '/');
	$corpus_folder			= $corpora_folder . '/' . $language . '/conllup';
	$eurovoc_search			= json_decode(file_get_contents($config['EUROVOC_FOLDER'] . '/eurovoc_form_search_' . $language . '.json'), true);
	$eurovoc_lemmas			= array();
	$document_names			= array_diff(scandir($corpus_folder), array('.', '..'));
	$total					= count($document_names);

	fwrite(STDERR, "Extracting lemmas from forms...\n");
	foreach ( $document_names as $i => $document_name ) {

		list($sentences_lemmas, $sentences_forms) = get_sentences($corpus_folder . '/' . $document_name);

		foreach ( $sentences_lemmas as $j => $sentence_lemmas )
			process_sentence($sentence_lemmas, $sentences_forms[$j]);

		if ( $i % 100 == 0 )
			fwrite(STDERR, round($i * 100 / $total, 2) . "%\r");
	}
	fwrite(STDERR, "100%    \n");
	fwrite(STDERR, "DONE\n");

	uasort($eurovoc_lemmas, function($a, $b) {

		if ( count($a) == count($b) )
			return 0;

		return ( count($a) < count($b) ) ? 1 : -1;
	});

	foreach ( $eurovoc_lemmas as $eurovoc_id => $lemmas )
		arsort($eurovoc_lemmas[$eurovoc_id]);

	file_put_contents($config['EUROVOC_FOLDER'] . '/eurovoc_forms_lemmas_' . $language . '.json', json_encode($eurovoc_lemmas, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES));

	function process_sentence($sentence_lemmas, $sentence_forms) {

		global $eurovoc_search, $eurovoc_lemmas;

		$matches = array();

		$length = count($sentence_forms);
		foreach ( $sentence_forms as $i => $form ) {

			if ( !array_key_exists($form, $eurovoc_search) )
				continue;

			foreach ( $eurovoc_search[$form] as $candidate ) {

				if ( $i + $candidate['length'] > $length )
					continue;

				$code = $candidate['code'];
				$phrase = implode(' ', array_slice($sentence_forms, $i, $candidate['length']));
				if ( $candidate['name'] != $phrase )
					continue;

				$candidate_lemmas = array_slice($sentence_lemmas, $i, $candidate['length']);
				if ( !array_key_exists($code, $eurovoc_lemmas) )
					$eurovoc_lemmas[$code] = array();

				$lemmas_key = implode(' ', $candidate_lemmas);
				if ( !array_key_exists($lemmas_key, $eurovoc_lemmas[$code]) )
					$eurovoc_lemmas[$code][$lemmas_key] = 0;

				$eurovoc_lemmas[$code][$lemmas_key]++;
			}
		}
	}

	function get_sentences($document_path) {

		$sentences_lemmas = array();
		$sentences_forms = array();
		$sentence_lemmas = array();
		$sentence_forms = array();

		$lines = file($document_path);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			if ( empty($line) )
				continue;

			$entry = explode("\t", $line);
			if ( count($entry) < 10 ) {

				if ( count($sentence_forms) != 0 ) {

					$sentences_lemmas[] = $sentence_lemmas;
					$sentences_forms[] = $sentence_forms;
					$sentence_lemmas = array();
					$sentence_forms = array();
				}

				continue;
			}

			$lemma = mb_strtolower($entry[2]);
			$lemma = str_replace('│', '', $lemma);

			$form = mb_strtolower($entry[1]);
			$form = str_replace('│', '', $form);

			$sentence_lemmas[] = $lemma;
			$sentence_forms[] = $form;
		}

		return array($sentences_lemmas, $sentences_forms);
	}

?>