<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language 							= $argv[1];

	$config_file						= $argv[2];
	if ( !file_exists($config_file) )
		die('ERROR: Cannot open file: ' . $config_file . "\n");

	$config								= parse_ini_file($config_file);
	$language_positions					= get_language_positions($config);

	if ( !file_exists($config['DATA_ANNOTATED_FOLDER'] . '/' . $language) )
		mkdir($config['DATA_ANNOTATED_FOLDER'] . '/' . $language);

	if ( $config['USE_INTERSECT'] == 1 )
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_intersected_' . $language . '.json';
	else
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_' . $language . '.json';

	$eurovoc_extended_bg_file			= $config['EUROVOC_FOLDER']  . '/eurovoc_extended_' . $config['ANNOTATION_LANGUAGE'] . '.json';
	$iate_simple_bg_file				= $config['IATE_FOLDER']  . '/iate_simple_' . $config['ANNOTATION_LANGUAGE'] . '.json';

	fwrite(STDERR, "Loading corpora...\n");
	$iate_simple_bg						= json_decode(file_get_contents($iate_simple_bg_file), true);
	$eurovoc_extended_bg				= json_decode(file_get_contents($eurovoc_extended_bg_file), true);
	$terms_simple_bg					= load_terms($config['TERMS_FILE'], $config['ANNOTATION_LANGUAGE']);
	$lemmas_simple_bg					= load_lemmas($config['LEMMAS_FILE'], $config['ANNOTATION_LANGUAGE']);
	$multi_corpus						= json_decode(file_get_contents($multi_corpus_file), true);
	$eurovoc_disapproved_ids			= array_flip(json_decode(file_get_contents('eurovoc_disapproved_ids.json'), true));
	fwrite(STDERR, "DONE\n");

	fwrite(STDERR, "Generating annotated representation...\n");
	$documents_count			= 0;
	$document_names				= array_keys($multi_corpus);
	foreach ( $document_names as $document_name ) {

		$document_content = '';
		$document_count = $document_unique_count = 0;

		if ( array_key_exists($document_name, $multi_corpus) ) {

			$document_data = $multi_corpus[$document_name];

			foreach ( $document_data as $code => $count ) {

				$prefix = mb_strtolower(mb_substr($code, 0, 2));

				if ( $prefix == 'ia' ) {

					$iate_id = str_replace('IATE_', '', $code);
					$document_content .=  ( array_key_exists($iate_id, $iate_simple_bg) ? implode(' | ', $iate_simple_bg[$iate_id]) . "\t" : '' ) . $code . "\t" . $count . "\n";
				
				} else if ( $prefix == 'eu' ) {

					$eurovoc_id = str_replace('Eurovoc_', '', $code);
					if ( array_key_exists($eurovoc_id, $eurovoc_disapproved_ids) )
						continue;

					$document_content .= ( array_key_exists($eurovoc_id, $eurovoc_extended_bg) ? implode(' | ', $eurovoc_extended_bg[$eurovoc_id]) . "\t" : '' ) . $code . "\t" . $count . "\n";

				} else if ( $prefix == 'te' )
					$document_content .= ( array_key_exists($code, $terms_simple_bg) ? $terms_simple_bg[$code] . "\t" : '' ) . $code . "\t" . $count . "\n";
				else if ( $prefix == 'le' )
					$document_content .= ( array_key_exists($code, $lemmas_simple_bg) ? $lemmas_simple_bg[$code] . "\t" : '' ) . $code . "\t" . $count . "\n";

				$document_count += $count;
				$document_unique_count++;
			}
		}

		if ( $document_count == 0 )
			continue;

// 		if ( $document_unique_count  < 10 || $document_unique_count > 40 )
// 			continue;

		$documents_count++;

		$document_content = rtrim($document_content);
		file_put_contents($config['DATA_ANNOTATED_FOLDER']  . '/' . $language . '/' . $document_name . '.txt', $document_content);
	}
	fwrite(STDERR, "DONE\n");

	fwrite(STDERR, "Generated Documents:\t$documents_count\n");

	function load_terms($terms_file, $language) {

		global $language_positions;

		$terms = array();

		$lines = file($terms_file);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			$entry = explode("\t", $line);
			if ( count($entry) != 7 )
				continue;

			$term = $entry[$language_positions[$language]];
			if ( !empty($term) )
				$terms['TERM_' . $i] = $term;
		}

		return $terms;
	}

	function load_lemmas($lemmas_file, $language) {

		global $language_positions;

		$lemmas = array();

		$lines = file($lemmas_file);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			$entry = explode("\t", $line);
			if ( count($entry) != 7 )
				continue;

			$lemma = $entry[$language_positions[$language]];
			if ( !empty($lemma) )
				$lemmas['LEMMA_' . $i] = $lemma;
		}

		return $lemmas;
	}

	function get_language_positions($config) {

		$languages = explode(',', $config['LANGUAGES']);
		$languages = array_map('trim', $languages);
		$language_positions = array_flip($languages);

		return $language_positions;
	}

?>