<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$language 							= $argv[1];
	$config_file						= $argv[2];
	if ( !file_exists($config_file) )
		die('ERROR: Cannot open file: ' . $config_file . "\n");

	$config								= parse_ini_file($config_file);

	if ( !file_exists($config['DATA_FOLDER']  . '/' . $language) )
		mkdir($config['DATA_FOLDER']  . '/' . $language);

	if ( $config['USE_INTERSECT'] == 1 )
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_intersected_' . $language . '.json';
	else
		$multi_corpus_file				= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_' . $language . '.json';

	fwrite(STDERR, "Loading corpora...\n");
	$multi_corpus						= json_decode(file_get_contents($multi_corpus_file), true);
	$eurovoc_disapproved_ids			= array_flip(json_decode(file_get_contents('eurovoc_disapproved_ids.json'), true));
	fwrite(STDERR, "DONE\n");

	fwrite(STDERR, "Generating representation...\n");
	$documents_count			= 0;
	$document_names				= array_keys($multi_corpus);
	foreach ( $document_names as $document_name ) {

		$document_content = '';
		$document_count = $document_unique_count = 0;

		if ( array_key_exists($document_name, $multi_corpus) ) {

			$document_data = $multi_corpus[$document_name];

			foreach ( $document_data as $code => $count ) {

				$prefix = mb_strtolower(mb_substr($code, 0, 2));
				if ( $prefix == 'eu' ) {

					$eurovoc_id = str_replace('Eurovoc_', '', $code);
					if ( array_key_exists($eurovoc_id, $eurovoc_disapproved_ids) )
						continue;
				}

				$document_count += $count;
				$document_unique_count++;

				$document_content .= implode(' ', array_fill(0, $count, $code))  . "\n";
			}
		}

		if ( $document_count == 0 )
			continue;

// 		if ( $document_unique_count  < 10 || $document_unique_count > 40 )
// 			continue;

		$documents_count++;

		$document_content = rtrim($document_content);
		file_put_contents($config['DATA_FOLDER']  . '/' . $language . '/' . $document_name . '.txt', $document_content);
	}
	fwrite(STDERR, "DONE\n");

	fwrite(STDERR, "Generated Documents:\t$documents_count\n");
?>