<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 3 )
		die('ERROR: Not enough parameters.' . "\n");

	$folder								= $argv[1];
	$clusters_file						= $folder . '/clusters.txt';

	if ( !file_exists($clusters_file) )
		die('ERROR: Cannot open filename: ' . $clusters_file . "\n");

	$config_file						= $argv[2];
	if ( !file_exists($config_file) )
		die('ERROR: Cannot open file: ' . $config_file . "\n");

	$config								= parse_ini_file($config_file);
	$languages							= explode(',', $config['LANGUAGES']);
	$languages							= array_map('trim', $languages);

	$multi_corpus						= array();
	foreach ( $languages as $language ) {

		if ( $config['USE_INTERSECT'] == 1 )
			$multi_corpus_file			= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_intersected_' . $language . '.json';
		else
			$multi_corpus_file			= $config['MULTI_CORPUS_FOLDER']  . '/multi_corpus_' . $language . '.json';

		$multi_corpus[$language]		= json_decode(file_get_contents($multi_corpus_file), true);
	}

	$eurovoc_disapproved_ids			= array_flip(json_decode(file_get_contents('eurovoc_disapproved_ids.json'), true));
	$iate_eurovoc_domains				= json_decode(file_get_contents('iate_eurovoc_domains.json'), true);
	$eurovoc_3_2						= json_decode(file_get_contents('eurovoc_3_2.json'), true);

	$clusters_domains = array();
	$clusters = load_clusters($clusters_file);
	foreach ( $clusters as $cluster_name => $cluster )
		$clusters_domains[$cluster_name] = process_cluster($cluster);

	file_put_contents($folder . '/clusters_domains.json', json_encode($clusters_domains, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE));

	function process_cluster($cluster) {

		global $multi_corpus, $eurovoc_disapproved_ids, $languages, $iate_eurovoc_domains, $eurovoc_3_2;

		$cluster_domains = array();
		foreach ( $cluster as $document_name ) {

			$document_domains = array();

			$document_language = get_document_language($document_name);
			if ( !in_array($document_language, $languages) )
				continue;

			if ( !array_key_exists($document_name, $multi_corpus[$document_language]) )
				continue;

			$document_data = $multi_corpus[$document_language][$document_name];
			foreach ( $document_data as $code => $count ) {

				$prefix = mb_strtolower(mb_substr($code, 0, 2));
				if ( $prefix == 'eu' ) {

					$eurovoc_id = str_replace('Eurovoc_', '', $code);
					if ( array_key_exists($eurovoc_id, $eurovoc_disapproved_ids) )
						continue;

					$domain_id = mb_substr($eurovoc_3_2[$eurovoc_id], 0, 2);

					if ( !array_key_exists($domain_id, $document_domains) )
						$document_domains[$domain_id] = 0;

					$document_domains[$domain_id]++;

				} else if ( $prefix == 'ia' ) {

					$iate_id = str_replace('IATE_', '', $code);
					$domain_ids = $iate_eurovoc_domains[$iate_id];
					foreach ( $domain_ids as $domain_id ) {

						if ( !array_key_exists($domain_id, $document_domains) )
							$document_domains[$domain_id] = 0;

						$document_domains[$domain_id]++;
					}
				}
			}

			foreach ( $document_domains as $domain_id => $count ) {

				if ( !array_key_exists($domain_id, $cluster_domains) )
					$cluster_domains[$domain_id] = 0;

				$cluster_domains[$domain_id] += $count;
			}
		}

		arsort($cluster_domains);
		$total = array_sum($cluster_domains);
		if ( count($cluster_domains) > 6 )
			$cluster_domains = array_slice($cluster_domains, 0, 6, true);

		$cluster_domains['total'] = $total;
		return $cluster_domains;
	}

	function get_document_language($document_name) {

		$language = mb_substr($document_name, 0, 2);

		if ( in_array($language, array('ha', 'rn', 'ut', 'tr', 'in', 'ko', 've', 'pa', 'ta', 'al', 'ny')) )
			$language = 'hu';

		if ( in_array($language, array('x0', 'x1', 'x2')) )
			$language = 'sl';

		return $language;
	}

	function load_clusters($clusters_file) {

		$clusters = array();

		$lines = file($clusters_file);
		foreach ( $lines as $i => $line ) {

			$line = trim($line);
			if ( empty($line) )
				continue;

			$document_names = explode(' ', $line);
			foreach ( $document_names as $j => $document_name )
				$document_names[$j] = pathinfo($document_name, PATHINFO_FILENAME);

			$cluster_name = 'cluster_' . ( $i + 1 );
			$clusters[$cluster_name] = $document_names;
		}

		return $clusters;
	}

?>