<?php

	mb_internal_encoding('UTF-8');

	if ( $argc < 2 )
		die('ERROR: Not enough parameters.' . "\n");

	$clusterization_file = $argv[1];

	$counts = language_counts($clusterization_file);

	$lines = file($clusterization_file);

	$total = 0;
	$distribution = array();
	foreach ( $lines as $line ) {

		$line = trim($line);

		if ( empty($line) )
			continue;

		$total++;
		$cluster_languages = array();
		$document_names = explode(' ', $line);
		foreach ( $document_names as $document_name ) {

			$language = get_document_language($document_name);
			$cluster_languages[] = $language;
		}

		if ( count($cluster_languages) == 0 )
			continue;

		$cluster_languages = array_values(array_unique($cluster_languages));
		sort($cluster_languages);
		$group_count = count($cluster_languages);

		$key = implode('#', $cluster_languages) . '_' . count($cluster_languages);
		if ( !array_key_exists($key, $distribution) )
			$distribution[$key] = 0;

		if ( !array_key_exists('group_' . $group_count, $distribution) )
			$distribution['group_' . $group_count] = 0;

		if ( $group_count == 1 ) {

			$language = $cluster_languages[0];
			$counts[$language]['group_1']++;
		}

		$distribution[$key]++;
		$distribution['group_' . $group_count]++;
	}

	echo $total . "\n";

	arsort($distribution);
	foreach ( $distribution as $key => $count ) {

		$language = str_replace('_1', '', $key);
		if ( mb_strlen($language) == 2 )
			echo $key . "\t" . $count . "\t" . round($count * 100 / $total, 2) . "%\t" . "\t" . $counts[$language]['total'] . "\t" . round($counts[$language]['group_1'] * 100 / $counts[$language]['total'] ) . "%\n";
		else
			echo $key . "\t" . $count . "\t" . round($count * 100 / $total, 2) . "%\n";
	}

// 	$codes = array_keys($codes_counts);
// 	rsort($codes);
// 	foreach ( $codes as $code )
// 		echo $code . "\t" . $codes_counts[$code] . "\n";

	function get_document_language($document_name) {

		$language = mb_substr($document_name, 0, 2);

		if ( in_array($language, array('ha', 'rn', 'ut', 'tr', 'in', 'ko', 've', 'pa', 'ta', 'al', 'ny')) )
			$language = 'hu';

		if ( in_array($language, array('x0', 'x1', 'x2')) )
			$language = 'sl';

		return $language;
	}

	function language_counts($clusterization_file) {

		$counts = array();

		$lines = file($clusterization_file);
		foreach ( $lines as $line ) {

			$line = trim($line);
			$document_names = explode(' ', $line);
			foreach ( $document_names as $document_name ) {

				$language = get_document_language($document_name);
				if ( !array_key_exists($language, $counts) )
					$counts[$language] = array(
						'total' => 0,
						'group_1' => 0
					);

				$counts[$language]['total']++;
			}
		}

		return $counts;
	}
?>