import os

class Corpus:

	def __init__(self):
		#self.stopwords = stopwords
		self.documents = []
		self.document_names = []
		self.dictionary = dict()

	def load_conllup(self, base_folder):
		for folder_path, folder_names, file_name in os.walk(base_folder):
			for file_name in file_name:
				document = self.load_document_conllup(folder_path + '/' + file_name)
				self.documents.append(document)
				self.document_names.append(file_name)

	def load_document_conllup(self, file_path):

		document = []

		#print(file_path)

		lines = open(file_path, encoding='utf-8').readlines()
		for line in lines:

			entries = line.rstrip("\n").split("\t")
			if ( len(entries) != 14 ):
				continue

			lemma = entries[2]
			if ( self.stopwords.exists(lemma) ):
				continue

			pos = entries[3]
			if ( pos not in ['NOUN', 'VERB', 'ADJ'] ):
				continue

			#if ( not self.embeddings.exists(lemma) ):
				#continue

			document.append(lemma)
			self.dictionary[lemma] = pos

		#print(document)

		return document

	def load_plain_content(self, base_folder):
		for folder_path, folder_names, file_name in os.walk(base_folder):
			for file_name in file_name:
				content = open(folder_path + '/' + file_name, encoding='utf-8').read()
				self.documents.append(content)
				self.document_names.append(file_name)

	def load_plain(self, base_folder):
		for folder_path, folder_names, file_name in os.walk(base_folder):
			for file_name in file_name:
				document = self.load_plain_document(folder_path + '/' + file_name)
				self.documents.append(document)
				self.document_names.append(file_name)

	def load_plain_document(self, file_path):

		document = []

		lines = open(file_path, encoding='utf-8').readlines()
		for line in lines:

			words = line.rstrip("\n").split(' ')
			document += words

		return document

	def save_dictionary(self, file_name):
		F = open(file_name, 'w', encoding='utf-8')
		for lemma, pos in self.dictionary.items():
			F.write(lemma + '\t' + pos + '\n')
		F.close()

	def get_documents(self):
		return self.documents

	def get_document_names(self):
		return self.document_names