import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold

ham = []
for path_ in os.listdir("./data/enron1/ham"):
	bytes = open(os.path.join("./data/enron1/ham", path_)).read()
	str_ = bytes.decode("utf-8", errors="ignore").strip()
	ham.append(str_)

spam = []
for path_ in os.listdir("./data/enron1/spam"):
	bytes = open(os.path.join("./data/enron1/spam", path_)).read()
	str_ = bytes.decode("utf-8", errors="ignore").strip()
	spam.append(str_)

#Aqui se intercalan hams y spams, de forma que al hacer los folds, todos los cortes de los datos quedan balanceados
text_data = [None] * (len(ham) + len(spam))
text_data[0::2] = ham #posiciones pares
text_data[1::2] = spam #posiciones impares

print "We have %d files. 1000 will be used for training, 500 for test" % len(text_data)

y = np.tile([0, 1], 750) #Con esto los hams tienen etiqueta 0 y los spams etiqueta 1
y_train = y[:1000]
y_test = y[1000:]

tf_idf_vectorizer = TfidfVectorizer()
X = tf_idf_vectorizer.fit_transform(text_data)
X_train = X[:1000, :]
X_test = X[1000:, :]

svm = SVC()
logistic_regression = LogisticRegression()
classifiers = {
	"Support vector machine": svm,
	"Logistic regression": logistic_regression
}
performances = {}
n_folds = 10
print "Evaluating models"
for name, classifier in classifiers.items():
	performance = []
	fold_indices_generator = KFold(X_train.shape[0], n_folds=n_folds)
	for (train, test) in fold_indices_generator:
		classifier.fit(X_train[train], y_train[train])
		performance.append(classifier.score(X_train[test], y_train[test]))
	performances[name] = np.mean(performance)
	print "%s: %s" % (name, performances[name])

max_performance = max(performances.values())
best_classifier_name = [name for (name, classifier) in classifiers.items() if performances[name] == max_performance][0] #Con esto elegimos el clasificador con la mayor performance
print "Winner: %s" % best_classifier_name
print "Performace on test set: %s" % classifiers[best_classifier_name].score(X_test, y_test)