# Clasificación 
# Autor: Felipe Bravo-Marquez

#Varios benchmarks y datasets de UCI
library("mlbench")
## Diabtes
data("PimaIndiansDiabetes", package = "mlbench")

# En R las funciones se crean usando formulas de la forma y~x1+x2....
# Si ponemos y~. estamos considerando todos los atributos del dataframe
# Para saber más ?formula


# Selección de atributos
library("FSelector")

subset <- cfs(diabetes~., PimaIndiansDiabetes) # Hace best-first

#Métodos de filtro
information.gain(diabetes~.,data=PimaIndiansDiabetes)
gain.ratio(diabetes~.,data=PimaIndiansDiabetes)
chi.squared(diabetes~.,data=PimaIndiansDiabetes)

#Sacamos Pressure y triceps del dataset
PimaIndiansDiabetes<-subset(PimaIndiansDiabetes, select=c(pregnant,glucose,insulin
                                           ,mass,pedigree,age,diabetes))

#Particionamos el dataset en train y test
fraction<-0.7  # Fraccion de elementos que usamos para entrenar y clasificar
train_rows<-sample(1:dim(PimaIndiansDiabetes)[1],size=round(dim(PimaIndiansDiabetes)[1]*fraction),replace=F)
train<-PimaIndiansDiabetes[train_rows,1:6]
train_labels<-PimaIndiansDiabetes[train_rows,7]
test<-PimaIndiansDiabetes[-train_rows,1:6]
test_label<-PimaIndiansDiabetes[-train_rows,7]



#knn
library("class")
kvec<-knn(train,test,cl=train_labels,k=2, prob=TRUE)
table(actual=test_label,predicted=kvec)

#knn.cv usando leave-one.out cv
predc_knn_cv<-knn.cv(PimaIndiansDiabetes[,1:6], PimaIndiansDiabetes[,7], k = 2, 
       l = 0, prob = FALSE, use.all = TRUE)
table(actual=PimaIndiansDiabetes$diabetes,predicted=predc_knn_cv)

#Árboles con CART
library("rpart")
library("rpart.plot")

train_frame<-PimaIndiansDiabetes[train_rows,]
test_frame<-PimaIndiansDiabetes[-train_rows,]

cart_tree<-rpart(diabetes~.,data=train_frame,method="class")
cart_tree_inf<-rpart(diabetes~.,data=train_frame,method="class",
                  parms=list(split='information'))
cart_tree_gini<-rpart(diabetes~.,data=train_frame,method="class",
                     parms=list(split='gini'))

# rpart recibe los parametros en rpart.control
rpart.plot(cart_tree)
rpart.plot(cart_tree_inf)
rpart.plot(cart_tree_gini)

fitted_rpart<-predict(cart_tree,newdata=test_frame,type="class")
tab<-table(test_label,fitted_rpart)

#Evaluacion en base a curvas ROC
library("ROCR")
fitted_rpart_prob<-predict(cart_tree,newdata=test_frame,type="prob")[,2]
pred<-prediction(fitted_rpart_prob,test_label)
perf <- performance(pred,"tpr","fpr")
plot(perf)
perf1 <- performance(pred, "prec", "rec")
plot(perf1)



#Trees
library("RWeka")

m1 <- J48(diabetes ~ ., data = train_frame)
table(actual=test_label,predict=predict(m1,newdata=test_frame))

## print and summary
m1
summary(m1) # calls evaluate_Weka_classifier()
table(PimaIndiansDiabetes$diabetes, predict(m1)) # by hand

## visualization
## use party package
if(require("party", quietly = TRUE)) plot(m1)

e <- evaluate_Weka_classifier(m1,
                              cost = matrix(c(0,2,1,0), ncol = 2),
                              numFolds = 10, complexity = TRUE,
                              seed = 123, class = TRUE)
e
summary(e)
e$confusionMatrix
e$details


library("e1071") # Libreria con varíos algoritmos de aprendizaje 
                # Incluye un método para calibrar parámetros llamado tune 

#?tune

mynaive <- naiveBayes(diabetes~.,data=train_frame, laplace=1)
table(actual=test_label,predict=predict(mynaive,newdata=test_frame))

mysvm<-svm(diabetes~.,data=train_frame,gamma = 0.1)
table(actual=test_label,predict=predict(mysvm,newdata=test_frame))




#svm usando 3-folds cross-validation y usando tune para calibrar parámetro
obj <- tune(method=svm, diabetes ~ ., data = train_frame
            ,ranges = list(gamma = 2^(-1:1), 
                                      cost = 2^(2:4)),
            tunecontrol = tune.control(sampling = "cross"),cross=3)

bestsvm<-obj$best.model


#Entrena haciendo k-folds cross-validation
#data: el data frame, label: la variable objetivo en String, method: la funcion
# en ... deben ir los parámetros adicionales del método
# Devuelve el data.frame con una nueva columna que tiene los valores predichos
mycv<-function(k,data,label,method,...){
  results<-data.frame()
  splits <- runif(nrow(data))
  for(i in 1:k){
    test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
    train.idx <- !test.idx
    test <- data[test.idx, , drop=FALSE]
    train <- data[train.idx, , drop=FALSE]
    fun <- method(as.formula(paste(label,"~.")),data=train,...)
    fitted<-cbind(test,predicted=predict(fun, test, type="c"))
    results<-rbind(results,fitted)
    
  }
  results 
}

ej<-mycv(5,iris,"Species",rpart)
ej2<-mycv(5,iris,"Species",svm)
ej3<-mycv(5,iris,"Species",svm,gamma = 0.1)
ej4<-mycv(5,iris,"Species",naiveBayes)

