Skip to content
Snippets Groups Projects
Commit f9515957 authored by Pedro Manuel Moreno Marcos's avatar Pedro Manuel Moreno Marcos
Browse files

anhadir ficheros predicción

parent 8a73d0f9
No related branches found
No related tags found
No related merge requests found
entrenar_modelos <- function(dropout_pred = TRUE, sem = 100, fichero = "dataset_uach_lala_info.csv")
{
library("dplyr")
library("caret")
df = obtenerIndicadoresUACH(sem, fichero)
load("dropout_uach_lala_info.Rda")
dr = as.data.frame(unlist(dropout))
names(dr)[1] = "dropout"
names <- rownames(dr)
rownames(dr) <- NULL
dr <- cbind(names,dr)
names(dr)[1] = "id"
Xy = merge(df, dr, by="id")
df = Xy
load("target_uach_lala_info.Rda")
demora_bach = numeric()
valid_user = data.frame(id=character(), valid_user=logical(), stringsAsFactors=FALSE)
for(j in 1:length(target)){
#usuario = gsub(" ", "", names(target[j]), fixed = TRUE)
usuario = names(target[j])
cond_4bach = sum(target[[j]] == -1) > 3
#cat('\n ', target[[j]])
#cat('\n ', max(target[[j]][which(diff(target[[j]]) == 0)])==1)
cond_1consec = max(target[[j]][which(diff(target[[j]]) == 0)])==1
idx = which(target[[j]] %in% c(-1))
# if(length(idx) == 1){
# # ultimo valor del vector de índices
# demora = idx[length(idx)]
# }
if(length(idx) == 0){
demora = 0
}else{
# if(length(idx) >= 1){
demora = tail(idx, n=1)
}
demora_bach[j] = demora
valid_user[j,] = c(usuario,cond_4bach)
}
df = merge(df, valid_user, by="id")
df = df[df$valid_user==TRUE,]
df2 = df
if(dropout_pred == TRUE){
df = subset(df, select=-c(id, valid_user))
df = subset(df, select=-npromcurs)
#df = subset(df, select=-ranuladas)
#df = subset(df, select=c(npromedio,dropout))
# Cambiar orden de columnas para que sea igual que conf_norm y los resultados sean exactos
df = df[,c(5,1,2,4,3,6)]
#cor(Xy, use="complete.obs", method="kendall")
df$dropout = as.factor(df$dropout)
levels(df$dropout) <- make.names(levels(factor(df$dropout)))
df[is.na(df)] <- 0
# Aplicación de modelos
# RANDOM FOREST
set.seed(2);
modFit <- train(dropout~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, classProbs=TRUE, summaryFunction = twoClassSummary), method="rf", metric="ROC")
AUC_rf = modFit$results[modFit$results$mtry == as.numeric(modFit$bestTune),]$ROC
sens_rf = modFit$results[modFit$results$mtry == as.numeric(modFit$bestTune),]$Sens
spec_rf = modFit$results[modFit$results$mtry == as.numeric(modFit$bestTune),]$Spec
cm = confusionMatrix(modFit$finalModel$predicted, modFit$finalModel$y)
acc_rf = as.numeric(cm$overall["Accuracy"])
kappa_rf = as.numeric(cm$overall["Kappa"])
recall_rf = cm$table[1,1]/(cm$table[1,1]+cm$table[2,1])
precision_rf = cm$table[1,1]/(cm$table[1,1]+cm$table[1,2])
f1score_rf = 2*(precision_rf*recall_rf)/(precision_rf+recall_rf)
modFit_rf = modFit
# Linear Regression
set.seed(2);
modFit <- train(dropout~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, classProbs=TRUE, summaryFunction = twoClassSummary), method="glm", metric="ROC")
AUC_glm = modFit$results$ROC
sens_glm = modFit$results$Sens
spec_glm = modFit$results$Spec
cm = confusionMatrix(modFit)
acc_glm = (cm$table[1,1]+cm$table[2,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
recall_glm = cm$table[1,1]/(cm$table[1,1]+cm$table[2,1])
precision_glm = cm$table[1,1]/(cm$table[1,1]+cm$table[1,2])
f1score_glm = 2*(precision_glm*recall_glm)/(precision_glm+recall_glm)
pr_alpha = acc_glm
pr_A = (cm$table[1,1]+cm$table[1,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_B = (cm$table[1,1]+cm$table[2,1])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_e = pr_A*pr_B+(1-pr_A)*(1-pr_B)
kappa_glm = (pr_alpha - pr_e)/(1-pr_e)
modFit_glm = modFit
# SVM (kernel RBF)
set.seed(2);
modFit <- train(dropout~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, classProbs=TRUE, summaryFunction = twoClassSummary), method="svmRadial", metric="ROC")
AUC_svm = modFit$results[modFit$results$C == as.numeric(modFit$bestTune$C),]$ROC
sens_svm = modFit$results[modFit$results$C == as.numeric(modFit$bestTune$C) & modFit$results$degree == as.numeric(modFit$bestTune$degree) & modFit$results$scale == as.numeric(modFit$bestTune$scale) ,]$Sens
spec_svm = modFit$results[modFit$results$C == as.numeric(modFit$bestTune$C) & modFit$results$degree == as.numeric(modFit$bestTune$degree) & modFit$results$scale == as.numeric(modFit$bestTune$scale) ,]$Spec
cm = confusionMatrix(modFit)
acc_svm = (cm$table[1,1]+cm$table[2,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
recall_svm = cm$table[1,1]/(cm$table[1,1]+cm$table[2,1])
precision_svm = cm$table[1,1]/(cm$table[1,1]+cm$table[1,2])
f1score_svm = 2*(precision_svm*recall_svm)/(precision_svm+recall_svm)
pr_alpha = acc_svm
pr_A = (cm$table[1,1]+cm$table[1,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_B = (cm$table[1,1]+cm$table[2,1])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_e = pr_A*pr_B+(1-pr_A)*(1-pr_B)
kappa_svm = (pr_alpha - pr_e)/(1-pr_e)
modFit_svm = modFit
# Decision Tree
set.seed(2);
modFit <- train(dropout~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, classProbs=TRUE, summaryFunction = twoClassSummary), method="rpart", metric="ROC")
AUC_dt = modFit$results[modFit$results$cp == as.numeric(modFit$bestTune),]$ROC
sens_dt = modFit$results[modFit$results$cp == as.numeric(modFit$bestTune),]$Sens
spec_dt = modFit$results[modFit$results$cp == as.numeric(modFit$bestTune),]$Spec
cm = confusionMatrix(modFit)
acc_dt = (cm$table[1,1]+cm$table[2,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
recall_dt = cm$table[1,1]/(cm$table[1,1]+cm$table[2,1])
precision_dt = cm$table[1,1]/(cm$table[1,1]+cm$table[1,2])
f1score_dt = 2*(precision_dt*recall_dt)/(precision_dt+recall_dt)
pr_alpha = acc_dt
pr_A = (cm$table[1,1]+cm$table[1,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_B = (cm$table[1,1]+cm$table[2,1])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_e = pr_A*pr_B+(1-pr_A)*(1-pr_B)
kappa_dt = (pr_alpha - pr_e)/(1-pr_e)
modFit_dt = modFit
# Neural Network
set.seed(2);
modFit <- train(dropout~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, classProbs=TRUE, summaryFunction = twoClassSummary), method="nnet", metric="ROC")
AUC_nnet = max(modFit$results$ROC)
sens_nnet = modFit$results$Sens
spec_nnet = modFit$results$Spec
cm = confusionMatrix(modFit)
acc_nnet = (cm$table[1,1]+cm$table[2,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
recall_nnet = cm$table[1,1]/(cm$table[1,1]+cm$table[2,1])
precision_nnet = cm$table[1,1]/(cm$table[1,1]+cm$table[1,2])
f1score_nnet = 2*(precision_glm*recall_glm)/(precision_glm+recall_glm)
pr_alpha = acc_glm
pr_A = (cm$table[1,1]+cm$table[1,2])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_B = (cm$table[1,1]+cm$table[2,1])/(cm$table[1,1]+cm$table[2,1]+cm$table[1,2]+cm$table[2,2])
pr_e = pr_A*pr_B+(1-pr_A)*(1-pr_B)
kappa_nnet = (pr_alpha - pr_e)/(1-pr_e)
modFit_nntet = modFit
# Seleccionar el mejor modelo
AUC_max = max(AUC_dt, AUC_glm, AUC_nnet, AUC_rf, AUC_svm)
if(AUC_max == AUC_dt){
modFit = modFit_dt
} else if(AUC_max == AUC_glm) {
modFit = modFit_glm
} else if(AUC_max == AUC_nnet){
modFit = modFit_nnet
} else if(AUC_max == AUC_rf){
modFit = modFit_rf
} else {
modFit = modFit_svm
}
save(modFit, file="dropout_model.Rda")
} else {
dimdf = dim(df2)[2]
df2[,dimdf+1] = demora_bach[as.logical(valid_user$valid_user)]
names(df2)[dimdf+1] = "demora"
df3 = df2[df2$dropout == 0,]
df = subset(df3, select=-c(id, valid_user, dropout))
df = subset(df, select=-npromcurs)
df[is.na(df)] <- 0
# ELIMINAR VARIABLES CON VARIANZA 0
x = nearZeroVar(df, saveMetrics = TRUE)
a = x[,"zeroVar"]
df = df[a == FALSE]
#Aplicación modelos
# Random Forest
set.seed(2);
modFit <- train(demora~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1), method="rf")
RMSE_rf = modFit$results[modFit$results$mtry == as.numeric(modFit$bestTune),]$RMSE
MAE = modFit$results[modFit$results$mtry == as.numeric(modFit$bestTune),]$MAE
modFit_rf = modFit
# Linear Regression
set.seed(2);
modFit <- train(demora~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1), method="lm")
RMSE_lm = modFit$results$RMSE
modFit_lm = modFit
# SVM (kernel Poly)
set.seed(2);
modFit <- train(demora~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1), method="svmPoly")
RMSE_svm = modFit$results[modFit$results$C == as.numeric(modFit$bestTune$C) & modFit$results$degree == as.numeric(modFit$bestTune$degree) & modFit$results$scale == as.numeric(modFit$bestTune$scale) ,]$RMSE
modFit_svm = modFit
# Decision Tree
set.seed(2);
modFit <- train(demora~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1), method="rpart")
RMSE_dt = modFit$results[modFit$results$cp == as.numeric(modFit$bestTune),]$RMSE
modFit_dt = modFit
# Neural Network
set.seed(2);
modFit <- train(demora~., data=df, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv", number=10, repeats=1, savePredictions = TRUE), method="nnet", linout = TRUE)
RMSE_nnet = min(modFit$results$RMSE)
modFit_nnet = modFit
# Seleccionar el mejor modelo
RMSE_min = min(RMSE_dt, RMSE_lm, RMSE_nnet, RMSE_rf, RMSE_svm)
if(RMSE_min == RMSE_dt){
modFit = modFit_dt
} else if(RMSE_min == RMSE_lm) {
modFit = modFit_lm
} else if(RMSE_min == RMSE_nnet){
modFit = modFit_nnet
} else if(RMSE_min == RMSE_rf){
modFit = modFit_rf
} else {
modFit = modFit_svm
}
save(modFit, file="modelo_demora.Rda")
}
}
\ No newline at end of file
source("obtenerIndicadoresUACH.R", encoding="UTF-8")
source("entrenar_modelos.R")
df = obtenerIndicadoresUACH()
entrenar_modelos(TRUE)
load("dropout_model.Rda")
dropout = predict(modFit, newdata = df, type = "prob")$X1
entrenar_modelos(FALSE)
load("modelo_demora.Rda")
demora = predict(modFit, newdata = df)
predictions = as.data.frame(cbind(id=df$id, dropout, demora))
names(predictions)[3] = "demora"
predictions$demora = as.character(predictions$demora)
predictions$demora = as.numeric(predictions$demora)
write.table(predictions, file="predicciones.csv", quote=FALSE, dec=",", sep=";", row.names=FALSE)
\ No newline at end of file
obtenerIndicadoresUACH <- function(sem = 100, fichero = "dataset_uach_lala_info.csv")
{
library("dplyr")
library("caret")
# TRADITIONAL VARIABLES
path = file.path(fichero)
data = read.csv(path, sep = ";", dec = ",", stringsAsFactors = FALSE)
# OBTENER ID DE CONFIGURACIONES
data[,13] = 0
names(data)[13] = "nconf"
for(i in 1:dim(data)[1]){
id = data[i,1]
anho = data[i,4]
semestre = data[i,5]
key = paste(id,anho,semestre)
if(i == 1){
key_prev = key
id_prev = id
index = 1
}
if(key != key_prev) {
index = index + 1
}
if(id != id_prev) {
index = 1
}
data[i,13] = index
key_prev = key
id_prev = id
}
# FILTRAR POR CONFIGURACIÓN
data = data[data$nconf <= sem,]
data_cursada = data[data$Concepto=="CURSADA",]
data_anulada = data[data$Concepto=="ANULADA",]
# Promedio normalizado
npromedio = as.data.frame(summarize(group_by(data, ï..Estudiante), npromedio=mean(Calificación)/7.0))
npromedio_curs = as.data.frame(summarize(group_by(data_cursada, ï..Estudiante), npromcurs=mean(Calificación)/7.0))
# Promedio de asignaturas superadas
data_aprobada = data_cursada[data_cursada$Calificación >= 4.0,]
npromedio_ap = as.data.frame(summarize(group_by(data_aprobada, ï..Estudiante), npromap=mean(Calificación)/7.0))
# Número de registros de asignaturas
nasignaturas = as.data.frame(summarize(group_by(data, ï..Estudiante), asignaturas=n()))
ncursadas = as.data.frame(summarize(group_by(data_cursada, ï..Estudiante), cursadas=n()))
nanuladas = as.data.frame(summarize(group_by(data_anulada, ï..Estudiante), anuladas=n()))
# Rate cursada
rcursada = merge(nasignaturas, ncursadas, by="ï..Estudiante", all.x=TRUE)
rcursada[,4] = rcursada$cursadas/rcursada$asignaturas
names(rcursada)[4] = "rcursada"
# Rate aprobadas
naprobadas = as.data.frame(summarize(group_by(data_cursada, ï..Estudiante), aprobadas=sum(Calificación>=4)))
df = merge(rcursada, naprobadas, by="ï..Estudiante", all.x=TRUE)
df[,6] = df$aprobadas/df$cursadas
names(df)[6] = "raprobadas"
# Rate de repeticiones de asignaturas
ndistintas = as.data.frame(summarize(group_by(data_cursada, ï..Estudiante), ndistintas=length(unique(Asignatura))))
df = merge(df, ndistintas, by="ï..Estudiante", all.x=TRUE)
df[,8] = df$ndistintas/df$cursadas
names(df)[8] = "repeticiones"
# Rate anulada
df = merge(df, nanuladas, by="ï..Estudiante", all.x=TRUE)
df$anuladas[is.na(df$anuladas)] <- 0
df[,10] = df$anuladas/df$asignaturas
names(df)[10] = "ranuladas"
df = merge(df, npromedio_curs, by="ï..Estudiante", all.x=TRUE)
df = merge(df, npromedio, by="ï..Estudiante", all.x=TRUE)
#df = merge(df, npromedio_ap, by="ï..Estudiante", all.x=TRUE)
df = subset(df,select=-c(asignaturas,cursadas,aprobadas, ndistintas,anuladas))
names(df)[1] = "id"
df[is.na(df)] = 0
return(df)
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment