Commit e001cc9d authored by Marlon Ulloa Amaya's avatar Marlon Ulloa Amaya

Merge branch 'master' of https://git.cti.espol.edu.ec/LALA-Project-EN/UCUENCA

Push of client and api source code
parents 1d48a1ce 983c4b80
import os
print("Prediciendo Dropout...\n")
os.system("python dropout/jsontocsv.py")
print("Archivos descargados de BD\n")
os.system("python dropout/get_calificaciones.py")
os.system("python dropout/get_mallas.py")
os.system("python dropout/get_estudiantes.py")
os.system("python dropout/get_pga.py")
os.system("python dropout/calculosasigmallas.py")
print("Variables de entrada calculadas\n")
os.system("python dropout/final.py")
print("Datos finales listos\n")
os.system("python dropout/predict.py")
print("Predicción acabada.")
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
import time
def main():
file = 'asigMallas.csv'
data = pd.read_csv(file)
data = data.drop("ASIGNATURA_CODIGO",1).drop("GRUPO",1).drop("PERLEC_ID",1).drop("FORMA_APROBACION",1)
data = data.drop("NOTA1",1).drop("NOTA2",1).drop("NOTA3",1).drop("NOTA4",1).drop("NOTA5",1).drop("NOTA6",1).drop("RESP_ID",1)
data = data.drop("MALLA_ANIO",1).drop("EJE_FORMACION",1).drop("OPTATIVO",1).drop("ELECTIVO",1)
data = data.fillna(0)
data = data[data['id']!=0]
data['TOTAL_HORAS_MALLA'] = data['TOTAL_HORAS_MALLA'].astype(int)
data.drop("Unnamed: 0",1,inplace=True)
for i in range(len(data)):
if data['CREDITOS'].values[i] == 0:
data['CREDITOS'].values[i] = data['TOTAL_HORAS_CICLO'].values[i]
data.drop("TOTAL_HORAS_CICLO",1,inplace = True)
data = data.sort_values(['id', 'ANIO'], ascending=[True, True])
#Quitamos todos los estudiantes que no tengan datos desde el primer semestre
malSem = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_x.min()
malSem = malSem[malSem['SEMESTRE_x']!=1]
for i in range(len(malSem)):
data = data[data['id']!=malSem['id'].values[i]]
#Calculamos numero de suspensos por estudiante
fail = data[data.ESTADO_APROBACION == '0']
fail['ESTADO_APROBACION'].replace(0,1,inplace = True)
numFail = fail.groupby(['id','CARRERA'],as_index=False).size().to_frame()
numFail.columns = ['Count']
#Dropout despues de 2 años sin matricularse
years = np.array((data.ANIO).astype(int))
ids = np.array(data.id)
dropout = []
for i in range(len(years)):
if ids[i] == ids[i-1]:
if (years[i]-years[i-1])>=4:
dropout.append(1)
else:
dropout.append(0)
else:
dropout.append(0)
dropout = pd.DataFrame(dropout)
ids = pd.DataFrame(ids)
dropid =pd.concat([ids, dropout], axis=1,)
dropid.columns = ['id','dropout']
dropout = dropid.groupby(['id'],as_index=False).dropout.sum()
#Si lleva mas de los ultimos 2 años sin ir y no se ha graduado
datamax = data.groupby(['id'],as_index=False).ANIO.max()
datamax.columns = ['id','year']
añoActual = int(time.strftime("%Y"))
fileE = 'estudPGA.csv'
df = pd.read_csv(fileE)
dataEstudiante = df.drop("Unnamed: 0",1).drop("FSE",1).drop("ANIO_EGRESO",1).drop("ANIO_INGRESO",1).drop("CARRERA_ID",1).drop("DURACION_ANIOS",1)
dataEstudiante = dataEstudiante.drop("SEMESTRE_EGRESO",1).drop("SEMESTRE_INGRESO",1)
stateStud = pd.merge(datamax,dataEstudiante,how='inner',on = 'id')
stateStud = pd.merge(stateStud,dropout,how='inner',on = 'id')
stateStud.year.fillna(0,inplace = True)
stateStud = stateStud[stateStud['year']!=0]
stateStud = stateStud.drop_duplicates(subset='id', keep='first', inplace=False)
for i in range(len(stateStud)):
if añoActual >= (stateStud.year.values[i]+3) and stateStud.DROPOUT.values[i] != 1:
dropout.dropout.values[i] = 1
dropout.to_csv("dropout5Years.csv")
#Media ponderada
data['NUMERO_MATRICULA'].replace(2,0.85,inplace = True)
data['NUMERO_MATRICULA'].replace(3,0.75,inplace = True)
data.NOTA_FINAL = data.NUMERO_MATRICULA*data.NOTA_FINAL*data.CREDITOS
#Semestres que lleva un estudiante en la universidad
semestres = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_x.max()
semTot = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_y.max()
semestres = pd.merge(semestres, semTot, how='outer', on=['id','CARRERA'])
semestres.SEMESTRE_x = semestres.SEMESTRE_x/semTot.SEMESTRE_y
semestres.drop("SEMESTRE_y",1,inplace = True)
#Años desde que empezo
datamax = data.groupby(['id','CARRERA'],as_index=False).ANIO.max()
datamin = data.groupby(['id','CARRERA'],as_index=False).ANIO.min()
dataMaxMin = pd.merge(datamax, datamin, how='outer', on=['id','CARRERA'])
dataMaxMin.ANIO_x = (dataMaxMin.ANIO_x).astype(int)
dataMaxMin.ANIO_y = (dataMaxMin.ANIO_y).astype(int)
dataMaxMin.ANIO_x = dataMaxMin.ANIO_x - dataMaxMin.ANIO_y + 1
dataMaxMin = pd.merge(dataMaxMin, semTot, how='outer', on=['id','CARRERA'])
#años que lleva/años que deberia durar la carrera
dataMaxMin.ANIO_x = dataMaxMin.ANIO_x/(dataMaxMin.SEMESTRE_y/2)
dataMaxMin = dataMaxMin.drop("ANIO_y",1).drop("SEMESTRE_y",1)
#Empezamos a calcular la media
dataS = data[data.ESTADO_APROBACION == '1']
credDone = data.groupby(['id','CARRERA'],as_index=False).CREDITOS.sum()
credSum = dataS.groupby(['id','CARRERA'],as_index=False).CREDITOS.sum()
gradeSum = dataS.groupby(['id','CARRERA'],as_index=False).NOTA_FINAL.sum()#Suma de las notas poderadas
dataMedia = pd.merge(credSum, gradeSum, how='outer', on=['id','CARRERA'])
dataMedia = pd.merge(credDone, dataMedia, how='outer', on=['id','CARRERA'])
dataMedia = dataMedia.fillna(0)
dataMedia.NOTA_FINAL = dataMedia.NOTA_FINAL / dataMedia.CREDITOS_y
dataMedia.CREDITOS_y = dataMedia.CREDITOS_y/dataMedia.CREDITOS_x
dataMedia = dataMedia.fillna(0)
dataMedia.drop("CREDITOS_x",1,inplace = True)
namesTot = ["id","CARRERA", "passDone","gradeMean"]
dataMedia.columns = namesTot
dataMedia.to_csv("varHistCredMedia.csv")#Media de todos los estudiantes
#Numero de creditos aprobados
dataPass = pd.merge(dataMaxMin,credSum, how='outer', on=['id','CARRERA'])
dataPass = dataPass.fillna(0)
totCred = data.groupby(['id','CARRERA'],as_index=False).TOTAL_HORAS_MALLA.max()
mallaId = data.groupby(['id','CARRERA'],as_index=False).MALLA_ID.max()
dataPass = pd.merge(dataPass,totCred,how = 'outer', on = ['id','CARRERA'])
dataPass = pd.merge(dataPass,mallaId,how = 'outer', on = ['id','CARRERA'])
dataPass = pd.merge(dataPass,semestres,how = 'outer', on = ['id','CARRERA'])
dataPass['CREDITOS'] = dataPass.CREDITOS/dataPass.TOTAL_HORAS_MALLA
dataPass.drop("TOTAL_HORAS_MALLA",1,inplace = True)
dataPass.to_csv("credPassYear.csv")#creditos totales aprobados
data = pd.merge(data, numFail, how='outer', on=['id'])
data['Count'] = data['Count'].fillna(0)
data = data[data.ESTADO_APROBACION != '0']
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
def main():
file = 'varHistCredMedia.csv'
dataMedia = pd.read_csv(file)
dataMedia = dataMedia.drop("Unnamed: 0",1)
fileE = 'cambioEstudiantes.csv'
df = pd.read_csv(fileE)
dataEstudiante = df.drop("Unnamed: 0",1)
fileC = 'credPassYear.csv'
dataCredCurs = pd.read_csv(fileC)
dataCredCurs = dataCredCurs.drop("Unnamed: 0",1)
data = pd.merge(dataEstudiante, dataMedia, how='outer', on=['id'])
data = pd.merge(data, dataCredCurs, how='outer', on=['id','CARRERA'])
data['gradeMean'].fillna(0,inplace = True)
data['CREDITOS'].fillna(0,inplace = True)
data['FSE'].fillna(5,inplace = True)
dataTot = data.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count()
dataTot.columns = ['CARRERA','MALLA_ID','TOT']
#Ponemos el dropout de los 3 años sin acudir a clase
dropout = 'dropout5Years.csv'
dropout = pd.read_csv(dropout)
dropout.drop("Unnamed: 0",1,inplace = True)
data = pd.merge(data, dropout, how='outer', on=['id'])
for i in range(len(data)):
if data.dropout.values[i] >= 1 :
data.DROPOUT.values[i] = 0
data.drop("dropout",1,inplace=True)
dataPass = data[data.DROPOUT == 1]
dataPass = dataPass.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count()
dataPass.columns = ['CARRERA','MALLA_ID','PASS']
dataFail = data[data.DROPOUT == 0]
dataFail = dataFail.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count()
dataFail.columns = ['CARRERA','MALLA_ID','FAIL']
dataRate = pd.merge(dataTot, dataPass, how='outer', on=['CARRERA','MALLA_ID'])
dataRate = pd.merge(dataRate, dataFail, how='outer', on=['CARRERA','MALLA_ID'])
dataRate.fillna(0,inplace = True)
#No se tiene en cuenta el número de estudiantes que están cursando ahora para el total de los alumnos
dataRate['rate'] = dataRate.FAIL/(dataRate.FAIL+dataRate.PASS)
dataRate = dataRate.drop("TOT",1).drop("PASS",1).drop("FAIL",1)
data = pd.merge(data, dataRate, how='outer', on=['CARRERA','MALLA_ID'])
data = data.drop_duplicates()
data = data.sort_values(by='id', ascending=True)
data = data.drop("ANIO_EGRESO",1).drop("CARRERA_ID",1).drop("DURACION_ANIOS",1).drop("NOMBRE",1).drop("NOTA_FINAL",1).drop("SEMESTRE_EGRESO",1)
data = data.drop("SEMESTRE_INGRESO",1).drop("ANIO_INGRESO",1)
columnas = ['FSE','ID','DROPOUT','CARRERA','PASSDONE','GRADEMEAN','YEARSMAT/YEARSDEGREE','CREDITSPASSEDDEGREE','MALLA_ID','SEMESTERMAT/SEMESTERDEG','ABANDONMENTRATE']
data.columns = columnas
data.CARRERA = data['CARRERA'].fillna(0)
data = data[data['CARRERA']!= 0]
data.to_csv("Final.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
def main():
file = 'calificaciones1.csv'
df = pd.read_csv(file)
df = df.sort_values(by='id', ascending=True)
df.drop("Unnamed: 0",1,inplace = True)
df = df.drop("NOTA7",1)
df['ESTADO_APROBACION'].replace("APROBADO",1,inplace = True)
df['ESTADO_APROBACION'].replace("REPROBADO",0,inplace = True)
df['ESTADO_APROBACION'].replace("REPROBADO POR FALTAS",0,inplace = True)
df['NOTA1'].fillna(0,inplace = True)
df['NOTA2'].fillna(0,inplace = True)
df['NOTA3'].fillna(0,inplace = True)
df['NOTA4'].fillna(0,inplace = True)
df['NOTA5'].fillna(0,inplace = True)
df['NOTA6'].fillna(0,inplace = True)
df['NOTA_FINAL'].fillna(0,inplace = True)
df.to_csv("cambioCal.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
def main():
file = 'estudiantes.csv'
df = pd.read_csv(file,encoding = "ISO-8859-1")
df = df.sort_values(by='id', ascending=True)
df = df.drop("TIPO",1).drop("COLEGIO",1)
file1 = 'graduados.csv'
df1 = pd.read_csv(file1)
df1 = df1.sort_values(by='id', ascending=True)
df1.drop("Unnamed: 0",1,inplace = True)
df1 = df1.drop("SEMESTRE_INGRESO_DESC",1).drop("SEMESTRE_EGRESO_DESC",1)
df1['DROPOUT'] = 1
df = pd.merge(df, df1, how='outer', on=['id'])
df.to_csv("cambioEstudiantes.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
def main():
file = 'mallas.csv'
df = pd.read_csv(file,encoding = "ISO-8859-1")
df = df.sort_values(by='CARRERA', ascending=True)
df = df.drop("NOMBRE_ASIGNATURA",1)
df.to_csv("cambioMallas.csv")
file = 'cambioCal.csv'
df1 = pd.read_csv(file)
semestresMalla = df.groupby(['CARRERA','MALLA_ID']).SEMESTRE.max().to_frame()
df = pd.merge(df1,df,how='inner', on=['ASIGNATURA_CODIGO','CARRERA']).drop("Unnamed: 0",1).drop("DESCRIPCIONPERIODO",1)
df = pd.merge(df,semestresMalla, how= 'outer', on = ['CARRERA','MALLA_ID'])
df = df.sort_values(by='id', ascending=True)
df.to_csv("asigMallas.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
def main():
file = 'pga.csv'
df = pd.read_csv(file)
df = df.sort_values(by='id', ascending=True)
df1 = df.groupby(['id','CARRERA']).PGA.sum().to_frame()
df2 = df.groupby(['id','CARRERA']).id.count()
df1 = df1.reset_index()
df2 = df2.to_frame()
df2.columns = ['COUNT']
df1 = pd.merge(df1, df2, how='outer', on=['id','CARRERA'])
df1.PGA = df1.PGA/df1.COUNT
df1 = df1.drop("COUNT",1)
df1.columns = ['id','CARRERA','MEDIA']
df1.to_csv("cambioPGA.csv")
df1.drop("CARRERA",1,inplace = True)
file = 'cambioEstudiantes.csv'
df = pd.read_csv(file)
df = pd.merge(df, df1, how='outer', on=['id']).drop("Unnamed: 0",1).drop("NOMBRE",1)
df.to_csv("estudPGA.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
import csv
import json
import pandas as pd
import numpy as np
est = json.loads(open('estudiantes.json').read())
f = csv.writer(open("estudiantes.csv", "w+"))
# Write CSV Header, If you dont need that, remove this line
for y in range(len(est)):
est[y]['id'] = y+1
f.writerow(["COLEGIO", "TIPO", "FSE","id"])
for x in est:
f.writerow([x["COLEGIO"],
x["TIPO"],
x["FSE"],
x["id"]])
file = 'estudiantes.csv'
df = pd.read_csv(file,encoding = "ISO-8859-1")
df.to_csv("estudiantes1.csv")
grad = json.loads(open('graduados.json').read())
df = pd.DataFrame(grad)
df.columns = ['dict']
arr = np.array(df['dict']).tolist()
arr = [{'CARRERA_ID': 0, 'NOMBRE': 0, 'DURACION_ANIOS': 0, 'ANIO_INGRESO': 0, 'SEMESTRE_INGRESO': 0, 'SEMESTRE_INGRESO_DESC': 0, 'ANIO_EGRESO': 0, 'SEMESTRE_EGRESO': 115, 'SEMESTRE_EGRESO_DESC': 'SEPTIEMBRE 2016-FEBRERO 2017', 'NOTA_FINAL': 0} if v is None else v for v in arr]
df = pd.DataFrame.from_records(arr)
df['id'] = 1
arr = np.array(df)
for i in range(len(arr)):
arr[i][10] = i+1
df = pd.DataFrame(arr, columns=df.columns)
df = df[df.CARRERA_ID !=0]
df.to_csv("graduados.csv")
mallas = json.loads(open('mallas.json').read())
f = csv.writer(open("mallas.csv", "w+"))
# Write CSV Header, If you dont need that, remove this line
f.writerow(['MALLA_ID', 'CARRERA', 'MALLA_ANIO', 'SEMESTRE', 'ASIGNATURA_CODIGO', 'NOMBRE_ASIGNATURA', 'CREDITOS', 'TOTAL_HORAS_CICLO', 'EJE_FORMACION', 'OPTATIVO', 'ELECTIVO','TOTAL_HORAS_MALLA'])
for x in mallas:
f.writerow([x["MALLA_ID_"],
x["CARRERA"],
x["MALLA_ANIO"],
x["SEMESTRE"],
x["ASIGNATURA_CODIGO"],
x["NOMBRE_ASIGNATURA"],
x["CREDITOS"],
x["TOTAL_HORAS_CICLO"],
x["EJE_FORMACION"],
x["OPTATIVO"],
x["ELECTIVO"],
x["TOTAL_HORAS_MALLA"]])
file = 'mallas.csv'
df = pd.read_csv(file,encoding = "ISO-8859-1")
df = df.drop_duplicates()
df['id'] = 1
arr = np.array(df)
for i in range(len(arr)):
arr[i][12] = i+1
df = pd.DataFrame(arr, columns=df.columns)
df.to_csv("mallas1.csv")
pga = json.loads(open('pga.json').read())
f = csv.writer(open("pga.csv", "w+"))
for y in range(len(pga)):
for x in range(len(pga[y])):
pga[y][x]['id'] = y+1
if len(pga[y]) == 0:
pga[y] = [{"CARRERA":0, "PERLEC_ID":0, "DESCRIPCIONPERIODO":0,"PGA":0,"id":y+1}]
# Write CSV Header, If you dont need that, remove this line
f.writerow(["CARRERA", "PERLEC_ID", "DESCRIPCIONPERIODO","PGA","id"])
for y in pga:
for x in y:
f.writerow([x["CARRERA"],
x["PERLEC_ID"],
x["DESCRIPCIONPERIODO"],
x["PGA"],
x["id"]])
file = 'pga.csv'
df = pd.read_csv(file,index_col='id')
df.to_csv("pga1.csv")
calif = json.loads(open('calificaciones.json').read())
f = csv.writer(open("calificaciones.csv", "w+"))
for y in range(len(calif)):
for x in range(len(calif[y])):
calif[y][x]['id'] = y+1
# Write CSV Header, If you dont need that, remove this line
f.writerow(['CARRERA', 'ASIGNATURA_CODIGO', 'NUMERO_MATRICULA', 'GRUPO', 'ANIO', 'PERLEC_ID', 'DESCRIPCIONPERIODO',
'ESTADO_APROBACION', 'FORMA_APROBACION', 'NOTA1', 'NOTA2', 'NOTA3', 'NOTA4', 'NOTA5', 'NOTA6', 'NOTA7', 'NOTA_FINAL', 'RESP_ID','id'])
for y in calif:
f.writerow("")
for x in y:
f.writerow([x["CARRERA"],
x["ASIGNATURA_CODIGO"],
x["NUMERO_MATRICULA"],
x["GRUPO"],
x["ANIO"],
x["PERLEC_ID"],
x["DESCRIPCIONPERIODO"],
x["ESTADO_APROBACION"],
x["FORMA_APROBACION"],
x["NOTA1"],
x["NOTA2"],
x["NOTA3"],
x["NOTA4"],
x["NOTA5"],
x["NOTA6"],
x["NOTA7"],
x["NOTA_FINAL"],
x["RESP_ID"],
x["id"]])
file = 'calificaciones.csv'
df = pd.read_csv(file,encoding = "ISO-8859-1")
df.to_csv("calificaciones1.csv")
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import metrics
def main():
file = 'Final.csv'
data = pd.read_csv(file)
data.drop("Unnamed: 0",1,inplace=True)
data['DROPOUT'].fillna(2,inplace = True)
data = data.dropna()
data['DROPOUT'].replace("",2,inplace = True)
dataPred = data.fillna(0)
dataTrain = dataPred[dataPred.DROPOUT != 2]
dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int)
dataTrain = dataTrain.drop("ID",1).drop("CARRERA",1)
dataF = dataPred[dataPred.DROPOUT == 2]
dataTest = dataPred[dataPred.DROPOUT == 2]
dataTest['DROPOUT'].replace(2,"",inplace = True)
dataTest = dataTest.drop("ID",1).drop("CARRERA",1)
x_train = dataTrain.values[:,[2,3,4,7]]
y_train = dataTrain.values[:,1]
x_test1 = dataTest.values[:,[2,3,4,7]]
y_test = dataTest.values[:,1]
robust_scaler = preprocessing.StandardScaler()
x_train = robust_scaler.fit_transform(x_train)
x_test = robust_scaler.fit_transform(x_test1)
"""
cfr = RandomForestClassifier()
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {
'max_depth': range(3,20,1),
'max_features': ['auto'],
'min_samples_leaf': range(1,20,1),
'min_samples_split': range(3,20,1),
'n_estimators': range(300,600,25)
}
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = cfr, param_grid = param_grid,
cv = 15, n_jobs = -1, verbose = 2)
# Fit the grid search to the data
grid_search.fit(x_train, y_train)
print("\n")
print(grid_search.best_params_)
print("\n")
"""
cfr2 = RandomForestClassifier(bootstrap = True, max_depth = 13, max_features='auto',min_samples_leaf = 7, min_samples_split = 15, n_estimators = 400)
y_pred = cfr2.fit(x_train, y_train).predict_proba(x_test)
y_pr2 = cfr2.fit(x_train, y_train).predict(x_test)
#y_pred = grid_search.fit(x_train, y_train).predict_proba(x_test)
#y_pr2 = grid_search.fit(x_train, y_train).predict(x_test)
x_pr1 = pd.DataFrame(x_test1)
y_pred2 = pd.DataFrame(y_pr2)
df = pd.DataFrame(y_pred[:,1] )
dataF = dataF.reset_index(drop=True)
dataF.drop("DROPOUT",1,inplace=True)
df = pd.concat([dataF,y_pred2, df], axis=1, ignore_index=True)
df.columns = ['FSE','ID','CARRERA','PASSDONE','MEDIA','YEARSMAT/DEGREE','CREDITPASSDEGREE','MALLA_ID','SEMESTERMAT/DEG','ABANDRATE','CLAS','PROB']
df.to_excel("ResultsRF.xlsx")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import csv
import pandas as pd
import string
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import metrics
def main():
file = 'Final.csv'
data = pd.read_csv(file)
dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1)
#dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip()
dataPred['DROPOUT'].fillna(2,inplace = True)
#print(dataPred.stateStudent)
dataPred['DROPOUT'].replace("",2,inplace = True)
dataPred = dataPred.fillna(0)
dataTrain = dataPred[dataPred.DROPOUT != 2]
dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int)
dataTest = dataPred[dataPred.DROPOUT == 2]
dataTest['DROPOUT'].replace(2,"",inplace = True)
#dataTrain.to_excel("dataTrain.xlsx")
#dataTest.to_excel("dataTest.xlsx")
x_train = dataTrain.values[:,[0,2,3,4,5,6]]
y_train = dataTrain.values[:,1]
x_test = dataTest.values[:,[0,2,3,4,5,6]]
y_test = dataTest.values[:,1]
robust_scaler = preprocessing.StandardScaler()
x_train = robust_scaler.fit_transform(x_train)
print("Starting cross-validation (" + str(len(x_train)) + ' learners)')
#cfr = DecisionTreeRegressor()
cfr2 = DecisionTreeClassifier()
kf = model_selection.KFold(n_splits=10)
cv = kf.split(x_train)
results = []
res_ce = []
A_A = 0
A_S = 0
S_A = 0
S_S = 0
y_pred_list = list()
y_true_list = list()
y_true2_list = list()
for traincv, testcv in cv:
y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv])
#results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2)))
y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv])
res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv])))
y_pr1 = pd.DataFrame(y_train[testcv])
x_pr1 = pd.DataFrame(x_train[testcv])
y_pred2 = pd.DataFrame(y_pr2)
df = pd.DataFrame(y_pred[:,1] )
df = pd.concat([y_pr1,y_pred2, df], axis=1,)
print(df)
df.to_excel("ResultsSVC.xlsx")
# Store results for AUC
for i, v in enumerate(y_pred[:,1]):
y_pred_list.append(v)
y_true_list.append(y_train[testcv][i])