diff --git a/Early Dropout Prediction System/EJECUTABLE.py b/Early Dropout Prediction System/EJECUTABLE.py new file mode 100644 index 0000000000000000000000000000000000000000..b637c3217a2edf291b6892d0c8e086da962600b2 --- /dev/null +++ b/Early Dropout Prediction System/EJECUTABLE.py @@ -0,0 +1,15 @@ +import os + +print("Prediciendo Dropout...\n") +os.system("python dropout/jsontocsv.py") +print("Archivos descargados de BD\n") +os.system("python dropout/get_calificaciones.py") +os.system("python dropout/get_mallas.py") +os.system("python dropout/get_estudiantes.py") +os.system("python dropout/get_pga.py") +os.system("python dropout/calculosasigmallas.py") +print("Variables de entrada calculadas\n") +os.system("python dropout/final.py") +print("Datos finales listos\n") +os.system("python dropout/predict.py") +print("Predicción acabada.") \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/calculosasigmallas.py b/Early Dropout Prediction System/dropout/calculosasigmallas.py new file mode 100644 index 0000000000000000000000000000000000000000..e570655134fe638994bd4c86ee6811ae3ab90c1c --- /dev/null +++ b/Early Dropout Prediction System/dropout/calculosasigmallas.py @@ -0,0 +1,142 @@ +import numpy as np +import csv +import pandas as pd +import string +import time + +def main(): + + file = 'asigMallas.csv' + data = pd.read_csv(file) + data = data.drop("ASIGNATURA_CODIGO",1).drop("GRUPO",1).drop("PERLEC_ID",1).drop("FORMA_APROBACION",1) + data = data.drop("NOTA1",1).drop("NOTA2",1).drop("NOTA3",1).drop("NOTA4",1).drop("NOTA5",1).drop("NOTA6",1).drop("RESP_ID",1) + data = data.drop("MALLA_ANIO",1).drop("EJE_FORMACION",1).drop("OPTATIVO",1).drop("ELECTIVO",1) + data = data.fillna(0) + data = data[data['id']!=0] + data['TOTAL_HORAS_MALLA'] = data['TOTAL_HORAS_MALLA'].astype(int) + data.drop("Unnamed: 0",1,inplace=True) + for i in range(len(data)): + if data['CREDITOS'].values[i] == 0: + data['CREDITOS'].values[i] = data['TOTAL_HORAS_CICLO'].values[i] + data.drop("TOTAL_HORAS_CICLO",1,inplace = True) + data = data.sort_values(['id', 'ANIO'], ascending=[True, True]) + + #Quitamos todos los estudiantes que no tengan datos desde el primer semestre + malSem = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_x.min() + malSem = malSem[malSem['SEMESTRE_x']!=1] + + for i in range(len(malSem)): + data = data[data['id']!=malSem['id'].values[i]] + + + #Calculamos numero de suspensos por estudiante + fail = data[data.ESTADO_APROBACION == '0'] + fail['ESTADO_APROBACION'].replace(0,1,inplace = True) + numFail = fail.groupby(['id','CARRERA'],as_index=False).size().to_frame() + numFail.columns = ['Count'] + + #Dropout despues de 2 años sin matricularse + years = np.array((data.ANIO).astype(int)) + ids = np.array(data.id) + dropout = [] + for i in range(len(years)): + if ids[i] == ids[i-1]: + if (years[i]-years[i-1])>=4: + dropout.append(1) + else: + dropout.append(0) + else: + dropout.append(0) + + dropout = pd.DataFrame(dropout) + ids = pd.DataFrame(ids) + dropid =pd.concat([ids, dropout], axis=1,) + dropid.columns = ['id','dropout'] + dropout = dropid.groupby(['id'],as_index=False).dropout.sum() + + #Si lleva mas de los ultimos 2 años sin ir y no se ha graduado + datamax = data.groupby(['id'],as_index=False).ANIO.max() + datamax.columns = ['id','year'] + añoActual = int(time.strftime("%Y")) + + fileE = 'estudPGA.csv' + df = pd.read_csv(fileE) + dataEstudiante = df.drop("Unnamed: 0",1).drop("FSE",1).drop("ANIO_EGRESO",1).drop("ANIO_INGRESO",1).drop("CARRERA_ID",1).drop("DURACION_ANIOS",1) + dataEstudiante = dataEstudiante.drop("SEMESTRE_EGRESO",1).drop("SEMESTRE_INGRESO",1) + + stateStud = pd.merge(datamax,dataEstudiante,how='inner',on = 'id') + stateStud = pd.merge(stateStud,dropout,how='inner',on = 'id') + stateStud.year.fillna(0,inplace = True) + stateStud = stateStud[stateStud['year']!=0] + stateStud = stateStud.drop_duplicates(subset='id', keep='first', inplace=False) + + for i in range(len(stateStud)): + if añoActual >= (stateStud.year.values[i]+3) and stateStud.DROPOUT.values[i] != 1: + dropout.dropout.values[i] = 1 + + dropout.to_csv("dropout5Years.csv") + + #Media ponderada + data['NUMERO_MATRICULA'].replace(2,0.85,inplace = True) + data['NUMERO_MATRICULA'].replace(3,0.75,inplace = True) + data.NOTA_FINAL = data.NUMERO_MATRICULA*data.NOTA_FINAL*data.CREDITOS + + + #Semestres que lleva un estudiante en la universidad + semestres = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_x.max() + semTot = data.groupby(['id','CARRERA'],as_index=False).SEMESTRE_y.max() + semestres = pd.merge(semestres, semTot, how='outer', on=['id','CARRERA']) + semestres.SEMESTRE_x = semestres.SEMESTRE_x/semTot.SEMESTRE_y + semestres.drop("SEMESTRE_y",1,inplace = True) + + #Años desde que empezo + datamax = data.groupby(['id','CARRERA'],as_index=False).ANIO.max() + datamin = data.groupby(['id','CARRERA'],as_index=False).ANIO.min() + dataMaxMin = pd.merge(datamax, datamin, how='outer', on=['id','CARRERA']) + dataMaxMin.ANIO_x = (dataMaxMin.ANIO_x).astype(int) + dataMaxMin.ANIO_y = (dataMaxMin.ANIO_y).astype(int) + dataMaxMin.ANIO_x = dataMaxMin.ANIO_x - dataMaxMin.ANIO_y + 1 + dataMaxMin = pd.merge(dataMaxMin, semTot, how='outer', on=['id','CARRERA']) + #años que lleva/años que deberia durar la carrera + dataMaxMin.ANIO_x = dataMaxMin.ANIO_x/(dataMaxMin.SEMESTRE_y/2) + dataMaxMin = dataMaxMin.drop("ANIO_y",1).drop("SEMESTRE_y",1) + + + #Empezamos a calcular la media + dataS = data[data.ESTADO_APROBACION == '1'] + credDone = data.groupby(['id','CARRERA'],as_index=False).CREDITOS.sum() + credSum = dataS.groupby(['id','CARRERA'],as_index=False).CREDITOS.sum() + gradeSum = dataS.groupby(['id','CARRERA'],as_index=False).NOTA_FINAL.sum()#Suma de las notas poderadas + dataMedia = pd.merge(credSum, gradeSum, how='outer', on=['id','CARRERA']) + dataMedia = pd.merge(credDone, dataMedia, how='outer', on=['id','CARRERA']) + dataMedia = dataMedia.fillna(0) + dataMedia.NOTA_FINAL = dataMedia.NOTA_FINAL / dataMedia.CREDITOS_y + dataMedia.CREDITOS_y = dataMedia.CREDITOS_y/dataMedia.CREDITOS_x + dataMedia = dataMedia.fillna(0) + + dataMedia.drop("CREDITOS_x",1,inplace = True) + namesTot = ["id","CARRERA", "passDone","gradeMean"] + dataMedia.columns = namesTot + dataMedia.to_csv("varHistCredMedia.csv")#Media de todos los estudiantes + + + #Numero de creditos aprobados + dataPass = pd.merge(dataMaxMin,credSum, how='outer', on=['id','CARRERA']) + dataPass = dataPass.fillna(0) + totCred = data.groupby(['id','CARRERA'],as_index=False).TOTAL_HORAS_MALLA.max() + mallaId = data.groupby(['id','CARRERA'],as_index=False).MALLA_ID.max() + dataPass = pd.merge(dataPass,totCred,how = 'outer', on = ['id','CARRERA']) + dataPass = pd.merge(dataPass,mallaId,how = 'outer', on = ['id','CARRERA']) + dataPass = pd.merge(dataPass,semestres,how = 'outer', on = ['id','CARRERA']) + dataPass['CREDITOS'] = dataPass.CREDITOS/dataPass.TOTAL_HORAS_MALLA + dataPass.drop("TOTAL_HORAS_MALLA",1,inplace = True) + dataPass.to_csv("credPassYear.csv")#creditos totales aprobados + + + data = pd.merge(data, numFail, how='outer', on=['id']) + data['Count'] = data['Count'].fillna(0) + data = data[data.ESTADO_APROBACION != '0'] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/final.py b/Early Dropout Prediction System/dropout/final.py new file mode 100644 index 0000000000000000000000000000000000000000..f447b1921c1108e96af103e2ba072d9d80cdaf1d --- /dev/null +++ b/Early Dropout Prediction System/dropout/final.py @@ -0,0 +1,69 @@ +import numpy as np +import csv +import pandas as pd +import string + +def main(): + file = 'varHistCredMedia.csv' + dataMedia = pd.read_csv(file) + dataMedia = dataMedia.drop("Unnamed: 0",1) + + fileE = 'cambioEstudiantes.csv' + df = pd.read_csv(fileE) + dataEstudiante = df.drop("Unnamed: 0",1) + + fileC = 'credPassYear.csv' + dataCredCurs = pd.read_csv(fileC) + dataCredCurs = dataCredCurs.drop("Unnamed: 0",1) + + data = pd.merge(dataEstudiante, dataMedia, how='outer', on=['id']) + data = pd.merge(data, dataCredCurs, how='outer', on=['id','CARRERA']) + data['gradeMean'].fillna(0,inplace = True) + data['CREDITOS'].fillna(0,inplace = True) + data['FSE'].fillna(5,inplace = True) + + dataTot = data.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count() + dataTot.columns = ['CARRERA','MALLA_ID','TOT'] + + #Ponemos el dropout de los 3 años sin acudir a clase + dropout = 'dropout5Years.csv' + dropout = pd.read_csv(dropout) + dropout.drop("Unnamed: 0",1,inplace = True) + data = pd.merge(data, dropout, how='outer', on=['id']) + + for i in range(len(data)): + if data.dropout.values[i] >= 1 : + data.DROPOUT.values[i] = 0 + data.drop("dropout",1,inplace=True) + dataPass = data[data.DROPOUT == 1] + dataPass = dataPass.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count() + dataPass.columns = ['CARRERA','MALLA_ID','PASS'] + dataFail = data[data.DROPOUT == 0] + dataFail = dataFail.groupby(['CARRERA','MALLA_ID'],as_index=False).id.count() + dataFail.columns = ['CARRERA','MALLA_ID','FAIL'] + + dataRate = pd.merge(dataTot, dataPass, how='outer', on=['CARRERA','MALLA_ID']) + dataRate = pd.merge(dataRate, dataFail, how='outer', on=['CARRERA','MALLA_ID']) + + dataRate.fillna(0,inplace = True) + + + #No se tiene en cuenta el número de estudiantes que están cursando ahora para el total de los alumnos + dataRate['rate'] = dataRate.FAIL/(dataRate.FAIL+dataRate.PASS) + dataRate = dataRate.drop("TOT",1).drop("PASS",1).drop("FAIL",1) + data = pd.merge(data, dataRate, how='outer', on=['CARRERA','MALLA_ID']) + data = data.drop_duplicates() + + + data = data.sort_values(by='id', ascending=True) + data = data.drop("ANIO_EGRESO",1).drop("CARRERA_ID",1).drop("DURACION_ANIOS",1).drop("NOMBRE",1).drop("NOTA_FINAL",1).drop("SEMESTRE_EGRESO",1) + data = data.drop("SEMESTRE_INGRESO",1).drop("ANIO_INGRESO",1) + columnas = ['FSE','ID','DROPOUT','CARRERA','PASSDONE','GRADEMEAN','YEARSMAT/YEARSDEGREE','CREDITSPASSEDDEGREE','MALLA_ID','SEMESTERMAT/SEMESTERDEG','ABANDONMENTRATE'] + data.columns = columnas + data.CARRERA = data['CARRERA'].fillna(0) + data = data[data['CARRERA']!= 0] + data.to_csv("Final.csv") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/get_calificaciones.py b/Early Dropout Prediction System/dropout/get_calificaciones.py new file mode 100644 index 0000000000000000000000000000000000000000..105e29626898ef6cffd56c87f820a82ed26a9739 --- /dev/null +++ b/Early Dropout Prediction System/dropout/get_calificaciones.py @@ -0,0 +1,30 @@ +import numpy as np +import csv +import pandas as pd +import string + + +def main(): + + file = 'calificaciones1.csv' + df = pd.read_csv(file) + df = df.sort_values(by='id', ascending=True) + df.drop("Unnamed: 0",1,inplace = True) + df = df.drop("NOTA7",1) + + df['ESTADO_APROBACION'].replace("APROBADO",1,inplace = True) + df['ESTADO_APROBACION'].replace("REPROBADO",0,inplace = True) + df['ESTADO_APROBACION'].replace("REPROBADO POR FALTAS",0,inplace = True) + + df['NOTA1'].fillna(0,inplace = True) + df['NOTA2'].fillna(0,inplace = True) + df['NOTA3'].fillna(0,inplace = True) + df['NOTA4'].fillna(0,inplace = True) + df['NOTA5'].fillna(0,inplace = True) + df['NOTA6'].fillna(0,inplace = True) + df['NOTA_FINAL'].fillna(0,inplace = True) + + df.to_csv("cambioCal.csv") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/get_estudiantes.py b/Early Dropout Prediction System/dropout/get_estudiantes.py new file mode 100644 index 0000000000000000000000000000000000000000..5fdfe1dd3514bec52a577477f4c2d1cce97f281d --- /dev/null +++ b/Early Dropout Prediction System/dropout/get_estudiantes.py @@ -0,0 +1,26 @@ +import numpy as np +import csv +import pandas as pd +import string + + +def main(): + + file = 'estudiantes.csv' + df = pd.read_csv(file,encoding = "ISO-8859-1") + df = df.sort_values(by='id', ascending=True) + df = df.drop("TIPO",1).drop("COLEGIO",1) + + file1 = 'graduados.csv' + df1 = pd.read_csv(file1) + df1 = df1.sort_values(by='id', ascending=True) + df1.drop("Unnamed: 0",1,inplace = True) + df1 = df1.drop("SEMESTRE_INGRESO_DESC",1).drop("SEMESTRE_EGRESO_DESC",1) + + df1['DROPOUT'] = 1 + df = pd.merge(df, df1, how='outer', on=['id']) + + df.to_csv("cambioEstudiantes.csv") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/get_mallas.py b/Early Dropout Prediction System/dropout/get_mallas.py new file mode 100644 index 0000000000000000000000000000000000000000..a2865eb82d180c6d1eab09347ce381ea9c28360a --- /dev/null +++ b/Early Dropout Prediction System/dropout/get_mallas.py @@ -0,0 +1,27 @@ +import numpy as np +import csv +import pandas as pd +import string + + +def main(): + + file = 'mallas.csv' + df = pd.read_csv(file,encoding = "ISO-8859-1") + df = df.sort_values(by='CARRERA', ascending=True) + df = df.drop("NOMBRE_ASIGNATURA",1) + + df.to_csv("cambioMallas.csv") + + file = 'cambioCal.csv' + df1 = pd.read_csv(file) + + semestresMalla = df.groupby(['CARRERA','MALLA_ID']).SEMESTRE.max().to_frame() + + df = pd.merge(df1,df,how='inner', on=['ASIGNATURA_CODIGO','CARRERA']).drop("Unnamed: 0",1).drop("DESCRIPCIONPERIODO",1) + df = pd.merge(df,semestresMalla, how= 'outer', on = ['CARRERA','MALLA_ID']) + df = df.sort_values(by='id', ascending=True) + + df.to_csv("asigMallas.csv") +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/get_pga.py b/Early Dropout Prediction System/dropout/get_pga.py new file mode 100644 index 0000000000000000000000000000000000000000..a2d0f27c30cdc2e68e7f6bbd93d0f91f43cd24de --- /dev/null +++ b/Early Dropout Prediction System/dropout/get_pga.py @@ -0,0 +1,37 @@ +import numpy as np +import csv +import pandas as pd +import string + + +def main(): + + file = 'pga.csv' + df = pd.read_csv(file) + df = df.sort_values(by='id', ascending=True) + + df1 = df.groupby(['id','CARRERA']).PGA.sum().to_frame() + df2 = df.groupby(['id','CARRERA']).id.count() + df1 = df1.reset_index() + + df2 = df2.to_frame() + df2.columns = ['COUNT'] + + df1 = pd.merge(df1, df2, how='outer', on=['id','CARRERA']) + + df1.PGA = df1.PGA/df1.COUNT + df1 = df1.drop("COUNT",1) + df1.columns = ['id','CARRERA','MEDIA'] + + df1.to_csv("cambioPGA.csv") + + df1.drop("CARRERA",1,inplace = True) + + file = 'cambioEstudiantes.csv' + df = pd.read_csv(file) + + df = pd.merge(df, df1, how='outer', on=['id']).drop("Unnamed: 0",1).drop("NOMBRE",1) + + df.to_csv("estudPGA.csv") +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/jsontocsv.py b/Early Dropout Prediction System/dropout/jsontocsv.py new file mode 100644 index 0000000000000000000000000000000000000000..37f9e5f85dcbf93c9dd125691da1dc9b38d48e76 --- /dev/null +++ b/Early Dropout Prediction System/dropout/jsontocsv.py @@ -0,0 +1,149 @@ +import csv +import json +import pandas as pd +import numpy as np + +est = json.loads(open('estudiantes.json').read()) + +f = csv.writer(open("estudiantes.csv", "w+")) + + +# Write CSV Header, If you dont need that, remove this line +for y in range(len(est)): + + est[y]['id'] = y+1 + +f.writerow(["COLEGIO", "TIPO", "FSE","id"]) + +for x in est: + f.writerow([x["COLEGIO"], + x["TIPO"], + x["FSE"], + x["id"]]) +file = 'estudiantes.csv' +df = pd.read_csv(file,encoding = "ISO-8859-1") +df.to_csv("estudiantes1.csv") + + +grad = json.loads(open('graduados.json').read()) + +df = pd.DataFrame(grad) +df.columns = ['dict'] + +arr = np.array(df['dict']).tolist() +arr = [{'CARRERA_ID': 0, 'NOMBRE': 0, 'DURACION_ANIOS': 0, 'ANIO_INGRESO': 0, 'SEMESTRE_INGRESO': 0, 'SEMESTRE_INGRESO_DESC': 0, 'ANIO_EGRESO': 0, 'SEMESTRE_EGRESO': 115, 'SEMESTRE_EGRESO_DESC': 'SEPTIEMBRE 2016-FEBRERO 2017', 'NOTA_FINAL': 0} if v is None else v for v in arr] + +df = pd.DataFrame.from_records(arr) + +df['id'] = 1 +arr = np.array(df) +for i in range(len(arr)): + arr[i][10] = i+1 +df = pd.DataFrame(arr, columns=df.columns) +df = df[df.CARRERA_ID !=0] +df.to_csv("graduados.csv") + + + +mallas = json.loads(open('mallas.json').read()) + + +f = csv.writer(open("mallas.csv", "w+")) +# Write CSV Header, If you dont need that, remove this line + +f.writerow(['MALLA_ID', 'CARRERA', 'MALLA_ANIO', 'SEMESTRE', 'ASIGNATURA_CODIGO', 'NOMBRE_ASIGNATURA', 'CREDITOS', 'TOTAL_HORAS_CICLO', 'EJE_FORMACION', 'OPTATIVO', 'ELECTIVO','TOTAL_HORAS_MALLA']) + +for x in mallas: + f.writerow([x["MALLA_ID_"], + x["CARRERA"], + x["MALLA_ANIO"], + x["SEMESTRE"], + x["ASIGNATURA_CODIGO"], + x["NOMBRE_ASIGNATURA"], + x["CREDITOS"], + x["TOTAL_HORAS_CICLO"], + x["EJE_FORMACION"], + x["OPTATIVO"], + x["ELECTIVO"], + x["TOTAL_HORAS_MALLA"]]) + +file = 'mallas.csv' +df = pd.read_csv(file,encoding = "ISO-8859-1") +df = df.drop_duplicates() +df['id'] = 1 +arr = np.array(df) +for i in range(len(arr)): + arr[i][12] = i+1 +df = pd.DataFrame(arr, columns=df.columns) +df.to_csv("mallas1.csv") + + + +pga = json.loads(open('pga.json').read()) + +f = csv.writer(open("pga.csv", "w+")) + +for y in range(len(pga)): + for x in range(len(pga[y])): + pga[y][x]['id'] = y+1 + if len(pga[y]) == 0: + pga[y] = [{"CARRERA":0, "PERLEC_ID":0, "DESCRIPCIONPERIODO":0,"PGA":0,"id":y+1}] + + + +# Write CSV Header, If you dont need that, remove this line + +f.writerow(["CARRERA", "PERLEC_ID", "DESCRIPCIONPERIODO","PGA","id"]) + +for y in pga: + for x in y: + f.writerow([x["CARRERA"], + x["PERLEC_ID"], + x["DESCRIPCIONPERIODO"], + x["PGA"], + x["id"]]) + + +file = 'pga.csv' +df = pd.read_csv(file,index_col='id') +df.to_csv("pga1.csv") + +calif = json.loads(open('calificaciones.json').read()) + + +f = csv.writer(open("calificaciones.csv", "w+")) + +for y in range(len(calif)): + for x in range(len(calif[y])): + calif[y][x]['id'] = y+1 + +# Write CSV Header, If you dont need that, remove this line + +f.writerow(['CARRERA', 'ASIGNATURA_CODIGO', 'NUMERO_MATRICULA', 'GRUPO', 'ANIO', 'PERLEC_ID', 'DESCRIPCIONPERIODO', + 'ESTADO_APROBACION', 'FORMA_APROBACION', 'NOTA1', 'NOTA2', 'NOTA3', 'NOTA4', 'NOTA5', 'NOTA6', 'NOTA7', 'NOTA_FINAL', 'RESP_ID','id']) + +for y in calif: + f.writerow("") + for x in y: + f.writerow([x["CARRERA"], + x["ASIGNATURA_CODIGO"], + x["NUMERO_MATRICULA"], + x["GRUPO"], + x["ANIO"], + x["PERLEC_ID"], + x["DESCRIPCIONPERIODO"], + x["ESTADO_APROBACION"], + x["FORMA_APROBACION"], + x["NOTA1"], + x["NOTA2"], + x["NOTA3"], + x["NOTA4"], + x["NOTA5"], + x["NOTA6"], + x["NOTA7"], + x["NOTA_FINAL"], + x["RESP_ID"], + x["id"]]) +file = 'calificaciones.csv' +df = pd.read_csv(file,encoding = "ISO-8859-1") +df.to_csv("calificaciones1.csv") \ No newline at end of file diff --git a/Early Dropout Prediction System/dropout/predict.py b/Early Dropout Prediction System/dropout/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..9b59a6b4fb8c526c1a36fc377e289550c7c2b4fc --- /dev/null +++ b/Early Dropout Prediction System/dropout/predict.py @@ -0,0 +1,81 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + file = 'Final.csv' + data = pd.read_csv(file) + data.drop("Unnamed: 0",1,inplace=True) + data['DROPOUT'].fillna(2,inplace = True) + data = data.dropna() + data['DROPOUT'].replace("",2,inplace = True) + + dataPred = data.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + dataTrain = dataTrain.drop("ID",1).drop("CARRERA",1) + + dataF = dataPred[dataPred.DROPOUT == 2] + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + dataTest = dataTest.drop("ID",1).drop("CARRERA",1) + + x_train = dataTrain.values[:,[2,3,4,7]] + y_train = dataTrain.values[:,1] + + x_test1 = dataTest.values[:,[2,3,4,7]] + y_test = dataTest.values[:,1] + + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + x_test = robust_scaler.fit_transform(x_test1) + + """ + cfr = RandomForestClassifier() + + from sklearn.model_selection import GridSearchCV + # Create the parameter grid based on the results of random search + param_grid = { + 'max_depth': range(3,20,1), + 'max_features': ['auto'], + 'min_samples_leaf': range(1,20,1), + 'min_samples_split': range(3,20,1), + 'n_estimators': range(300,600,25) + } + + # Instantiate the grid search model + grid_search = GridSearchCV(estimator = cfr, param_grid = param_grid, + cv = 15, n_jobs = -1, verbose = 2) + # Fit the grid search to the data + grid_search.fit(x_train, y_train) + print("\n") + print(grid_search.best_params_) + print("\n") + """ + cfr2 = RandomForestClassifier(bootstrap = True, max_depth = 13, max_features='auto',min_samples_leaf = 7, min_samples_split = 15, n_estimators = 400) + y_pred = cfr2.fit(x_train, y_train).predict_proba(x_test) + y_pr2 = cfr2.fit(x_train, y_train).predict(x_test) + + #y_pred = grid_search.fit(x_train, y_train).predict_proba(x_test) + #y_pr2 = grid_search.fit(x_train, y_train).predict(x_test) + + x_pr1 = pd.DataFrame(x_test1) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + dataF = dataF.reset_index(drop=True) + dataF.drop("DROPOUT",1,inplace=True) + df = pd.concat([dataF,y_pred2, df], axis=1, ignore_index=True) + df.columns = ['FSE','ID','CARRERA','PASSDONE','MEDIA','YEARSMAT/DEGREE','CREDITPASSDEGREE','MALLA_ID','SEMESTERMAT/DEG','ABANDRATE','CLAS','PROB'] + df.to_excel("ResultsRF.xlsx") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predDecTree.py b/Early Dropout Prediction System/predDecTree.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9fdb48ce0cc9e2076bd3128c5299cfbc95c53f --- /dev/null +++ b/Early Dropout Prediction System/predDecTree.py @@ -0,0 +1,167 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.tree import DecisionTreeRegressor +from sklearn.tree import DecisionTreeClassifier +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[0,2,3,4,5,6]] + y_train = dataTrain.values[:,1] + + x_test = dataTest.values[:,[0,2,3,4,5,6]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + + + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr = DecisionTreeRegressor() + cfr2 = DecisionTreeClassifier() + + kf = model_selection.KFold(n_splits=10) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + df.to_excel("ResultsSVC.xlsx") + + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + RMSE = np.array(results).mean() + #print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predGradBoost.py b/Early Dropout Prediction System/predGradBoost.py new file mode 100644 index 0000000000000000000000000000000000000000..a05fe44d905f3960a0f8a354a26279192ee76b7a --- /dev/null +++ b/Early Dropout Prediction System/predGradBoost.py @@ -0,0 +1,166 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.ensemble import GradientBoostingClassifier +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['stateStudent'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[0,2,3,4,5,6]] + y_train = dataTrain.values[:,1] + + x_test = dataTest.values[:,[0,2,3,4,5,6]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + + + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr = DecisionTreeRegressor() + cfr2 = GradientBoostingClassifier() + + kf = model_selection.KFold(n_splits=10) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + df.to_excel("ResultsDT.xlsx") + + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + #RMSE = np.array(results).mean() + #print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predLinLog.py b/Early Dropout Prediction System/predLinLog.py new file mode 100644 index 0000000000000000000000000000000000000000..c32aaa2c24f373cfb2588aa5563d5d30f9446cfe --- /dev/null +++ b/Early Dropout Prediction System/predLinLog.py @@ -0,0 +1,165 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn import linear_model +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[0,2,3,4,5,6]] + y_train = dataTrain.values[:,1] + + x_test = dataTest.values[:,[0,2,3,4,5,6]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + + + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr2 = linear_model.LinearRegression() + cfr2 = linear_model.LogisticRegression() + + kf = model_selection.KFold(n_splits=10) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + df.to_excel("ResultsSVC.xlsx") + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + RMSE = np.array(results).mean() + #print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predMLP.py b/Early Dropout Prediction System/predMLP.py new file mode 100644 index 0000000000000000000000000000000000000000..521a62e7b4f357f637e2de5ff33a4c37fcd7f714 --- /dev/null +++ b/Early Dropout Prediction System/predMLP.py @@ -0,0 +1,166 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.neural_network import MLPClassifier +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[0,2,3,4,5,6]] + y_train = dataTrain.values[:,1] + + x_test = dataTest.values[:,[0,2,3,4,5,6]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + + + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr = DecisionTreeRegressor() + cfr2 = MLPClassifier() + + kf = model_selection.KFold(n_splits=10) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + df.to_excel("ResultsDT.xlsx") + + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + #RMSE = np.array(results).mean() + #print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predRF.py b/Early Dropout Prediction System/predRF.py new file mode 100644 index 0000000000000000000000000000000000000000..688bfd1c1565117257620d1c868f300141c760c9 --- /dev/null +++ b/Early Dropout Prediction System/predRF.py @@ -0,0 +1,176 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("ID",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred = dataPred.dropna() + print(len(dataPred)) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[2,3,4,5,6,7]] + y_train = dataTrain.values[:,1] + + x_test = dataTest.values[:,[2,3,4,5,6,7]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + x_test = robust_scaler.fit_transform(x_test) + + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr = RandomForestRegressor(n_estimators = 500) + cfr2 = RandomForestClassifier(n_estimators = 500) + + kf = model_selection.KFold(n_splits=10) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + #y_pred = cfr.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred - y_train[testcv])**2))) + y_pred1 = cfr2.fit(x_train[traincv], y_train[traincv]) + y_pred = y_pred1.predict_proba(x_train[testcv]) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + #df.to_excel("ResultsRF.xlsx") + + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + #RMSE = np.array(results).mean() + #print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + pred = y_pred1.predict_proba(x_test) + pred = pd.DataFrame(pred) + pred.to_excel("RF6.xlsx") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Early Dropout Prediction System/predSVM.py b/Early Dropout Prediction System/predSVM.py new file mode 100644 index 0000000000000000000000000000000000000000..adab0d7d452c8dd54a7b19f3c9e61698777a832f --- /dev/null +++ b/Early Dropout Prediction System/predSVM.py @@ -0,0 +1,187 @@ +import numpy as np +import csv +import pandas as pd +import string +from sklearn import model_selection +from sklearn.svm import SVC +from sklearn.svm import SVR +from sklearn.model_selection import GridSearchCV +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn import metrics + +def main(): + + file = 'Final.csv' + data = pd.read_csv(file) + dataPred = data.drop("id",1).drop("CARRERA",1).drop("Unnamed: 0",1) + #dataPred['DROPOUT'] = dataPred['DROPOUT'].str.strip() + + + dataPred['DROPOUT'].fillna(2,inplace = True) + #print(dataPred.stateStudent) + dataPred['DROPOUT'].replace("",2,inplace = True) + dataPred = dataPred.fillna(0) + + dataTrain = dataPred[dataPred.DROPOUT != 2] + dataTrain.DROPOUT = dataTrain.DROPOUT.astype(int) + + dataTest = dataPred[dataPred.DROPOUT == 2] + dataTest['DROPOUT'].replace(2,"",inplace = True) + + #dataTrain.to_excel("dataTrain.xlsx") + #dataTest.to_excel("dataTest.xlsx") + + x_train = dataTrain.values[:,[0,2,3,4,5,6]] + y_train = dataTrain.values[:,1] + print(y_train) + x_test = dataTest.values[:,[0,2,3,4,5,6]] + y_test = dataTest.values[:,1] + + robust_scaler = preprocessing.StandardScaler() + x_train = robust_scaler.fit_transform(x_train) + x_test = robust_scaler.fit_transform(x_test) + # Get best parameters + # Set the parameters by cross-validation + + """ + tuned_parameters = [{'kernel': ['rbf','linear','poly'], 'gamma': [1e-2,0.5e-3, 1.5e-1],'C': [1, 2] }] + clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=25) + clf.fit(x_train, y_train) + print("Best parameters set found on development set:") + print(clf.best_params_) + + """ + print("Starting cross-validation (" + str(len(x_train)) + ' learners)') + + #cfr = SVR( epsilon =0.01) + cfr2 = SVC(probability = True) + + kf = model_selection.KFold(n_splits=25) + cv = kf.split(x_train) + + results = [] + res_ce = [] + A_A = 0 + A_S = 0 + S_A = 0 + S_S = 0 + + y_pred_list = list() + y_true_list = list() + y_true2_list = list() + + for traincv, testcv in cv: + """ + y_pred = cfr.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + results.append(np.sqrt(np.mean((y_pred - y_train[testcv])**2))) + y_pr1 = pd.DataFrame(y_train[testcv]) + df = pd.DataFrame(y_pred) + df = pd.concat([y_pr1, df], axis=1,) + df.to_excel("ResultsSVR.xlsx") + """ + y_pred = cfr2.fit(x_train[traincv], y_train[traincv]).predict_proba(x_train[testcv]) + #print(y_pred) + #results.append(np.sqrt(np.mean((y_pred[:,1] - y_train[testcv])**2))) + y_pr2 = cfr2.fit(x_train[traincv], y_train[traincv]).predict(x_train[testcv]) + res_ce.append(np.mean(np.abs(y_pr2 - y_train[testcv]))) + y_pr1 = pd.DataFrame(y_train[testcv]) + x_pr1 = pd.DataFrame(x_train[testcv]) + y_pred2 = pd.DataFrame(y_pr2) + df = pd.DataFrame(y_pred[:,1] ) + df = pd.concat([y_pr1,y_pred2, df], axis=1,) + print(df) + df.to_excel("ResultsSVC.xlsx") + + # Store results for AUC + for i, v in enumerate(y_pred[:,1]): + y_pred_list.append(v) + y_true_list.append(y_train[testcv][i]) + y_true2_list.append(y_train[testcv][i]) + # Certificate earners + for i, val in enumerate(y_pr2): + if y_pr2[i] == 1 and y_train[testcv][i] == 1: + A_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 1: + A_S += 1 + if y_pr2[i] == 1 and y_train[testcv][i] == 0: + S_A += 1 + if y_pr2[i] == 0 and y_train[testcv][i] == 0: + S_S += 1 + #print out the mean of the cross-validated results + RMSE = np.array(results).mean() + print("RMSE: " + str( RMSE)) + accuracy = (A_A+S_S)/((A_A+A_S+S_A+S_S)*1.0) + print("Results CE: " + str(1-np.array(res_ce).mean()) + " / " + str(accuracy)) + # Results about certificate earners + print(str(A_A) + "\t" + str(A_S)) + print(str(S_A) + "\t" + str(S_S)) + TP = A_A + FP = A_S + FN = S_A + TN = S_S + try: + recall = TP / ((TP+FN)*1.0); + except: + recall = 0 + try: + precision = TP / ((TP+FP)*1.0); + except: + precision = 0 + try: + specificity = TN / ((TN+FP)*1.0) + except: + specificicty = 0 + try: + NPV = TN / ((FN+TN)*1.0); + except: + NPV = 0 + try: + F_score = (2*TP)/((2*TP+FP+FN)*1.0) + except: + F_score = 0 + + print('Recall: ' + str(recall)) + print('Precision: ' + str(precision)) + print('Specificity: ' + str(specificity)) + print('NVP:' + str(NPV)) + print('F-score: ' + str(F_score)) + + # Compute AUC + y = np.array(y_true_list) + pred = np.array(y_pred_list) + y_true = np.array(y_true2_list) + fpr, tpr, thresholds = metrics.roc_curve(y, pred) + + AUC = metrics.auc(fpr, tpr) + RMSEsk = np.sqrt(metrics.mean_squared_error(y_true, pred)) + MAE = metrics.mean_absolute_error(y_true, pred) + print('AUC: ' + str(AUC)) + + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr)) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() + + results = dict() + results['RMSE'] = RMSEsk + results['MAE'] = MAE + results['AUC'] = AUC + results['F1'] = F_score + results['recall'] = recall + results['precision'] = precision + results['accuracy'] = accuracy + + print(results) + + +if __name__ == "__main__": + main() \ No newline at end of file