Reputation: 1
I'm working with random forest algorithm to predict college dropouts with python, the algorithm is finished and now I have to use that file and be able to run it from a website, I'm using django but I don't know how I can make it work, I import the file in views but it only display a line, it doesn't even have an structure like it has when I run the file in jupyter, so if anyone knows something I'll be very thankful.
Sorry if is a little difficult to understand, english is not my first language.
This is the algorithm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd
from sklearn.feature_selection import SelectKBest
dataset = pd.read_csv('C:/Users/danni/OneDrive/Documents/Universidad/2018/Tesis/encuestas/Nueva carpeta/Nueva carpeta/SinRanking/2005_2017_SOLO_PRIMERO_Y_SEGUNDO.csv', delimiter=";")
datos2 = dataset['F_Nac']
i = 0
a = []
while i < len(datos2):
value2 = datos2[i]
first = value2[6:10]
year = first
a_ingreso = dataset['A_Ingreso']
a.append(a_ingreso[i] - int(year))
i += 1
dataset['edad_ingreso']=a;
#calculamos la edad de ingreso a la universidad
def calcula_dif_years_eg_in(anio,cuando):
return anio - cuando
dataset['a_egresado_colegio']=dataset.apply(lambda x: calcula_dif_years_eg_in(x['A_Ingreso'],x['A_Egreso_Colegio']), axis=1);
dataset = dataset.drop(["F_Nac","A_Ingreso","A_Egreso_Colegio","Via_Ingreso"], axis=1)
# cargamos las variables predictoras
predictors = dataset.drop(['Deserto'], axis=1)
# y estos son los resultados que se obtienen, en el mismo orden
targets = dataset['Deserto']
best=SelectKBest(k=10)
X_new = best.fit_transform(predictors, targets)
X_new.shape
selected = best.get_support(indices=True)
print(predictors.columns[selected])
#datos desde el 2005 al 2015
X_train = predictors[0:567]
X_test = predictors[568:632]
#datos del 2016
y_train = targets[0:567]
y_test = targets[568:632]
modelo = RandomForestClassifier(
random_state = 1, # semilla inicial de aleatoriedad del algoritmo
n_estimators = 5, # cantidad de arboles a crear
min_samples_split = 0.5, # cantidad minima de observaciones para dividir un nodo
min_samples_leaf = 8, # observaciones minimas que puede tener una hoja del arbol
n_jobs = -1 # tareas en paralelo. para todos los cores disponibles usar -1
)
modelo.fit(X_train[predictors.columns[selected]].values, y_train)
prediccion = modelo.predict(X_test[predictors.columns[selected]].values)
modelo.score(X_train[predictors.columns[selected]], y_train)
modelo.score(X_test[predictors.columns[selected]], y_test)
print(metrics.classification_report(y_true=y_test, y_pred=prediccion))
print(pd.crosstab(y_test, prediccion, rownames=['REAL'], colnames=['PREDICCION']))
var_imp = pd.DataFrame({
'feature':predictors.columns[selected],
'v_importance':modelo.feature_importances_.tolist()
})
print (var_imp.sort_values(by = 'v_importance', ascending=False))
#Curvas de aprendizaje
from sklearn.learning_curve import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=modelo,
X=X_train, y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10), cv=10,
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
sns.set_palette("deep",desat=.6)
sns.set_context(rc={"figure.figsize":(8,4)})
# graficando las curvas
plt.plot(train_sizes, train_mean, color='r', marker='o', markersize=5,
label='entrenamiento')
plt.fill_between(train_sizes, train_mean + train_std,
train_mean - train_std, alpha=0.15, color='r')
plt.plot(train_sizes, test_mean, color='b', linestyle='--',
marker='s', markersize=5, label='evaluacion')
plt.fill_between(train_sizes, test_mean + test_std,
test_mean - test_std, alpha=0.15, color='b')
plt.grid()
plt.title('Curva de aprendizaje')
plt.legend(loc='upper right')
plt.xlabel('Cant de ejemplos de entrenamiento')
plt.ylabel('Precision')
plt.show()
i = 0
a = []
while i < len(X_test):
a.append(modelo.predict(X_test[predictors.columns[selected]])[i])
i += 1
print(a)
X_test['x']=a
X_test
print(X_test.groupby('x').size())
Upvotes: 0
Views: 587
Reputation: 8307
There are a couple of options.
Upvotes: 1