Reputation: 595
I am trying to develop this Game Recommendation project. However, I'm having difficulty loading the machine learning models that I saved using the joblib
library.
I have added the notebook to Google Colab: Google Colab - Games.ipynb
The df_gamesplayed
dataset, refers to all the games that some user has added to his profile that he has already played, so he will receive a recommendation according to those games. (I don't know if it's the right way, because each user has a profile with different games already played).
But, after I did some tests (the model is not perfect yet), but the point is that I'm having difficulty loading the saved model, and reading the dataset to show me which games are recommended and the score for each game.
I noticed that if I run the title vectorizer, it generates an array of (1, 36949)
, apparently different from what I trained.
And my array of features of the num_features
variable is (2, 11429)
, also apparently different.
And when I do the hstack it presents me with the error below, because the number of lines is different.
hstack Error:
ValueError: blocks [0 ,:] has incompatible row dimensions. Got blocks [0,1] .shape [0] == 1, expected 2.
And if I remove one feature from the num_features
variable, it runs hstack
normally, but it gives me an error when I try to run predict_proba
.
predict_proba Error:
ValueError: Number of features of the model must match the input. Model n_features is 2672 and input n_features is 48378.
For the title vectorizer I was using the NM_GAME
column only, but I ended up concatenating it with other columns to create the IMPORTANT_FEATURES
column, however the recommendation game name, must come from the NM_GAME
column.
How can I bring the game recommendation to the user?
Script:
from os.path import join
import pandas as pd
import numpy as np
import os
import sqlite3 as sql
import matplotlib.pyplot as plt
import joblib as jb
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix
# Database and Datasets
# Opens the connection to the database according to the DB file
conn = sql.connect('C:\\Users\\guilh\\OneDrive\\Documentos\\Cursos\\Mario Filho\\Games\\Database\\Games.db')
# Create dataframe
df_gamesbyplatform = pd.read_sql_query("SELECT * FROM V_GAMESBYPLATFORM", conn)
df_gamesdetails = pd.read_sql_query("SELECT * FROM V_GAMES_DETAILS", conn)
df_gamesplayed = pd.read_sql_query("SELECT * FROM V_GAMESPLAYED", conn)
# Closes the connection to the database
conn.close()
# Check the created dataframe
print(df_gamesplayed.head())
# Features
features = pd.DataFrame(index=df_gamesplayed.index)
# Feature if the user has played the game or not, 1 = yes and 0 = no
y = df_gamesplayed["IC_PLAYED"].copy()
# Separates the necessary features of the dataframe for training
features["NR_CRITICSCORE"] = df_gamesplayed["NR_CRITICSCORE"]
features["DT_YEAROFRELEASE"] = df_gamesplayed["DT_YEAROFRELEASE"]
# Show the new features dataframe
print(features.head())
print(y)
mask_train = df_gamesplayed['DT_YEAROFRELEASE'] < median(features["DT_YEAROFRELEASE"])
mask_val = (df_gamesplayed['DT_YEAROFRELEASE'] >= median(features["DT_YEAROFRELEASE"]))
Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)
title_train = df_gamesplayed[mask_train]['IMPORTANT_FEATURES']
title_val = df_gamesplayed[mask_val]['IMPORTANT_FEATURES']
title_vec = TfidfVectorizer(min_df=4, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)
print(title_bow_train.shape, title_bow_val.shape)
print([Xtrain, title_bow_train])
print([Xtrain.shape, title_bow_train.shape], [Xval.shape, title_bow_val.shape])
from scipy.sparse import hstack, vstack
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])
print(Xtrain_wtitle.shape, Xval_wtitle.shape)
# Random Forest
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=2, class_weight="balanced", n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, ytrain)
print(Xval_wtitle.shape)
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]
print(average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf))
# LightGBM
from lightgbm import LGBMClassifier
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)
p = mdl.predict_proba(Xval_wtitle)[:, 1]
print(average_precision_score(yval, p), roc_auc_score(yval, p))
# Bayesian Optimization
from skopt import forest_minimize
def tune_lgbm(params):
print(params)
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
min_df = params[6]
ngram_range = (1, params[7])
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])
mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
min_child_samples=min_child_samples, subsample=subsample,
colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0,
class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)
p = mdl.predict_proba(Xval_wtitle)[:, 1]
print(roc_auc_score(yval, p))
return -average_precision_score(yval, p)
space = [(1e-3, 1e-1, 'log-uniform'), # lr
(1, 15), # max_depth
(1, 20), # min_child_samples
(0.05, 1.), # subsample
(0.05, 1.), # colsample_bytree
(100,1000), # n_estimators
(1,5), # min_df
(1,5)] # ngram_range
res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)
print(res.x, res.fun)
# LightGBM after optimization
params = [0.059718899146636396, 2, 3, 0.5850311564924788, 0.8012679059209196, 688, 1, 3]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
min_df = params[6]
ngram_range = (1, params[7])
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])
mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
min_child_samples=min_child_samples, subsample=subsample,
colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0,
class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)
p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]
print(average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm))
# Logistic Reg
from sklearn.pipeline import make_pipeline
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())
#scaler = StandardScaler()
#scaler = MaxAbsScaler()
#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)
lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]
print(average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr))
# Ensemble
p = (p_lr + p_rf + p_lgbm)/3
print(average_precision_score(yval, p), roc_auc_score(yval, p))
print(pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr())
p = 0.1*p_rf + 0.9*p_lgbm
print(average_precision_score(yval, p), roc_auc_score(yval, p))
# Save machine learning models
# v1 = 08/03/2021
jb.dump(mdl_rf, "random_forest_v1.pkl.z")
jb.dump(mdl_lgbm, "lgbm_v1.pkl.z")
jb.dump(lr_pipeline, "logistic_reg_v1.pkl.z")
jb.dump(title_vec, "title_vectorizer_v1.pkl.z")
# Testing of saved machine learning models
mdl_rf = jb.load("random_forest_v1.pkl.z")
mdl_lgbm = jb.load("lgbm_v1.pkl.z")
title_vec = jb.load("title_vectorizer_v1.pkl.z")
lr_pipeline = jb.load("logistic_reg_v1.pkl.z")
title = df_gamesplayed['IMPORTANT_FEATURES'].to_string()
# Features
features = dict()
#features = pd.DataFrame(index=df_gamesplayed.index)
# Separates the necessary features of the dataframe for training variables
features['NR_CRITICSCORE'] = df_gamesplayed["NR_CRITICSCORE"]
features['DT_YEAROFRELEASE'] = df_gamesplayed["DT_YEAROFRELEASE"]
vectorized_title = title_vec.transform([title])
print(vectorized_title)
print(vectorized_title.shape)
array = np.array([features['NR_CRITICSCORE'], features['DT_YEAROFRELEASE']])
num_features = csr_matrix(array)
print(num_features.shape)
from scipy.sparse import hstack, vstack
feature_array = hstack([num_features, vectorized_title])
print(feature_array.shape)
p_rf = mdl_rf.predict_proba(feature_array)[0][1]
p_lgbm = mdl_lgbm.predict_proba(feature_array)[0][1]
Upvotes: 0
Views: 140
Reputation: 595
The problem was that I had more than on title_vec variable, so I had to reorganize the script and it worked!
Upvotes: 0