Python Error on hstack and predict_proba after load the machine learning model

Question

I am trying to develop this Game Recommendation project. However, I'm having difficulty loading the machine learning models that I saved using the joblib library.

I have added the notebook to Google Colab: Google Colab - Games.ipynb

The df_gamesplayed dataset, refers to all the games that some user has added to his profile that he has already played, so he will receive a recommendation according to those games. (I don't know if it's the right way, because each user has a profile with different games already played).

But, after I did some tests (the model is not perfect yet), but the point is that I'm having difficulty loading the saved model, and reading the dataset to show me which games are recommended and the score for each game.

I noticed that if I run the title vectorizer, it generates an array of (1, 36949), apparently different from what I trained.

And my array of features of the num_features variable is (2, 11429), also apparently different. And when I do the hstack it presents me with the error below, because the number of lines is different.

hstack Error:

ValueError: blocks [0 ,:] has incompatible row dimensions. Got blocks [0,1] .shape [0] == 1, expected 2.

And if I remove one feature from the num_features variable, it runs hstack normally, but it gives me an error when I try to run predict_proba.

predict_proba Error:

ValueError: Number of features of the model must match the input. Model n_features is 2672 and input n_features is 48378.

For the title vectorizer I was using the NM_GAME column only, but I ended up concatenating it with other columns to create the IMPORTANT_FEATURES column, however the recommendation game name, must come from the NM_GAME column.

How can I bring the game recommendation to the user?

Script:

from os.path import join
import pandas as pd
import numpy as np
import os
import sqlite3 as sql
import matplotlib.pyplot as plt
import joblib as jb
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

# Database and Datasets

# Opens the connection to the database according to the DB file
conn = sql.connect('C:\Users\guilh\OneDrive\Documentos\Cursos\Mario Filho\Games\Database\Games.db')

# Create dataframe
df_gamesbyplatform = pd.read_sql_query("SELECT * FROM V_GAMESBYPLATFORM", conn)
df_gamesdetails = pd.read_sql_query("SELECT * FROM V_GAMES_DETAILS", conn)
df_gamesplayed = pd.read_sql_query("SELECT * FROM V_GAMESPLAYED", conn)

# Closes the connection to the database
conn.close()

# Check the created dataframe
print(df_gamesplayed.head())

# Features

features = pd.DataFrame(index=df_gamesplayed.index)

# Feature if the user has played the game or not, 1 = yes and 0 = no
y = df_gamesplayed["IC_PLAYED"].copy()

# Separates the necessary features of the dataframe for training
features["NR_CRITICSCORE"] = df_gamesplayed["NR_CRITICSCORE"]
features["DT_YEAROFRELEASE"] = df_gamesplayed["DT_YEAROFRELEASE"]

# Show the new features dataframe
print(features.head())
print(y)

mask_train = df_gamesplayed['DT_YEAROFRELEASE'] < median(features["DT_YEAROFRELEASE"])
mask_val = (df_gamesplayed['DT_YEAROFRELEASE'] >= median(features["DT_YEAROFRELEASE"]))

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)

title_train = df_gamesplayed[mask_train]['IMPORTANT_FEATURES']
title_val = df_gamesplayed[mask_val]['IMPORTANT_FEATURES']

title_vec = TfidfVectorizer(min_df=4, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

print(title_bow_train.shape, title_bow_val.shape)

print([Xtrain, title_bow_train])

print([Xtrain.shape, title_bow_train.shape], [Xval.shape, title_bow_val.shape])

from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

print(Xtrain_wtitle.shape, Xval_wtitle.shape)

# Random Forest

mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=2, class_weight="balanced", n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, ytrain)

print(Xval_wtitle.shape)

p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]
print(average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf))

# LightGBM

from lightgbm import LGBMClassifier

mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)
p = mdl.predict_proba(Xval_wtitle)[:, 1]

print(average_precision_score(yval, p), roc_auc_score(yval, p))

# Bayesian Optimization

from skopt import forest_minimize

def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 15), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

print(res.x, res.fun)

# LightGBM after optimization

params = [0.059718899146636396, 2, 3, 0.5850311564924788, 0.8012679059209196, 688, 1, 3]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]
print(average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm))

# Logistic Reg

from sklearn.pipeline import make_pipeline

Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]
print(average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr))

# Ensemble

p = (p_lr + p_rf + p_lgbm)/3
print(average_precision_score(yval, p), roc_auc_score(yval, p))

print(pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr())

p = 0.1*p_rf + 0.9*p_lgbm
print(average_precision_score(yval, p), roc_auc_score(yval, p))

# Save machine learning models

# v1 = 08/03/2021

jb.dump(mdl_rf, "random_forest_v1.pkl.z")
jb.dump(mdl_lgbm, "lgbm_v1.pkl.z")
jb.dump(lr_pipeline, "logistic_reg_v1.pkl.z")
jb.dump(title_vec, "title_vectorizer_v1.pkl.z")

# Testing of saved machine learning models

mdl_rf = jb.load("random_forest_v1.pkl.z")
mdl_lgbm = jb.load("lgbm_v1.pkl.z")
title_vec = jb.load("title_vectorizer_v1.pkl.z")
lr_pipeline = jb.load("logistic_reg_v1.pkl.z")

title = df_gamesplayed['IMPORTANT_FEATURES'].to_string()

# Features
features = dict()

#features = pd.DataFrame(index=df_gamesplayed.index)

# Separates the necessary features of the dataframe for training variables
features['NR_CRITICSCORE'] = df_gamesplayed["NR_CRITICSCORE"]
features['DT_YEAROFRELEASE'] = df_gamesplayed["DT_YEAROFRELEASE"]

vectorized_title = title_vec.transform([title])
print(vectorized_title)
print(vectorized_title.shape)

array = np.array([features['NR_CRITICSCORE'], features['DT_YEAROFRELEASE']])
num_features = csr_matrix(array)
print(num_features.shape)

from scipy.sparse import hstack, vstack

feature_array = hstack([num_features, vectorized_title])

print(feature_array.shape)

p_rf = mdl_rf.predict_proba(feature_array)[0][1]
p_lgbm = mdl_lgbm.predict_proba(feature_array)[0][1]

Guilherme Matheus · Accepted Answer

The problem was that I had more than on title_vec variable, so I had to reorganize the script and it worked!

Python Error on hstack and predict_proba after load the machine learning model

Answers (1)

Related Questions