Reputation: 1
I'm trying to solve a problem of recommending a doctor based on a user's symptoms and location using a hybridized collaborative filtering and sentence similarity-based recommender system that follow these steps:
Here's the code I've written
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Load the dataset
file_path = "Egyptian Doctors.csv"
df = pd.read_csv(file_path)
# Data Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['avg_rate'] = imputer.fit_transform(df[['avg_rate']])
df['Wait_time_Minutes'] = imputer.fit_transform(df[['Wait_time_Minutes']])
df['doctor_visitors'] = imputer.fit_transform(df[['doctor_visitors']])
# Encode categorical features for non-BERT parts
label_encoder = LabelEncoder()
df['specialization_encoded'] = label_encoder.fit_transform(df['specialization'])
df['location_encoded'] = label_encoder.fit_transform(df['location'])
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['avg_rate', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
# Generate BERT embeddings for text features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(text_list):
text_list = [str(text) for text in text_list] # Convert all elements to strings
inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].numpy() # Use [CLS] token embeddings
# Combine specialization and location for BERT embeddings
df['text_features'] = df['specialization'] + " " + df['location']
df['symptoms'] = df['specialization'] # Assuming symptoms are similar to specialization for this example
# Generate BERT embeddings
embeddings = get_bert_embeddings(df['text_features'].tolist())
embeddings_df = pd.DataFrame(embeddings, index=df.index)
# Split the data into train and test sets
# Keep 'avg_rate' for collaborative filtering
X = pd.concat([df.drop(columns=['text_features', 'specialization', 'location']), embeddings_df], axis=1)
X.columns = X.columns.astype(str)
y = df['avg_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]
# Impute missing values in X_train_ml and X_test_ml
imputer = SimpleImputer(strategy='median') # Or any other strategy you prefer
X_train_ml = imputer.fit_transform(X_train_ml)
X_test_ml = imputer.transform(X_test_ml)
# Train the RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_ml, y_train)
# Collaborative Filtering
user_item_matrix = X_train.pivot_table(index='specialization_encoded', columns='location_encoded', values='avg_rate')
user_item_matrix.fillna(0, inplace=True)
def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities
# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')
recommendations = []
for _, row in similar_doctors.iterrows():
doctor_id = row.name
doctor_specialization = row['specialization_encoded']
doctor_location = row['location_encoded']
doctor_rating = row['avg_rate']
# Collaborative Filtering Score
cf_score = matrix.loc[doctor_specialization, doctor_location]
# RandomForest Score
rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
rf_score = model_rf.predict(rf_features)[0]
# Combine scores
final_score = (cf_score + rf_score) / 2
recommendations.append((doctor_id, final_score))
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
return recommendations
# User input for recommendations
user_symptoms = "Skin Rash"
user_location = "El-Mansoura"
recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
Here's the error
KeyError Traceback (most recent call last)
<ipython-input-7-6399ed6786be> in <cell line: 113>()
111 user_symptoms = "Skin Rash"
112 user_location = "El-Mansoura"
--> 113 recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
114
115 print("Top 5 Recommendations:")
8 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in _raise_if_missing(self, key, indexer, axis_name)
5939
5940 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941 raise KeyError(f"{not_found} not in index")
5942
5943 @overload
KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,...
Upvotes: -2
Views: 42
Reputation: 1
The error is in the hybrid_recommendation function. The code is trying to predict data that is different from what it was trained on. The change in data results from not properly concatenating the embedding data to the training data used in the step.
# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]
Here is the modification to the hybrid_recommendation function to use only the data present in train data:
def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities
# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')
recommendations = []
for _, row in similar_doctors.iterrows():
doctor_id = row.name
doctor_specialization = row['specialization_encoded']
doctor_location = row['location_encoded']
doctor_rating = row['avg_rate']
# Collaborative Filtering Score
cf_score = matrix.loc[doctor_specialization, doctor_location]
# RandomForest Score
# rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
#This is where the error is, the data used to train the model is different from what you are trying to predict.
rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']].values.reshape(1, -1)
rf_features = imputer.transform(rf_features)
rf_score = model_rf.predict(rf_features)[0]
# Combine scores
final_score = (cf_score + rf_score) / 2
recommendations.append((doctor_id, final_score))
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
return recommendations
Upvotes: 0