Hybridized collaborative filtering and sentence similarity-based system for doctor recommendation based on user input of symptoms and location

Question

I'm trying to solve a problem of recommending a doctor based on a user's symptoms and location using a hybridized collaborative filtering and sentence similarity-based recommender system that follow these steps:

Handle missing values.
Encode categorical features.
Normalize numerical features.
Generate Sentence Embeddings: Use BERT to generate embeddings for the text features (symptoms and doctor specialization).
Collaborative Filtering Model: Create a user-item matrix based on doctor specializations and locations.
Sentence Similarity-Based Recommender: Calculate the similarity between user symptoms and doctor specializations using cosine similarity of BERT embeddings.
Hybrid Model: Combine the collaborative filtering scores and sentence similarity scores to provide personalized recommendations.
Evaluate the recommendation quality using metrics like precision, recall, F1-score, and accuracy.

Here's the code I've written

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = "Egyptian Doctors.csv"
df = pd.read_csv(file_path)

# Data Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['avg_rate'] = imputer.fit_transform(df[['avg_rate']])
df['Wait_time_Minutes'] = imputer.fit_transform(df[['Wait_time_Minutes']])
df['doctor_visitors'] = imputer.fit_transform(df[['doctor_visitors']])

# Encode categorical features for non-BERT parts
label_encoder = LabelEncoder()
df['specialization_encoded'] = label_encoder.fit_transform(df['specialization'])
df['location_encoded'] = label_encoder.fit_transform(df['location'])

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['avg_rate', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Generate BERT embeddings for text features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text_list):
    text_list = [str(text) for text in text_list]  # Convert all elements to strings
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Use [CLS] token embeddings

# Combine specialization and location for BERT embeddings
df['text_features'] = df['specialization'] + " " + df['location']
df['symptoms'] = df['specialization']  # Assuming symptoms are similar to specialization for this example

# Generate BERT embeddings
embeddings = get_bert_embeddings(df['text_features'].tolist())
embeddings_df = pd.DataFrame(embeddings, index=df.index)

# Split the data into train and test sets
# Keep 'avg_rate' for collaborative filtering
X = pd.concat([df.drop(columns=['text_features', 'specialization', 'location']), embeddings_df], axis=1)
X.columns = X.columns.astype(str)
y = df['avg_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]

# Impute missing values in X_train_ml and X_test_ml
imputer = SimpleImputer(strategy='median') # Or any other strategy you prefer
X_train_ml = imputer.fit_transform(X_train_ml)
X_test_ml = imputer.transform(X_test_ml)

# Train the RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_ml, y_train)

# Collaborative Filtering
user_item_matrix = X_train.pivot_table(index='specialization_encoded', columns='location_encoded', values='avg_rate')
user_item_matrix.fillna(0, inplace=True)

def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
    # Find the nearest doctors based on user symptoms and location
    user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
    
    cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
    df['similarity'] = cosine_similarities

    # Get top 10 similar doctors
    similar_doctors = df.nlargest(10, 'similarity')

    recommendations = []

    for _, row in similar_doctors.iterrows():
        doctor_id = row.name
        doctor_specialization = row['specialization_encoded']
        doctor_location = row['location_encoded']
        doctor_rating = row['avg_rate']

        # Collaborative Filtering Score
        cf_score = matrix.loc[doctor_specialization, doctor_location]

        # RandomForest Score
        rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
        rf_score = model_rf.predict(rf_features)[0]

        # Combine scores
        final_score = (cf_score + rf_score) / 2

        recommendations.append((doctor_id, final_score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

    return recommendations

# User input for recommendations
user_symptoms = "Skin Rash"
user_location = "El-Mansoura"
recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)

Here's the error

KeyError                                  Traceback (most recent call last)
 in ()
    111 user_symptoms = "Skin Rash"
    112 user_location = "El-Mansoura"
--> 113 recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
    114 
    115 print("Top 5 Recommendations:")

8 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in _raise_if_missing(self, key, indexer, axis_name)
   5939 
   5940             not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941             raise KeyError(f"{not_found} not in index")
   5942 
   5943     @overload

KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,...

Hybridized collaborative filtering and sentence similarity-based system for doctor recommendation based on user input of symptoms and location

Answers (1)

Related Questions