Sadura Akinrinwa
Sadura Akinrinwa

Reputation: 1

Hybridized collaborative filtering and sentence similarity-based system for doctor recommendation based on user input of symptoms and location

I'm trying to solve a problem of recommending a doctor based on a user's symptoms and location using a hybridized collaborative filtering and sentence similarity-based recommender system that follow these steps:

  1. Handle missing values.
  2. Encode categorical features.
  3. Normalize numerical features.
  4. Generate Sentence Embeddings: Use BERT to generate embeddings for the text features (symptoms and doctor specialization).
  5. Collaborative Filtering Model: Create a user-item matrix based on doctor specializations and locations.
  6. Sentence Similarity-Based Recommender: Calculate the similarity between user symptoms and doctor specializations using cosine similarity of BERT embeddings.
  7. Hybrid Model: Combine the collaborative filtering scores and sentence similarity scores to provide personalized recommendations.
  8. Evaluate the recommendation quality using metrics like precision, recall, F1-score, and accuracy.

Here's the code I've written

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = "Egyptian Doctors.csv"
df = pd.read_csv(file_path)

# Data Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['avg_rate'] = imputer.fit_transform(df[['avg_rate']])
df['Wait_time_Minutes'] = imputer.fit_transform(df[['Wait_time_Minutes']])
df['doctor_visitors'] = imputer.fit_transform(df[['doctor_visitors']])

# Encode categorical features for non-BERT parts
label_encoder = LabelEncoder()
df['specialization_encoded'] = label_encoder.fit_transform(df['specialization'])
df['location_encoded'] = label_encoder.fit_transform(df['location'])

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['avg_rate', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Generate BERT embeddings for text features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text_list):
    text_list = [str(text) for text in text_list]  # Convert all elements to strings
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Use [CLS] token embeddings

# Combine specialization and location for BERT embeddings
df['text_features'] = df['specialization'] + " " + df['location']
df['symptoms'] = df['specialization']  # Assuming symptoms are similar to specialization for this example

# Generate BERT embeddings
embeddings = get_bert_embeddings(df['text_features'].tolist())
embeddings_df = pd.DataFrame(embeddings, index=df.index)

# Split the data into train and test sets
# Keep 'avg_rate' for collaborative filtering
X = pd.concat([df.drop(columns=['text_features', 'specialization', 'location']), embeddings_df], axis=1)
X.columns = X.columns.astype(str)
y = df['avg_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]

# Impute missing values in X_train_ml and X_test_ml
imputer = SimpleImputer(strategy='median') # Or any other strategy you prefer
X_train_ml = imputer.fit_transform(X_train_ml)
X_test_ml = imputer.transform(X_test_ml)

# Train the RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_ml, y_train)

# Collaborative Filtering
user_item_matrix = X_train.pivot_table(index='specialization_encoded', columns='location_encoded', values='avg_rate')
user_item_matrix.fillna(0, inplace=True)

def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
    # Find the nearest doctors based on user symptoms and location
    user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
    
    cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
    df['similarity'] = cosine_similarities

    # Get top 10 similar doctors
    similar_doctors = df.nlargest(10, 'similarity')

    recommendations = []

    for _, row in similar_doctors.iterrows():
        doctor_id = row.name
        doctor_specialization = row['specialization_encoded']
        doctor_location = row['location_encoded']
        doctor_rating = row['avg_rate']

        # Collaborative Filtering Score
        cf_score = matrix.loc[doctor_specialization, doctor_location]

        # RandomForest Score
        rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
        rf_score = model_rf.predict(rf_features)[0]

        # Combine scores
        final_score = (cf_score + rf_score) / 2

        recommendations.append((doctor_id, final_score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

    return recommendations

# User input for recommendations
user_symptoms = "Skin Rash"
user_location = "El-Mansoura"
recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)

Here's the error

KeyError                                  Traceback (most recent call last)
<ipython-input-7-6399ed6786be> in <cell line: 113>()
    111 user_symptoms = "Skin Rash"
    112 user_location = "El-Mansoura"
--> 113 recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
    114 
    115 print("Top 5 Recommendations:")

8 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in _raise_if_missing(self, key, indexer, axis_name)
   5939 
   5940             not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941             raise KeyError(f"{not_found} not in index")
   5942 
   5943     @overload

KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,...

Upvotes: -2

Views: 42

Answers (1)

The error is in the hybrid_recommendation function. The code is trying to predict data that is different from what it was trained on. The change in data results from not properly concatenating the embedding data to the training data used in the step.

# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]

Here is the modification to the hybrid_recommendation function to use only the data present in train data:

def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]

cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities

# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')

recommendations = []

for _, row in similar_doctors.iterrows():
    doctor_id = row.name
    doctor_specialization = row['specialization_encoded']
    doctor_location = row['location_encoded']
    doctor_rating = row['avg_rate']

    # Collaborative Filtering Score
    cf_score = matrix.loc[doctor_specialization, doctor_location]

    # RandomForest Score
    # rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)

    #This is where the error is, the data used to train the model is different from what you are trying to predict.

    rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']].values.reshape(1, -1)
    rf_features = imputer.transform(rf_features)
    rf_score = model_rf.predict(rf_features)[0]

    # Combine scores
    final_score = (cf_score + rf_score) / 2

    recommendations.append((doctor_id, final_score))

recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

return recommendations

Upvotes: 0

Related Questions