Using Kaggle code/model to predict classifications for unseen dataset

Question

I have obtained the following code along with a dataset from a Kaggle notebook: https://www.kaggle.com/code/danofer/predicting-protein-classification/notebook

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sys

# Import Datasets
df_seq = pd.read_csv('pdb_data_seq.csv')
df_char = pd.read_csv('pdb_data_no_dups.csv')

print('Datasets have been loaded...')

# 2). ----- Filter and Process Dataset ------

# Filter for only proteins
protein_char = df_char[df_char.macromoleculeType == 'Protein']
protein_seq = df_seq[df_seq.macromoleculeType == 'Protein']

print(protein_char.head())

print(protein_seq.describe(include="all"))

print(protein_char.columns)

# Select  some variables to join
protein_char = protein_char[['structureId','classification','residueCount', 'resolution', 
       'structureMolecularWeight','crystallizationTempK', 'densityMatthews', 'densityPercentSol', 'phValue']]
protein_seq = protein_seq[['structureId','sequence']]
print(protein_seq.head())

print(protein_char.head())

# Join two datasets on structureId
model_f = protein_char.set_index('structureId').join(protein_seq.set_index('structureId'))
print(model_f.head())

print('%d is the number of rows in the joined dataset' %model_f.shape[0])

# Check NA counts
print(model_f.isnull().sum())

# Drop rows with missing values
model_f = model_f.dropna()
print('%d is the number of proteins that have a classification and sequence' %model_f.shape[0])

# Look at classification type counts
counts = model_f.classification.value_counts()
print(counts)

#plot counts
plt.figure()
sns.distplot(counts[(counts > 1000)], hist = False, color = 'purple')
plt.title('Count Distribution for Family Types')
plt.ylabel('% of records')
plt.show()

# Get classification types where counts are over 1000
types = np.asarray(counts[(counts > 1000)].index)
print(len(types))

# Filter dataset's records for classification types > 1000
data = model_f[model_f.classification.isin(types)]
# leaving more rows results in duplciates / index related?
data = data.drop_duplicates(subset=["classification","sequence"])      
print(types)

print('%d is the number of records in the final filtered dataset' %data.shape[0])

data = data.drop_duplicates(subset=["classification","sequence"]) 
print(data.shape)

## Could add n-grams
## https://stackoverflow.com/questions/18658106/quick-implementation-of-character-n-grams-using-python
# jump_size !=1 -> less overlap in n-grams. 
def char_grams(text,n=3,jump_size=2):
    return [text[i:i+n] for i in range(0,len(text)-n+1,jump_size)]

data.head(3).sequence.apply(char_grams)

data["3mers"] = data.sequence.apply(char_grams)

data.tail()

data.to_csv("protein_classification_46k_ngrams.csv.gz",compression="gzip")

# 3). ----- Train Test Split -----

# Split Data
X_train, X_test, y_train, y_test = train_test_split(data['sequence'], data['classification'], test_size = 0.2, random_state = 1)

# Create a Count Vectorizer to gather the unique elements in sequence
vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (4,4))

# Fit and Transform CountVectorizer
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

#Print a few of the features
print(vect.get_feature_names_out()[-20:])
sys.exit()
# 4). ------ Machine Learning Models ------

# Make a prediction dictionary to store accuracys
prediction = dict()

# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df, y_train)
NB_pred = model.predict(X_test_df)
prediction["MultinomialNB"] = accuracy_score(NB_pred, y_test)
print(prediction['MultinomialNB'])

# Adaboost
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(X_train_df,y_train)
ADA_pred = model.predict(X_test_df)
prediction["Adaboost"] = accuracy_score(ADA_pred , y_test)
print(prediction["Adaboost"])

# 5). ----- Plot Confusion Matrix for NB -----

# Plot confusion matrix
conf_mat = confusion_matrix(y_test, NB_pred, labels = types)

#Normalize confusion_matrix
conf_mat = conf_mat.astype('float')/ conf_mat.sum(axis=1)[:, np.newaxis]

# Plot Heat Map
fig , ax = plt.subplots()
fig.set_size_inches(13, 8)
sns.heatmap(conf_mat)

print(types[3])
#print(types[38])

#Print F1 score metrics
print(classification_report(y_test, NB_pred, target_names = types))

However, my dataset is different, and it comprises sequences in CSV format. There is only one common column between my dataset and the test/train datasets.
I am seeking guidance on how to utilize this code/model to predict the classification in the subsequent column of my sequences. Please provide assistance.

Using Kaggle code/model to predict classifications for unseen dataset

Answers (1)

Related Questions