Reputation: 9
I am getting 100% accuracy for both the training and testing sets across multiple models—Decision Tree, Random Forest, SVM, KNN, and Logistic Regression. I am trying to predict phishing websites, which is a classification problem. I expect the accuracy to differ between the training and testing sets for these models, but I am consistently getting 100% accuracy. The actual results should show varying accuracy between the different models for both training and testing. What could be the issue with my code?
machine learning language
# -*- coding: utf-8 -*-
"""Detection_phishing_website.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1X3FW4ETVRWcRa0nn_crBcBqf-6gwWRP4
"""
from google.colab import drive
drive.mount('/content/drive')
#importing numpy and pandas which are required for data pre-processing
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#Loading file
data1 = pd.read_csv("/content/drive/MyDrive/Dataset/PhiUSIIL_Phishing_URL_Dataset (1).csv")
data1.head()
data1.shape
#Dropping the column
data = data1.drop(['Domain','URL','TLD','Title','FILENAME'], axis = 1).copy()
data.info()
data.describe()
data.isnull().sum()
#Plotting the data distribution
data.hist(bins = 50,figsize = (25,25))
plt.show()
# Sepratating & assigning features and target columns to X & y
y = data['label']
X = data.drop('label',axis=1)
X.shape, y.shape
# # Splitting the dataset into train and test sets: 70-30 split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
"""**Machine Learning Models & Training**"""
#importing packages
from sklearn.metrics import accuracy_score
# Creating holders to store the model performance results
ML_Model = []
acc_train = []
acc_test = []
#function to call for storing the results
def storeResults(model, a,b):
ML_Model.append(model)
acc_train.append(round(a, 3))
acc_test.append(round(b, 3))
"""**Decision Tree Classifier**"""
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier
# instantiate the model
tree = DecisionTreeClassifier(max_depth = 7)
# fit the model
tree.fit(X_train, y_train)
print(tree)
# Alternatively, print the model's parameters explicitly using get_params
params = tree.get_params()
print(params)
#predicting the target value from the model for the samples
y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)
#computing the accuracy of the model performance
acc_train_tree = accuracy_score(y_train,y_train_tree)
acc_test_tree = accuracy_score(y_test,y_test_tree)
print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))
#checking the feature improtance in the model
plt.figure(figsize=(16,18))
n_features = X_train.shape[1]
plt.barh(range(n_features), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('Decision Tree', acc_train_tree, acc_test_tree)
"""Random Forest Classifier"""
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
# instantiate the model
forest = RandomForestClassifier(max_depth=5)
# fit the model
forest.fit(X_train, y_train)
#predicting the target value from the model for the samples
y_test_forest = forest.predict(X_test)
y_train_forest = forest.predict(X_train)
#computing the accuracy of the model performance
acc_train_forest = accuracy_score(y_train,y_train_forest)
acc_test_forest = accuracy_score(y_test,y_test_forest)
print("Random forest: Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random forest: Accuracy on test Data: {:.3f}".format(acc_test_forest))
#checking the feature improtance in the model
plt.figure(figsize=(16,18))
n_features = X_train.shape[1]
plt.barh(range(n_features), forest.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('Random Forest', acc_train_forest, acc_test_forest)
""" Gradient Boost Classsification"""
# Gradient Boosting Classifier Model
from sklearn.ensemble import GradientBoostingClassifier
# instantiate the model
gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
# fit the model
gbc.fit(X_train,y_train)
#predicting the target value from the model for the samples
y_train_gbc = gbc.predict(X_train)
y_test_gbc = gbc.predict(X_test)
#computing the accuracy, f1_score, Recall, precision of the model performance
from sklearn import metrics # Import the metrics module
acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('Gradient Boosting', acc_train_gbc, acc_test_gbc)
#Support vector machine model
from sklearn.svm import SVC
# instantiate the model
svm = SVC(kernel='linear', C=1.0, random_state=12)
#fit the model
svm.fit(X_train, y_train)
#predicting the target value from the model for the samples
y_test_svm = svm.predict(X_test)
y_train_svm = svm.predict(X_train)
#computing the accuracy of the model performance
acc_train_svm = accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)
print("SVM: Accuracy on training Data: {:.3f}".format(acc_train_svm))
print("SVM : Accuracy on test Data: {:.3f}".format(acc_test_svm))
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('SVM', acc_train_svm, acc_test_svm) # Pass the svm model object, not the string 'SVM'
"""K-Nearest Neighbors : Classifier"""
# K-Nearest Neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier
# instantiate the model
knn = KNeighborsClassifier(n_neighbors=1)
# fit the model
knn.fit(X_train,y_train)
#predicting the target value from the model for the samples
y_train_knn = knn.predict(X_train)
y_test_knn = knn.predict(X_test)
#computing the accuracy,f1_score,Recall,precision of the model performance
from sklearn import metrics # Import the metrics module
acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
print("K-Nearest Neighbors : Accuracy on training Data: {:.3f}".format(acc_train_knn))
print("K-Nearest Neighbors : Accuracy on test Data: {:.3f}".format(acc_test_knn))
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('KNN', acc_train_knn, acc_test_knn) # Pass the svm model object, not the string 'SVM'
"""Logistic Regression Classifier"""
# Linear regression model
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
# instantiate the model
log = LogisticRegression()
# fit the model
log.fit(X_train,y_train)
#predicting the target value from the model for the samples
y_train_log = log.predict(X_train)
y_test_log = log.predict(X_test)
#computing the accuracy of the model performance
acc_train_log = metrics.accuracy_score(y_train,y_train_log)
acc_test_log = metrics.accuracy_score(y_test,y_test_log)
print("Logistic Regression : Accuracy on training Data: {:.3f}".format(acc_train_log))
print("Logistic Regression : Accuracy on test Data: {:.3f}".format(acc_test_log))
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('Logistic Regression', acc_train_log, acc_test_log) # Pass the svm model object, not the string 'SVM'
"""Comparision model"""
#creating dataframe
results = pd.DataFrame({ 'ML Model': ML_Model,
'Train Accuracy': acc_train,
'Test Accuracy': acc_test})
results
Upvotes: 0
Views: 23