Reputation: 865
I have a dataset is in this format in .csv
id,interaction_flag,x_coordinate,y_coordinate,z_coordinate,hydrophobicity_kd,hydrophobicity_ww,hydrophobicity_hh,surface_tension,charge_cooh,charge_nh3,charge_r,alpha_helix,beta_strand,turn,van_der_walls,mol_wt,solublity
229810,1,-33.8675148907451,-110.273691995647,100.021824089754,0.129381338742408,0.129381338742408,0.129381338742408,57.9996957403639,2.20539553752535,9.55985801217038,4.47146044624688,1.08064908722114,1.20135902636915,0.611653144016251,145.232251521298,107.951643002026,21.5344036511141
229811,1,-26.9070290467923,-117.172163712053,106.980243932766,0.922048681541592,0.922048681541592,0.922048681541592,58.5383367139972,2.03983772819472,9.23210953346856,1.58401622717997,0.84178498985806,1.0387626774848,0.921703853955354,124.73630831643,84.1570182555755,10.7648600405665
I am trying to get Receiver Operating Characteristics (ROC) from this data using this link : http://scikit-learn.org/0.11/auto_examples/plot_roc.html
My target is interaction_flag column and test is all columns after interaction_flag. But, my program continue running in never ending state.
When I run the test example given in that link, it runs within a moment.
Can anyone let me know what wrong I am doing? or do I need to so something else to load my data like iris?
my code :
import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
training = 'dataset/training_5000_col.csv'
test = 'dataset/test_5000_col.csv'
random_state = np.random.RandomState(0)
# Import some data to play with
#iris = datasets.load_iris()
#X = iris.data
#y = iris.target
X = []
y = []
for line in open(training):
z = line.rstrip().split(',')
y.append(int(z[2]))
tmp = []
for a in range(5, 15):
tmp.append(float(z[a]))
X.append(tmp)
X_train = np.array(X)
y_train = np.array(y)
X1 = []
y1 = []
for line in open(test):
z = line.rstrip().split(',')
y1.append(int(z[2]))
tmp = []
for a in range(5, 15):
tmp.append(float(z[a]))
X1.append(tmp)
X_test = np.array(X1)
y_test = np.array(y1)
# Run classifier
classifier = svm.SVC(kernel='linear', probability=True)
probas_ = classifier.fit(X_train, y_train).predict_proba(X_test)
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
print "y_test : ", y_test
print "fpr : ", fpr
print "tpr : ", tpr
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()
my .csv file is at : http://pastebin.com/iet5xQW2 how I will plot roc with this .csv
Upvotes: 0
Views: 1160
Reputation: 17015
You need to have two different labels in order to plot the ROC curve. The following example works for me if I add some 0 labels in your data. I have used pandas to read the data, rest is same as sklearn example.
Also, you need to split the dataset into training and test set to plot the ROC curve on the test set.
import pandas as pd
import numpy as np
from scipy import interp
import pylab as pl
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
def data(filename):
X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True, low_memory = False)
X = np.asarray(X)
data = X[:,2:]
labels = X[:,1]
print np.unique(labels)
return data, labels
filename = '../data/sodata.csv'
X, y = data(filename)
###############################################################################
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(y, n_folds=6)
classifier = svm.SVC(kernel='linear', probability=True, random_state=0)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate(cv):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
pl.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
pl.xlim([-0.05, 1.05])
pl.ylim([-0.05, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()
Upvotes: 2