veena
veena

Reputation: 865

working with dataset in sklearn

I have a dataset is in this format in .csv

id,interaction_flag,x_coordinate,y_coordinate,z_coordinate,hydrophobicity_kd,hydrophobicity_ww,hydrophobicity_hh,surface_tension,charge_cooh,charge_nh3,charge_r,alpha_helix,beta_strand,turn,van_der_walls,mol_wt,solublity  
229810,1,-33.8675148907451,-110.273691995647,100.021824089754,0.129381338742408,0.129381338742408,0.129381338742408,57.9996957403639,2.20539553752535,9.55985801217038,4.47146044624688,1.08064908722114,1.20135902636915,0.611653144016251,145.232251521298,107.951643002026,21.5344036511141        
229811,1,-26.9070290467923,-117.172163712053,106.980243932766,0.922048681541592,0.922048681541592,0.922048681541592,58.5383367139972,2.03983772819472,9.23210953346856,1.58401622717997,0.84178498985806,1.0387626774848,0.921703853955354,124.73630831643,84.1570182555755,10.7648600405665

I am trying to get Receiver Operating Characteristics (ROC) from this data using this link : http://scikit-learn.org/0.11/auto_examples/plot_roc.html

My target is interaction_flag column and test is all columns after interaction_flag. But, my program continue running in never ending state.

When I run the test example given in that link, it runs within a moment.

Can anyone let me know what wrong I am doing? or do I need to so something else to load my data like iris?

my code :

import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc

training = 'dataset/training_5000_col.csv'
test = 'dataset/test_5000_col.csv'

random_state = np.random.RandomState(0)

# Import some data to play with
#iris = datasets.load_iris()
#X = iris.data
#y = iris.target
X = []
y = []
for line in open(training):
    z = line.rstrip().split(',')
y.append(int(z[2]))
tmp = []
for a in range(5, 15):
    tmp.append(float(z[a]))
X.append(tmp)
X_train = np.array(X)
y_train = np.array(y)



X1 = []
y1 = []
for line in open(test):
z = line.rstrip().split(',')
y1.append(int(z[2]))
tmp = []
for a in range(5, 15):
    tmp.append(float(z[a]))
X1.append(tmp)
X_test = np.array(X1)
y_test = np.array(y1)

# Run classifier
classifier = svm.SVC(kernel='linear', probability=True)
probas_ = classifier.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
print "y_test : ", y_test
print "fpr : ", fpr
print "tpr : ", tpr
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

my .csv file is at : http://pastebin.com/iet5xQW2 how I will plot roc with this .csv

Upvotes: 0

Views: 1160

Answers (1)

Abhishek Thakur
Abhishek Thakur

Reputation: 17015

You need to have two different labels in order to plot the ROC curve. The following example works for me if I add some 0 labels in your data. I have used pandas to read the data, rest is same as sklearn example.

Also, you need to split the dataset into training and test set to plot the ROC curve on the test set.

import pandas as pd
import numpy as np
from scipy import interp
import pylab as pl

from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold




def data(filename):
    X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True, low_memory = False)

    X = np.asarray(X)

    data = X[:,2:]
    labels = X[:,1]
    print np.unique(labels)

    return data, labels




filename = '../data/sodata.csv'
X, y = data(filename)

###############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(y, n_folds=6)
classifier = svm.SVC(kernel='linear', probability=True, random_state=0)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
pl.plot(mean_fpr, mean_tpr, 'k--',
        label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

pl.xlim([-0.05, 1.05])
pl.ylim([-0.05, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

Upvotes: 2

Related Questions