Hanley Soilsmith
Hanley Soilsmith

Reputation: 629

Using my own .csv in tensorflow

I asked a previous question about the same code here where and how to put the filename in this tensorflow code?

Not sure if I should merge that into this question or leave it as is.

The following code is from Sirajology's git hub. I have not found a super straight forward tutorial on how to get one's own .csv file into a simple tensorflow neural network so my hope is this thread might provide that instruction for future searchers.

The code is as follows

import tensorflow.python.platform

import numpy as np
import tensorflow as tf

# Global variables.
NUM_LABELS = 2    # The number of labels.
BATCH_SIZE = 5  # The number of training examples to use per training step.

# Define the flags useable from the command line.
tf.app.flags.DEFINE_string('train', None,
                           'File containing the training data (labels & features).')
tf.app.flags.DEFINE_string('test', None,
                           'File containing the test data (labels & features).')
tf.app.flags.DEFINE_integer('num_epochs', 1,
                            'Number of examples to separate from the training '
                            'data for the validation set.')
tf.app.flags.DEFINE_boolean('verbose', False, 'Produce verbose output.')
FLAGS = tf.app.flags.FLAGS

# Extract numpy representations of the labels and features given rows consisting of:
#   label, feat_0, feat_1, ..., feat_n
def extract_data(filename):

    # Arrays to hold the labels and feature vectors.
    labels = []
    fvecs = []

    # Iterate over the rows, splitting the label from the features. Convert labels
    # to integers and features to floats.
    for line in file(filename):
        row = line.split(",")
        labels.append(int(row[0]))
        fvecs.append([float(x) for x in row[1:]])

    # Convert the array of float arrays into a numpy float matrix.
    fvecs_np = np.matrix(fvecs).astype(np.float32)

    # Convert the array of int labels into a numpy array.
    labels_np = np.array(labels).astype(dtype=np.uint8)

    # Convert the int numpy array into a one-hot matrix.
    labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)

    # Return a pair of the feature matrix and the one-hot label matrix.
    return fvecs_np,labels_onehot

def main(argv=None):
    # Be verbose?
    verbose = FLAGS.verbose

    # Get the data.
    train_data_filename = FLAGS.train
    test_data_filename = FLAGS.test

    # Extract it into numpy matrices.
    train_data,train_labels = extract_data(train_data_filename)
    test_data, test_labels = extract_data(test_data_filename)

    # Get the shape of the training data.
    train_size,num_features = train_data.shape

    # Get the number of epochs for training.
    num_epochs = FLAGS.num_epochs

    # This is where training samples and labels are fed to the graph.
    # These placeholder nodes will be fed a batch of training data at each
    # training step using the {feed_dict} argument to the Run() call below.
    x = tf.placeholder("float", shape=[None, num_features])
    y_ = tf.placeholder("float", shape=[None, NUM_LABELS])

    # For the test data, hold the entire dataset in one constant node.
    test_data_node = tf.constant(test_data)

    # Define and initialize the network.

    # These are the weights that inform how much each feature contributes to
    # the classification.
    W = tf.Variable(tf.zeros([num_features,NUM_LABELS]))
    b = tf.Variable(tf.zeros([NUM_LABELS]))
    y = tf.nn.softmax(tf.matmul(x,W) + b)

    # Optimization.
    cross_entropy = -tf.reduce_sum(y_*tf.log(y))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

    # Evaluation.
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    # Create a local session to run this computation.
    with tf.Session() as s:
        # Run all the initializers to prepare the trainable parameters.
        tf.initialize_all_variables().run()
        if verbose:
            print ('Initialized!')
            print
            print ('Training.')

        # Iterate and train.
        for step in xrange(num_epochs * train_size // BATCH_SIZE):
            if verbose:
                print (step,)

            offset = (step * BATCH_SIZE) % train_size
            batch_data = train_data[offset:(offset + BATCH_SIZE), :]
            batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
            train_step.run(feed_dict={x: batch_data, y_: batch_labels})

            if verbose and offset >= train_size-BATCH_SIZE:
                print

        # Give very detailed output.
        if verbose:
            print
            print ('Weight matrix.')
            print (s.run(W))
            print
            print ('Bias vector.')
            print (s.run(b))
            print
            print ("Applying model to first test instance.")
            first = test_data[:1]
            print ("Point =", first)
            print ("Wx+b = ", s.run(tf.matmul(first,W)+b))
            print ("softmax(Wx+b) = ", s.run(tf.nn.softmax(tf.matmul(first,W)+b)))
            print

        print ("Accuracy:", accuracy.eval(feed_dict={x: test_data, y_: test_labels}))


if __name__ == '__main__':
    tf.app.run()

When I run the code from terminal with the following command (windows10 cmd line) python YourScript.py --train FileName.csv --test TestName.csv --num_epochs 5 --verbose True I get these errors. Any help is greatly appreciated!

Error #1 File "softmax.py", line 133, in tf.app.run()

tf.app.run()

Error #2 File "C:\app.py", line 43, in run sys.exit(main(sys.argv[:1] + flags_passthrough))

labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32) 

Error #3 File "softmax.py", line 57, in main train_data,train_labels = extract_data(train_data_filename)

train_data,train_labels = extract_data(train_data_filename) 
test_data, test_labels = extract_data(test_data_filename)

Error #4 File "softmax.py", line 31, in extract_data for line in file(filename): NameError: name 'file' is not defined

for line in file(filename):
        row = line.split(",")
        labels.append(int(row[7]))
        fvecs.append([float(x) for x in row[1:6]])

Upvotes: 0

Views: 455

Answers (1)

mrry
mrry

Reputation: 126194

It looks like the problem stems from this line, which uses a built-in function (file()) that is not available in Python 3.5:

for line in file(filename):

Replacing it with the following line should fix the error:

for line in open(filename):

Upvotes: 1

Related Questions