sklearn oneclass svm KeyError

Question

My Dataset is a set of system calls for both malware and benign, I preprocessed it and now it looks like this

NtQueryPerformanceCounter
NtProtectVirtualMemory
NtProtectVirtualMemory
NtQuerySystemInformation
NtQueryVirtualMemory
NtQueryVirtualMemory
NtProtectVirtualMemory
NtOpenKey
NtOpenKey
NtOpenKey
NtQuerySecurityAttributesToken
NtQuerySecurityAttributesToken
NtQuerySystemInformation
NtQuerySystemInformation
NtAllocateVirtualMemory
NtFreeVirtualMemory

Now I'm using tfidf to extract the features and then use ngram to make a sequence of them

from __future__ import print_function

import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.svm import OneClassSVM

nGRAM1 = 8
nGRAM2 = 10
weight = 4

main_corpus_MAL = []
main_corpus_target_MAL = []
main_corpus_BEN = []
main_corpus_target_BEN = []

my_categories = ['benign', 'malware']

# feeding corpus the testing data

print("Loading system call database for categories:")
print(my_categories if my_categories else "all")

import glob
import os

malCOUNT = 0
benCOUNT = 0
for filename in glob.glob(os.path.join('C:\Users\alika\Documents\testingSVM\sysMAL', '*.txt')):
    fMAL = open(filename, "r")
    aggregate = ""
    for line in fMAL:
        linea = line[:(len(line)-1)]
        aggregate += " " + linea
    main_corpus_MAL.append(aggregate)
    main_corpus_target_MAL.append(1)
    malCOUNT += 1

for filename in glob.glob(os.path.join('C:\Users\alika\Documents\testingSVM\sysBEN', '*.txt')):
    fBEN = open(filename, "r")
    aggregate = ""
    for line in fBEN:
        linea = line[:(len(line) - 1)]
        aggregate += " " + linea
    main_corpus_BEN.append(aggregate)
    main_corpus_target_BEN.append(0)
    benCOUNT += 1

# weight as determined in the top of the code
train_corpus = main_corpus_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
train_corpus_target = main_corpus_target_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
test_corpus = main_corpus_MAL[(len(main_corpus_MAL)-(len(main_corpus_MAL)//(weight+1))):]
test_corpus_target = main_corpus_target_MAL[(len(main_corpus_MAL)-len(main_corpus_MAL)//(weight+1)):]

def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

# size of datasets
train_corpus_size_mb = size_mb(train_corpus)
test_corpus_size_mb = size_mb(test_corpus)

print("%d documents - %0.3fMB (training set)" % (
    len(train_corpus_target), train_corpus_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(test_corpus_target), test_corpus_size_mb))
print("%d categories" % len(my_categories))
print()
print("Benign Traces: "+str(benCOUNT)+" traces")
print("Malicious Traces: "+str(malCOUNT)+" traces")
print()

print("Extracting features from the training data using a sparse vectorizer...")
t0 = time()

vectorizer = TfidfVectorizer(ngram_range=(nGRAM1, nGRAM2), min_df=1, use_idf=True, smooth_idf=True) ##############

analyze = vectorizer.build_analyzer()

X_train = vectorizer.fit_transform(train_corpus)

duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, train_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer...")
t0 = time()
X_test = vectorizer.transform(test_corpus)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, test_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

The output is:

Loading system call database for categories:
['benign', 'malware']
177 documents - 45.926MB (training set)
44 documents - 12.982MB (test set)
2 categories

Benign Traces: 72 traces
Malicious Traces: 150 traces

Extracting features from the training data using a sparse vectorizer...
done in 7.831695s at 5.864MB/s
n_samples: 177, n_features: 603170

Extracting features from the test data using the same vectorizer...
done in 1.624100s at 7.993MB/s
n_samples: 44, n_features: 603170

Now for the learning section I'm trying to use sklearn OneClassSVM:

print("==================
")
print("Training: ")
classifier = OneClassSVM(kernel='linear', gamma='auto')
classifier.fit(X_test)

fraud_pred = classifier.predict(X_test)

unique, counts = np.unique(fraud_pred, return_counts=True)
print (np.asarray((unique, counts)).T)

fraud_pred = pd.DataFrame(fraud_pred)
fraud_pred= fraud_pred.rename(columns={0: 'prediction'})
main_corpus_target = pd.DataFrame(main_corpus_target)
main_corpus_target= main_corpus_target.rename(columns={0: 'Category'})

this the output to fraud_pred and main_corpus_target

prediction
0   1
1  -1
2   1
3   1
4   1
5  -1
6   1
7  -1
...
30 rows * 1 column
====================
Category
0   1
1   1
2   1
3   1
4   1
...
217 0
218 0
219 0
220 0
221 0
222 rows * 1 column

but when i try to calculate TP,TN,FP,FN:

##Performance check of the model

TP = FN = FP = TN = 0
for j in range(len(main_corpus_target)):
    if main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == 1:
        TP = TP+1
    elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
        FN = FN+1
    elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
        FP = FP+1
    else:
        TN = TN +1
print (TP,  FN,  FP,  TN)

I get this error:

KeyError                                  Traceback (most recent call last)
 in 
      7     elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
      8         FN = FN+1
----> 9     elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
     10         FP = FP+1
     11     else:

c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
   1069         key = com.apply_if_callable(key, self)
   1070         try:
-> 1071             result = self.index.get_value(self, key)
   1072 
   1073             if not is_scalar(result):

c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
   4728         k = self._convert_scalar_indexer(k, kind="getitem")
   4729         try:
-> 4730             return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
   4731         except KeyError as e1:
   4732             if len(self) > 0 and (self.holds_integer() or self.is_boolean()):

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 30

1) I know the error is because it's trying to access a key that isn’t in a dictionary, but i can't just insert some numbers in the fraud_pred to handle this issue, any suggestions??
2) Am i doing anything wrong that they don't match?
3) I want to compare the results to other one class classification algorithms, Due to my method, what are the best ones that i can use??

sklearn oneclass svm KeyError

Answers (1)

Related Questions