Reputation: 17647
I am trying to create a database table from a pandas DataFrame. I receive a strange error that I am not able to replicate with a simple example.
This is the code
import pandas as pd
import os
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
path = 'sqlite://'
engine = create_engine(path, echo=False)
df_kerbs, df_flows = get_dataframes()
df = pd.merge(df_kerbs, df_flows, on=['Src', 'SrcPort', 'Dst', 'DstPort', 'First'], how='right')
df.to_sql(name='FlowsKerbsMerged', con=engine, index=False, if_exists='append',
index_label='First')
And this is the output:
Traceback (most recent call last):
File "merge_data.py", line 61, in <module>
index_label='First')
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/core/generic.py", line 1201, in to_sql
chunksize=chunksize, dtype=dtype)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 470, in to_sql
chunksize=chunksize, dtype=dtype)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 1147, in to_sql
table.create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 596, in create
self._execute_create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 581, in _execute_create
self.table.create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/schema.py", line 756, in create
checkfirst=checkfirst)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1929, in _run_visitor
conn._run_visitor(visitorcallable, element, **kwargs)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1538, in _run_visitor
**kwargs).traverse_single(element)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/visitors.py", line 121, in traverse_single
return meth(obj, **kw)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/ddl.py", line 767, in visit_table
include_foreign_key_constraints=include_foreign_key_constraints
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 945, in execute
return meth(self, multiparams, params)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/ddl.py", line 68, in _execute_on_connection
return connection._execute_ddl(self, multiparams, params)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1002, in _execute_ddl
compiled
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1189, in _execute_context
context)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1402, in _handle_dbapi_exception
exc_info
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/util/compat.py", line 203, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1182, in _execute_context
context)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/default.py", line 470, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) duplicate column name: Application [SQL: u'\nCREATE TABLE "FlowsKerbsMerged" (\n\t"ClientIP" FLOAT, \n\t"ConversationIdentifier" FLOAT, \n\t"CorvilOrderId" FLOAT, \n\t"DomainName" FLOAT, \n\t"DomainNameSalt" FLOAT, \n\t"HostName" FLOAT, \n\t"HostNameAugmented" FLOAT, \n\t"HostNameSalt" FLOAT, \n\t"KRB AP-REQ.ap-options" FLOAT, \n\t"KRB AP-REQ.msg-type" FLOAT, \n\t"KRB AP-REQ.pvno" FLOAT, \n\t"KRB ETYPE-INFO2-ENTRY.etype" FLOAT, \n\t"KRB ETYPE-INFO2-ENTRY.salt" FLOAT, \n\t"KRB EncryptedData.cipher" FLOAT, \n\t"KRB EncryptedData.etype" FLOAT, \n\t"KRB EncryptedData.kvno" FLOAT, \n\t"KRB HostAddress.addr-type" FLOAT, \n\t"KRB HostAddress.address" FLOAT, \n\t"KRB KDC-REP.crealm" FLOAT, \n\t"KRB KDC-REP.msg-type" FLOAT, \n\t"KRB KDC-REP.pvno" FLOAT, \n\t"KRB KDC-REQ-BODY.etype.values" FLOAT, \n\t"KRB KDC-REQ-BODY.etype.values:repeating" FLOAT, \n\t"KRB KDC-REQ-BODY.kdc-options" FLOAT, \n\t"KRB KDC-REQ-BODY.nonce" FLOAT, \n\t"KRB KDC-REQ-BODY.realm" FLOAT, \n\t"KRB KDC-REQ-BODY.rtime" FLOAT, \n\t"KRB KDC-REQ-BODY.till" FLOAT, \n\t"KRB KDC-REQ.msg-type" FLOAT, \n\t"KRB KDC-REQ.pvno" FLOAT, \n\t"KRB KERB-PA-PAC-REQUEST.include-pac" FLOAT, \n\t"KRB KRB-ERROR.e-data" FLOAT, \n\t"KRB KRB-ERROR.error-code" FLOAT, \n\t"KRB KRB-ERROR.error-code_string" FLOAT, \n\t"KRB KRB-ERROR.error-description_string" FLOAT, \n\t"KRB KRB-ERROR.msg-type" FLOAT, \n\t"KRB KRB-ERROR.pvno" FLOAT, \n\t"KRB KRB-ERROR.realm" FLOAT, \n\t"KRB KRB-ERROR.stime" FLOAT, \n\t"KRB KRB-ERROR.susec" FLOAT, \n\t"KRB PA-DATA.padata-type" FLOAT, \n\t"KRB PA-DATA.padata-value" FLOAT, \n\t"KRB PA-ENC-TS-ENC.patimestamp" FLOAT, \n\t"KRB PA-PAC-OPTIONS.flags" FLOAT, \n\t"KRB PrincipalName.name-string.values" FLOAT, \n\t"KRB PrincipalName.name-string.values:repeating" FLOAT, \n\t"KRB PrincipalName.name-type" FLOAT, \n\t"KRB Ticket.realm" FLOAT, \n\t"KRB Ticket.tkt-vno" FLOAT, \n\t"Msg Decoder" FLOAT, \n\t"Msg ID" FLOAT, \n\t"Msg Type" FLOAT, \n\t"RequestReply" FLOAT, \n\t"ServerIP" FLOAT, \n\t"UserName" FLOAT, \n\t"UserNameAugmented" FLOAT, \n\t"UserNameSalt" FLOAT, \n\t"Watchlist Match" FLOAT, \n\tapplication FLOAT, \n\tbytes FLOAT, \n\tclient_port FLOAT, \n\tdir FLOAT, \n\t"Dst" TEXT, \n\t"DstPort" FLOAT, \n\tserver_port FLOAT, \n\t"Src" TEXT, \n\t"SrcPort" FLOAT, \n\ttime DATETIME, \n\tts_ns FLOAT, \n\t"First" BIGINT, \n\t"Protocol" TEXT, \n\t"Group ID" BIGINT, \n\t"Port" TEXT, \n\t"VPort" BIGINT, \n\t"IP TOS" TEXT, \n\t"VLAN ID" FLOAT, \n\t"VLAN Pri" FLOAT, \n\t"MPLS Exp" FLOAT, \n\t"Application" TEXT, \n\t"Packets" BIGINT, \n\t"Messages" BIGINT, \n\t"Bytes" BIGINT, \n\t"Last" BIGINT, \n\t"LearnedIPs alertable" FLOAT, \n\t"LearnedIPs learned-ip" FLOAT, \n\t"LearnedIPs new-ips" FLOAT, \n\t"LearnedIPs subnet-name" FLOAT, \n\t"LearnedIPs timestamp-sec" FLOAT, \n\t"LearnedIPs total-ips" FLOAT, \n\t"SrcSubnet" TEXT, \n\t"DstSubnet" TEXT\n)\n\n']
(scientific_python_2_7) [lpuggini@machinelearn-1 Kerberos]$
Further, I do not receive the error if before:
df.to_sql(name='FlowsKerbsMerged', con=engine, index=False, if_exists='append', index_label='First')
I rename two of the columns i.e.
df = df.rename(columns={'Application': 'Application2', 'Bytes': 'Bytes2'}
In conclusion it seems like in my df
I have repeated columns but this does not seem to be the case as:
In [6]: len(df.columns)
Out[6]: 91
In [7]: len(set(df.columns))
Out[7]: 91
What may the reason of the error be?
EDIT:
This is the list of columns:
In [5]: df.columns
Out[5]:
Index([u'ClientIP', u'ConversationIdentifier', u'CorvilOrderId', u'DomainName',
u'DomainNameSalt', u'HostName', u'HostNameAugmented', u'HostNameSalt',
u'KRB AP-REQ.ap-options', u'KRB AP-REQ.msg-type', u'KRB AP-REQ.pvno',
u'KRB ETYPE-INFO2-ENTRY.etype', u'KRB ETYPE-INFO2-ENTRY.salt',
u'KRB EncryptedData.cipher', u'KRB EncryptedData.etype',
u'KRB EncryptedData.kvno', u'KRB HostAddress.addr-type',
u'KRB HostAddress.address', u'KRB KDC-REP.crealm',
u'KRB KDC-REP.msg-type', u'KRB KDC-REP.pvno',
u'KRB KDC-REQ-BODY.etype.values',
u'KRB KDC-REQ-BODY.etype.values:repeating',
u'KRB KDC-REQ-BODY.kdc-options', u'KRB KDC-REQ-BODY.nonce',
u'KRB KDC-REQ-BODY.realm', u'KRB KDC-REQ-BODY.rtime',
u'KRB KDC-REQ-BODY.till', u'KRB KDC-REQ.msg-type', u'KRB KDC-REQ.pvno',
u'KRB KERB-PA-PAC-REQUEST.include-pac', u'KRB KRB-ERROR.e-data',
u'KRB KRB-ERROR.error-code', u'KRB KRB-ERROR.error-code_string',
u'KRB KRB-ERROR.error-description_string', u'KRB KRB-ERROR.msg-type',
u'KRB KRB-ERROR.pvno', u'KRB KRB-ERROR.realm', u'KRB KRB-ERROR.stime',
u'KRB KRB-ERROR.susec', u'KRB PA-DATA.padata-type',
u'KRB PA-DATA.padata-value', u'KRB PA-ENC-TS-ENC.patimestamp',
u'KRB PA-PAC-OPTIONS.flags', u'KRB PrincipalName.name-string.values',
u'KRB PrincipalName.name-string.values:repeating',
u'KRB PrincipalName.name-type', u'KRB Ticket.realm',
u'KRB Ticket.tkt-vno', u'Msg Decoder', u'Msg ID', u'Msg Type',
u'RequestReply', u'ServerIP', u'UserName', u'UserNameAugmented',
u'UserNameSalt', u'Watchlist Match', u'application', u'bytes',
u'client_port', u'dir', u'Dst', u'DstPort', u'server_port', u'Src',
u'SrcPort', u'time', u'ts_ns', u'First', u'Protocol', u'Group ID',
u'Port', u'VPort', u'IP TOS', u'VLAN ID', u'VLAN Pri', u'MPLS Exp',
u'Application', u'Packets', u'Messages', u'Bytes', u'Last',
u'LearnedIPs alertable', u'LearnedIPs learned-ip',
u'LearnedIPs new-ips', u'LearnedIPs subnet-name',
u'LearnedIPs timestamp-sec', u'LearnedIPs total-ips', u'SrcSubnet',
u'DstSubnet'],
dtype='object')
Upvotes: 0
Views: 3351
Reputation: 27899
At index 58
of your list you have a column named application
.
At index 78
of your list you have a column named Application
.
At index 59
of your list you have a column named bytes
.
At index 81
of your list you have a column named Bytes
.
Python doesn't see these as duplicates because set()
is case sensitive e.g.:
set(['Foo', 'foo', 'Foo'])
#{'Foo', 'foo'}
You can check this also with:
len(set(map(lambda x: x.lower(), df.columns)))
#89
Upvotes: 5