Reputation: 993
Hoping for a few pointers on how I can optimise this code up... Ideally I'd like to keep with using pandas but assume there's some nifty sqlite tricks I can use to get some good speed-up. For additional "points", would love to know if Cython could help at all here?
Incase it's not obvious from the code.. for context, I'm having to write out millions of very small sqlite files (files in "uncompressedDir") and outputting them into a much larger "master" sqlite DB ("6th jan.db").
Thanks in advance everyone!
%%cython -a
import os
import pandas as pd
import sqlite3
import time
import sys
def main():
rootDir = "/Users/harryrobinson/Desktop/dataForMartin/"
unCompressedDir = "/Users/harryrobinson/Desktop/dataForMartin/unCompressedSqlFiles/"
with sqlite3.connect(rootDir+'6thJan.db') as conn:
destCursor = conn.cursor()
createTable = "CREATE TABLE IF NOT EXISTS userData(TimeStamp, Category, Action, Parameter1Name, Parameter1Value, Parameter2Name, Parameter2Value, formatVersion, appVersion, userID, operatingSystem)"
destCursor.execute(createTable)
for i in os.listdir(unCompressedDir):
try:
with sqlite3.connect(unCompressedDir+i) as connection:
cursor = connection.cursor()
cursor.execute('SELECT * FROM Events')
df_events = pd.DataFrame(cursor.fetchall())
cursor.execute('SELECT * FROM Global')
df_global = pd.DataFrame(cursor.fetchall())
cols = ['TimeStamp', 'Category', 'Action', 'Parameter1Name', 'Parameter1Value', 'Parameter2Name', 'Parameter2Value']
df_events = df_events.drop(0,axis=1)
df_events.columns = cols
df_events['formatVersion'] = df_global.iloc[0,0]
df_events['appVersion'] = df_global.iloc[0,1]
df_events['userID'] = df_global.iloc[0,2]
df_events['operatingSystem'] = df_global.iloc[0,3]
except Exception as e:
print(e, sys.exc_info()[-1].tb_lineno)
try:
df_events.to_sql("userData", conn, if_exists="append", index=False)
except Exception as e:
print("Sqlite error, {0} - line {1}".format(e, sys.exc_info()[-1].tb_lineno))
UPDATE: halved the time by adding a transaction instead of to_sql
Upvotes: 2
Views: 1059
Reputation: 107652
Reconsider using Pandas as a staging tool (leave the library for data analysis). Simply write pure SQL queries which can be accommodated by using SQLite's ATTACH to query external databases.
with sqlite3.connect(os.path.join(rootDir,'6thJan.db')) as conn:
destCursor = conn.cursor()
createTable = """CREATE TABLE IF NOT EXISTS userData(
TimeStamp TEXT, Category TEXT, Action TEXT, Parameter1Name TEXT,
Parameter1Value TEXT, Parameter2Name TEXT, Parameter2Value TEXT,
formatVersion TEXT, appVersion TEXT, userID TEXT, operatingSystem TEXT
);"""
destCursor.execute(createTable)
conn.commit()
for i in os.listdir(unCompressedDir):
destCursor.execute("ATTACH ? AS curr_db;", i)
sql = """INSERT INTO userData
SELECT e.*, g.formatVersion, g.appVersion, g.userID, g.operatingSystem
FROM curr_db.[events] e
CROSS JOIN (SELECT * FROM curr_db.[global] LIMIT 1) g;"""
destCursor.execute(sql)
conn.commit()
destCursor.execute("DETACH curr_db;")
Upvotes: 1