Reputation: 129
I have 5 attributes I'm wanting to compare across a few thousand records with a view to splitting the population of records into a discrete set of records whose attributes match a particular combination.
Each of the attributes can take one of three values when compared against an existing dataset containing the same attributes, viz. =, > or <
In essence the number of combinations comprises a Cartesian product which I've generated using itertools.product, with the following code snippet:
from itertools import product
tc = ("=", ">", "<")
dr = ("=", ">", "<")
bd = ("=", ">", "<")
fr = ("=", ">", "<")
ch = ("=", ">", "<")
permutations = list(product(tc, dr, bd, fr, ch))
the resulting list takes the form of:
('=', '=', '=', '=', '=')
('=', '=', '=', '=', '>')
('=', '=', '=', '=', '<')
('=', '=', '=', '>', '=')
...
I'd previously coded a Python script to dynamically generate the required SQL queries, run them and write the results out to text file (CSV)
The code relies on iterating through a list containing a dictionary that was at the time defined as follows (clearly this approach is not viable where more attributes are introduced.):
permutations = []
permutations.append({'tc': "=", 'dr': "=", 'bd': "=", 'fr': "="})
permutations.append({'tc': ">", 'dr': "=", 'bd': "=", 'fr': "="})
permutations.append({'tc': "<", 'dr': "=", 'bd': "=", 'fr': "="})
permutations.append({'tc': "=", 'dr': ">", 'bd': "=", 'fr': "="})
permutations.append({'tc': ">", 'dr': ">", 'bd': "=", 'fr': "="})
I'd then iterate through the list dynamically using the respective 'tc', 'dr', 'bd' and 'fr' values in each iteration and then generate the SQL query snippet:
for permutation in permutations:
tc = permutation.get('tc')
dr = permutation.get('dr')
bd = permutation.get('bd')
fr = permutation.get('fr')
criteria = (f"a.f1 {tc} b.f1 AND "
f"a.f2 {dr} b.f2 AND "
f"a.f3 {bd} b.f3 AND "
f"a.f4 {fr} b.f4")
Question: what is the most efficient way of turning the list generated by itertools.product into the equivalent list with embedded dictionary my code has been leveraging - or would it be better to modify my code to reference elements of the list directly?
Upvotes: 0
Views: 79
Reputation: 129
import pandas as pd
import sqlite3
from itertools import product
def run_query(query1, query2, iteration, dbconn):
""" generate a dataframe containing the query results and if a result is returned,
write it out to file and delete the corresponding records from the table """
df = pd.read_sql( query1, dbconn)
if len(df) > 0:
df.to_csv(f"/tmp/db/results/{iteration}alib_has_{tc}tc_{dr}dr_{bd}bd_{fr}fr_{ch}ch.csv", index = False, sep="|")
dbcursor.execute( query2 )
""" set up all the global variables for the script """
""" define the fields we want to return from the query """
fields = ("b.__dirpath jbod_dirpath")
""" name the tables we will be iterating """
table1 = "alib_dirnames"
table2 = "jbod_dirnames"
""" Define the datapoints and their possible values to iterate """
tc = ("=", ">", "<")
dr = ("=", ">", "<")
bd = ("=", ">", "<")
fr = ("=", ">", "<")
ch = ("=", ">", "<")
""" Generate the resulting Cartesian product """
permutations = list(product(tc, dr, bd, fr, ch))
""" establish database connection """
conn = sqlite3.connect("/tmp/db/alib_jbod.db")
dbcursor = conn.cursor()
""" now iterate the queries """
for tc, dr, bd, fr, ch in permutations:
""" define select criteria - 1st query is highest order match: albumartist, album & version"""
criteria = (f"a.albumartist IS NOT NULL AND "
f"b.albumartist IS NOT NULL AND "
f"a.album IS NOT NULL AND "
f"b.album IS NOT NULL AND "
f"a.version IS NOT NULL AND "
f"b.version IS NOT NULL AND "
f"a.albumartist = b.albumartist COLLATE NOCASE AND "
f"a.album = b.album COLLATE NOCASE AND "
f"a.version = b.version COLLATE NOCASE AND "
f"a.track_count {tc} b.track_count AND "
f"a.dynamicrange {dr} b.dynamicrange AND "
f"a.__bitspersample {bd} b.__bitspersample AND "
f"a.__frequency_num {fr} b.__frequency_num AND "
f"a.__channels {ch} b.__channels")
""" define the SQL query to be run """
query1 = (f"SELECT {fields} "
f"FROM {table1} a "
f"INNER JOIN {table2} b "
f"ON ( {criteria} ) "
f"WHERE a.__dirname NOT LIKE 'CD%' "
f"ORDER BY b.__dirpath;")
""" define the corresponding delete query """
query2 = (f"DELETE FROM {table2} "
f"WHERE {table2}.__dirpath IN ( "
f"SELECT b.__dirpath "
f"FROM {table1} a "
f"INNER JOIN {table2} b "
f"ON ( {criteria} ) "
f"WHERE a.__dirname NOT LIKE 'CD%');")
run_query(query1, query2, 1, conn)
Upvotes: 1