Reputation: 89
I am trying the below code but i am getting error
if not os.path.isfile('train.db'):
disk_engine = create_engine('sqlite:///train.db')
start = dt.datetime.now()
chunksize = 15000
j = 0
index_start = 1
for df in pd.read_csv('final_features.csv', names=['Unnamed: 0','id','is_duplicate','cwc_min','cwc_max','csc_min','csc_max','ctc_min','ctc_max','last_word_eq','first_word_eq','abs_len_diff','mean_len','token_set_ratio','token_sort_ratio','fuzz_ratio','fuzz_partial_ratio','longest_substr_ratio','freq_qid1','freq_qid2','q1len','q2len','q1_n_words','q2_n_words','word_Common','word_Total','word_share','freq_q1+q2','freq_q1-q2','0_x','1_x','2_x','3_x','4_x','5_x','6_x','7_x','8_x','9_x','10_x','11_x','12_x','13_x','14_x','15_x','16_x','17_x','18_x','19_x','20_x','21_x','22_x','23_x','24_x','25_x','26_x','27_x','28_x','29_x','30_x','31_x','32_x','33_x','34_x','35_x','36_x','37_x','38_x','39_x','40_x','41_x','42_x','43_x','44_x','45_x','46_x','47_x','48_x','49_x','50_x','51_x','52_x','53_x','54_x','55_x','56_x','57_x','58_x','59_x','60_x','61_x','62_x','63_x','64_x','65_x','66_x','67_x','68_x','69_x','70_x','71_x','72_x','73_x','74_x','75_x','76_x','77_x','78_x','79_x','80_x','81_x','82_x','83_x','84_x','85_x','86_x','87_x','88_x','89_x','90_x','91_x','92_x','93_x','94_x','95_x','0_y','1_y','2_y','3_y','4_y','5_y','6_y','7_y','8_y','9_y','10_y','11_y','12_y','13_y','14_y','15_y','16_y','17_y','18_y','19_y','20_y','21_y','22_y','23_y','24_y','25_y','26_y','27_y','28_y','29_y','30_y','31_y','32_y','33_y','34_y','35_y','36_y','37_y','38_y','39_y','40_y','41_y','42_y','43_y','44_y','45_y','46_y','47_y','48_y','49_y','50_y','51_y','52_y','53_y','54_y','55_y','56_y','57_y','58_y','59_y','60_y','61_y','62_y','63_y','64_y','65_y','66_y','67_y','68_y','69_y','70_y','71_y','72_y','73_y','74_y','75_y','76_y','77_y','78_y','79_y','80_y','81_y','82_y','83_y','84_y','85_y','86_y','87_y','88_y','89_y','90_y','91_y','92_y','93_y','94_y','95_y'], chunksize=chunksize, iterator=True, encoding='utf-8', ):
df.index += index_start
j+=1
print('{} rows'.format(j*chunksize))
df.to_sql('data', disk_engine, if_exists='append')
index_start = df.index[-1] + 1
This is the o/p that i am getting
15000 rows
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
506 def do_execute(self, cursor, statement, parameters, context=None):
--> 507 cursor.execute(statement, parameters)
508
OperationalError: too many SQL variables
The above exception was the direct cause of the following exception:
OperationalError Traceback (most recent call last)
<ipython-input-83-b376654c990a> in <module>()
14 j+=1
15 print('{} rows'.format(j*chunksize))
---> 16 df.to_sql('data', disk_engine, if_exists='append')
17 index_start = df.index[-1] + 1
~\Anaconda3\lib\site-packages\pandas\core\generic.py in to_sql(self, name, con, schema, if_exists, index, index_label, chunksize, dtype)
2125 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2126
-> 2127 ExcelWriter can also be used to append to an existing Excel file:
2128
2129 >>> with pd.ExcelWriter('output.xlsx',
~\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype)
448 index=True,
449 index_label=None,
--> 450 chunksize=None,
451 dtype=None,
452 method=None,
~\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype)
1147
1148 @staticmethod
-> 1149 def _query_iterator(
1150 result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
1151 ):
~\Anaconda3\lib\site-packages\pandas\io\sql.py in insert(self, chunksize)
661 ----------
662 conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
--> 663 keys : list of str
664 Column names
665 data_iter : generator of list
~\Anaconda3\lib\site-packages\pandas\io\sql.py in _execute_insert(self, conn, keys, data_iter)
636 return str(CreateTable(self.table).compile(self.pd_sql.connectable))
637
--> 638 def _execute_create(self):
639 # Inserting table into database, add to MetaData object
640 self.table = self.table.tometadata(self.pd_sql.meta)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object, *multiparams, **params)
946 raise exc.ObjectNotExecutableError(object)
947 else:
--> 948 return meth(self, multiparams, params)
949
950 def _execute_function(self, func, multiparams, params):
~\Anaconda3\lib\site-packages\sqlalchemy\sql\elements.py in _execute_on_connection(self, connection, multiparams, params)
267 def _execute_on_connection(self, connection, multiparams, params):
268 if self.supports_execution:
--> 269 return connection._execute_clauseelement(self, multiparams, params)
270 else:
271 raise exc.ObjectNotExecutableError(self)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_clauseelement(self, elem, multiparams, params)
1058 compiled_sql,
1059 distilled_params,
-> 1060 compiled_sql, distilled_params
1061 )
1062 if self._has_events or self.engine._has_events:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1198 parameters,
1199 cursor,
-> 1200 context)
1201
1202 if self._has_events or self.engine._has_events:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1411 util.raise_from_cause(
1412 sqlalchemy_exception,
-> 1413 exc_info
1414 )
1415 else:
~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info)
201 exc_type, exc_value, exc_tb = exc_info
202 cause = exc_value if exc_value is not exception else None
--> 203 reraise(type(exception), exception, tb=exc_tb, cause=cause)
204
205 if py3k:
~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause)
184 value.__cause__ = cause
185 if value.__traceback__ is not tb:
--> 186 raise value.with_traceback(tb)
187 raise value
188
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1191 statement,
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
1195 self._handle_dbapi_exception(
~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
505
506 def do_execute(self, cursor, statement, parameters, context=None):
--> 507 cursor.execute(statement, parameters)
508
509 def do_execute_no_params(self, cursor, statement, context=None):
I have already tried with various chunksize values but it won't worked. Can anyone please suggest me to fix this error. I am running this code in jupyter notebook. I already have updated versions of pandas and other libraries so there is no compatability issue.
Upvotes: 1
Views: 3583
Reputation: 11
Based on a few tests, I'm not sure that the @Scratch'N'Purr answer is completely correct.
For instance, define a SQLite table with, say, 58 columns. Then, try to insert a Pandas df with only, say, 830 rows and 58 columns, with a statement like:
df.to_sql("tableName", conn, if_exists="append", index=False, method="multi")
You will get the op error: sqlite3.OperationalError: too many SQL variables
I also read that there is an error on
"line 2143, in _execute_insert_multi
conn.execute(self.insert_statement(num_rows=len(data_list)), flattened_data)"
I'm not an expert there, but maybe (could someone enlight me on that?) all the placeholders for every row that must be inserted end up in the same query "flattened"? As a result, we end up with more placeholders than SQLite can accept.
In my case, by modifying the query and setting a chunksize limit, I could get rid off the error:
df.to_sql("tableName", conn, if_exists="append", chunksize=100, index=False, method="multi")
From that, I must infer that it does impact the number of placeholders used in the query. So, yes, it seems that the chuncksize could, under some circumstances, resolve such a problem.
Upvotes: 0
Reputation: 10429
This error is related to the number of parameters being passed to sqlite3. In essence, what's happening behind the scenes is that there's a SQL query being issued to the db engine: INSERT INTO myTable (col1, col2, col3,..., col_n) VALUES (?, ?, ?,..., ?)
, where the ?
are the values from your dataframe being passed to the database.
This error occurred because your dataframe is very wide (has lots of columns), so during insertion, many parameters are being passed into the SQL statement. You can actually see in the error stack that justifies my explanation:
--> 507 cursor.execute(statement, parameters)
Simply, SQLite can only handle a limited number of parameters being passed. This is simply a limitation of SQLite. You can scroll down to #9 in this page for more info on this.
Setting chunksize will not resolve your problem. My suggestion is using another db like postgres or mysql.
Upvotes: 1