Reputation: 7251
I have a 97-2003 .xls
excel file that I need to converted to a .xlsx
. The operational sense is that I have a plethora of these I need to convert.
My issue is that each .xls
file has various sheets; both the number and sheet names change.
I am trying to convert the .xls
to .xlsx
extension using Python. This question and this question and this pandas
page have led me to believe using Openpyxl
is a good solution for this. My current code looks like:
import pandas as pd, xlrd, openpyxl
filename = 'STAGING FILE1.XLS'
xlsx_filename = "temp_output.xlsx"
xls_file = pd.ExcelFile(filename)
with pd.ExcelWriter(xlsx_filename) as writer:
for sheet in xls_file.sheet_names:
df = pd.read_excel(pd.read_excel(filename, sheet_name=sheet))
df.to_excel(writer, sheet_name=sheet)
Please note, none of these libraries are hard requirements; just what I think is my best chance at solving this problem.
However, I am getting IndexError: At least one sheet must be visible
, with a full traceback below. How can I properly accomplish switching the extension from .XLS
to .xlsx
?
Full traceback:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [13], in <module>
----> 1 dma_hh = wb[wb.worksheets[0]]
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\workbook\workbook.py:288, in Workbook.__getitem__(self, key)
286 if sheet.title == key:
287 return sheet
--> 288 raise KeyError("Worksheet {0} does not exist.".format(key))
KeyError: 'Worksheet <Worksheet "Sheet1"> does not exist.'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [5], in <module>
4 for sheet in xls_file.sheet_names:
----> 5 df = pd.read_excel(pd.read_excel(filename, sheet_name=sheet))
6 df.to_excel(writer, sheet_name=sheet)
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_base.py:364, in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
363 should_close = True
--> 364 io = ExcelFile(io, storage_options=storage_options, engine=engine)
365 elif engine and engine != io.engine:
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_base.py:1191, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
1190 else:
-> 1191 ext = inspect_excel_format(
1192 content_or_path=path_or_buffer, storage_options=storage_options
1193 )
1194 if ext is None:
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_base.py:1070, in inspect_excel_format(content_or_path, storage_options)
1068 content_or_path = BytesIO(content_or_path)
-> 1070 with get_handle(
1071 content_or_path, "rb", storage_options=storage_options, is_text=False
1072 ) as handle:
1073 stream = handle.handle
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\common.py:609, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
608 # open URLs
--> 609 ioargs = _get_filepath_or_buffer(
610 path_or_buf,
611 encoding=encoding,
612 compression=compression,
613 mode=mode,
614 storage_options=storage_options,
615 )
617 handle = ioargs.filepath_or_buffer
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\common.py:396, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
395 msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
--> 396 raise ValueError(msg)
398 return IOArgs(
399 filepath_or_buffer=filepath_or_buffer,
400 encoding=encoding,
(...)
403 mode=mode,
404 )
ValueError: Invalid file path or buffer object type: <class 'pandas.core.frame.DataFrame'>
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
Input In [5], in <module>
4 for sheet in xls_file.sheet_names:
5 df = pd.read_excel(pd.read_excel(filename, sheet_name=sheet))
----> 6 df.to_excel(writer, sheet_name=sheet)
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_base.py:1020, in ExcelWriter.__exit__(self, exc_type, exc_value, traceback)
1019 def __exit__(self, exc_type, exc_value, traceback):
-> 1020 self.close()
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_base.py:1024, in ExcelWriter.close(self)
1022 def close(self):
1023 """synonym for save, to make it more file-like"""
-> 1024 content = self.save()
1025 self.handles.close()
1026 return content
File filepath\Excel Formatting\_venv\lib\site-packages\pandas\io\excel\_openpyxl.py:80, in OpenpyxlWriter.save(self)
76 def save(self):
77 """
78 Save workbook to disk.
79 """
---> 80 self.book.save(self.handles.handle)
81 if "r+" in self.mode and not isinstance(self.handles.handle, mmap.mmap):
82 # truncate file to the written content
83 self.handles.handle.truncate()
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\workbook\workbook.py:407, in Workbook.save(self, filename)
405 if self.write_only and not self.worksheets:
406 self.create_sheet()
--> 407 save_workbook(self, filename)
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\writer\excel.py:293, in save_workbook(workbook, filename)
291 archive = ZipFile(filename, 'w', ZIP_DEFLATED, allowZip64=True)
292 writer = ExcelWriter(workbook, archive)
--> 293 writer.save()
294 return True
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\writer\excel.py:275, in ExcelWriter.save(self)
273 def save(self):
274 """Write data into the archive."""
--> 275 self.write_data()
276 self._archive.close()
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\writer\excel.py:89, in ExcelWriter.write_data(self)
87 writer = WorkbookWriter(self.workbook)
88 archive.writestr(ARC_ROOT_RELS, writer.write_root_rels())
---> 89 archive.writestr(ARC_WORKBOOK, writer.write())
90 archive.writestr(ARC_WORKBOOK_RELS, writer.write_rels())
92 self._merge_vba()
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\workbook\_writer.py:148, in WorkbookWriter.write(self)
146 self.write_names()
147 self.write_pivots()
--> 148 self.write_views()
149 self.write_refs()
151 return tostring(self.package.to_tree())
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\workbook\_writer.py:135, in WorkbookWriter.write_views(self)
134 def write_views(self):
--> 135 active = get_active_sheet(self.wb)
136 if self.wb.views:
137 self.wb.views[0].activeTab = active
File filepath\Excel Formatting\_venv\lib\site-packages\openpyxl\workbook\_writer.py:33, in get_active_sheet(wb)
31 visible_sheets = [idx for idx, sheet in enumerate(wb._sheets) if sheet.sheet_state == "visible"]
32 if not visible_sheets:
---> 33 raise IndexError("At least one sheet must be visible")
35 idx = wb._active_sheet_index
36 sheet = wb.active
IndexError: At least one sheet must be visible
Upvotes: 0
Views: 1223
Reputation: 7251
The following solution works and was inspired by this post and this post:
import pandas as pd, gc as gc
gc.enable()
filename = 'STAGING FILE1.XLS'
xlsx_filename = "temp_output.xlsx"
xls_file = pd.ExcelFile(filename)
# Create ExcelFile object and retrieve sheet names
xls_file = pd.ExcelFile(filename)
sheet_names = xls_file.sheet_names
# Create dict
res = {}
# Build dict of sheetname: dataframe of each sheet
for sheet in sheet_names:
res[sheet] = pd.read_excel(filename, sheet_name=sheet, header=None)
# Create ExcelWriter object
writer = pd.ExcelWriter(xlsx_filename, engine='xlsxwriter')
# Loop through dict, and have the writer write them to a single file
for sheet, frame in res.items():
frame.to_excel(writer, sheet_name=sheet, header=None, index=None)
# Save off
writer.save()
# Delete objects and free up memory
del xls_file, sheet_names, res, writer
gc.collect()
Upvotes: 1