AmirReza
AmirReza

Reputation: 365

CompDocError when importing .xls file format to python using pandas read_excel()

I'm trying to import the IMF's GDP data. It is an excel file with .xls extension format. when I try to import it using the code below:

imf_data = pd.read_excel('C:\\Users\\amir\\Downloads\\imf_data.xls', sheet_name=None)

I get the following error:

_locate_stream(Workbook): seen
    0  5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 
   20  4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 
  340= 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 
  360  4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 2 
  380  2 2 
---------------------------------------------------------------------------
CompDocError                              Traceback (most recent call last)
<ipython-input-6-784f71031ca1> in <module>
----> 1 imf_data = pd.read_excel('C:\\Users\\amir\\Downloads\\imf_data.xls', sheet_name=None)

~\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    186                 else:
    187                     kwargs[new_arg_name] = new_arg_value
--> 188             return func(*args, **kwargs)
    189         return wrapper
    190     return _deprecate_kwarg

~\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    186                 else:
    187                     kwargs[new_arg_name] = new_arg_value
--> 188             return func(*args, **kwargs)
    189         return wrapper
    190     return _deprecate_kwarg

~\Anaconda3\lib\site-packages\pandas\io\excel.py in read_excel(io, sheet_name, header, names, index_col, parse_cols, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skip_footer, skipfooter, convert_float, mangle_dupe_cols, **kwds)
    348 
    349     if not isinstance(io, ExcelFile):
--> 350         io = ExcelFile(io, engine=engine)
    351 
    352     return io.parse(

~\Anaconda3\lib\site-packages\pandas\io\excel.py in __init__(self, io, engine)
    651         self._io = _stringify_path(io)
    652 
--> 653         self._reader = self._engines[engine](self._io)
    654 
    655     def __fspath__(self):

~\Anaconda3\lib\site-packages\pandas\io\excel.py in __init__(self, filepath_or_buffer)
    422             self.book = xlrd.open_workbook(file_contents=data)
    423         elif isinstance(filepath_or_buffer, compat.string_types):
--> 424             self.book = xlrd.open_workbook(filepath_or_buffer)
    425         else:
    426             raise ValueError('Must explicitly set engine if not passing in'

~\Anaconda3\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    155         formatting_info=formatting_info,
    156         on_demand=on_demand,
--> 157         ragged_rows=ragged_rows,
    158     )
    159     return bk

~\Anaconda3\lib\site-packages\xlrd\book.py in open_workbook_xls(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
     86             formatting_info=formatting_info,
     87             on_demand=on_demand,
---> 88             ragged_rows=ragged_rows,
     89         )
     90         t1 = perf_counter()

~\Anaconda3\lib\site-packages\xlrd\book.py in biff2_8_load(self, filename, file_contents, logfile, verbosity, use_mmap, encoding_override, formatting_info, on_demand, ragged_rows)
    634                 for qname in ['Workbook', 'Book']:
    635                     self.mem, self.base, self.stream_len = \
--> 636                                 cd.locate_named_stream(UNICODE_LITERAL(qname))
    637                     if self.mem: break
    638                 else:

~\Anaconda3\lib\site-packages\xlrd\compdoc.py in locate_named_stream(self, qname)
    397             result = self._locate_stream(
    398                 self.mem, 512, self.SAT, self.sec_size, d.first_SID,
--> 399                 d.tot_size, qname, d.DID+6)
    400             if self.DEBUG:
    401                 print("\nseen", file=self.logfile)

~\Anaconda3\lib\site-packages\xlrd\compdoc.py in _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id)
    425             if self.seen[s]:
    426                 print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile)
--> 427                 raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
    428             self.seen[s] = seen_id
    429             tot_found += 1

CompDocError: Workbook corruption: seen[2] == 4

why such a thing is happening and how it can be resolved?

Upvotes: 0

Views: 3600

Answers (1)

Julio Batista Silva
Julio Batista Silva

Reputation: 2089

In my case it was a Compound Document format with a .xls extension.

We can open those files in Python using the OleFileIO_PL package. My real example:

import pandas as pd
import requests
import OleFileIO_PL

url = "https://www.cepea.esalq.usp.br/br/indicador/series/milho.aspx?id=77"
resp = requests.get(url)
ole = OleFileIO_PL.OleFileIO(resp.content)
df = pd.read_excel(ole.openstream('Workbook'), skiprows=3)

Upvotes: 3

Related Questions