istwine
istwine

Reputation: 31

How to read/parse an .xls file in Python (XML schema)

how to read this structure in python?

<?xml version="1.0" encoding="ISO-8859-1"?>

-<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:schemas-microsoft-com:office:spreadsheet">


-<Styles>


-<Style ss:ID="VIEW">

<Font ss:Bold="1"/>

</Style>


-<Style ss:ID="HEADER">

<Font ss:Bold="1" ss:Color="#FFFFFF"/>

<Interior ss:Color="#666699" ss:Pattern="Solid"/>

</Style>


-<Style ss:ID="DOUBLE_0">

<NumberFormat ss:Format="0"/>

</Style>


-<Style ss:ID="DOUBLE_2">

<NumberFormat ss:Format="0.00"/>

</Style>


-<Style ss:ID="DOUBLE_3">

<NumberFormat ss:Format="0.000"/>

</Style>


-<Style ss:ID="DOUBLE_4">

<NumberFormat ss:Format="0.0000"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_2">

<NumberFormat ss:Format="0.00%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_3">

<NumberFormat ss:Format="0.000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_4">

<NumberFormat ss:Format="0.0000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_5">

<NumberFormat ss:Format="0.00000%"/>

</Style>


-<Style ss:ID="DATE">

<NumberFormat ss:Format="yyyy\-mm\-dd;@"/>

</Style>

<ss:Style ss:ID="STRING"/>

</Styles>


-<Worksheet ss:Name="MSFRE">


-<Table x:FullRows="1" x:FullColumns="1">


-<Row>


-<Cell ss:StyleID="VIEW" ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3" ss:MergeAcross="5">

<Data ss:Type="String"/>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3">

<Data ss:Type="String">Holding Date</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="4">

<Data ss:Type="String">Fund code</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="5">

<Data ss:Type="String">Fund name</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="6">

<Data ss:Type="String">PTF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="7">

<Data ss:Type="String">REF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="8">

<Data ss:Type="String">PTF-REF</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Asia ex-Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Emerging Countries</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Europe ex-EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">North America</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>

</Row>


</Table>

</Worksheet>

</Workbook>

if I use this code the cell index collapse and can't recreate the table:

import pandas as pd
import xml.etree.cElementTree as ET

#dict_bd_area = 
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}

tree = ET.parse("filepath")
root = tree.getroot()

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = tree

    data = []
    for i, node in enumerate(root.findall('.//doc:Row', ns)):
        if i > 3:
            data.append({'Geo': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
                         'Holding - Date': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns)),
                         'Fund Code': getvalueofnode(node.find('doc:Cell[3]/doc:Data', ns)),
                         'Name': getvalueofnode(node.find('doc:Cell[4]/doc:Data', ns)), 
                         'PTF': getvalueofnode(node.find('doc:Cell[5]/doc:Data', ns)),
                         'REF': getvalueofnode(node.find('doc:Cell[6]/doc:Data', ns)),
                         'PTF-REF':getvalueofnode(node.find('doc:Cell[7]/doc:Data', ns)),
                         'SPS PTF': getvalueofnode(node.find('doc:Cell[8]/doc:Data', ns)),
                         'SPS REF': getvalueofnode(node.find('doc:Cell[9]/doc:Data', ns))
                        })

return(pd.DataFrame(data))

How to handle the cell index if is empty or null as in this case? (in this case cell index 7 doesn't always exists). The Win32 library cannot help me in this case, I tried but it didn't have performance required to handle thousands of file

UPDATE: 2

What if I want to parse the parentcolDimension? This is the beginning of the XML file:

<loadingDate>2021-08-10T00:00:00+02:00</loadingDate>
<exportDate>2021-08-11T18:43:22.513+02:00</exportDate>
<exportParameters key="occurrenceKey" value="Family_occurrence"/>
<exportParameters key="occurrenceName" value="Family occurrence"/>
<exportParameters key="ConfigurationPath" value="/usr/users/reporting/configurations"/>
<exportParameters key="scenarioKey" value="mult"/>
<exportParameters key="perspective" value="Mulnt"/>
<dataViews viewHashcode="-75742857" showRootLine="true">
<viewKey>Flat</viewKey>
<viewName>Flat</viewName>
<viewType>tableSectionViewType</viewType>
<sectionInfos>
<sectionInfo sectionKey="MultiView" showSectionLine="false" startLine="0"/>
</sectionInfos>
<viewId>3</viewId>
<headers>
<header id="1" key="GF 350006|" desc="<html><font color=#00A0E3><b>Weight% (PTF) groupName="GF 350006" Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350006"/>
</header>
<header id="2" key="GF TOP PRIVATE 350007 groupName="GF 350007"Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350007"/>

Update 3: How could I retrieve how many times this cell <Cell ss:StyleID="s63"/> (the ones with no data) occurs, for example, in the second row?

<?xml version="1.0"?>
<?mso-application progid="Excel.Sheet"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
 xmlns:o="urn:schemas-microsoft-com:office:office"
 xmlns:x="urn:schemas-microsoft-com:office:excel"
 xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
 xmlns:html="http://www.w3.org/TR/REC-html40"
 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xmlns:msxsl="urn:schemas-microsoft-com:xslt">
 <DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
  <Version>16.00</Version>
 </DocumentProperties>
 <OfficeDocumentSettings xmlns="urn:schemas-microsoft-com:office:office">
  <AllowPNG/>
 </OfficeDocumentSettings>
 <ExcelWorkbook xmlns="urn:schemas-microsoft-com:office:excel">
  <WindowHeight>6760</WindowHeight>
  <WindowWidth>15530</WindowWidth>
  <WindowTopX>0</WindowTopX>
  <WindowTopY>0</WindowTopY>
  <ProtectStructure>False</ProtectStructure>
  <ProtectWindows>False</ProtectWindows>
 </ExcelWorkbook>
 <Styles>
  <Style ss:ID="Default" ss:Name="Normal">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Calibri" x:Family="Swiss" ss:Size="11" ss:Color="#000000"/>
   <Interior/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s62">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Arial" ss:Bold="1"/>
   <Interior/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s63">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Arial" ss:Color="#FFFFFF" ss:Bold="1"/>
   <Interior ss:Color="#666699" ss:Pattern="Solid"/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s65">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Calibri" x:Family="Swiss" ss:Size="11" ss:Color="#000000"/>
   <Interior/>
   <NumberFormat ss:Format="Percent"/>
   <Protection/>
  </Style>
 </Styles>
 <Worksheet ss:Name="- Family split occurrence-3- ">
  <Table ss:ExpandedColumnCount="262" ss:ExpandedRowCount="2460" x:FullColumns="1"
   x:FullRows="1" ss:DefaultRowHeight="14.5">
   <Row>
    <Cell ss:StyleID="s62"><Data ss:Type="String">Transparency Equity</Data></Cell>
   </Row>
   <Row>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:MergeAcross="4" ss:StyleID="s63"><Data ss:Type="String">Date 0</Data></Cell>
    <Cell ss:MergeAcross="4" ss:StyleID="s63"><Data ss:Type="String">Date 1</Data></Cell>

Upvotes: 1

Views: 879

Answers (1)

Alexandra Dudkina
Alexandra Dudkina

Reputation: 4482

Try to use "Index" attribute instead of cell element index:

# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}

# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))

Same approach could be used for other cells as well.

This code will try to find cell element with the given index attribute. If not found function getvalueofnode() will return None.

To get parentColDimension following code could be used:

for parentColDimension in root.findall('.//doc:header/doc:parentColDimension', ns):
  print(parentColDimension.get('label'))

Upvotes: 1

Related Questions