user75415
user75415

Reputation: 11

How to extract data/nodes from .xml stored as string?

I have a .xml in string format xmlString which looks like below

<entry xmlns="http://www.w3.org/2004/tom">
  <id>urn:contentItem:7WBG-8H88-Y898-B277-00000-00-1</id>
  <title>Dinn-Pixie Stares, Inc</title>
  <published>2015-12-24T00:00:00Z</published>
  <updated>2023-10-24T18:42:17Z</updated>
  <author>
    <name>AlphaNext</name>
  </author>
  <content type="application/xml">
    <baseRelatedDoc xmlns="" xmlns:xsi="http://www.w3.org/2012/XMLSchema" xsi:noNamespaceSchemaLocation="http://www.alphanext.com/xmlscemas/content/public/caseddoc/1/" documentType="socket">
      <baseRelatedDocHead>
        <baseInfo>
          <portInfo>
            <identifier idType="portIdentifier">100027579</identifier>
            <portName>United States Port, Minnesota Middle</portName>
            <jurisdiction>
              <jurisSystem/>
            </jurisdiction>
          </portInfo>
          <date dateType="filed" year="2015" month="02" day="21">2015-02-21</date>
          <classification classificationScheme="baseType">
            <classificationItem>
              <classCode>BK</classCode>
              <className>Tankruptcy</className>
            </classificationItem>
          </classification>
          <classification classificationScheme="baseNos">
            <classificationItem>
              <classCode>0</classCode>
              <className>UNKNOWN</className>
            </classificationItem>
          </classification>
        </baseInfo>
        <baseSupplement>
          <label>US Tankruptcy Port Socket</label>
          <date dateType="updated" year="2024" month="09" day="13">2024-07-11T15:08:26.450</date>
          <status>Unknown</status>
        </baseSupplement>
        <baseName>Dinn-Pixie Stares, Inc</baseName>
      </baseRelatedDocHead>
      <baseRelatedDocBody>
        <socket/>
      </baseRelatedDocBody>
      <metadata>
        <dc:metadata xmlns:dc="http://purrel.org/dc/element/1.2/">
          <dc:source sourceScheme="productContentSetIdentifier">343392</dc:source>
          <dc:creator>US Tankruptcy Port for the Last Town of Minnesota</dc:creator>
          <dc:identifier identifierScheme="PGIID">urn:contentItem:8WBG-8H70-Y892-B237-00000-00</dc:identifier>
          <dc:date dateType="last-updated">2024-07-11</dc:date>
        </dc:metadata>
      </metadata>
    </baseRelatedDoc>
  </content>
</entry>

I need to extract value of the field like baseName, portName, title, classCode,dc:creator .

However when i am trying to extract them using y=tree.findall('baseName') where tree = ET.fromstring(xmlString)) but y comes out as an empty list. I get the same empty list when i tried for nodes like portName, dc:creator . How do i extract the value of these nodes/field?

Upvotes: 1

Views: 60

Answers (2)

Hermann12
Hermann12

Reputation: 3581

Figure out in which namespace your information is and than search for it:

import xml.etree.ElementTree as ET

xml_ ="""<entry xmlns="http://www.w3.org/2004/tom">
  <id>urn:contentItem:7WBG-8H88-Y898-B277-00000-00-1</id>
  <title>Dinn-Pixie Stares, Inc</title>
  <published>2015-12-24T00:00:00Z</published>
  <updated>2023-10-24T18:42:17Z</updated>
  <author>
    <name>AlphaNext</name>
  </author>
  <content type="application/xml">
    <baseRelatedDoc xmlns="" xmlns:xsi="http://www.w3.org/2012/XMLSchema" xsi:noNamespaceSchemaLocation="http://www.alphanext.com/xmlscemas/content/public/caseddoc/1/" documentType="socket">
      <baseRelatedDocHead>
        <baseInfo>
          <portInfo>
            <identifier idType="portIdentifier">100027579</identifier>
            <portName>United States Port, Minnesota Middle</portName>
            <jurisdiction>
              <jurisSystem/>
            </jurisdiction>
          </portInfo>
          <date dateType="filed" year="2015" month="02" day="21">2015-02-21</date>
          <classification classificationScheme="baseType">
            <classificationItem>
              <classCode>BK</classCode>
              <className>Tankruptcy</className>
            </classificationItem>
          </classification>
          <classification classificationScheme="baseNos">
            <classificationItem>
              <classCode>0</classCode>
              <className>UNKNOWN</className>
            </classificationItem>
          </classification>
        </baseInfo>
        <baseSupplement>
          <label>US Tankruptcy Port Socket</label>
          <date dateType="updated" year="2024" month="09" day="13">2024-07-11T15:08:26.450</date>
          <status>Unknown</status>
        </baseSupplement>
        <baseName>Dinn-Pixie Stares, Inc</baseName>
      </baseRelatedDocHead>
      <baseRelatedDocBody>
        <socket/>
      </baseRelatedDocBody>
      <metadata>
        <dc:metadata xmlns:dc="http://purrel.org/dc/element/1.2/">
          <dc:source sourceScheme="productContentSetIdentifier">343392</dc:source>
          <dc:creator>US Tankruptcy Port for the Last Town of Minnesota</dc:creator>
          <dc:identifier identifierScheme="PGIID">urn:contentItem:8WBG-8H70-Y892-B237-00000-00</dc:identifier>
          <dc:date dateType="last-updated">2024-07-11</dc:date>
        </dc:metadata>
      </metadata>
    </baseRelatedDoc>
  </content>
</entry>"""

interest = ["baseName", "portName", "{http://www.w3.org/2004/tom}title", "classCode", "{http://purrel.org/dc/element/1.2/}creator"]
root=ET.fromstring(xml_)
for tag_ in root.iter():
    # 1st figure out in which namespace your tag is:
    #print(tag_.tag)
    # than adjust your list of interest
    if tag_.tag in interest:
        print(f"{tag_.tag:<45}:", tag_.text)

Output:

{http://www.w3.org/2004/tom}title            : Dinn-Pixie Stares, Inc
portName                                     : United States Port, Minnesota Middle
classCode                                    : BK
classCode                                    : 0
baseName                                     : Dinn-Pixie Stares, Inc
{http://purrel.org/dc/element/1.2/}creator   : US Tankruptcy Port for the Last Town of Minnesota

Upvotes: 0

Andrej Kesely
Andrej Kesely

Reputation: 195573

You can try:

import xml.etree.ElementTree as ET

xml_doc = """\
<entry xmlns="http://www.w3.org/2004/tom">
  <id>urn:contentItem:7WBG-8H88-Y898-B277-00000-00-1</id>
  <title>Dinn-Pixie Stares, Inc</title>
  <published>2015-12-24T00:00:00Z</published>
  <updated>2023-10-24T18:42:17Z</updated>
  <author>
    <name>AlphaNext</name>
  </author>
  <content type="application/xml">
    <baseRelatedDoc xmlns="" xmlns:xsi="http://www.w3.org/2012/XMLSchema" xsi:noNamespaceSchemaLocation="http://www.alphanext.com/xmlscemas/content/public/caseddoc/1/" documentType="socket">
      <baseRelatedDocHead>
        <baseInfo>
          <portInfo>
            <identifier idType="portIdentifier">100027579</identifier>
            <portName>United States Port, Minnesota Middle</portName>
            <jurisdiction>
              <jurisSystem/>
            </jurisdiction>
          </portInfo>
          <date dateType="filed" year="2015" month="02" day="21">2015-02-21</date>
          <classification classificationScheme="baseType">
            <classificationItem>
              <classCode>BK</classCode>
              <className>Tankruptcy</className>
            </classificationItem>
          </classification>
          <classification classificationScheme="baseNos">
            <classificationItem>
              <classCode>0</classCode>
              <className>UNKNOWN</className>
            </classificationItem>
          </classification>
        </baseInfo>
        <baseSupplement>
          <label>US Tankruptcy Port Socket</label>
          <date dateType="updated" year="2024" month="09" day="13">2024-07-11T15:08:26.450</date>
          <status>Unknown</status>
        </baseSupplement>
        <baseName>Dinn-Pixie Stares, Inc</baseName>
      </baseRelatedDocHead>
      <baseRelatedDocBody>
        <socket/>
      </baseRelatedDocBody>
      <metadata>
        <dc:metadata xmlns:dc="http://purrel.org/dc/element/1.2/">
          <dc:source sourceScheme="productContentSetIdentifier">343392</dc:source>
          <dc:creator>US Tankruptcy Port for the Last Town of Minnesota</dc:creator>
          <dc:identifier identifierScheme="PGIID">urn:contentItem:8WBG-8H70-Y892-B237-00000-00</dc:identifier>
          <dc:date dateType="last-updated">2024-07-11</dc:date>
        </dc:metadata>
      </metadata>
    </baseRelatedDoc>
  </content>
</entry>"""


root = ET.fromstring(xml_doc)

ns = {"tom": "http://www.w3.org/2004/tom", "dc": "http://purrel.org/dc/element/1.2/"}

base_name = root.find(".//baseName").text
port_name = root.find(".//portName").text
title = root.find(".//tom:title", ns).text
class_codes = [e.text for e in root.findall(".//classCode")]
dc_creator = root.find(".//dc:creator", ns).text

print(f"{base_name=}")
print(f"{port_name=}")
print(f"{title=}")
print(f"{class_codes=}")
print(f"{dc_creator=}")

Prints:

base_name='Dinn-Pixie Stares, Inc'
port_name='United States Port, Minnesota Middle'
title='Dinn-Pixie Stares, Inc'
class_codes=['BK', '0']
dc_creator='US Tankruptcy Port for the Last Town of Minnesota'

Upvotes: 0

Related Questions