an_ja
an_ja

Reputation: 427

Convert xml without namespaces

Thanks to Dave2e's answer (Convert XML to dataframe) I managed to convert most of my xml-files to a dataframe. However, I have several files, that don't use namespaces. The script doesn't show any error messages, but it doesn't run through either (after 48 hours I cancelled the process). The files are about 58Mb (each).

How do I have to change the code so that it runs through?

  doc <- read_xml('/path/to/Quelle/Response.xml')
  
  # namespace identifizieren
  ns <- xml_ns(doc)
  
  reportedPersons <- xml_find_all(doc, ".//d1:reportedPerson")
  
  dfs<- lapply(reportedPersons, function(reportedPerson) {
    
    #find all the children nodes
    allchildren <- reportedPerson %>% xml_find_all( './/*') 
    
    #count the number of children in each node
    branches <-reportedPerson %>% xml_find_all( './/*') %>% xml_length(only_elements = TRUE)
    
    #filter to keep the lowest level - nodes with values - no children
    leafs <- allchildren[which(branches==0)]
    
    #get values and names of the leafs
    values <- leafs %>% xml_text()
    names <- leafs %>% xml_name() #xml_name(ns=ns)
    #get names of the parents - still results in a few duplicates
    parentNames_top <-  sapply(leafs, function(leaf {xml_parent(xml_parent(xml_parent(xml_parent(leaf)))) %>% xml_name()})
    parentNames_middle1 <-  sapply(leafs, function(leaf){xml_parent(xml_parent(xml_parent(leaf))) %>% xml_name()})
    parentNames_middle2 <-  sapply(leafs, function(leaf){xml_parent(xml_parent(leaf)) %>% xml_name()})
    parentNames <-  sapply(leafs, function(leaf){xml_parent(leaf) %>% xml_name()})
    x <- paste(parentNames_top, parentNames_middle1, parentNames_middle2, parentNames, names, sep="_")
    
    #put into a dataframe 
    df <- as.data.frame(t(values), cut.names = FALSE)
    names(df) = paste(parentNames_top, parentNames, names, sep="_")
    #there are still a few duplicates could use full path names to avoid duplicates
    #names(df) = xml_path(leafs)
    
    df
  })
  
  d.test <- bind_rows(dfs)

Excerp from a file:

<?xml version="1.0" encoding="utf-8"?>
<delivery xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2.1" xmlns="http://www.ech.ch/xmlns/eCH-0099/2">
  <deliveryHeader>
    <senderId xmlns="http://www.ech.ch/xmlns/eCH-0058/4">1-2k78-1</senderId>
    <recipientId xmlns="http://www.ech.ch/xmlns/eCH-0058/4">3-LI-1</recipientId>
    <recipientId xmlns="http://www.ech.ch/xmlns/eCH-0058/4">2-TZ-3</recipientId>
    <messageId xmlns="http://www.ech.ch/xmlns/eCH-0058/4">53453-FGHRFTZH</messageId>
    <messageType xmlns="http://www.ech.ch/xmlns/eCH-0058/4">1</messageType>
    <sendingApplication xmlns="http://www.ech.ch/xmlns/eCH-0058/4">
      <manufacturer>In AG</manufacturer>
      <product>tres</product>
      <productVersion>2020</productVersion>
    </sendingApplication>
    <partialDelivery xmlns="http://www.ech.ch/xmlns/eCH-0058/4">
      <uniqueIdDelivery>45der4</uniqueIdDelivery>
      <totalNumberOfPackages>1</totalNumberOfPackages>
      <numberOfActualPackage>1</numberOfActualPackage>
    </partialDelivery>
    <messageDate xmlns="http://www.ech.ch/xmlns/eCH-0058/4">2021-04-12T11:31:37.52</messageDate>
    <eventDate xmlns="http://www.ech.ch/xmlns/eCH-0058/4">2021-03-31</eventDate>
    <modificationDate xmlns="http://www.ech.ch/xmlns/eCH-0058/4">2021-04-12</modificationDate>
    <action xmlns="http://www.ech.ch/xmlns/eCH-0058/4">1</action>
    <testDeliveryFlag xmlns="http://www.ech.ch/xmlns/eCH-0058/4">false</testDeliveryFlag>
  </deliveryHeader>
  <reportedPerson>
    <baseData>
      <person xmlns="http://www.ech.ch/xmlns/eCH-0011/8">
        <personIdentification>
          <vn xmlns="http://www.ech.ch/xmlns/eCH-0044/4">6664442863124</vn>
          <localPersonId xmlns="http://www.ech.ch/xmlns/eCH-0044/4">
            <personIdCategory>WW.454</personIdCategory>
            <personId>3335</personId>
          </localPersonId>
          <otherPersonId xmlns="http://www.ech.ch/xmlns/eCH-0044/4">
            <personIdCategory>RT.5674</personIdCategory>
            <personId>99999999</personId>
          </otherPersonId>
          <officialName xmlns="http://www.ech.ch/xmlns/eCH-0044/4">Test</officialName>
          <firstName xmlns="http://www.ech.ch/xmlns/eCH-0044/4">Wels</firstName>
          <sex xmlns="http://www.ech.ch/xmlns/eCH-0044/4">9</sex>
          <dateOfBirth xmlns="http://www.ech.ch/xmlns/eCH-0044/4">
            <yearMonthDay>1733-01-01</yearMonthDay>
          </dateOfBirth>
        </personIdentification>
        <nameData>
          <officialName>Test</officialName>
          <firstName>Wels</firstName>
          <callName>Welsi</callName>
        </nameData>
        <birthData>
          <dateOfBirth>
            <yearMonthDay xmlns="http://www.ech.ch/xmlns/eCH-0044/4">1733-01-01</yearMonthDay>
          </dateOfBirth>
          <placeOfBirth>
            <foreignCountry>
              <country>
                <countryId xmlns="http://www.ech.ch/xmlns/eCH-0008/3">4353</countryId>
                <countryIdISO2 xmlns="http://www.ech.ch/xmlns/eCH-0008/3">YX</countryIdISO2>
                <countryNameShort xmlns="http://www.ech.ch/xmlns/eCH-0008/3">Yukiland</countryNameShort>
              </country>
              <town>Eikwrl</town>
            </foreignCountry>
          </placeOfBirth>
          <sex>9</sex>
        </birthData>
        <religionData>
          <religion>000</religion>
          <religionValidFrom>1733-01-01</religionValidFrom>
        </religionData>
        <maritalData>
          <maritalStatus>9</maritalStatus>
        </maritalData>
        <nationalityData>
          <nationalityStatus>9</nationalityStatus>
          <countryInfo>
            <country>
              <countryId xmlns="http://www.ech.ch/xmlns/eCH-0008/3">9655</countryId>
              <countryIdISO2 xmlns="http://www.ech.ch/xmlns/eCH-0008/3">ZT</countryIdISO2>
              <countryNameShort xmlns="http://www.ech.ch/xmlns/eCH-0008/3">Zentralris</countryNameShort>
            </country>
            <nationalityValidFrom>1733-01-01</nationalityValidFrom>
          </countryInfo>
        </nationalityData>
        <languageOfCorrespondance>de</languageOfCorrespondance>
        <placeOfOrigin>
          <originName>Goswiesl</originName>
          <canton>ZT</canton>
          <placeOfOriginId>893</placeOfOriginId>
        </placeOfOrigin>
      </person>
      <hasMainResidence xmlns="http://www.ech.ch/xmlns/eCH-0011/8">
        <mainResidence>
          <reportingMunicipality>
            <municipalityId xmlns="http://www.ech.ch/xmlns/eCH-0007/5">3453</municipalityId>
            <municipalityName xmlns="http://www.ech.ch/xmlns/eCH-0007/5">Twers</municipalityName>
            <cantonAbbreviation xmlns="http://www.ech.ch/xmlns/eCH-0007/5">ZT</cantonAbbreviation>
            <historyMunicipalityId xmlns="http://www.ech.ch/xmlns/eCH-0007/5">93434</historyMunicipalityId>
          </reportingMunicipality>
          <arrivalDate>1733-01-01</arrivalDate>
          <comesFrom>
            <swissTown>
              <municipalityId xmlns="http://www.ech.ch/xmlns/eCH-0007/5">45335</municipalityId>
              <municipalityName xmlns="http://www.ech.ch/xmlns/eCH-0007/5">Wekwr</municipalityName>
              <cantonAbbreviation xmlns="http://www.ech.ch/xmlns/eCH-0007/5">EO</cantonAbbreviation>
              <historyMunicipalityId xmlns="http://www.ech.ch/xmlns/eCH-0007/5">234923</historyMunicipalityId>
            </swissTown>
          </comesFrom>
          <dwellingAddress>
            <EGID>99999999</EGID>
            <EWID>999</EWID>
            <householdID>99999</householdID>
            <address>
              <street xmlns="http://www.ech.ch/xmlns/eCH-0010/5">Werkhof</street>
              <houseNumber xmlns="http://www.ech.ch/xmlns/eCH-0010/5">1</houseNumber>
              <town xmlns="http://www.ech.ch/xmlns/eCH-0010/5">Tewrw</town>
              <swissZipCode xmlns="http://www.ech.ch/xmlns/eCH-0010/5">5656</swissZipCode>
              <swissZipCodeAddOn xmlns="http://www.ech.ch/xmlns/eCH-0010/5">00</swissZipCodeAddOn>
              <swissZipCodeId xmlns="http://www.ech.ch/xmlns/eCH-0010/5">23423</swissZipCodeId>
              <country xmlns="http://www.ech.ch/xmlns/eCH-0010/5">UZ</country>
            </address>
            <typeOfHousehold>1</typeOfHousehold>
            <movingDate>1733-01-01</movingDate>
          </dwellingAddress>
        </mainResidence>
      </hasMainResidence>
    </baseData>
  </reportedPerson>
</delivery>

Upvotes: 1

Views: 306

Answers (1)

Parfait
Parfait

Reputation: 107652

Consider XSLT (special-purpose language designed to transform XML files) to flatten your XML at <reportedPerson> node across all levels of its descendants. To avoid repeated name, the grandparent and parent node names are concatenated to current node name. Below works on this XML and your previous posted XML. R can run XSLT 1.0 with the xslt package (sister package to xml2):

XSLT (save as .xsl file, a special .xml file)

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
                              xmlns:doc="http://www.ech.ch/xmlns/eCH-0099/2">
  <xsl:output method="xml" indent="yes"/>
  <xsl:strip-space elements="*"/>
  
  <xsl:template match="/doc:delivery">
      <xsl:copy>
          <xsl:apply-templates select="doc:reportedPerson"/>
      </xsl:copy>
  </xsl:template>
  
    <xsl:template match="doc:reportedPerson">
      <xsl:copy>
          <xsl:apply-templates select="*/*/*/*[text()]"/>
          <xsl:apply-templates select="*/*/*/*/*[text()]"/>
          <xsl:apply-templates select="*/*/*/*/*/*[text()]"/>
          <xsl:apply-templates select="*/*/*/*/*/*/*[text()]"/>
      </xsl:copy>
   </xsl:template>
  
    <xsl:template match="doc:reportedPerson/*/*/*/*[text()] | 
                         doc:reportedPerson/*/*/*/*/*[text()] |
                         doc:reportedPerson/*/*/*/*/*/*[text()] | 
                         doc:reportedPerson/*/*/*/*/*/*/*[text()]">
      <xsl:call-template name="flatten_nodes"/>
   </xsl:template>
   
    <xsl:template name="flatten_nodes">
      <xsl:element name="{concat(local-name(../..), '_', local-name(..), '_', local-name())}" 
                   namespace="http://www.ech.ch/xmlns/eCH-0099/2">
          <xsl:value-of select="text()"/>
      </xsl:element>
    </xsl:template>

</xsl:stylesheet>

Online Demo

R

library(xml2)
library(xslt)

# LOAD XML AND XSL
doc <- read_xml("/path/to/Input.xml")
style <- read_xml("/path/to/Flatten/Script.xsl", package = "xslt")

# RUN TRANSFORMATION
new_xml <- xml_xslt(doc, style)

# RETRIEVE reportedPerson NODES
nmsp <- c(doc = "http://www.ech.ch/xmlns/eCH-0099/2")
recs <- xml2::xml_find_all(new_xml, "//doc:reportedPerson", ns=nmsp)

# BIND EACH CHILD TEXT AND NAME
df_list <- lapply(recs, function(r) {
  vals <- xml2::xml_children(r)
  
  data.frame(rbind(setNames(c(xml2::xml_text(vals)), 
                            c(xml2::xml_name(vals)))))
})

# COMBINE ALL DFS
reportedPerson_df <- do.call(rbind.data.frame, df_list)

# REMOVE HELPER OBJECTS
rm(recs, df_list)

Alternatively, if you have your own XSLT processor and can run raw XML through it or call with R's system() at command line, the original XML package can work as well with handy xmlToDataFrame. This demonstrates the portability of XSLT!

library(XML)

# COMMAND LINE CALL TO UNIX'S XSLTPROC (ALTERNATIVE TO xslt PACKAGE)
system("xsltproc -o /path/to/input.xml /path/to/xslt_script.xsl /path/to/output.xml")

doc <- xmlParse("/path/to/output.xml")
nmsp <- c(doc = "http://www.ech.ch/xmlns/eCH-0099/2")

reportedPerson_df <- xmlToDataFrame(
    doc, nodes=getNodeSet(doc, "//doc:reportedPerson", namespace=nmsp)
)

Upvotes: 1

Related Questions