topchef
topchef

Reputation: 19783

Parsing XML with R - is it always so difficult?

I spent more time than expected to transfer xml into dataframe (code snippet contains both xml and xmlTreeParse to make post smaller, the whole solution is after this):

users = xmlTreeParse(file=
'<?xml version="1.0" encoding="utf-8"?>
<users>
  <row Id="-1" Reputation="1" CreationDate="2010-07-19T06:55:26.860" DisplayName="Community" LastAccessDate="2010-07-19T06:55:26.860" Location="on the server farm" AboutMe="some text" Views="0" UpVotes="4382" DownVotes="771" EmailHash="a007be5a61f6aa8f3e85ae2fc18dd66e" />
  <row Id="2" Reputation="101" CreationDate="2010-07-19T14:01:36.697" DisplayName="Geoff Dalgas" LastAccessDate="2012-09-13T17:41:48.300" WebsiteUrl="http://stackoverflow.com" Location="Corvallis, OR" AboutMe="some text 2" Views="7" UpVotes="3" DownVotes="0" EmailHash="b437f461b3fd27387c5d8ab47a293d35" Age="36" />
  <row Id="3" Reputation="101" CreationDate="2010-07-19T15:34:50.507" DisplayName="Jarrod Dixon" LastAccessDate="2013-01-15T03:28:47.657" WebsiteUrl="http://stackoverflow.com" Location="New York, NY" AboutMe="some text 3" Views="9" UpVotes="19" DownVotes="0" EmailHash="2dfa19bf5dc5826c1fe54c2c049a1ff1" Age="34" />
  <row Id="4" Reputation="101" CreationDate="2010-07-19T19:03:27.400" DisplayName="Emmett" LastAccessDate="2013-04-16T16:51:04.780" WebsiteUrl="http://minesweeperonline.com" Location="New York, NY" AboutMe="some text 4" Views="3" UpVotes="0" DownVotes="0" EmailHash="129bc58fc3f1e3853cdd3cefc75fe1a0" Age="27" />
  <row Id="5" Reputation="6182" CreationDate="2010-07-19T19:03:57.227" DisplayName="Shane" LastAccessDate="2013-02-05T11:23:09.587" WebsiteUrl="http://www.statalgo.com" Location="New York, NY" AboutMe="some text 5" Views="605" UpVotes="659" DownVotes="5" EmailHash="0cee97ffd90277bf4ac753331d50af60" Age="34" />
  <row Id="6" Reputation="442" CreationDate="2010-07-19T19:04:07.647" DisplayName="Harlan" LastAccessDate="2013-05-09T13:11:29.027" WebsiteUrl="http://www.harlan.harris.name" Location="District of Columbia" AboutMe="some text 6" Views="30" UpVotes="42" DownVotes="0" EmailHash="9f1a68b9e623be5da422b44e733fa8bc" Age="40" />
  <row Id="7" Reputation="329" CreationDate="2010-07-19T19:04:37.257" DisplayName="Vince" LastAccessDate="2013-05-21T22:49:10.237" WebsiteUrl="http://bioinformatics.ucdavis.edu" Location="Davis, CA" AboutMe="some text 7" Views="21" UpVotes="14" DownVotes="0" EmailHash="4f7cebc8ac200d15bac5dcff51469425" Age="27" />
  <row Id="8" Reputation="6104" CreationDate="2010-07-19T19:04:52.280" DisplayName="csgillespie" LastAccessDate="2013-05-21T17:32:58.693" WebsiteUrl="http://www.mas.ncl.ac.uk/~ncsg3/" Location="Newcastle, United Kingdom" AboutMe="some text 8" Views="399" UpVotes="576" DownVotes="18" EmailHash="3c3eea4eda77ffe95ae18c78c3fc7e55" Age="35" />
  <row Id="10" Reputation="121" CreationDate="2010-07-19T19:05:40.403" DisplayName="Pierre" LastAccessDate="2012-10-04T17:17:01.430" WebsiteUrl="http://plindenbaum.blogspot.com" Location="France" AboutMe="some text 10" Views="8" UpVotes="2" DownVotes="0" EmailHash="61200477cf8983809ec152f484750204" Age="43" />
  <row Id="11" Reputation="136" CreationDate="2010-07-19T19:06:02.713" DisplayName="wahalulu" LastAccessDate="2013-05-26T20:36:24.567" WebsiteUrl="http://www.linkedin.com/in/marckvaisman" Location="Washington, DC" AboutMe="some text 11" Views="2" UpVotes="10" DownVotes="0" EmailHash="9a9a05e41ae6e3b127697967cea5f8fb" Age="39" />
  <row Id="12" Reputation="101" CreationDate="2010-07-19T19:06:34.507" DisplayName="Jin" LastAccessDate="2013-04-11T18:31:58.360" WebsiteUrl="http://www.8164.org" Location="Raleigh, NC" AboutMe="some text 12" Views="5" UpVotes="4" DownVotes="0" EmailHash="70ad2c2830eb9a7753bd6312f3811e3e" Age="37" />
  <row Id="13" Reputation="677" CreationDate="2010-07-19T19:06:49.527" DisplayName="Sharpie" LastAccessDate="2012-01-02T22:55:04.743" WebsiteUrl="http://www.sharpsteen.net" Location="United States" AboutMe="Undergraduate studying Environmental Engineering and Applied Mathematics." Views="37" UpVotes="44" DownVotes="1" EmailHash="a52001938ed33a87334447413cc5beaa" Age="27" />
  <row Id="15" Reputation="11" CreationDate="2010-07-19T19:07:32.537" DisplayName="hannes.koller" LastAccessDate="2010-08-24T14:23:18.050" WebsiteUrl="http://soma.denkt.org" Location="Vienna, Austria" AboutMe="" Views="2" UpVotes="0" DownVotes="0" EmailHash="0ecd144e2f3d05e6ee6b89404d1d4c53" Age="34" />
  <row Id="16" Reputation="101" CreationDate="2010-07-19T19:08:13.957" DisplayName="slashnick" LastAccessDate="2010-08-19T20:40:59.080" Location="London, United Kingdom" Views="2" UpVotes="7" DownVotes="0" EmailHash="5691ff74e21c78cd1563b5123254cbd6" Age="30" />
  <row Id="17" Reputation="192" CreationDate="2010-07-19T19:08:28.243" DisplayName="Random" LastAccessDate="2010-09-10T07:34:36.123" AboutMe="" Views="6" UpVotes="13" DownVotes="1" EmailHash="5a3c78de1408aae57797dffd0782b992" />
  <row Id="18" Reputation="128" CreationDate="2010-07-19T19:08:29.070" DisplayName="grokus" LastAccessDate="2012-08-09T15:02:00.600" WebsiteUrl="http://wikipedia.org" Location="United States" AboutMe="about me 18" Views="6" UpVotes="16" DownVotes="0" EmailHash="7d1f931327bfab7b214758be17627adc" Age="43" />
  <row Id="19" Reputation="101" CreationDate="2010-07-19T19:08:45.250" DisplayName="Noah Snyder" LastAccessDate="2012-06-17T15:53:43.550" WebsiteUrl="http://sbseminar.wordpress.com" Location="New York, NY" AboutMe="about me 19" Views="11" UpVotes="2" DownVotes="0" EmailHash="895385d49eb1f04c5ee1f8d7734f3a62" Age="33" />
</users>',
          asText=TRUE)

XML is simply representation of Users table from the stackexchange data dump:

<users>
  <row Id=..... />
  <row Id=..... />
  .....
  <row Id=..... />
</users>

The mapping to dataframe is the same as if I mapped the table. This is the code that gets job done for me:

require(XML)
require(plyr)

# insert xmlTreeParse here

r = xmlRoot(users)

attrs = c('Id', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
          'WebsiteUrl', 'Location', 'AboutMe',  'Views', 'UpVotes', 'DownVotes', 
          'EmailHash', 'Age')

mapUserAttrs = function(x, colNames) {
  t = data.frame(as.integer(x['Id']), 
           as.integer(x['Reputation']), 
           strptime(x['CreationDate'], '%Y-%m-%dT%H:%M:%OS'), 
           as.character(x['DisplayName']), 
           strptime(x['LastAccessDate'], '%Y-%m-%dT%H:%M:%OS'), 
           as.character(x['WebsiteUrl']), 
           as.character(x['Location']), 
           as.character(x['AboutMe']),
           as.integer(x['Views']), 
           as.integer(x['UpVotes']), 
           as.integer(x['DownVotes']), 
           as.character(x['EmailHash']), 
           as.integer(x['Age']))
  names(t) = colNames
  return(t)
}

result = ldply(lapply(xmlChildren(r), xmlAttrs), mapUserAttrs, attrs)

It looks excessively busy to me - but I found no better way to accomplish the task with XML package and amount of examples and documentation I found.

I'd like to know if there is less complex (or shorter) way to accomplish the same?

Upvotes: 4

Views: 483

Answers (1)

SchaunW
SchaunW

Reputation: 3601

You can do this with the xmlToList function in the XML package and, because some of your nodes contain options that others don't, you'll also need the rbind.fill function from the plyr package.

The line of code below converts your XML to a list, loops through the nodes and turns the character strings into data frames, and then rbinds all those data frames together.

require(xml)
require(plyr)

out <- do.call("rbind.fill",
  lapply(xmlToList(users), 
    function(x) as.data.frame(as.list(x), stringsAsFactors = FALSE)))


head(out)
  Id Reputation            CreationDate  DisplayName          LastAccessDate             Location     AboutMe Views UpVotes DownVotes
1 -1          1 2010-07-19T06:55:26.860    Community 2010-07-19T06:55:26.860   on the server farm   some text     0    4382       771
2  2        101 2010-07-19T14:01:36.697 Geoff Dalgas 2012-09-13T17:41:48.300        Corvallis, OR some text 2     7       3         0
3  3        101 2010-07-19T15:34:50.507 Jarrod Dixon 2013-01-15T03:28:47.657         New York, NY some text 3     9      19         0
4  4        101 2010-07-19T19:03:27.400       Emmett 2013-04-16T16:51:04.780         New York, NY some text 4     3       0         0
5  5       6182 2010-07-19T19:03:57.227        Shane 2013-02-05T11:23:09.587         New York, NY some text 5   605     659         5
6  6        442 2010-07-19T19:04:07.647       Harlan 2013-05-09T13:11:29.027 District of Columbia some text 6    30      42         0
                         EmailHash                    WebsiteUrl  Age
1 a007be5a61f6aa8f3e85ae2fc18dd66e                          <NA> <NA>
2 b437f461b3fd27387c5d8ab47a293d35      http://stackoverflow.com   36
3 2dfa19bf5dc5826c1fe54c2c049a1ff1      http://stackoverflow.com   34
4 129bc58fc3f1e3853cdd3cefc75fe1a0  http://minesweeperonline.com   27
5 0cee97ffd90277bf4ac753331d50af60       http://www.statalgo.com   34
6 9f1a68b9e623be5da422b44e733fa8bc http://www.harlan.harris.name   40

EDIT

The resulting data frame will be composed entirely of character vectors. If you want to convert those vectors to dates, date-times, numerics, etc., you an either do it one-by-one, or you can write a function that specifies what classes ought to be assigned to columns with certain names, or you can write a function to try to infer the proper class from the data. Below is an example of that last option:

giveClasses <- function(df, threshold = 0.1) {
  df_classes <- sapply(df, class)

  df_alpha <- sapply(df, function(x) {
    mean(grepl("[[:alpha:]]", x)) >= threshold}) &
    df_classes == "character"

  df_digits <- sapply(df, function(x) mean(grepl("\\d", x))) >= threshold &
    df_classes == "character" &
    !df_alpha

  df_percent <- sapply(df, function(x) mean(grepl("%", x))) >= threshold &
    df_classes == "character" &
    !df_alpha &
    df_digits

  df_digits[df_percent] <- FALSE

  df_decimal <- sapply(df, function(x) mean(grepl("\\.", x))) >= threshold &
    df_classes == "character" &
    !df_percent &
    df_digits &
    !df_alpha

  df_dates <- sapply(df, function(x) {
    mean(grepl(
      "^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}$", x)) >= threshold}) &
    df_classes == "character"

  df_datetime <- sapply(df, function(x) {
    mean(grepl(
      "^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}\\D\\d{2}:\\d{2}(:\\d{2})?(\\.\\d{1,})?$", x)) >= threshold}) &
    df_classes == "character"

  # convert character data to appropriate classes
  df_logical <- sapply(df, function(x) {
    y <- unique(na.omit(x))
    length(y) == 2 & 
      mean(grepl("^n", y, ignore.case = TRUE) |
          grepl("^y", y, ignore.case = TRUE)) == 1
  })

  df_digits[df_dates | df_datetime] <- FALSE

  df[,df_percent] <- lapply(df[,df_percent, drop = FALSE], function(x) {
    as.numeric(gsub("[^[:digit:].]", "", x)) / 100})

  df[,df_logical] <- lapply(df[,df_logical, drop = FALSE], function(x) {
    x[grep("^y", x, ignore.case = TRUE)] <- TRUE
    x[grep("^n", x, ignore.case = TRUE)] <- FALSE
    as.logical(x)
  })

  df[,df_decimal] <- lapply(df[,df_decimal, drop = FALSE], function(x) {
    as.numeric(gsub("[^[:digit:].]", "", x))})

  df[,df_digits] <- lapply(df[,df_digits, drop = FALSE], function(x) {
    as.integer(gsub("[^[:digit:]]", "", x))})

  df[,df_dates] <- lapply(df[,df_dates, drop = FALSE], function(x) {
    as.Date(x)})

  df[,df_datetime] <- lapply(df[,df_datetime, drop = FALSE], function(x) {
    strptime(x, '%Y-%m-%dT%H:%M:%OS')})

  df_ischaracter <- sapply(df, function(x) any(class(x) == "character"))

  df[,df_ischaracter] <- lapply(df[,df_ischaracter, drop = FALSE], function(x) {
    x <- gsub("^\\s+|\\s+$|(?<=\\s)\\s+", "", x, perl = TRUE)})

  df
}

The above function assigns a class to a column if over 90% of the values in that column fit the pattern appropriate to that class. Otherwise, it keeps them as characters. It addresses patterns not found in your example data set - I just copied the code from another project I'm working on. So:

str(giveClasses(out))

'data.frame':   17 obs. of  13 variables:
 $ Id            : int  1 2 3 4 5 6 7 8 10 11 ...
 $ Reputation    : int  1 101 101 101 6182 442 329 6104 121 136 ...
 $ CreationDate  : POSIXlt, format: "2010-07-19 06:55:26" "2010-07-19 14:01:36" "2010-07-19 15:34:50" "2010-07-19 19:03:27" ...
 $ DisplayName   : chr  "Community" "Geoff Dalgas" "Jarrod Dixon" "Emmett" ...
 $ LastAccessDate: POSIXlt, format: "2010-07-19 06:55:26" "2012-09-13 17:41:48" "2013-01-15 03:28:47" "2013-04-16 16:51:04" ...
 $ Location      : chr  "on the server farm" "Corvallis, OR" "New York, NY" "New York, NY" ...
 $ AboutMe       : chr  "some text" "some text 2" "some text 3" "some text 4" ...
 $ Views         : int  0 7 9 3 605 30 21 399 8 2 ...
 $ UpVotes       : int  4382 3 19 0 659 42 14 576 2 10 ...
 $ DownVotes     : int  771 0 0 0 5 0 0 18 0 0 ...
 $ EmailHash     : chr  "a007be5a61f6aa8f3e85ae2fc18dd66e" "b437f461b3fd27387c5d8ab47a293d35" "2dfa19bf5dc5826c1fe54c2c049a1ff1" "129bc58fc3f1e3853cdd3cefc75fe1a0" ...
 $ WebsiteUrl    : chr  NA "http://stackoverflow.com" "http://stackoverflow.com" "http://minesweeperonline.com" ...
 $ Age           : int  NA 36 34 27 34 40 27 35 43 39 ...

Upvotes: 3

Related Questions