Reputation: 19783
I spent more time than expected to transfer xml into dataframe (code snippet contains both xml and xmlTreeParse
to make post smaller, the whole solution is after this):
users = xmlTreeParse(file=
'<?xml version="1.0" encoding="utf-8"?>
<users>
<row Id="-1" Reputation="1" CreationDate="2010-07-19T06:55:26.860" DisplayName="Community" LastAccessDate="2010-07-19T06:55:26.860" Location="on the server farm" AboutMe="some text" Views="0" UpVotes="4382" DownVotes="771" EmailHash="a007be5a61f6aa8f3e85ae2fc18dd66e" />
<row Id="2" Reputation="101" CreationDate="2010-07-19T14:01:36.697" DisplayName="Geoff Dalgas" LastAccessDate="2012-09-13T17:41:48.300" WebsiteUrl="http://stackoverflow.com" Location="Corvallis, OR" AboutMe="some text 2" Views="7" UpVotes="3" DownVotes="0" EmailHash="b437f461b3fd27387c5d8ab47a293d35" Age="36" />
<row Id="3" Reputation="101" CreationDate="2010-07-19T15:34:50.507" DisplayName="Jarrod Dixon" LastAccessDate="2013-01-15T03:28:47.657" WebsiteUrl="http://stackoverflow.com" Location="New York, NY" AboutMe="some text 3" Views="9" UpVotes="19" DownVotes="0" EmailHash="2dfa19bf5dc5826c1fe54c2c049a1ff1" Age="34" />
<row Id="4" Reputation="101" CreationDate="2010-07-19T19:03:27.400" DisplayName="Emmett" LastAccessDate="2013-04-16T16:51:04.780" WebsiteUrl="http://minesweeperonline.com" Location="New York, NY" AboutMe="some text 4" Views="3" UpVotes="0" DownVotes="0" EmailHash="129bc58fc3f1e3853cdd3cefc75fe1a0" Age="27" />
<row Id="5" Reputation="6182" CreationDate="2010-07-19T19:03:57.227" DisplayName="Shane" LastAccessDate="2013-02-05T11:23:09.587" WebsiteUrl="http://www.statalgo.com" Location="New York, NY" AboutMe="some text 5" Views="605" UpVotes="659" DownVotes="5" EmailHash="0cee97ffd90277bf4ac753331d50af60" Age="34" />
<row Id="6" Reputation="442" CreationDate="2010-07-19T19:04:07.647" DisplayName="Harlan" LastAccessDate="2013-05-09T13:11:29.027" WebsiteUrl="http://www.harlan.harris.name" Location="District of Columbia" AboutMe="some text 6" Views="30" UpVotes="42" DownVotes="0" EmailHash="9f1a68b9e623be5da422b44e733fa8bc" Age="40" />
<row Id="7" Reputation="329" CreationDate="2010-07-19T19:04:37.257" DisplayName="Vince" LastAccessDate="2013-05-21T22:49:10.237" WebsiteUrl="http://bioinformatics.ucdavis.edu" Location="Davis, CA" AboutMe="some text 7" Views="21" UpVotes="14" DownVotes="0" EmailHash="4f7cebc8ac200d15bac5dcff51469425" Age="27" />
<row Id="8" Reputation="6104" CreationDate="2010-07-19T19:04:52.280" DisplayName="csgillespie" LastAccessDate="2013-05-21T17:32:58.693" WebsiteUrl="http://www.mas.ncl.ac.uk/~ncsg3/" Location="Newcastle, United Kingdom" AboutMe="some text 8" Views="399" UpVotes="576" DownVotes="18" EmailHash="3c3eea4eda77ffe95ae18c78c3fc7e55" Age="35" />
<row Id="10" Reputation="121" CreationDate="2010-07-19T19:05:40.403" DisplayName="Pierre" LastAccessDate="2012-10-04T17:17:01.430" WebsiteUrl="http://plindenbaum.blogspot.com" Location="France" AboutMe="some text 10" Views="8" UpVotes="2" DownVotes="0" EmailHash="61200477cf8983809ec152f484750204" Age="43" />
<row Id="11" Reputation="136" CreationDate="2010-07-19T19:06:02.713" DisplayName="wahalulu" LastAccessDate="2013-05-26T20:36:24.567" WebsiteUrl="http://www.linkedin.com/in/marckvaisman" Location="Washington, DC" AboutMe="some text 11" Views="2" UpVotes="10" DownVotes="0" EmailHash="9a9a05e41ae6e3b127697967cea5f8fb" Age="39" />
<row Id="12" Reputation="101" CreationDate="2010-07-19T19:06:34.507" DisplayName="Jin" LastAccessDate="2013-04-11T18:31:58.360" WebsiteUrl="http://www.8164.org" Location="Raleigh, NC" AboutMe="some text 12" Views="5" UpVotes="4" DownVotes="0" EmailHash="70ad2c2830eb9a7753bd6312f3811e3e" Age="37" />
<row Id="13" Reputation="677" CreationDate="2010-07-19T19:06:49.527" DisplayName="Sharpie" LastAccessDate="2012-01-02T22:55:04.743" WebsiteUrl="http://www.sharpsteen.net" Location="United States" AboutMe="Undergraduate studying Environmental Engineering and Applied Mathematics." Views="37" UpVotes="44" DownVotes="1" EmailHash="a52001938ed33a87334447413cc5beaa" Age="27" />
<row Id="15" Reputation="11" CreationDate="2010-07-19T19:07:32.537" DisplayName="hannes.koller" LastAccessDate="2010-08-24T14:23:18.050" WebsiteUrl="http://soma.denkt.org" Location="Vienna, Austria" AboutMe="" Views="2" UpVotes="0" DownVotes="0" EmailHash="0ecd144e2f3d05e6ee6b89404d1d4c53" Age="34" />
<row Id="16" Reputation="101" CreationDate="2010-07-19T19:08:13.957" DisplayName="slashnick" LastAccessDate="2010-08-19T20:40:59.080" Location="London, United Kingdom" Views="2" UpVotes="7" DownVotes="0" EmailHash="5691ff74e21c78cd1563b5123254cbd6" Age="30" />
<row Id="17" Reputation="192" CreationDate="2010-07-19T19:08:28.243" DisplayName="Random" LastAccessDate="2010-09-10T07:34:36.123" AboutMe="" Views="6" UpVotes="13" DownVotes="1" EmailHash="5a3c78de1408aae57797dffd0782b992" />
<row Id="18" Reputation="128" CreationDate="2010-07-19T19:08:29.070" DisplayName="grokus" LastAccessDate="2012-08-09T15:02:00.600" WebsiteUrl="http://wikipedia.org" Location="United States" AboutMe="about me 18" Views="6" UpVotes="16" DownVotes="0" EmailHash="7d1f931327bfab7b214758be17627adc" Age="43" />
<row Id="19" Reputation="101" CreationDate="2010-07-19T19:08:45.250" DisplayName="Noah Snyder" LastAccessDate="2012-06-17T15:53:43.550" WebsiteUrl="http://sbseminar.wordpress.com" Location="New York, NY" AboutMe="about me 19" Views="11" UpVotes="2" DownVotes="0" EmailHash="895385d49eb1f04c5ee1f8d7734f3a62" Age="33" />
</users>',
asText=TRUE)
XML is simply representation of Users table from the stackexchange data dump:
<users>
<row Id=..... />
<row Id=..... />
.....
<row Id=..... />
</users>
The mapping to dataframe is the same as if I mapped the table. This is the code that gets job done for me:
require(XML)
require(plyr)
# insert xmlTreeParse here
r = xmlRoot(users)
attrs = c('Id', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
'EmailHash', 'Age')
mapUserAttrs = function(x, colNames) {
t = data.frame(as.integer(x['Id']),
as.integer(x['Reputation']),
strptime(x['CreationDate'], '%Y-%m-%dT%H:%M:%OS'),
as.character(x['DisplayName']),
strptime(x['LastAccessDate'], '%Y-%m-%dT%H:%M:%OS'),
as.character(x['WebsiteUrl']),
as.character(x['Location']),
as.character(x['AboutMe']),
as.integer(x['Views']),
as.integer(x['UpVotes']),
as.integer(x['DownVotes']),
as.character(x['EmailHash']),
as.integer(x['Age']))
names(t) = colNames
return(t)
}
result = ldply(lapply(xmlChildren(r), xmlAttrs), mapUserAttrs, attrs)
It looks excessively busy to me - but I found no better way to accomplish the task with XML package and amount of examples and documentation I found.
I'd like to know if there is less complex (or shorter) way to accomplish the same?
Upvotes: 4
Views: 483
Reputation: 3601
You can do this with the xmlToList
function in the XML
package and, because some of your nodes contain options that others don't, you'll also need the rbind.fill
function from the plyr
package.
The line of code below converts your XML to a list, loops through the nodes and turns the character strings into data frames, and then rbinds all those data frames together.
require(xml)
require(plyr)
out <- do.call("rbind.fill",
lapply(xmlToList(users),
function(x) as.data.frame(as.list(x), stringsAsFactors = FALSE)))
head(out)
Id Reputation CreationDate DisplayName LastAccessDate Location AboutMe Views UpVotes DownVotes
1 -1 1 2010-07-19T06:55:26.860 Community 2010-07-19T06:55:26.860 on the server farm some text 0 4382 771
2 2 101 2010-07-19T14:01:36.697 Geoff Dalgas 2012-09-13T17:41:48.300 Corvallis, OR some text 2 7 3 0
3 3 101 2010-07-19T15:34:50.507 Jarrod Dixon 2013-01-15T03:28:47.657 New York, NY some text 3 9 19 0
4 4 101 2010-07-19T19:03:27.400 Emmett 2013-04-16T16:51:04.780 New York, NY some text 4 3 0 0
5 5 6182 2010-07-19T19:03:57.227 Shane 2013-02-05T11:23:09.587 New York, NY some text 5 605 659 5
6 6 442 2010-07-19T19:04:07.647 Harlan 2013-05-09T13:11:29.027 District of Columbia some text 6 30 42 0
EmailHash WebsiteUrl Age
1 a007be5a61f6aa8f3e85ae2fc18dd66e <NA> <NA>
2 b437f461b3fd27387c5d8ab47a293d35 http://stackoverflow.com 36
3 2dfa19bf5dc5826c1fe54c2c049a1ff1 http://stackoverflow.com 34
4 129bc58fc3f1e3853cdd3cefc75fe1a0 http://minesweeperonline.com 27
5 0cee97ffd90277bf4ac753331d50af60 http://www.statalgo.com 34
6 9f1a68b9e623be5da422b44e733fa8bc http://www.harlan.harris.name 40
EDIT
The resulting data frame will be composed entirely of character vectors. If you want to convert those vectors to dates, date-times, numerics, etc., you an either do it one-by-one, or you can write a function that specifies what classes ought to be assigned to columns with certain names, or you can write a function to try to infer the proper class from the data. Below is an example of that last option:
giveClasses <- function(df, threshold = 0.1) {
df_classes <- sapply(df, class)
df_alpha <- sapply(df, function(x) {
mean(grepl("[[:alpha:]]", x)) >= threshold}) &
df_classes == "character"
df_digits <- sapply(df, function(x) mean(grepl("\\d", x))) >= threshold &
df_classes == "character" &
!df_alpha
df_percent <- sapply(df, function(x) mean(grepl("%", x))) >= threshold &
df_classes == "character" &
!df_alpha &
df_digits
df_digits[df_percent] <- FALSE
df_decimal <- sapply(df, function(x) mean(grepl("\\.", x))) >= threshold &
df_classes == "character" &
!df_percent &
df_digits &
!df_alpha
df_dates <- sapply(df, function(x) {
mean(grepl(
"^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}$", x)) >= threshold}) &
df_classes == "character"
df_datetime <- sapply(df, function(x) {
mean(grepl(
"^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}\\D\\d{2}:\\d{2}(:\\d{2})?(\\.\\d{1,})?$", x)) >= threshold}) &
df_classes == "character"
# convert character data to appropriate classes
df_logical <- sapply(df, function(x) {
y <- unique(na.omit(x))
length(y) == 2 &
mean(grepl("^n", y, ignore.case = TRUE) |
grepl("^y", y, ignore.case = TRUE)) == 1
})
df_digits[df_dates | df_datetime] <- FALSE
df[,df_percent] <- lapply(df[,df_percent, drop = FALSE], function(x) {
as.numeric(gsub("[^[:digit:].]", "", x)) / 100})
df[,df_logical] <- lapply(df[,df_logical, drop = FALSE], function(x) {
x[grep("^y", x, ignore.case = TRUE)] <- TRUE
x[grep("^n", x, ignore.case = TRUE)] <- FALSE
as.logical(x)
})
df[,df_decimal] <- lapply(df[,df_decimal, drop = FALSE], function(x) {
as.numeric(gsub("[^[:digit:].]", "", x))})
df[,df_digits] <- lapply(df[,df_digits, drop = FALSE], function(x) {
as.integer(gsub("[^[:digit:]]", "", x))})
df[,df_dates] <- lapply(df[,df_dates, drop = FALSE], function(x) {
as.Date(x)})
df[,df_datetime] <- lapply(df[,df_datetime, drop = FALSE], function(x) {
strptime(x, '%Y-%m-%dT%H:%M:%OS')})
df_ischaracter <- sapply(df, function(x) any(class(x) == "character"))
df[,df_ischaracter] <- lapply(df[,df_ischaracter, drop = FALSE], function(x) {
x <- gsub("^\\s+|\\s+$|(?<=\\s)\\s+", "", x, perl = TRUE)})
df
}
The above function assigns a class to a column if over 90% of the values in that column fit the pattern appropriate to that class. Otherwise, it keeps them as characters. It addresses patterns not found in your example data set - I just copied the code from another project I'm working on. So:
str(giveClasses(out))
'data.frame': 17 obs. of 13 variables:
$ Id : int 1 2 3 4 5 6 7 8 10 11 ...
$ Reputation : int 1 101 101 101 6182 442 329 6104 121 136 ...
$ CreationDate : POSIXlt, format: "2010-07-19 06:55:26" "2010-07-19 14:01:36" "2010-07-19 15:34:50" "2010-07-19 19:03:27" ...
$ DisplayName : chr "Community" "Geoff Dalgas" "Jarrod Dixon" "Emmett" ...
$ LastAccessDate: POSIXlt, format: "2010-07-19 06:55:26" "2012-09-13 17:41:48" "2013-01-15 03:28:47" "2013-04-16 16:51:04" ...
$ Location : chr "on the server farm" "Corvallis, OR" "New York, NY" "New York, NY" ...
$ AboutMe : chr "some text" "some text 2" "some text 3" "some text 4" ...
$ Views : int 0 7 9 3 605 30 21 399 8 2 ...
$ UpVotes : int 4382 3 19 0 659 42 14 576 2 10 ...
$ DownVotes : int 771 0 0 0 5 0 0 18 0 0 ...
$ EmailHash : chr "a007be5a61f6aa8f3e85ae2fc18dd66e" "b437f461b3fd27387c5d8ab47a293d35" "2dfa19bf5dc5826c1fe54c2c049a1ff1" "129bc58fc3f1e3853cdd3cefc75fe1a0" ...
$ WebsiteUrl : chr NA "http://stackoverflow.com" "http://stackoverflow.com" "http://minesweeperonline.com" ...
$ Age : int NA 36 34 27 34 40 27 35 43 39 ...
Upvotes: 3