Canovice
Canovice

Reputation: 10163

In R, improving efficiency / speed parsing a long XML document

I have my hands on a bit of sportVU data (coordinate player tracking of basketball players) in XML format, and am trying to find an efficient way to parse this into R dataframes. For reference, here is the type of data I am working with:

xml_str = '<sequences period="1">
             <moment game-clock="720.00" time="1453777809409" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37744,29.70218,4.18768;23,262882,20.60259,20.94833,0;23,266358,44.9396,19.39261,0;23,509450,29.94114,25.65156,0;23,604898,46.12643,31.1069,0;23,699950,65.75548,29.05962,0;5312,263884,63.23074,19.75536,0;5312,329824,46.12643,31.1069,0;5312,340730,65.75548,29.05962,0;5312,462980,65.2449,25.5923,0;5312,609567,44.9396,19.39261,0"/>
             <moment game-clock="720.00" time="1453777809448" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,20.47625,21.10303,0;23,266358,44.94889,19.41382,0;23,509450,29.95246,25.6483,0;23,604898,46.13548,31.11234,0;23,699950,65.54889,29.13574,0;5312,263884,63.19786,19.78119,0;5312,329824,46.13548,31.11234,0;5312,340730,65.54889,29.13574,0;5312,462980,65.22661,25.67978,0;5312,609567,44.94889,19.41382,0"/>
             <moment game-clock="720.00" time="1453777809489" game-event-id="" shot-clock="24.00" locations="-1,-1,49.27471,30.13754,4.01725;23,262882,20.34594,21.24155,0;23,266358,44.94417,19.49824,0;23,509450,29.972,25.6471,0;23,604898,46.14312,31.10966,0;23,699950,65.33004,29.25136,0;5312,263884,63.16254,19.80861,0;5312,329824,46.14312,31.10966,0;5312,340730,65.33004,29.25136,0;5312,462980,65.2156,25.73148,0;5312,609567,44.94417,19.49824,0"/>
             <moment game-clock="720.00" time="1453777809529" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37694,29.72015,4.22522;23,262882,20.21595,21.39361,0;23,266358,44.94711,19.5666,0;23,509450,29.99479,25.64543,0;23,604898,46.15027,31.12124,0;23,699950,65.11179,29.36794,0;5312,263884,63.12304,19.83758,0;5312,329824,46.15027,31.12124,0;5312,340730,65.11179,29.36794,0;5312,462980,65.20283,25.7811,0;5312,609567,44.94711,19.5666,0"/>
             <moment game-clock="720.00" time="1453777809569" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24878,29.90948,4.104;23,262882,20.08169,21.52092,0;23,266358,44.95597,19.55453,0;23,509450,30.01362,25.65796,0;23,604898,46.15381,31.12995,0;23,699950,64.87379,29.54509,0;5312,263884,63.08375,19.86668,0;5312,329824,46.15381,31.12995,0;5312,340730,64.87379,29.54509,0;5312,462980,65.19289,25.82793,0;5312,609567,44.95597,19.55453,0"/>
             <moment game-clock="720.00" time="1453777809609" game-event-id="" shot-clock="24.00" locations="-1,-1,49.34944,29.53386,4.37756;23,262882,19.93925,21.57641,0;23,266358,44.95367,19.55548,0;23,509450,30.02373,25.68459,0;23,604898,46.15968,31.14234,0;23,699950,64.63944,29.72012,0;5312,263884,63.03494,19.95414,0;5312,329824,46.15968,31.14234,0;5312,340730,64.63944,29.72012,0;5312,462980,65.18154,25.87779,0;5312,609567,44.95367,19.55548,0"/>
             <moment game-clock="720.00" time="1453777809649" game-event-id="" shot-clock="24.00" locations="-1,-1,49.29842,29.74836,4.29728;23,262882,19.7968,21.62425,0;23,266358,44.97828,19.51615,0;23,509450,30.02255,25.64997,0;23,604898,46.1634,31.14518,0;23,699950,64.44716,29.84182,0;5312,263884,62.98457,20.04141,0;5312,329824,46.1634,31.14518,0;5312,340730,64.44716,29.84182,0;5312,462980,65.15931,25.93168,0;5312,609567,44.97828,19.51615,0"/>
             <moment game-clock="720.00" time="1453777809689" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,19.65811,21.66964,0;23,266358,45.05661,19.46085,0;23,509450,30.01974,25.62911,0;23,604898,46.19779,31.17625,0;23,699950,64.29233,29.91812,0;5312,263884,62.93202,20.1291,0;5312,329824,46.19779,31.17625,0;5312,340730,64.29233,29.91812,0;5312,462980,65.14666,25.96346,0;5312,609567,45.05661,19.46085,0"/>
             <moment game-clock="720.00" time="1453777809728" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.52976,21.72166,0;23,266358,45.07626,19.45181,0;23,509450,30.01819,25.62626,0;23,604898,46.20207,31.19139,0;23,699950,64.10344,29.96732,0;5312,263884,62.87931,20.21896,0;5312,329824,46.20207,31.19139,0;5312,340730,64.10344,29.96732,0;5312,462980,65.12518,26.01317,0;5312,609567,45.07626,19.45181,0"/>
             <moment game-clock="720.00" time="1453777809769" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.40259,21.77219,0;23,266358,45.01633,19.41528,0;23,509450,30.01316,25.61478,0;23,604898,46.20748,31.21032,0;23,699950,63.94112,30.00692,0;5312,263884,62.82417,20.31417,0;5312,329824,46.20748,31.21032,0;5312,340730,63.94112,30.00692,0;5312,462980,65.10361,26.05668,0;5312,609567,45.01633,19.41528,0"/>
             <moment game-clock="720.00" time="1453777809809" game-event-id="" shot-clock="24.00" locations="-1,-1,49.40035,29.31999,4.45764;23,262882,19.26225,21.74403,0;23,266358,45.03757,19.35151,0;23,509450,30.00625,25.60483,0;23,604898,46.21089,31.22762,0;23,699950,63.76999,30.04214,0;5312,263884,62.7722,20.407,0;5312,329824,46.21089,31.22762,0;5312,340730,63.76999,30.04214,0;5312,462980,65.0792,26.10155,0;5312,609567,45.03757,19.35151,0"/>
             <moment game-clock="720.00" time="1453777809849" game-event-id="" shot-clock="24.00" locations="-1,-1,49.17246,29.8749,4.08078;23,262882,19.13395,21.78725,0;23,266358,45.04715,19.21508,0;23,509450,29.9845,25.51368,0;23,604898,46.26867,31.28159,0;23,699950,63.61645,30.0221,0;5312,263884,62.71828,20.50007,0;5312,329824,46.26867,31.28159,0;5312,340730,63.61645,30.0221,0;5312,462980,65.04425,26.18745,0;5312,609567,45.04715,19.21508,0"/>
             <moment game-clock="720.00" time="1453777809889" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24667,29.2957,4.46901;23,262882,19.00912,21.82333,0;23,266358,45.02115,19.10097,0;23,509450,29.96265,25.43249,0;23,604898,46.26457,31.32984,0;23,699950,63.46754,29.99107,0;5312,263884,62.66985,20.58952,0;5312,329824,46.26457,31.32984,0;5312,340730,63.46754,29.99107,0;5312,462980,65.01059,26.25959,0;5312,609567,45.02115,19.10097,0"/>
             <moment game-clock="720.00" time="1453777809928" game-event-id="" shot-clock="24.00" locations="-1,-1,49.01807,29.85051,4.09221;23,262882,18.88202,21.79858,0;23,266358,44.95699,19.07127,0;23,509450,29.93991,25.35426,0;23,604898,46.25301,31.35973,0;23,699950,63.29378,30.00075,0;5312,263884,62.61706,20.68884,0;5312,329824,46.25301,31.35973,0;5312,340730,63.29378,30.00075,0;5312,462980,64.98385,26.29224,0;5312,609567,44.95699,19.07127,0"/>
             <moment game-clock="720.00" time="1453777809969" game-event-id="" shot-clock="24.00" locations="-1,-1,48.93946,29.87874,4.1644;23,262882,18.75688,21.75509,0;23,266358,44.95388,19.00199,0;23,509450,29.91814,25.27759,0;23,604898,46.23982,31.38857,0;23,699950,63.12091,30.01038,0;5312,263884,62.57866,20.72805,0;5312,329824,46.23982,31.38857,0;5312,340730,63.12091,30.01038,0;5312,462980,64.95801,26.3292,0;5312,609567,44.95388,19.00199,0"/>
          </sequences>'

This is a subset of the actual data, where 1 game of basketball corresponds with roughly (25 per second) * (10 players) * (48 * 60 seconds in game) = 720,000 moment nodes per game. My goal is to parse this into an organized dataframe - the first 4 columns are the first 4 attributes of the moment nodes (game-clock, time, game-event-id, shot-clock), and then there are several columns of the data that arise from the locations attribute, which is storing coordinate data for 10 different players plus the ball, all in one string (each location attribute has 55 numbers, 5 for each of the 10 players and 5 for the ball).

Here is my code, which is working okay-ish but needs to be sped up quite a bit for efficiency for what I'm doing. I use the XML package, and simple functions within, but am okay with switching to xml2 or doing anything else needed to speed this up.

sportvu = xmlParse(xml_str)

# quick load the data into a dataframe
sportvu_df <- data.frame(
  game_clock = sapply(sportvu["//sequences/moment/@game-clock"], as, "numeric"),
  time = sapply(sportvu["//sequences/moment/@time"], as, "character"),
  game_event_id = sapply(sportvu["//sequences/moment/@game-event-id"], as, "integer"),
  shot_clock = sapply(sportvu["//sequences/moment/@shot-clock"], as, "numeric"), 
  locations = sapply(sportvu["//sequences/moment/@locations"], as, "character")
)

# convert locations into what we want, many more columns 
locations = as.character(sportvu_df$locations)
locations = strsplit(gsub(";", ",", locations), ",")
reps = sapply(locations, length) / 5
locations = as.data.frame(matrix(unlist(locations), ncol = 5, byrow = TRUE), stringsAsFactors = FALSE)
colnames(locations) = c("team_id", "global.player.id", "x_loc", "y_loc", "radius")

locations$global.player.id = as.integer(locations$global.player.id); locations$team_id = as.integer(locations$team_id); locations$x_loc = as.numeric(locations$x_loc); locations$y_loc = as.numeric(locations$y_loc); locations$radius = as.numeric(locations$radius)

# connect locations back with sportvu_df
sportvu_df = sportvu_df[!(names(sportvu_df) %in% "locations")]
sportvu_df = sportvu_df[rep(row.names(sportvu_df), reps), ]
sportvu_df = cbind(sportvu_df, locations)
sportvu_df$order = seq(1,nrow(sportvu_df),by=1)

Any thoughts are appreciated. Thanks!

Upvotes: 0

Views: 106

Answers (1)

Dave2e
Dave2e

Reputation: 24079

There is no need to wrap the type conversions into an sapply function. They are vectorized thus a direct call should be enough:

sportvu_df <- data.frame(
  game_clock = as.numeric(sportvu["//sequences/moment/@game-clock"]),
  time = as.character(sportvu["//sequences/moment/@time"]),
  game_event_id = as.integer(sportvu["//sequences/moment/@game-event-id"]),
  shot_clock = as.numeric(sportvu["//sequences/moment/@shot-clock"]), 
  locations = as.character(sportvu["//sequences/moment/@locations"])
)

By eliminating the 5 sapply (i.e. loops) this should speed up your routine.

Upvotes: 2

Related Questions