Reputation: 33
I am trying to generate a dataframe basic from a dataframe which contains inside another dataframe and a list
Sample file https://drive.google.com/file/d/1dfdM-GVuYz7q5Nk_KktUabrWqlpDMFt1/view?usp=sharing
Code starts reading a json with jsonlite stream_in and generating result object.
result is dataframe containig a sub dataframe ball which contains a list xyz and a column speed
# result (dataframe)
# |
# |
# ball (dataframe) contains xyz list and speed numeric so
# |
# |
# xyz (list) elements
# [[1]]
# [1] -0.08 0.17 0.00
#
# speed (numeric column)
My requirement is to get a dataframe with below listed columns and WITHOUT homePlayers and awayPlayers lists.
required df columns period,frameIdx, gameClock , x,y,z,speed ,live,lastTouch
My main problem is extracting ball contents in columns
library(jsonlite)
library(plyr)
library(tidyr) # unnest
library(dplyr) # for %>% and other dplyr functions
file_match_json <- 'match.jsonl'
result <- as_tibble(jsonlite::stream_in(file(file_match_json)))
Here it is the result object type with this underlying structure
> result
# A tibble: 30 x 8
period frameIdx gameClock homePlayers awayPlayers ball$xyz $speed live lastTouch
<int> <int> <dbl> <list> <list> <list> <dbl> <lgl> <chr>
1 1 0 0 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.6 FALSE home
2 1 1 0.04 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.8 TRUE home
3 1 2 0.08 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.8 TRUE home
4 1 3 0.12 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.8 TRUE home
5 1 4 0.16 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.7 TRUE home
6 1 5 0.2 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.5 TRUE home
7 1 6 0.24 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.3 TRUE home
8 1 7 0.28 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 12.0 TRUE home
9 1 8 0.32 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 11.8 TRUE home
10 1 9 0.36 <df[,3] [11 x 3]> <df[,3] [11 x 3]> <dbl [3]> 11.5 TRUE home
# below it is the structure of the sub dataframe inside result
str(result$ball)
'data.frame': 30 obs. of 2 variables:
$ xyz :List of 30
..$ : num -0.08 0.17 0
..$ : num -0.18 0.16 0
..$ : num -0.46 0.16 0
...
$ speed: num 12.6 12.8 12.8 12.8 12.7 ...
# Tried this with no luck because i need to unnest/unlist xyz
df <- result %>%
select(period,frameIdx,gameClock,live,lastTouch,ball$xyz, ball$speed)
Upvotes: 0
Views: 493
Reputation: 1438
The simple fix for this would be to add a flatten argument like so: stream_in(path, flatten = T)
More thoroughly:
library(tidyverse)
library(jsonlite)
library(janitor)
path <- file("match.jsonl")
result <- stream_in(path, flatten = T) %>%
as_tibble() %>%
clean_names()
result
#> # A tibble: 30 x 9
#> period frame_idx game_clock home_players away_players live last_touch
#> <int> <int> <dbl> <list> <list> <lgl> <chr>
#> 1 1 0 0 <df[,3] [11~ <df[,3] [11~ FALSE home
#> 2 1 1 0.04 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 3 1 2 0.08 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 4 1 3 0.12 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 5 1 4 0.16 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 6 1 5 0.2 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 7 1 6 0.24 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 8 1 7 0.28 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 9 1 8 0.32 <df[,3] [11~ <df[,3] [11~ TRUE home
#> 10 1 9 0.36 <df[,3] [11~ <df[,3] [11~ TRUE home
#> # ... with 20 more rows, and 2 more variables: ball_xyz <list>,
#> # ball_speed <dbl>
Note that ball_xyz
and ball_speed
have been split or flattened depending on how you look at it.
We get your desired select()
statement as such:
result %>%
select(period, frame_idx, game_clock, live, last_touch, ball_xyz, ball_speed)
#> # A tibble: 30 x 7
#> period frame_idx game_clock live last_touch ball_xyz ball_speed
#> <int> <int> <dbl> <lgl> <chr> <list> <dbl>
#> 1 1 0 0 FALSE home <dbl [3]> 12.6
#> 2 1 1 0.04 TRUE home <dbl [3]> 12.8
#> 3 1 2 0.08 TRUE home <dbl [3]> 12.8
#> 4 1 3 0.12 TRUE home <dbl [3]> 12.8
#> 5 1 4 0.16 TRUE home <dbl [3]> 12.7
#> 6 1 5 0.2 TRUE home <dbl [3]> 12.5
#> 7 1 6 0.24 TRUE home <dbl [3]> 12.3
#> 8 1 7 0.28 TRUE home <dbl [3]> 12.0
#> 9 1 8 0.32 TRUE home <dbl [3]> 11.8
#> 10 1 9 0.36 TRUE home <dbl [3]> 11.5
#> # ... with 20 more rows
Taking things a bit further, we can use unnest_wider()
to expand the ball_xyz
values into their own columns.
result %>%
select(period, frame_idx, game_clock, live, last_touch, ball = ball_xyz, ball_speed) %>%
mutate(ball = map(ball, ~ setNames(.x, c("x", "y", "z")))) %>%
unnest_wider(ball, names_sep = "_", simplify = T)
#> # A tibble: 30 x 9
#> period frame_idx game_clock live last_touch ball_x ball_y ball_z
#> <int> <int> <dbl> <lgl> <chr> <dbl> <dbl> <dbl>
#> 1 1 0 0 FALSE home -0.08 0.17 0
#> 2 1 1 0.04 TRUE home -0.18 0.16 0
#> 3 1 2 0.08 TRUE home -0.46 0.16 0
#> 4 1 3 0.12 TRUE home -0.89 0.16 0
#> 5 1 4 0.16 TRUE home -1.3 0.16 0
#> 6 1 5 0.2 TRUE home -1.78 0.16 0
#> 7 1 6 0.24 TRUE home -2.29 0.17 0
#> 8 1 7 0.28 TRUE home -2.82 0.17 0
#> 9 1 8 0.32 TRUE home -3.36 0.18 0
#> 10 1 9 0.36 TRUE home -3.91 0.2 0
#> # ... with 20 more rows, and 1 more variable: ball_speed <dbl>
Created on 2019-10-11 by the reprex package (v0.3.0)
Hope this helps!
Upvotes: 1