Reputation: 4230
I read small text files from many folders into a list. Hence, I have a list of length n with 2 data.frames
.
Here's an example of element 3 of the list (dput at the end of question)
ip_list[[3]]
$`dc:a6:32:2d:b6:c4`
# A tibble: 2 x 1
X1
<chr>
1 MAC address is: dc:a6:32:2d:b6:c4
2 IP is: 18.21.162.74
$`dc:a6:32:2d:b6:c4_running`
# A tibble: 1 x 1
datetime
<dttm>
1 2020-03-13 19:11:07
My goal is to reshape the list into a data frame with n machines and 3 columns (mac, ip, datetime). I have done so in what I believe is somewhat cumbersome way using:
n_machines <- length(ip_list)
# first element will be the mac and ip
df <- lapply(1:n_machines,
function (xx) as.data.frame(t(ip_list[[xx]][[1]]),
stringsAsFactors = FALSE)) %>%
bind_rows() %>%
# now clean
rename(mac = V1, ip = V2) %>%
mutate(mac = str_remove(mac, "MAC address is: "),
ip = str_remove(ip, "IP is: "))
# second element will be running time
running_time <- lapply(1:n_machines,
function (xx) as.data.frame(t(ip_list[[xx]][[2]]),
stringsAsFactors = FALSE)) %>%
bind_rows() %>%
rename(datetime = V1)
# join stuff (order should be kept)
df <- bind_cols(df, running_time)
Which produces the expected outcome:
df
mac ip datetime
1 dc:a6:32:21:59:2b 18.21.129.94
2 dc:a6:32:2d:8c:ca 18.21.171.210
3 dc:a6:32:2d:b6:c4 18.21.162.74 2020-03-13 19:11:07
4 dc:a6:32:2d:b8:62 18.21.178.96
Question: Is there a better way? I feel like there should be a way to do this. Particularly:
n_machines = 0
)as.data.frame(t(...))
is cumbersome and I lose the names (which I have to reasign later) but I couldn't figure out a way to pivot_wider
or similar. Here's a small dput
dput(ip_list[1:4])
list(list(`dc:a6:32:21:59:2b` = structure(list(X1 = c("MAC address is: dc:a6:32:21:59:2b",
"IP is: 18.21.129.94")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec")), `dc:a6:32:21:59:2b_running` = structure(list(
datetime = ""), class = "data.frame", row.names = c(NA, -1L
))), list(`dc:a6:32:2d:8c:ca` = structure(list(X1 = c("MAC address is: dc:a6:32:2d:8c:ca",
"IP is: 18.21.171.210")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec")), `dc:a6:32:2d:8c:ca_running` = structure(list(
datetime = ""), class = "data.frame", row.names = c(NA, -1L
))), list(`dc:a6:32:2d:b6:c4` = structure(list(X1 = c("MAC address is: dc:a6:32:2d:b6:c4",
"IP is: 18.21.162.74")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec")), `dc:a6:32:2d:b6:c4_running` = structure(list(
datetime = structure(1584126667.65542, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L), spec = structure(list(
cols = list(datetime = structure(list(format = ""), class = c("collector_datetime",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec"))), list(`dc:a6:32:2d:b8:62` = structure(list(
X1 = c("MAC address is: dc:a6:32:2d:b8:62", "IP is: 18.21.178.96"
)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), spec = structure(list(cols = list(
X1 = structure(list(), class = c("collector_character", "collector"
))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec")), `dc:a6:32:2d:b8:62_running` = structure(list(
datetime = ""), class = "data.frame", row.names = c(NA, -1L
))))
It was a dplyr
issue that got resolved when I went for development version (currently 0.8.99.9002).
Both answers provide what's expected and I think are overall improvements. I think the accepted answer is easier on readability but this is highly subjective. My only worry is that my previous lapply
option would be quite stable overtime while purrr
/dplyr
get cannibalized quite often.
Upvotes: 3
Views: 55
Reputation: 887128
One option would be to loop over the 'ip_list' with map
, bind the columns (bind_cols
), convert the 'datetime' column to DateTime
class as some elements have blank only values, then separate
the 'X1' column (after converting to 'data.frame' with map_dfr
), into two, and reshape into 'wide' format with pivot_wider
library(dplyr)
library(purrr)
library(stringr)
library(lubridate)
map_dfr(ip_list, ~
.x %>%
bind_cols %>%
mutate(datetime = ymd_hms(datetime)), .id = 'grp') %>%
separate(X1, into = c('grp1','val'), sep=" is: ") %>%
mutate(grp1 = word(grp1, 1)) %>%
pivot_wider(names_from = grp1, values_from = val) %>%
select(mac = MAC, ip = IP, datetime)
# A tibble: 4 x 3
# mac ip datetime
# <chr> <chr> <dttm>
#1 dc:a6:32:21:59:2b 18.21.129.94 NA
#2 dc:a6:32:2d:8c:ca 18.21.171.210 NA
#3 dc:a6:32:2d:b6:c4 18.21.162.74 2020-03-13 19:11:07
#4 dc:a6:32:2d:b8:62 18.21.178.96 NA
Upvotes: 1
Reputation: 39858
One option involving dplyr
, tidyr
, purrr
and stringr
could be:
map_dfr(.x = ip_list, ~ .x %>%
bind_rows() %>%
mutate_all(as.character) %>%
pivot_longer(everything(), values_drop_na = TRUE), .id = "ID") %>%
mutate(name = if_else(name == "datetime", name, str_extract(value, "^MAC|^IP")),
value = str_remove(value, ".*: ")) %>%
pivot_wider(names_from = "name", values_from = "value")
ID MAC IP datetime
<chr> <chr> <chr> <chr>
1 1 dc:a6:32:21:59:2b 18.21.129.94 ""
2 2 dc:a6:32:2d:8c:ca 18.21.171.210 ""
3 3 dc:a6:32:2d:b6:c4 18.21.162.74 2020-03-13 19:11:07
4 4 dc:a6:32:2d:b8:62 18.21.178.96 ""
Then, if you need datetime as an actual date-time, with the addition of lubridate
:
map_dfr(.x = ip_list, ~ .x %>%
bind_rows() %>%
mutate_all(as.character) %>%
pivot_longer(everything(), values_drop_na = TRUE), .id = "ID") %>%
mutate(name = if_else(name == "datetime", name, str_extract(value, "^MAC|^IP")),
value = str_remove(value, ".*: ")) %>%
pivot_wider(names_from = "name", values_from = "value") %>%
mutate(datetime = ymd_hms(datetime))
Upvotes: 2