Reputation: 59
I have a large list of dataframes of the following structure:
foo <- 1:5
lorem1968 <- c(6, NA, NA, 8, NA)
lorem1969 <- c(NA, 17, NA, 19, 20)
df1 <- data.frame(foo, lorem1968, lorem1969)
ipsum <- 11:15
lorem1970 <- c(22, NA, 24, NA, NA)
df2 <- data.frame(ipsum, lorem1969, lorem1970)
df.list <- list(df1, df2)
[[1]]
foo lorem1968 lorem1969
1 1 6 NA
2 2 NA 17
3 3 NA NA
4 4 8 19
5 5 NA 20
[[2]]
ipsum lorem1969 lorem1970
1 11 NA 22
2 12 17 NA
3 13 NA 24
4 14 19 NA
5 15 20 NA
I would like now to iterate over all columns named loremxxxx and replace all NA's there with 0. Then, I would like to create a new column in each df which contains the average of all loremxxxx columns contained in that specific df.
The problem is that these are overlapping panels in the original data, so any df1 contains lorem1968, lorem1969, lorem1970. df2 contains lorem1969, 1970, 1971. And so on.
I tried to select the columns like this:
lorem.cols <- purrr::map(panels.list, function(x)
select(x, starts_with("lorem"))
)
and also:
lorem.cols <- purrr::map(df.list, function(data)
data %>% select(data, starts_with("lorem"))
)
but both threw an error of either not finding the function or of giving me "Selection:" and waiting for input. Just tried to copy from the help page of the select()
function.
After I planned on replacing NAs like so:
df.list <- purrr::map(df.list, function(data)
data %>% mutate(lorem.cols = replace(is.na(lorem.cols), 0))
)
Thanks guys!
Upvotes: 4
Views: 150
Reputation: 25225
Another option is to use rowSums
to save some time on the conversion of NAs to 0:
lapply(df.list, function(x) {
i1 <- grep("^lorem\\d+$", names(x))
transform(x, avg = rowSums(x[i1], na.rm=TRUE) / ncol(x[i1]))
})
timing code:
set.seed(0L)
ndf <- 1e4
nr <- 1e4
nc <- 2
df.list <- replicate(ndf,
data.frame(id=1:nr, matrix(sample(c(1, NA_real_), nr*nc, TRUE), ncol=nc)),
simplify=FALSE)
mtd0 <- function() {
lapply(df.list, function(x) {
i1 <- grep("^X\\d+$", names(x))
x[i1] <- replace(x[i1], is.na(x[i1]), 0)
transform(x, avg = rowMeans(x[i1], na.rm = TRUE))
})
}
mtd2 <- function() {
lapply(df.list, function(x) {
i1 <- grep("^X\\d+$", names(x))
transform(x, avg = rowSums(x[i1], na.rm=TRUE) / ncol(x[i1]))
})
}
bench::mark(mtd0(), mtd2(), check=FALSE)
timings:
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 35.51s 35.51s 0.0282 7.83GB 0.422 1 15 35.51s <list [10,000]> <df[,3] [151,107 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd2() 8.91s 8.91s 0.112 2.98GB 1.12 1 10 8.91s <list [10,000]> <df[,3] [30,314 x 3]> <bch:tm> <tibble [1 x 3]>
Upvotes: 2
Reputation: 552
Assuming that there are no NAs in any other columns than those beginning with lorem, you could do the following
lapply(df.list, function(df) {
df[is.na(df)] <- 0
df$mean <- apply(df[, grep("lorem", names(df))], 1, mean)
return (df)
})
# [[1]]
# foo lorem1968 lorem1969 mean
# 1 1 6 0 3.0
# 2 2 0 17 8.5
# 3 3 0 0 0.0
# 4 4 8 19 13.5
# 5 5 0 20 10.0
#
# [[2]]
# ipsum lorem1969 lorem1970 mean
# 1 11 0 22 11.0
# 2 12 17 0 8.5
# 3 13 0 24 12.0
# 4 14 19 0 9.5
# 5 15 20 0 10.0
Following @akrun answer you can use rowMeans
instead of apply(df[, grep("lorem", names(df))], 1, mean)
, i.e.
lapply(df.list, function(df) {
df[is.na(df)] <- 0
df$mean <- rowMeans(df[, grep("lorem", names(df))])
return (df)
})
Upvotes: 1
Reputation: 887158
We can use base R
. Loop through the list
with lapply
, use grep
to find the index of the column names that match 'lorem' followed by one or more digits, replace
the NA
s in those columns with 0, and transform
the original dataset in the list
to create a new column 'avg' by getting the mean
of those 'lorem' columns
lapply(df.list, function(x) {
i1 <- grep("^lorem\\d+$", names(x))
x[i1] <- replace(x[i1], is.na(x[i1]), 0)
transform(x, avg = rowMeans(x[i1], na.rm = TRUE))
})
#[[1]]
# foo lorem1968 lorem1969 avg
#1 1 6 0 3.0
#2 2 0 17 8.5
#3 3 0 0 0.0
#4 4 8 19 13.5
#5 5 0 20 10.0
#[[2]]
# ipsum lorem1969 lorem1970 avg
#1 11 0 22 11.0
#2 12 17 0 8.5
#3 13 0 24 12.0
#4 14 19 0 9.5
#5 15 20 0 10.0
Upvotes: 2
Reputation: 11255
Here's a data.table approach that relies on data.table
update-by-reference that holds true in lapply()
calls as well.
library(data.table)
lapply(df.list, setDT)
lapply(df.list,
function(dt) {
cols <- grep('^lorem', names(dt))
setnafill(dt, fill = 0L, cols = cols)
dt[, mean_lorem := rowMeans(.SD), .SDcols = cols]
})
#> [[1]]
#> foo lorem1968 lorem1969 mean_lorem
#> 1: 1 6 0 3.0
#> 2: 2 0 17 8.5
#> 3: 3 0 0 0.0
#> 4: 4 8 19 13.5
#> 5: 5 0 20 10.0
#>
#> [[2]]
#> ipsum lorem1969 lorem1970 mean_lorem
#> 1: 11 0 22 11.0
#> 2: 12 17 0 8.5
#> 3: 13 0 24 12.0
#> 4: 14 19 0 9.5
#> 5: 15 20 0 10.0
Upvotes: 2
Reputation: 39858
With dplyr
, tidyr
and purrr
, you can do:
map(df.list, ~ select_at(.x, vars(contains("lorem"))) %>%
mutate_all(~ replace_na(., 0)) %>%
mutate(avg = rowMeans(.)))
[[1]]
lorem1968 lorem1969 avg
1 6 0 3.0
2 0 17 8.5
3 0 0 0.0
4 8 19 13.5
5 0 20 10.0
[[2]]
lorem1969 lorem1970 avg
1 0 22 11.0
2 17 0 8.5
3 0 24 12.0
4 19 0 9.5
5 20 0 10.0
If you actually want to keep also the other columns:
map(df.list, ~ mutate_at(.x, vars(contains("lorem")), ~ replace_na(., 0)) %>%
mutate(avg = rowMeans(select(., starts_with("lorem")))))
Upvotes: 1
Reputation: 39595
You could try something like this:
foo <- 1:5
lorem1968 <- c(6, NA, NA, 8, NA)
lorem1969 <- c(NA, 17, NA, 19, 20)
df1 <- data.frame(foo, lorem1968, lorem1969)
ipsum <- 11:15
lorem1970 <- c(22, NA, 24, NA, NA)
df2 <- data.frame(ipsum, lorem1969, lorem1970)
df.list <- list(df1, df2)
#Create function
replace_f <- function(x)
{
#Replace NA by 0
x[is.na(x)] <- 0
#Compute mean
#Variable selection
index <- which(grepl("lorem",names(x)))
x$Avg <- apply(x[,index],1,mean)
return(x)
}
df.list2 <- lapply(df.list,replace_f)
df.list2
[[1]]
foo lorem1968 lorem1969 Avg
1 1 6 0 3.0
2 2 0 17 8.5
3 3 0 0 0.0
4 4 8 19 13.5
5 5 0 20 10.0
[[2]]
ipsum lorem1969 lorem1970 Avg
1 11 0 22 11.0
2 12 17 0 8.5
3 13 0 24 12.0
4 14 19 0 9.5
5 15 20 0 10.0
Upvotes: 0