Reputation: 515
Suppose I have a data.frame or tibble. This object has several columns. Some columns are (A
, B
, C
) are means, and other columns are standard deviations (A.sd
, B.sd
, C.sd
).
df <-
data.frame(
A=c(1,2,3),
A.sd=c(0.3, 0.2, 0.1),
B=c(20,2,34),
B.sd=c(2.1, 5.2, 5.1),
C=c(14,26,13),
C.sd=c(1.3, 0.7, 4.5)
)
Now I want to calculate, the coefficient of variation (sd/mean) (this would be df$A.cv = df$A.sd/df$A
, and so on). I could do it one by one. But I would like to know if maybe the tidyverse
provides a more automatic way to do this. Some way to match the "mean" columns with the "sd" columns to calculate the "cv" columns.
Upvotes: 0
Views: 154
Reputation: 47320
using tidyverse
and split.default
:
df %>%
split.default(substr(names(.),1,1)) %>%
map_dfc(~mutate(., !!paste0(names(.)[1],".cv") := .[[2]]/.[[1]]))
# A A.sd A.cv B B.sd B.cv C C.sd C.cv
# 1 1 0.3 0.30000000 20 2.1 0.105 14 1.3 0.09285714
# 2 2 0.2 0.10000000 2 5.2 2.600 26 0.7 0.02692308
# 3 3 0.1 0.03333333 34 5.1 0.150 13 4.5 0.34615385
paste0(names(.)[1],".cv")
(A.cv
etc) for each data frame and binds everything together.In base R :
df_list <- unname(split.default(df,substr(names(df),1,1)))
add_cv <- function(x) `[[<-`(x, paste0(names(x)[1], ".cv"), value = x[[2]] / x[[1]])
do.call(cbind, lapply(df_list, add_cv))
# A A.sd A.cv B B.sd B.cv C C.sd C.cv
# 1 1 0.3 0.30000000 20 2.1 0.105 14 1.3 0.09285714
# 2 2 0.2 0.10000000 2 5.2 2.600 26 0.7 0.02692308
# 3 3 0.1 0.03333333 34 5.1 0.150 13 4.5 0.34615385
base R again splitting differently :
df_list <- split.default(df, endsWith(names(df),".sd"))
cbind(df, setNames(df_list[[2]] / df_list[[1]], paste0(names(df_list[[1]]), ".cv")))
# A A.sd B B.sd C C.sd A.cv B.cv C.cv
# 1 1 0.3 20 2.1 14 1.3 0.30000000 0.105 0.09285714
# 2 2 0.2 2 5.2 26 0.7 0.10000000 2.600 0.02692308
# 3 3 0.1 34 5.1 13 4.5 0.03333333 0.150 0.34615385
Upvotes: 1
Reputation: 11480
Simply do:
IND <- rep(seq(1:(ncol(df1)/2)),each=2)
df1[paste0(names(df1)[!duplicated(IND,F)], ".cv")] <- lapply(split(as.data.frame(t(df1)), IND), function(x)c(t(x)[,2]/t(x)[,1]))
# A A.sd B B.sd C C.sd A.cv B.cv C.cv
#1 1 0.3 20 2.1 14 1.3 0.30000000 0.105 0.09285714
#2 2 0.2 2 5.2 26 0.7 0.10000000 2.600 0.02692308
#3 3 0.1 34 5.1 13 4.5 0.03333333 0.150 0.34615385
please note:
Base
solution - no third party package needed.If you want to be dependant on the names, you can use a simple for loop:
# name_vec <- LETTERS[1:3]
name_vec <- names(df1)[grepl("^[^.]+$",names(df1))]
for( name_el in name_vec) {
df1[paste0(name_el, ".cv")] <- df1[[paste0(name_el, ".sd")]]/df1[[name_el]]
}
Upvotes: 0
Reputation: 50678
A canonical & parsimonious approach would be to reshape from wide to long, calculate the CVs and reshape again from long to wide (if necessary).
library(tidyverse)
df %>%
rowid_to_column("row") %>%
gather(key, value, -row) %>%
mutate(key = str_replace(key, "^([A-Z])$", "\\1.mean")) %>%
separate(key, c("var", "col")) %>%
spread(col, value) %>%
transmute(row, var = paste0(var, ".cv"), cv = sd / mean) %>%
spread(var, cv)
# row A.cv B.cv C.cv
#1 1 0.30000000 0.105 0.09285714
#2 2 0.10000000 2.600 0.02692308
#3 3 0.03333333 0.150 0.34615385
This approach is also independent of the order the mean/sd columns are in.
EDIT BY OP:
df %>%
rowid_to_column("row") %>%
gather(key, value, -row) %>%
mutate(key = str_replace(key, "^([A-Z])$", "\\1.mean")) %>%
separate(key, c("var", "col")) %>%
spread(col, value) %>%
transmute(row, var = paste0(var, ".cv"), cv = sd / mean) %>%
spread(var, cv) %>%
bind_cols(df, .) %>%
select(-row)
That way the result is in the same data frame and without the "row" column.
Upvotes: 0
Reputation: 4989
> cv
A.cv B.cv C.cv
1 0.30000000 0.105 0.09285714
2 0.10000000 2.600 0.02692308
3 0.03333333 0.150 0.34615385
Obviously super-hacky and lots of room for optimization, but probably achieves your goal.
cv <- data.frame()
counter <- 0
for (i in 1:ncol(df))(
if (grepl("sd$", colnames(df)[i]) == TRUE){
counter <- counter + 1
for (j in 1:nrow(df))(
cv[j, counter] <- df[j, i]/df[j, i-1]
)
names(cv)[counter] <- paste0(colnames(df)[i-1],".cv")
}
)
Upvotes: 0
Reputation: 3926
You can do something like this:
library(tidyverse)
df %<>% mutate(A.cv=A.sd/A,
B.cv=B.sd/B,
C.cv=C.sd/C)
A better solution is posed below.
Upvotes: 0
Reputation: 26343
You could split your data column-wise (split.default
) by the first letter of names(df)
and then use imap
to generate the cv
column.
library(tidyverse)
split.default(df, f = substr(names(df), 1, 1)) %>%
imap(.x = ., ~ mutate(., cv = .x[, paste0(.y, ".sd")] / .x[, .y])) %>%
imap(., ~ set_names(., nm = paste0(.y, c("", ".sd", ".cv")))) %>% # rename the columns
bind_cols()
# A A.sd A.cv B B.sd B.cv C C.sd C.cv
#1 1 0.3 0.30000000 20 2.1 0.105 14 1.3 0.09285714
#2 2 0.2 0.10000000 2 5.2 2.600 26 0.7 0.02692308
#3 3 0.1 0.03333333 34 5.1 0.150 13 4.5 0.34615385
imap
is handy here because it lets you iterate of the list and over the names of that list (the .y
in the code) easily.
The second imap
call is needed here because strangly this gives an error
split.default(df, f = substr(names(df), 1, 1)) %>%
imap(.x = ., ~ mutate(., paste0(.y, ".cv") = .x[, paste0(.y, ".sd")] / .x[, .y]))
Same idea but in base R
lst <- split.default(df, f = substr(names(df), 1, 1))
Reduce(cbind, Map(
function(x, y)
`[<-`(x, paste0(y, ".cv"), value = x[, paste0(y, ".sd")] / x[, y]),
x = lst,
y = names(lst)
))
Upvotes: 1
Reputation: 26
I suggest the following transformations:
df %>%
# Adding counter
mutate(n = 1:n()) %>%
# Converting to long format
gather("key", "value", -n) %>%
# Adding variable that distinguishes SD and mean
mutate(type = ifelse(grepl("\\.sd$", key), "SD", "mean"),
item = sub("(\\w).*", "\\1", key), # A, B, or C
case = paste(item, n)) %>% # e.g., A 1, B 2, etc.
select(n, value, type, case) %>%
# Conversion back to wide format
spread("type", "value") %>%
# Calculating COV
mutate(COV = mean / SD)
Upvotes: 0
Reputation: 1205
Using your data df
you can use the dplyr
function ends_with()
to split the dataset into two, convert to long and bind again:
library(tidyverse)
df <-
data.frame(
A=c(1,2,3),
A.sd=c(0.3, 0.2, 0.1),
B=c(20,2,34),
B.sd=c(2.1, 5.2, 5.1),
C=c(14,26,13),
C.sd=c(1.3, 0.7, 4.5)
)
sds <- select(df, ends_with(".sd")) %>%
gather() %>%
rename(sd = value) %>%
select(sd)
means <- select(df, -ends_with(".sd")) %>%
gather() %>%
rename(mean = value)
df_n <- bind_cols(means, sds)
df_n <- mutate(df_n, cv = sd/mean)
Upvotes: 0
Reputation: 16121
Here's an alternative tidyverse
version:
df <-
data.frame(
A=c(1,2,3),
A.sd=c(0.3, 0.2, 0.1),
B=c(20,2,34),
B.sd=c(2.1, 5.2, 5.1),
C=c(14,26,13),
C.sd=c(1.3, 0.7, 4.5)
)
library(tidyverse)
{df %>% select(matches("sd")) / df %>% select(-matches("sd"))} %>%
setNames(gsub("sd", "cv", names(.))) %>%
bind_cols(df, .)
# A A.sd B B.sd C C.sd A.cv B.cv C.cv
# 1 1 0.3 20 2.1 14 1.3 0.30000000 0.105 0.09285714
# 2 2 0.2 2 5.2 26 0.7 0.10000000 2.600 0.02692308
# 3 3 0.1 34 5.1 13 4.5 0.03333333 0.150 0.34615385
Note that you have to make sure your columns are in the right order in your original dataset.
Upvotes: 0
Reputation: 429
If you turn it into a long DF Something like this is relatively easy:
library(tidyverse)
df <- data.frame(
groups = rep(c("A", "B", "C"), each = 3),
means = c(1, 2, 3, 20, 2, 34, 14, 26, 13),
sd = c(0.3, 0.2, 0.1, 2.1, 5.2, 5.1, 1.3, 0.7, 4.5)
)
df <- df %>% mutate(
cv = (sd / means)
)
Upvotes: 0