Reputation: 89
I need quite a simple thing. To iterate over columns of a dataset to create percentil versions of said columns. I tried with dplyr and data.table but none seem to do what I need. Particulary, I need to exclude de NA values when creating the percentile versions of the columns.
Reproducible example below:
values<-c(19,
6,
27,
63,
50,
59,
97,
89,
NA,
9,
31,
58,
83,
2,
1,
31,
3,
1,
27,
40,
32,
42,
99,
NA,
12,
16,
23,
98,
44,
25,
13,
70,
64,
NA,
37,
75,
73,
59,
21,
3,
76,
43,
6,
96,
55,
48,
70,
90,
18,
58,
22,
19,
26,
49,
59,
94,
31,
45,
20,
8,
26,
56,
7,
11,
98,
50,
41,
38,
86,
0,
37,
NA,
40,
7,
88,
38,
41,
41,
19,
34,
21,
64,
87,
22,
54,
39,
75,
72,
91,
78)
values2<- c(98,
60,
9,
98,
NA,
88,
NA,
54,
92,
90,
NA,
83,
92,
65,
44,
NA,
98,
40,
26,
40,
54,
56,
15,
90,
15,
63,
57,
NA,
85,
69,
73,
43,
24,
27,
82,
75,
29,
98,
29,
5,
91,
88,
28,
12,
53,
NA,
2,
42,
86,
2,
78,
20,
50,
73,
77,
NA,
4,
39,
90,
NA,
29,
14,
98,
88,
77,
79,
30,
9,
74,
93,
NA,
16,
27,
16,
18,
40,
NA,
2,
66,
71,
82,
10,
62,
84,
25,
NA,
15,
12,
85,
50)
groups<-c(1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2)
df<-as.data.frame(cbind(groups,values,values2))
library(dplyr)
for (i in c("values","values2")) {
df<-df %>%
group_by(groups) %>%
mutate(!!sym(paste( i,"_percentile", sep="")) := percent_rank(na.omit(i)))
}
for (i in c("values","values2")) {
df<-df %>%
group_by(groups) %>%
mutate(!!sym(paste( i,"_percentile", sep="")) := rank(i)/length(i) )
}
library(data.table)
df<- as.data.table(df)
for (i in c("values","values2")) {
df[, paste(i,"_percentile",sep="") := rank(get(i))/length( get(i)), by = groups ]
}
for (i in c("values","values2")) {
df[!is.na(i), paste(i,"_percentile",sep="") := rank(get(i))/length( get(i)), by = groups ]
}
Upvotes: 1
Views: 277
Reputation: 1831
My tidyverse
answer has the same structure as @akrun's -- using mutate_at
to add multiple columns, starts_with
to select the columns. A few things worth pointing out with the more minimal example:
percent_rank
function already removes NA's when it calculates, so you don't have to do the additional work to filter them out of the calc.percent_rank
can return a NaN value there because it's scaling the min_rank
. Inside the direct mutate_at
, that issue seems to be avoided. (It's unclear what value that should be assigned to in your case).percent_rank
's are accordingly not 1.0.library(tidyverse)
df <- tribble(
~groups, ~values1, ~values2,
"a", 1, 10,
"a", 2, 10,
"a", NA, 8,
"a", 3, 9,
"a", 4, 7,
"b", NA, 10,
"b", 2, NA,
"b", NA, 8
)
df %>%
group_by(groups) %>%
mutate_at(
vars(starts_with("values")),
list(percentile = ~ percent_rank(.)))
#> # A tibble: 8 x 5
#> # Groups: groups [2]
#> groups values1 values2 values1_percentile values2_percentile
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 1 10 0 0.75
#> 2 a 2 10 0.333 0.75
#> 3 a NA 8 NA 0.25
#> 4 a 3 9 0.667 0.5
#> 5 a 4 7 1 0
#> 6 b NA 10 NA 1
#> 7 b 2 NA 0 NA
#> 8 b NA 8 NA 0
Upvotes: 0
Reputation: 887108
An option is mutate_at
. After grouping by 'groups', use mutate_at
to loop over the columns that starts_with
('values') as column name, replace
, the values where the values are not NA with the percent_rank
of the non-NA elements
library(dplyr)
df %>%
group_by(groups) %>%
mutate_at(vars(starts_with('values')),
list(percentile = ~ replace(., !is.na(.), percent_rank(.[!is.na(.)]))))
Or with data.table
library(data.table)
nm1 <- paste(names(df1)[2:3], "_percentile")
setDT(df)[, (nm1) := lapply(.SD, function(x) replace(x, !is.na(x),
frank(x[!is.na(x)])/sum(!is.na(x)))), .SDcols = 2:3, by = groups]
Upvotes: 1