Saurabh
Saurabh

Reputation: 1626

R mean of unique values in data.table

I have a data.table as follows

structure(list(group = c("A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "A", "A", "A", "A", "A", "A", "A", "A", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B", "B", "B", "B", 
                          "B", "B", "B", "B", "B"), 
               V1 = c(6.38, 6.38, 6.38, 6.38, -1.53, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, 
                      -24.93, -24.93, -24.93, -6.8, -6.8, -6.8, -6.8, -6.8, -1.71, 
                      -1.71, -1.71, -1.71, -1.71, -1.06, -1.06, -1.06, -1.06, -1.06, 
                      -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, 
                      -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, 
                      -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, 
                      -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, 8.42, 8.42, 8.42, 4.34, 
                      4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 
                      4.34, 4.34, 4.34, 4.34, 4.34, 4.34), 
               V2 = c(0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, -0.11, -0.11, -11, -11, -11, -11, -11, 
                      -11, -11, -11, -11, -11, 1.6, 1.6, 1.6, 1.6, -0.55, -0.55, -0.55, 
                      -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, 
                      -0.19, -0.19, -0.19, -0.19, -0.19, 2.63, 2.63, 2.63, 2.63, 2.63, 
                      2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 
                      -3.86, -3.86, 0.48, 0.48, 0.48, 0.48, 0.48, 0.48, 0.48, -1.38, 
                      -1.38, -1.38, -1.38, -5.15, -11.58, -11.58, -11.58, -11.58, -11.58, 
                      -11.58, -11.58, -11.58, -11.58, -0.46, -7.32, -7.32, -7.32, -7.32, 
                      -7.32, -7.32, -7.32, -7.32, 2.67, 4.88, 4.88, 4.88, 4.88, 4.88, 
                      4.88, 4.88, 4.88, -11.57, -11.57, -11.57, 1.67, 1.55, 1.55, 2.3, 
                      2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, -1.42, 21.88, 21.88, 21.88, 
                      21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 
                      21.88, 21.88, 21.88, 21.88, 21.88, 21.88, -0.59, -0.59, -0.59, 
                      -0.59, -0.59, -1.87, -1.87, -1.87)), row.names = c(NA, -138L), 
          class = c("data.table", "data.frame"))

I want to find the sum of unique values in each column by the group.

I have tried the following which is giving me the sum of all values in each column (but not the unique values).

library(data.table)
dt[, lapply(.SD, sum, na.rm = T), by=group, .SDcols = c("V1", "V2")]

   group       V1     V2
1:     A -1571.53 -88.67
2:     B    20.55 245.64

However, I only want to find the sum of unique values.

The answer should look like this -

group       V1     V2
1:     A  -269.38 -12.43
2:     B    -4.47  27.17

Thanks!

Upvotes: 3

Views: 476

Answers (1)

pseudospin
pseudospin

Reputation: 2767

The sum of unique values (independently for each column) can be calculated like this

dt[, .(sum(unique(V1)), sum(unique(V2))), group]
#>    group     V1     V2
#> 1:     A -20.08 -13.83
#> 2:     B   3.19  -5.01

The answer you give is the sum of each column where each row is unique, i.e.

unique(dt)[, .(sum(V1), sum(V2)), group]
#>    group      V1     V2
#> 1:     A -269.38 -12.43
#> 2:     B   -4.47  27.17

and for a named list of columns

unique(dt)[, lapply(.SD, sum), group, .SDcols = c('V1', 'V2')]

or if it is the first version of uniqueing you want

dt[, lapply(.SD, function(x) sum(unique(x))), group, .SDcols = c('V1', 'V2')]

Upvotes: 3

Related Questions