Reputation: 1626
I have a data.table as follows
structure(list(group = c("A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B"),
V1 = c(6.38, 6.38, 6.38, 6.38, -1.53, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93, -24.93,
-24.93, -24.93, -24.93, -6.8, -6.8, -6.8, -6.8, -6.8, -1.71,
-1.71, -1.71, -1.71, -1.71, -1.06, -1.06, -1.06, -1.06, -1.06,
-1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06,
-1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06,
-1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06, -1.06,
-1.06, -1.06, -1.06, -1.06, -1.06, -1.06, 8.42, 8.42, 8.42, 4.34,
4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34,
4.34, 4.34, 4.34, 4.34, 4.34, 4.34),
V2 = c(0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, -0.11, -0.11, -11, -11, -11, -11, -11,
-11, -11, -11, -11, -11, 1.6, 1.6, 1.6, 1.6, -0.55, -0.55, -0.55,
-2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15, -2.15,
-0.19, -0.19, -0.19, -0.19, -0.19, 2.63, 2.63, 2.63, 2.63, 2.63,
2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63, 2.63,
-3.86, -3.86, 0.48, 0.48, 0.48, 0.48, 0.48, 0.48, 0.48, -1.38,
-1.38, -1.38, -1.38, -5.15, -11.58, -11.58, -11.58, -11.58, -11.58,
-11.58, -11.58, -11.58, -11.58, -0.46, -7.32, -7.32, -7.32, -7.32,
-7.32, -7.32, -7.32, -7.32, 2.67, 4.88, 4.88, 4.88, 4.88, 4.88,
4.88, 4.88, 4.88, -11.57, -11.57, -11.57, 1.67, 1.55, 1.55, 2.3,
2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, -1.42, 21.88, 21.88, 21.88,
21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88, 21.88,
21.88, 21.88, 21.88, 21.88, 21.88, 21.88, -0.59, -0.59, -0.59,
-0.59, -0.59, -1.87, -1.87, -1.87)), row.names = c(NA, -138L),
class = c("data.table", "data.frame"))
I want to find the sum of unique values in each column by the group.
I have tried the following which is giving me the sum of all values in each column (but not the unique values).
library(data.table)
dt[, lapply(.SD, sum, na.rm = T), by=group, .SDcols = c("V1", "V2")]
group V1 V2
1: A -1571.53 -88.67
2: B 20.55 245.64
However, I only want to find the sum of unique values.
The answer should look like this -
group V1 V2
1: A -269.38 -12.43
2: B -4.47 27.17
Thanks!
Upvotes: 3
Views: 476
Reputation: 2767
The sum of unique values (independently for each column) can be calculated like this
dt[, .(sum(unique(V1)), sum(unique(V2))), group]
#> group V1 V2
#> 1: A -20.08 -13.83
#> 2: B 3.19 -5.01
The answer you give is the sum of each column where each row is unique, i.e.
unique(dt)[, .(sum(V1), sum(V2)), group]
#> group V1 V2
#> 1: A -269.38 -12.43
#> 2: B -4.47 27.17
and for a named list of columns
unique(dt)[, lapply(.SD, sum), group, .SDcols = c('V1', 'V2')]
or if it is the first version of uniqueing you want
dt[, lapply(.SD, function(x) sum(unique(x))), group, .SDcols = c('V1', 'V2')]
Upvotes: 3