ail
ail

Reputation: 149

Append first row by group to original data

I have data with a grouping variable "ID" and some values:

ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6 
2, 7
2, 8

Within each group, I want to append the first row after the last row. Desired result:

ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6 
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group

I have 5000 row of this.

Upvotes: 4

Views: 226

Answers (5)

Henrik
Henrik

Reputation: 67778

Use !duplicated to get first row by "ID" (more efficient than "by group" operations). rbind with original data and order result:

df = data.frame(ID = rep(1:2, each = 4), Value = 1:8)

d2 = rbind(df, df[!duplicated(df$ID), ])
d2[order(d2$ID), ]
#   ID Value
# 1   1     1
# 2   1     2
# 3   1     3
# 4   1     4
# 11  1     1
# 5   2     5
# 6   2     6
# 7   2     7
# 8   2     8
# 51  2     5

Same idea with data.table::duplicated:

d = as.data.table(df)
d2 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d2, ID)

More straightforward with data.table::unique:

d2 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d2, ID)

Also data.table::rowid:

d2 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d2, ID)

By avoiding "by group" operations, !duplicated, unique and rowid alternatives are all faster than the clear winner in the previous benchmark, data.table solution which uses by, on a 5000 row data set (see OP):

df = data.frame(ID = rep(1:1250, each = 4), Value = 1:5e3)
d = as.data.table(d)

microbenchmark(
  f_by = {
    d1 = d[ , .(Value = c(Value, first(Value))), by = ID]
  },
  f_dupl_df = {
    d2 = rbind(df, df[!duplicated(df$ID), ])
    d2 = d2[order(d2$ID), ]
  },
  f_dupl_dt = {
    d3 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
    setorder(d3, ID)
  },
  f_uniq_dt = {
    d4 = rbindlist(list(d, unique(d, by = "ID")))
    setorder(d4, ID)
  },
  f_rowid = {
    d5 = rbindlist(list(d, d[rowid(ID) == 1]))
    setorder(d5, ID)
  },
  times = 10L)
# Unit: milliseconds
#       expr    min     lq     mean  median      uq     max
#      f_by 8.5167 9.1397 11.01410 9.90925 12.3327 15.9134
# f_dupl_df 6.8337 7.0901  8.31057 7.56810  8.4899 13.9278
# f_dupl_dt 2.4742 2.6687  3.24932 3.18670  3.7993  4.3318
# f_uniq_dt 2.2059 2.4225  3.50756 3.36250  4.4590  5.6632
#   f_rowid 2.2963 2.4295  3.43876 2.74345  4.8035  5.9278

all.equal(d1, as.data.table(d2))
all.equal(d1, d3)
all.equal(d1, d4)
all.equal(d1, d5)
# [1] TRUE

However, a sub-second benchmark isn't very informative, so try on larger data with many groups. The base solution loses ground. data.table::duplicated, unique and rowid scale better, and are now about 20 times faster, data.table::unique being fastest.

df = data.frame(ID = rep(1:1250000, each = 4), Value = 1:5e6)
d = as.data.table(df)

# Unit: milliseconds
#      expr        min         lq       mean     median         uq        max neval
#      f_by  6834.5959  7157.1686 12273.2399  7775.3919  8850.5324 35339.0262    10
# f_dupl_df 10732.1536 11035.4886 19440.4964 11691.5347 37956.6961 38387.4927    10
# f_dupl_dt   174.5640   183.8399   391.8605   381.8920   401.4929   962.4948    10
# f_uniq_dt   156.1267   161.9555   212.3472   180.7912   209.3905   406.7780    10
#   f_rowid   192.1106   197.1564   380.0023   234.5851   474.5024  1172.6529    10

For completeness, a binary search, with mult = "first" to select the first match:

d[.(unique(ID)), on = .(ID), mult = "first"]

However, in both timings above, it ended on twice the time compared to the unique alternative.

Upvotes: 4

harre
harre

Reputation: 7277

Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):

dat |>
  group_by(ID) |>
  do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
  ungroup()
dat |>
  group_by(ID) |>
  group_modify(~ add_row(., Value = first(.$Value))) |>
  ungroup()

Or bind_rows with summarize (my variation of @Gregor Thomas, thanks):

dat |> 
  group_by(ID) |>
  summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
  ungroup()

Or by applying the same logic of @Henrik using bind_rows, filter, and arrange:

dat |>
  bind_rows(dat |> filter(!duplicated(ID))) |>
  arrange(ID)

Output:

# A tibble: 10 × 2
      ID Value
   <int> <dbl>
 1     1     1
 2     1     2
 3     1     3
 4     1     4
 5     1     1
 6     2     5
 7     2     6
 8     2     7
 9     2     8
10     2     5

Thanks to @SamR for the data.

Upvotes: 4

jblood94
jblood94

Reputation: 16981

And with data.table:

library(data.table)

dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#>     ID Value
#>  1:  1     1
#>  2:  1     2
#>  3:  1     3
#>  4:  1     4
#>  5:  1     1
#>  6:  2     5
#>  7:  2     6
#>  8:  2     7
#>  9:  2     8
#> 10:  2     5

Benchmarking with a 5000-row table:

library(dplyr)

dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)

f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
  dt %>%
    group_by(ID) %>%
    do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
    ungroup()
}
f4 <- function(dt) {
  dt %>%
    group_by(ID) %>%
    group_modify(~ add_row(., Value = first(.$Value))) %>%
    ungroup()
}

microbenchmark::microbenchmark(data.table = f1(dt),
                               "base R" = f2(dt),
                               tidyverse1 = f3(dt),
                               tidyverse2 = f4(dt),
                               times = 10)
#> Unit: milliseconds
#>        expr      min       lq      mean   median       uq      max neval
#>  data.table   3.4989   3.6844   4.16619   4.3085   4.4623   5.3020    10
#>      base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901    10
#>  tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321    10
#>  tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926    10

If speed is really important, this simple Rcpp function provides a very fast solution:

Rcpp::cppFunction(
  "IntegerVector firstLast(const IntegerVector& x) {
    const int n = x.size();
    IntegerVector idxOut(2*n);
    int i0 = 1;
    int idx = 0;
    idxOut(0) = 1;
    
    for (int i = 1; i < n; i++) {
      if (x(i) != x(i - 1)) {
        idxOut(++idx) = i0;
        i0 = i + 1;
      }
      idxOut(++idx) = i + 1;
    }
    idxOut(++idx) = i0;
    return idxOut[Rcpp::Range(0, idx)];
  }"
)

Benchmarking against the fastest solution from this answer (on a much larger dataset):

dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)

microbenchmark::microbenchmark(
  f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
  f_Rcpp = dt[firstLast(dt$ID)],
  check = "equal"
)
#> Unit: milliseconds
#>      expr     min       lq     mean   median       uq      max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867   100
#>    f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637   100

Upvotes: 4

ThomasIsCoding
ThomasIsCoding

Reputation: 101064

You can try the following data.table option

> setDT(df)[, .SD[(seq(1 + .N) - 1) %% .N + 1], ID]
    ID Value
 1:  1     1
 2:  1     2
 3:  1     3
 4:  1     4
 5:  1     1
 6:  2     5
 7:  2     6
 8:  2     7
 9:  2     8
10:  2     5

Upvotes: 1

SamR
SamR

Reputation: 20240

Here is a base R method:

do.call(rbind, 
    lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))
) 
#      ID Value
# 1.1   1     1
# 1.2   1     2
# 1.3   1     3
# 1.4   1     4
# 1.5   1     1
# 2.5   2     5
# 2.6   2     6
# 2.7   2     7
# 2.8   2     8
# 2.51  2     5

It does give you slightly strange row names - if you care about that you can wrap it in (or pipe it to) tibble::as_tibble(), which removes row names altogether.

Alternatively you could do data.table::rbindlist(lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))), becausedata.table also does not use row names.

Data

dat  <- read.csv(text = "ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6 
2, 7
2, 8", h=T)

Upvotes: 1

Related Questions