ghs101
ghs101

Reputation: 113

Transforming a dataframe R by colum start and end

I have a dataframe as follows.

> df
  condition duration start end
1         A        2     3   4
2         B        3     8  10
3         A        2     7   8

I want to transform the dataframe into a table that gives me a 0 or 1 for each position for each condition. The conditions can overlap at the start and end and occur multiple times; however, the multiple occurrences of a type of condition don't.

It should look like something as follows

> df2
   count A B
1      1 0 0
2      2 0 0
3      3 1 0
4      4 1 0
5      5 0 0
6      6 0 0
7      7 1 0
8      8 1 1
9      9 0 1
10    10 0 1

Upvotes: 2

Views: 74

Answers (3)

Maël
Maël

Reputation: 52069

Using tabulate:

with(df, mapply(seq, start, end)) |>
  tapply(df$condition, unlist) |>
  purrr::map_dfc(tabulate, nbins = m) |>
  transform(count = seq(m))

output

   count     A     B
 1     1     0     0
 2     2     0     0
 3     3     1     0
 4     4     1     0
 5     5     0     0
 6     6     0     0
 7     7     1     0
 8     8     1     1
 9     9     0     1
10    10     0     1

Another tidyverse solution:

library(dplyr)
library(tidyr)
df %>% 
  mutate(count = mapply(`:`, start, end), .keep = "unused") %>% 
  unnest(count) %>% 
  pivot_wider(-duration, names_from = condition, values_from = condition, values_fn = length, values_fill = 0) %>% 
  complete(count = seq(count), fill = list(A = 0, B = 0)) %>% 
  arrange(count)

Upvotes: 3

zx8754
zx8754

Reputation: 56179

Using base: split, lapply, seq:

m <- max(df[, c("start", "end")])

data.frame(c(
  count = list(1:m),
  lapply(split(df, df$condition), function(i){
    out <- rep(0, m)
    out[ unlist(Map(seq, i$start, i$end)) ] <- 1
    out
    })
  ))

#    count A B
# 1      1 0 0
# 2      2 0 0
# 3      3 1 0
# 4      4 1 0
# 5      5 0 0
# 6      6 0 0
# 7      7 1 0
# 8      8 1 1
# 9      9 0 1
# 10    10 0 1

Upvotes: 2

Jon Spring
Jon Spring

Reputation: 66520

library(dplyr); library(tidyr)
df |>
  uncount(duration, .id = "copy") |>
  mutate(row = start + copy - 1) |>
  count(condition, row) |>
  complete(condition, row = 1:max(row), fill = list(n = 0)) |>
  pivot_wider(names_from = condition, values_from = n) 

Result

# A tibble: 10 × 3
     row     A     B
   <dbl> <int> <int>
 1     1     0     0
 2     2     0     0
 3     3     1     0
 4     4     1     0
 5     5     0     0
 6     6     0     0
 7     7     1     0
 8     8     1     1
 9     9     0     1
10    10     0     1

Upvotes: 3

Related Questions