Reputation: 427

Within rows of data frame, find first occurrence and longest sequence of value

Consider this data frame, which provides the scored responses on a 15-item test for 10 individuals:

library(tidyverse)
input <- tribble(
  ~ID, ~i1, ~i2, ~i3, ~i4, ~i5, ~i6, ~i7, ~i8, ~i9, ~i10, ~i11, ~i12, ~i13, ~i14, ~i15,
  "A", 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
  "B", 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  "C", 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
  "D", 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
  "E", 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
  "F", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  "G", 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
  "H", 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
  "I", 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  "J", 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1
)

I want R to go row-by-row, and scan the cells in each row from left to right, in order to create these new columns:

first_0_name: returns the column name of the cell containing the first occurrence of the value 0

first_0_loc: returns the column location of the cell containing the first occurrence of the value 0

streak_1: starting from the first occurrence of 0, find the next occurrence of 1, and then count how many consecutive 1 before the next occurrence of 0.

The new columns should appear as below

new_cols <- tribble(
  ~first_0_name, ~first_0_loc, ~streak_1,
  "i9", 10, 5,
  "i4", 5, 4,
  "i6", 7, 8,
  "i8", 9, 4,
  "i9", 10, 5,
  NA, NA, NA,
  "i1", 2, 5,
  "i3", 4, 8,
  "i2", 3, NA,
  "i1", 2, 1
)

Thanks in advance for any help!

Upvotes: 3

Answers (3)

akrun

Reputation: 887621

An option using melt from data.table

library(data.table)
melt(setDT(input), id.var = 'ID')[, .(first_o_name = first(variable[value == 0]), 
    first_o_loc = which(value == 0)[1] +1,
    streak_1 = sum(cumsum(c(TRUE, diff(value == 0) < 0)) == 2) - 1 ),  ID
     ][streak_1 < 0, streak_1 := NA_real_][]

A base R option can also be with apply and rle

do.call(rbind, apply(input[-1], 1, function(x) {
       first_o_loc <- unname(which(x == 0)[1] + 1)
       first_o_name <- names(x)[first_o_loc-1]
       rl <- rle(x)
       rl1 <- within.list(rl,  {
             i1 <- cumsum(values == 0) == 1
             values <- values[i1]
             lengths <- lengths[i1]})
        streak_1 <- unname(rl1$lengths[2])
      data.frame(first_o_name, first_o_loc, streak_1)}))
#   first_o_name first_o_loc streak_1
#1            i9          10        5
#2            i4           5        4
#3            i6           7        8
#4            i8           9        4
#5            i9          10        5
#6          <NA>          NA       NA
#7            i1           2        5
#8            i3           4        8
#9            i2           3       NA
#10           i3           4        2

Upvotes: 0

Jon Spring

Reputation: 66835

Edit #2: Rewrote as combination of two summarizations.

input_tidy <- input %>%
  gather(col, val, -ID) %>%
  group_by(ID) %>%
  arrange(ID) %>%
  mutate(col_num = row_number() + 1) 

input[,1] %>% 
  # Combine with summary of each ID's first zero
  left_join(input_tidy %>% filter(val == 0) %>%
              summarize(first_0_name = first(col),
                        first_0_loc = first(col_num))) %>%
  # Combine with length of each ID's first post-0 streak of 1's
  left_join(input_tidy %>%
              filter(val == 1 & cumsum(val == 1 & lag(val, default = 1) == 0) == 1) %>% 
              summarize(streak_1 = n()))


# A tibble: 10 x 4
   ID    first_0_name first_0_loc streak_1
   <chr> <chr>              <dbl>    <int>
 1 A     i9                    10        5
 2 B     i4                     5        4
 3 C     i6                     7        8
 4 D     i8                     9        4
 5 E     i9                    10        5
 6 F     NA                    NA       NA
 7 G     i1                     2        5
 8 H     i3                     4        8
 9 I     i2                     3       NA
10 J     i3                     4        2

Upvotes: 0

jdobres

Reputation: 11957

If you wanted to use base R a little more directly and avoid the cost of transforming the whole data frame. This solution also retains the order of rows without having to create extra ordering columns (unlike the tidyverse solution).

results <- apply(input, 1, function(x) {

  # get indices of all zeros
  zeros <- which(x == 0)

  # exit early if no zeros are found
  if (length(zeros) == 0) {
    return(data.frame(first_0_name = NA, first_0_loc = NA, streak_1 = NA))
  }

  first.name <- names(zeros[1])         # name of first 0 column
  first.idx <- zeros[1]                 # location of first zero
  longest.streak <- diff(zeros)[1] - 1  # length of first 0-0 streak

  return(data.frame(first_0_name = first.name, 
                    first_0_loc = first.idx, 
                    streak_1 = ifelse(longest.streak == 0, NA, longest.streak))
         )

})

output <- do.call(rbind, results)

    first_0_name first_0_loc streak_1
i9            i9          10        5
i4            i4           5        4
i6            i6           7        8
i8            i8           9       NA
i91           i9          10        5
1           <NA>          NA       NA
i1            i1           2        5
i3            i3           4        8
i2            i2           3       NA
i31           i3           4        2

Upvotes: 1

Within rows of data frame, find first occurrence and longest sequence of value

Answers (3)

Related Questions