Reputation: 427
Consider this data frame, which provides the scored responses on a 15-item test for 10 individuals:
library(tidyverse)
input <- tribble(
~ID, ~i1, ~i2, ~i3, ~i4, ~i5, ~i6, ~i7, ~i8, ~i9, ~i10, ~i11, ~i12, ~i13, ~i14, ~i15,
"A", 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
"B", 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
"C", 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
"D", 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
"E", 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
"F", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
"G", 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
"H", 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
"I", 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
"J", 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1
)
I want R to go row-by-row, and scan the cells in each row from left to right, in order to create these new columns:
first_0_name
: returns the column name of the cell containing the first occurrence of the value 0
first_0_loc
: returns the column location of the cell containing the first occurrence of the value 0
streak_1
: starting from the first occurrence of 0
, find the next occurrence of 1
, and then count how many consecutive 1
before the next occurrence of 0
.
The new columns should appear as below
new_cols <- tribble(
~first_0_name, ~first_0_loc, ~streak_1,
"i9", 10, 5,
"i4", 5, 4,
"i6", 7, 8,
"i8", 9, 4,
"i9", 10, 5,
NA, NA, NA,
"i1", 2, 5,
"i3", 4, 8,
"i2", 3, NA,
"i1", 2, 1
)
Thanks in advance for any help!
Upvotes: 3
Views: 610
Reputation: 887621
An option using melt
from data.table
library(data.table)
melt(setDT(input), id.var = 'ID')[, .(first_o_name = first(variable[value == 0]),
first_o_loc = which(value == 0)[1] +1,
streak_1 = sum(cumsum(c(TRUE, diff(value == 0) < 0)) == 2) - 1 ), ID
][streak_1 < 0, streak_1 := NA_real_][]
A base R
option can also be with apply
and rle
do.call(rbind, apply(input[-1], 1, function(x) {
first_o_loc <- unname(which(x == 0)[1] + 1)
first_o_name <- names(x)[first_o_loc-1]
rl <- rle(x)
rl1 <- within.list(rl, {
i1 <- cumsum(values == 0) == 1
values <- values[i1]
lengths <- lengths[i1]})
streak_1 <- unname(rl1$lengths[2])
data.frame(first_o_name, first_o_loc, streak_1)}))
# first_o_name first_o_loc streak_1
#1 i9 10 5
#2 i4 5 4
#3 i6 7 8
#4 i8 9 4
#5 i9 10 5
#6 <NA> NA NA
#7 i1 2 5
#8 i3 4 8
#9 i2 3 NA
#10 i3 4 2
Upvotes: 0
Reputation: 66835
Edit #2: Rewrote as combination of two summarizations.
input_tidy <- input %>%
gather(col, val, -ID) %>%
group_by(ID) %>%
arrange(ID) %>%
mutate(col_num = row_number() + 1)
input[,1] %>%
# Combine with summary of each ID's first zero
left_join(input_tidy %>% filter(val == 0) %>%
summarize(first_0_name = first(col),
first_0_loc = first(col_num))) %>%
# Combine with length of each ID's first post-0 streak of 1's
left_join(input_tidy %>%
filter(val == 1 & cumsum(val == 1 & lag(val, default = 1) == 0) == 1) %>%
summarize(streak_1 = n()))
# A tibble: 10 x 4
ID first_0_name first_0_loc streak_1
<chr> <chr> <dbl> <int>
1 A i9 10 5
2 B i4 5 4
3 C i6 7 8
4 D i8 9 4
5 E i9 10 5
6 F NA NA NA
7 G i1 2 5
8 H i3 4 8
9 I i2 3 NA
10 J i3 4 2
Upvotes: 0
Reputation: 11957
If you wanted to use base R a little more directly and avoid the cost of transforming the whole data frame. This solution also retains the order of rows without having to create extra ordering columns (unlike the tidyverse solution).
results <- apply(input, 1, function(x) {
# get indices of all zeros
zeros <- which(x == 0)
# exit early if no zeros are found
if (length(zeros) == 0) {
return(data.frame(first_0_name = NA, first_0_loc = NA, streak_1 = NA))
}
first.name <- names(zeros[1]) # name of first 0 column
first.idx <- zeros[1] # location of first zero
longest.streak <- diff(zeros)[1] - 1 # length of first 0-0 streak
return(data.frame(first_0_name = first.name,
first_0_loc = first.idx,
streak_1 = ifelse(longest.streak == 0, NA, longest.streak))
)
})
output <- do.call(rbind, results)
first_0_name first_0_loc streak_1
i9 i9 10 5
i4 i4 5 4
i6 i6 7 8
i8 i8 9 NA
i91 i9 10 5
1 <NA> NA NA
i1 i1 2 5
i3 i3 4 8
i2 i2 3 NA
i31 i3 4 2
Upvotes: 1