Reputation: 37
Given a dataframe like this:
df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))
ID fruit
1 1 apple
2 2 orange
3 3 kiwi
4 4 pear
5 5 orange
6 6 kiwi
7 7 apple
8 8 apple
How can I create a new column that identifies whether a value has appeared in the previous n rows? For example, if n = 3, I would like an output like this:
ID fruit previous_3
1 1 apple FALSE
2 2 orange FALSE
3 3 kiwi FALSE
4 4 pear FALSE
5 5 orange TRUE
6 6 kiwi TRUE
7 7 apple FALSE
8 8 apple TRUE
Upvotes: 1
Views: 123
Reputation: 35554
You can use rollapplyr()
of zoo
.
( rollapplyr()
is a wrapper around rollapply()
that uses a default of align = "right"
. )
Custom function
prev_n <- function(x, n){
zoo::rollapplyr(x, n+1, function(x) duplicated(x)[length(x)], partial = T)
}
Scenario 1
df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))
library(dplyr)
df %>%
mutate(previous_3 = prev_n(fruit, 3))
# ID fruit previous_3
# 1 1 apple FALSE
# 2 2 orange FALSE
# 3 3 kiwi FALSE
# 4 4 pear FALSE
# 5 5 orange TRUE
# 6 6 kiwi TRUE
# 7 7 apple FALSE
# 8 8 apple TRUE
Scenario 2
df2 <- data.frame(ID = seq(1, 8), fruit = rep("apple", 8))
df2 %>%
mutate(previous_3 = prev_n(fruit, 3))
# ID fruit previous_3
# 1 1 apple FALSE
# 2 2 apple TRUE
# 3 3 apple TRUE
# 4 4 apple TRUE
# 5 5 apple TRUE
# 6 6 apple TRUE
# 7 7 apple TRUE
# 8 8 apple TRUE
Upvotes: 1
Reputation: 5788
Base R solution:
df$previous_3 <- sapply(seq_len(nrow(df)),
function(i){any(duplicated(df$fruit[ifelse((i-3) < 0, 0, (i-3)):i]))})
Upvotes: 1
Reputation: 244
library(slider)
df %>%
mutate(previous_3 = map2_lgl(fruit, slide(lag(fruit), ~ .x, .before = 2),
~ .x %in% .y))
# # A tibble: 8 x 3
# ID fruit previous_3
# <int> <chr> <lgl>
# 1 1 apple FALSE
# 2 2 orange FALSE
# 3 3 kiwi FALSE
# 4 4 pear FALSE
# 5 5 orange TRUE
# 6 6 kiwi TRUE
# 7 7 apple FALSE
# 8 8 apple TRUE
Upvotes: 1
Reputation: 886998
An option with shift
from data.table
library(data.table)
f_prev_n <- function(dat, colnm, n) {
rowSums(sapply(data.table::shift(df[[colnm]], n = seq_len(n)),
function(x) dat[[colnm]] == x), na.rm = TRUE) > 0
}
df$previous_3 <- f_prev_n(df, 'fruit', 3)
df$previous_3
#[1] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
df$previous_6 <- f_prev_n(df, 'fruit', 6)
df$previous_6
#[1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
Upvotes: 2
Reputation: 599
We can define a function like this (vec
is the vector of values, n
is the number of elements to look back).
repeated_previously <- function(vec, n) {
vec_len <- length(vec)
# create a vector that will hold TRUE/FALSE values
bool_vec <- logical()
# create a vector that will be used for comparison with n previous values
prev_values <- c()
# for each index in 1 up to the the length of the vector
for (vec_ind in 1:vec_len ) {
# get the value at the current index
curr_val <- vec[vec_ind]
# if the current value has already occurred in the n previous positions
if (curr_val %in% prev_values) {
# add a TRUE to the vector to be returned
bool_vec <- c(bool_vec, TRUE)
} else {
bool_vec <- c(bool_vec, FALSE)
}
# if the `prev_values` has been filled up with n values already
if (length(prev_values) >= n) {
# drop the first value in `prev_values` and add the current iteration's value
prev_values <- c(prev_values[2:n], curr_val)
} else {
# ... otherwise just add the current iteration's value
prev_values <- c(prev_values, curr_val)
}
}
return(bool_vec)
}
Now we can use the function.
df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))
df$previous_3 <- repeated_previously(df$fruit, 3)
df$previous_6 <- repeated_previously(df$fruit, 6)
Here's the output.
> print(df)
ID fruit previous_3 previous_6
1 1 apple FALSE FALSE
2 2 orange FALSE FALSE
3 3 kiwi FALSE FALSE
4 4 pear FALSE FALSE
5 5 orange TRUE TRUE
6 6 kiwi TRUE TRUE
7 7 apple FALSE TRUE
8 8 apple TRUE TRUE
Upvotes: 1
Reputation: 299
Should work using base R with a simple loop to go through each row and check previous n rows, n is currently defined as 3 but can be changed
df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"), stringsAsFactors = F)
# Create Column to store true false values
df$previous_n <- FALSE
# Define Variable for how far to look back
n <- 3
# Loop through each row of dataframe df
for (i in seq_len(nrow(df))) {
# Only calculate for the second row on
if (i > 1) {
# Find the starting index for the previous values
start_value <- ifelse(i - n < 1, 1, i - n)
# Identify the current value in the row
c_value <- df$fruit[i]
# Identify all previous values
previous_values <- df$fruit[start_value:i - 1]
# Check if the current value is in list of previous values, and add result to df
df$previous_n[i] <- c_value %in% previous_values
}
}
df
ID fruit previous_n
1 1 apple FALSE
2 2 orange FALSE
3 3 kiwi FALSE
4 4 pear FALSE
5 5 orange TRUE
6 6 kiwi TRUE
7 7 apple FALSE
8 8 apple TRUE
Upvotes: 1