Cenlle
Cenlle

Reputation: 37

Compare value to previous n values in dataframe

Given a dataframe like this:

df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))

  ID  fruit
1  1  apple
2  2 orange
3  3   kiwi
4  4   pear
5  5 orange
6  6   kiwi
7  7  apple
8  8  apple

How can I create a new column that identifies whether a value has appeared in the previous n rows? For example, if n = 3, I would like an output like this:

  ID  fruit previous_3
1  1  apple      FALSE
2  2 orange      FALSE
3  3   kiwi      FALSE
4  4   pear      FALSE
5  5 orange       TRUE
6  6   kiwi       TRUE
7  7  apple      FALSE
8  8  apple       TRUE

Upvotes: 1

Views: 123

Answers (6)

Darren Tsai
Darren Tsai

Reputation: 35554

You can use rollapplyr() of zoo.
( rollapplyr() is a wrapper around rollapply() that uses a default of align = "right". )

Custom function

prev_n <- function(x, n){
  zoo::rollapplyr(x, n+1, function(x) duplicated(x)[length(x)], partial = T)
}

Scenario 1

df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))
library(dplyr)

df %>%
  mutate(previous_3 = prev_n(fruit, 3))

#   ID  fruit previous_3
# 1  1  apple      FALSE
# 2  2 orange      FALSE
# 3  3   kiwi      FALSE
# 4  4   pear      FALSE
# 5  5 orange       TRUE
# 6  6   kiwi       TRUE
# 7  7  apple      FALSE
# 8  8  apple       TRUE

Scenario 2

df2 <- data.frame(ID = seq(1, 8), fruit = rep("apple", 8))
df2 %>%
  mutate(previous_3 = prev_n(fruit, 3))

#   ID fruit previous_3
# 1  1 apple      FALSE
# 2  2 apple       TRUE
# 3  3 apple       TRUE
# 4  4 apple       TRUE
# 5  5 apple       TRUE
# 6  6 apple       TRUE
# 7  7 apple       TRUE
# 8  8 apple       TRUE

Upvotes: 1

hello_friend
hello_friend

Reputation: 5788

Base R solution:

df$previous_3 <- sapply(seq_len(nrow(df)), 
                     function(i){any(duplicated(df$fruit[ifelse((i-3) < 0, 0, (i-3)):i]))})

Upvotes: 1

Jingxin Zhang
Jingxin Zhang

Reputation: 244

library(slider)

df %>% 
   mutate(previous_3 = map2_lgl(fruit, slide(lag(fruit), ~ .x, .before = 2),
                                ~ .x %in% .y))

# # A tibble: 8 x 3
#      ID fruit  previous_3
#   <int> <chr>  <lgl>     
# 1     1 apple  FALSE     
# 2     2 orange FALSE     
# 3     3 kiwi   FALSE     
# 4     4 pear   FALSE     
# 5     5 orange TRUE      
# 6     6 kiwi   TRUE      
# 7     7 apple  FALSE     
# 8     8 apple  TRUE   

Upvotes: 1

akrun
akrun

Reputation: 886998

An option with shift from data.table

library(data.table)    
f_prev_n <- function(dat, colnm, n) {
         rowSums(sapply(data.table::shift(df[[colnm]], n = seq_len(n)), 
          function(x) dat[[colnm]] == x), na.rm = TRUE) > 0
}
df$previous_3 <- f_prev_n(df, 'fruit', 3)
df$previous_3
#[1] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE

df$previous_6 <- f_prev_n(df, 'fruit', 6)
df$previous_6
#[1] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE

Upvotes: 2

datalowe
datalowe

Reputation: 599

We can define a function like this (vec is the vector of values, n is the number of elements to look back).

repeated_previously <- function(vec, n) {
  vec_len <- length(vec)
  # create a vector that will hold TRUE/FALSE values
  bool_vec <- logical()
  # create a vector that will be used for comparison with n previous values
  prev_values <- c()
  # for each index in 1 up to the the length of the vector
  for (vec_ind in 1:vec_len ) {
    # get the value at the current index
    curr_val <- vec[vec_ind]
    # if the current value has already occurred in the n previous positions
    if (curr_val %in% prev_values) {
      # add a TRUE to the vector to be returned
      bool_vec <- c(bool_vec, TRUE)
    } else {
      bool_vec <- c(bool_vec, FALSE)
    }
    # if the `prev_values` has been filled up with n values already
    if (length(prev_values) >= n) {
      # drop the first value in `prev_values` and add the current iteration's value
      prev_values <- c(prev_values[2:n], curr_val)
    } else {
      # ... otherwise just add the current iteration's value
      prev_values <- c(prev_values, curr_val)
    }
  }
  return(bool_vec)
}

Now we can use the function.

df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"))
df$previous_3 <- repeated_previously(df$fruit, 3)
df$previous_6 <- repeated_previously(df$fruit, 6)

Here's the output.

> print(df)
  ID  fruit previous_3 previous_6
1  1  apple      FALSE      FALSE
2  2 orange      FALSE      FALSE
3  3   kiwi      FALSE      FALSE
4  4   pear      FALSE      FALSE
5  5 orange       TRUE       TRUE
6  6   kiwi       TRUE       TRUE
7  7  apple      FALSE       TRUE
8  8  apple       TRUE       TRUE

Upvotes: 1

Jamie_B
Jamie_B

Reputation: 299

Should work using base R with a simple loop to go through each row and check previous n rows, n is currently defined as 3 but can be changed

df <- data.frame(ID = seq(1,8), fruit = c("apple", "orange", "kiwi", "pear", "orange", "kiwi", "apple", "apple"), stringsAsFactors = F)

# Create Column to store true false values
df$previous_n <- FALSE

# Define Variable for how far to look back
n <- 3

# Loop through each row of dataframe df
for (i in seq_len(nrow(df))) {
  # Only calculate for the second row on
  if (i > 1) {
    # Find the starting index for the previous values
    start_value <- ifelse(i - n < 1, 1, i - n)
    # Identify the current value in the row
    c_value <- df$fruit[i]
    
    # Identify all previous values
    previous_values <- df$fruit[start_value:i - 1]
    # Check if the current value is in list of previous values, and add result to df
    df$previous_n[i] <- c_value %in% previous_values
  }
}

df
 ID  fruit previous_n
1  1  apple      FALSE
2  2 orange      FALSE
3  3   kiwi      FALSE
4  4   pear      FALSE
5  5 orange       TRUE
6  6   kiwi       TRUE
7  7  apple      FALSE
8  8  apple       TRUE

Upvotes: 1

Related Questions