Docconcoct
Docconcoct

Reputation: 2050

Find first example of unique values and return row number

I have this data frame:

df <- structure(list(Name = c("Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2"), 
                 StimulusName = c("Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2"), 
                 Fixation = c(NA, NA, 1L, 1L, NA, NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
                              2L, NA, NA, NA, 3L, 3L, 3L, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, NA, NA, 3L, 3L, 3L, 4L, 4L, 4L, NA, NA, 1L, 1L, NA, 
                              NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA)), 
            row.names = c(NA, -79L), class = c("tbl_df", "tbl", "data.frame"))

There are 3 columns: Name, StimulusName, and Fixation.

I'd like to be able to return the row number for the first examples of unique values in column Fixation and group these by Name, and StimulusName.

Here's what I've tried so far (based on a partial solution found elsewhere):

# function to return rows
Unique_Indices <- function(Values){
  unik <- !duplicated(Values)  ## logical vector of unique values
  return(seq_along(Values)[unik])  ## indices
}

But when I use it with a dplyr chain it doesn't return original row numbers but rather starts the row count afresh by the grouping:

library(tidyr)

# This doesn't work
Unique_Index <- df %>%
  group_by(Name, StimulusName) %>%
  summarise(Indices = list(Unique_Indices(Fixation))) %>%
  unnest()

The incorrect output looks like this:

enter image description here

You can see that Indices doesn't contain the original row numbers once it moves onto the next StimulusName dues to the group_byinstruction. Is there any way I can group_by as I desire while retaining the original row number from the df?

Upvotes: 2

Views: 1514

Answers (3)

akrun
akrun

Reputation: 887148

Here is an option with base R

cbind(unique(df)[-3], Fixation = which(!duplicated(df)))
#   Name StimulusName Fixation
#1  Sub1        Stim1        1
#2  Sub1        Stim1        3
#3  Sub1        Stim1        7
#4  Sub1        Stim1        9
#5  Sub1        Stim1       16
#6  Sub1        Stim1       18
#7  Sub1        Stim2       20
#8  Sub1        Stim2       24
#9  Sub1        Stim2       28
#10 Sub1        Stim2       37
#11 Sub2        Stim1       40
#12 Sub2        Stim1       46
#13 Sub2        Stim1       50
#14 Sub2        Stim1       54
#15 Sub2        Stim1       57
#16 Sub2        Stim2       60
#17 Sub2        Stim2       62
#18 Sub2        Stim2       66
#19 Sub2        Stim2       68
#20 Sub2        Stim2       75
#21 Sub2        Stim2       77

Upvotes: 0

kath
kath

Reputation: 7724

You can filter directly the non-duplicated values of Fixation per group and first turn the rownames to a proper column to keep the indices.

library(dplyr)
library(tibble)

df %>% 
  rownames_to_column() %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

# A tibble: 21 x 4
# Groups:   Name, StimulusName [4]
#    rowname Name  StimulusName Fixation
#    <chr>   <chr> <chr>           <int>
#  1 1       Sub1  Stim1              NA
#  2 3       Sub1  Stim1               1
#  3 7       Sub1  Stim1               2
#  4 9       Sub1  Stim1               3
#  5 16      Sub1  Stim1               4
#  6 18      Sub1  Stim1               5
#  7 20      Sub1  Stim2              NA
#  8 24      Sub1  Stim2               1
#  9 28      Sub1  Stim2               2
# 10 37      Sub1  Stim2               3
# ... with 11 more rows

With Ronak Shah's suggestion a dplyr-only solution can look like this:

df %>% 
  mutate(Index = row_number()) %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

Upvotes: 3

IceCreamToucan
IceCreamToucan

Reputation: 28685

In data.table there is a variable .I, which is the row number, so you can just subset .I. Indices in my output should be the same as rowname in the output from the code in @kath's comment.

library(data.table)
setDT(df)

df[, .(Indices = .I[!duplicated(Fixation)])
   , .(Name, StimulusName)]


#     Name StimulusName Indices
#  1: Sub1        Stim1       1
#  2: Sub1        Stim1       3
#  3: Sub1        Stim1       7
#  4: Sub1        Stim1       9
#  5: Sub1        Stim1      16
#  6: Sub1        Stim1      18
#  7: Sub1        Stim2      20
#  8: Sub1        Stim2      24
#  9: Sub1        Stim2      28
# 10: Sub1        Stim2      37
# 11: Sub2        Stim1      40
# 12: Sub2        Stim1      46
# 13: Sub2        Stim1      50
# 14: Sub2        Stim1      54
# 15: Sub2        Stim1      57
# 16: Sub2        Stim2      60
# 17: Sub2        Stim2      62
# 18: Sub2        Stim2      66
# 19: Sub2        Stim2      68
# 20: Sub2        Stim2      75
# 21: Sub2        Stim2      77
#     Name StimulusName Indices

Upvotes: 3

Related Questions